• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2023 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 /* Introduction
8  * ============
9  *
10  * This pass optimizes varyings between 2 shaders, which means dead input/
11  * output removal, constant and uniform load propagation, deduplication,
12  * compaction, and inter-shader code motion. This is used during the shader
13  * linking process.
14  *
15  *
16  * Notes on behavior
17  * =================
18  *
19  * The pass operates on scalar varyings using 32-bit and 16-bit types. Vector
20  * varyings are not allowed.
21  *
22  * Indirectly-indexed varying slots (not vertices) are not optimized or
23  * compacted, but unused slots of indirectly-indexed varyings are still filled
24  * with directly-indexed varyings during compaction. Indirectly-indexed
25  * varyings are still removed if they are unused by the other shader.
26  *
27  * Indirectly-indexed vertices don't disallow optimizations, but compromises
28  * are made depending on how they are accessed. They are common in TCS, TES,
29  * and GS, so there is a desire to optimize them as much as possible. More on
30  * that in various sections below.
31  *
32  * Transform feedback doesn't prevent most optimizations such as constant
33  * propagation and compaction. Shaders can be left with output stores that set
34  * the no_varying flag, meaning the output is not consumed by the next shader,
35  * which means that optimizations did their job and now the output is only
36  * consumed by transform feedback.
37  *
38  * All legacy varying slots are optimized when it's allowed.
39  *
40  *
41  * Convergence property of shader outputs
42  * ======================================
43  *
44  * When an output stores an SSA that is convergent and all stores of that
45  * output appear in unconditional blocks or conditional blocks with
46  * a convergent entry condition and the shader is not GS, it implies that all
47  * vertices of that output have the same value, therefore the output can be
48  * promoted to flat because all interpolation modes lead to the same result
49  * as flat. Such outputs are opportunistically compacted with both flat and
50  * non-flat varyings based on whichever has unused slots in their vec4s. This
51  * pass refers to such inputs, outputs, and varyings as "convergent" (meaning
52  * all vertices are always equal).
53  *
54  * By default, flat varyings are the only ones that are not considered convergent
55  * because we want the flexibility to pack convergent varyings with both flat
56  * and non-flat varyings, and since flat varyings can contain integers and
57  * doubles, we can never interpolate them as FP32 or FP16. Optimizations start
58  * with separate interpolated, flat, and convergent groups of varyings, and
59  * they choose whether they want to promote convergent to interpolated or
60  * flat, or whether to leave that decision to the end when the compaction
61  * happens.
62  *
63  * The above default behavior doesn't apply when the hw supports convergent
64  * flat loads with interpolated vec4 slots. (there is a NIR option)
65  *
66  * TES patch inputs are always convergent because they are uniform within
67  * a primitive.
68  *
69  *
70  * Optimization steps
71  * ==================
72  *
73  * 1. Determine which varying slots can be optimized and how.
74  *
75  *    * When a varying is said to be "optimized" in the following text, it
76  *      means all optimizations are performed, such as removal, constant
77  *      propagation, and deduplication.
78  *    * All VARn, PATCHn, and FOGC varyings are always optimized and
79  *      compacted.
80  *    * PRIMITIVE_ID is treated as VARn in (GS, FS).
81  *    * TEXn are removed if they are dead (except TEXn inputs, which can't be
82  *      removed due to being affected by the coord replace state). TEXn can’t
83  *      also be optimized or compacted due to being affected by the coord
84  *      replace state. TEXn not consumed by FS are treated as VARn.
85  *    * COLn and BFCn only propagate constants if they are between 0 and 1
86  *      because of the clamp vertex color state, and they are only
87  *      deduplicated and compacted among themselves because they are affected
88  *      by the flat shade, provoking vertex, two-side color selection, and
89  *      clamp vertex color states. COLn and BFCn not consumed by FS are
90  *      treated as VARn.
91  *    * All system value outputs like POS, PSIZ, CLIP_DISTn, etc. can’t be
92  *      removed, but they are demoted to sysval-only outputs by setting
93  *      the "no_varying" flag (i.e. they can be removed as varyings), so
94  *      drivers should look at the "no_varying" flag. If an output is not
95  *      a sysval output in a specific stage, it's treated as VARn. (such as
96  *      POS in TCS)
97  *    * TESS_LEVEL_* inputs in TES can’t be touched if TCS is missing.
98  *
99  * 2. Remove unused inputs and outputs
100  *
101  *    * Outputs not used in the next shader are removed.
102  *    * Inputs not initialized by the previous shader are replaced with undef
103  *      except:
104  *      * LAYER and VIEWPORT are replaced with 0 in FS.
105  *      * TEXn.xy is untouched because the coord replace state can set it, and
106  *        TEXn.zw is replaced by (0, 1), which is equal to the coord replace
107  *        value.
108  *    * Output loads that have no output stores anywhere in the shader are
109  *      replaced with undef. (for TCS, though it works with any shader)
110  *    * Output stores with transform feedback are preserved, but get
111  *      the “no_varying” flag, meaning they are not consumed by the next
112  *      shader stage. Later, transform-feedback-only varyings are compacted
113  *      (relocated) such that they are always last.
114  *    * TCS outputs that are read by TCS, but not used by TES get
115  *      the "no_varying" flag to indicate that they are only read by TCS and
116  *      not consumed by TES. Later, such TCS outputs are compacted (relocated)
117  *      such that they are always last to keep all outputs consumed by TES
118  *      consecutive without holes.
119  *
120  * 3. Constant, uniform, UBO load, and uniform expression propagation
121  *
122  *    * Define “uniform expressions” as ALU expressions only sourcing
123  *      constants, uniforms, and UBO loads.
124  *    * Constants, uniforms, UBO loads, and uniform expressions stored
125  *      in outputs are moved into the next shader, and the outputs are removed.
126  *    * The same propagation is done from output stores to output loads.
127  *      (for TCS, though it works with any shader)
128  *    * If there are multiple stores to the same output, all such stores
129  *      should store the same constant, uniform, UBO load, or uniform
130  *      expression for the expression to be propagated. If an output has
131  *      multiple vertices, all vertices should store the same expression.
132  *    * nir->options has callbacks that are used to estimate the cost of
133  *      uniform expressions that drivers can set to control the complexity of
134  *      uniform expressions that are propagated. This is to ensure that
135  *      we don't increase the GPU overhead measurably by moving code across
136  *      pipeline stages that amplify GPU work.
137  *    * Special cases:
138  *      * Constant COLn and BFCn are propagated only if the constants are
139  *        in the [0, 1] range because of the clamp vertex color state.
140  *        If both COLn and BFCn are written, they must write the same
141  *        constant. If BFCn is written but not COLn, the constant is
142  *        propagated from BFCn to COLn.
143  *      * TEX.xy is untouched because of the coord replace state.
144  *        If TEX.zw is (0, 1), only those constants are propagated because
145  *        they match the coord replace values.
146  *      * CLIP_DISTn, LAYER and VIEWPORT are always propagated.
147  *        Eliminated output stores get the "no_varying" flag if they are also
148  *        xfb stores or write sysval outputs.
149  *
150  * 4. Remove duplicated output components
151  *
152  *    * By comparing SSA defs.
153  *    * If there are multiple stores to the same output, all such stores
154  *      should store the same SSA as all stores of another output for
155  *      the output to be considered duplicated. If an output has multiple
156  *      vertices, all vertices should store the same SSA.
157  *    * Deduplication can only be done between outputs of the same category.
158  *      Those are: interpolated, patch, flat, interpolated color, flat color,
159  *                 and conditionally interpolated color based on the flat
160  *                 shade state
161  *    * Everything is deduplicated except TEXn due to the coord replace state.
162  *    * Eliminated output stores get the "no_varying" flag if they are also
163  *      xfb stores or write sysval outputs.
164  *
165  * 5. Backward inter-shader code motion
166  *
167  *    "Backward" refers to moving code in the opposite direction that shaders
168  *    are executed, i.e. moving code from the consumer to the producer.
169  *
170  *    Fragment shader example:
171  *    ```
172  *       result = input0 * uniform + input1 * constant + UBO.variable;
173  *    ```
174  *
175  *    The computation of "result" in the above example can be moved into
176  *    the previous shader and both inputs can be replaced with a new input
177  *    holding the value of "result", thus making the shader smaller and
178  *    possibly reducing the number of inputs, uniforms, and UBOs by 1.
179  *
180  *    Such code motion can be performed for any expression sourcing only
181  *    inputs, constants, and uniforms except for fragment shaders, which can
182  *    also do it but with the following limitations:
183  *    * Only these transformations can be perfomed with interpolated inputs
184  *      and any composition of these transformations (such as lerp), which can
185  *      all be proven mathematically:
186  *      * interp(x, i, j) + interp(y, i, j) = interp(x + y, i, j)
187  *      * interp(x, i, j) + convergent_expr = interp(x + convergent_expr, i, j)
188  *      * interp(x, i, j) * convergent_expr = interp(x * convergent_expr, i, j)
189  *        * all of these transformations are considered "inexact" in NIR
190  *        * interp interpolates an input according to the barycentric
191  *          coordinates (i, j), which are different for perspective,
192  *          noperspective, center, centroid, sample, at_offset, and at_sample
193  *          modes.
194  *        * convergent_expr is any expression sourcing only constants,
195  *          uniforms, and convergent inputs. The only requirement on
196  *          convergent_expr is that it doesn't vary between vertices of
197  *          the same primitive, but it can vary between primitives.
198  *    * If inputs are flat or convergent, there are no limitations on
199  *      expressions that can be moved.
200  *    * Interpolated and flat inputs can't mix in the same expression, but
201  *      convergent inputs can mix with both.
202  *    * The interpolation qualifier of the new input is inherited from
203  *      the removed non-convergent inputs that should all have the same (i, j).
204  *      If there are no non-convergent inputs, then the new input is declared
205  *      as flat (for simplicity; we can't choose the barycentric coordinates
206  *      at random because AMD doesn't like when there are multiple sets of
207  *      barycentric coordinates in the same shader unnecessarily).
208  *    * Inf values break code motion across interpolation. See the section
209  *      discussing how we handle it near the end.
210  *
211  *    The above rules also apply to open-coded TES input interpolation, which
212  *    is handled the same as FS input interpolation. The only differences are:
213  *    * Open-coded TES input interpolation must match one of the allowed
214  *      equations. Different interpolation equations are treated the same as
215  *      different interpolation qualifiers in FS.
216  *    * Patch varyings are always treated as convergent.
217  *
218  *    Prerequisites:
219  *    * We need a post-dominator tree that is constructed from a graph where
220  *      vertices are instructions and directed edges going into them are
221  *      the values of their source operands. This is different from how NIR
222  *      dominance works, which represents all instructions within a basic
223  *      block as a linear chain of vertices in the graph.
224  *      In our graph, all loads without source operands and all constants are
225  *      entry nodes in the graph, and all stores and discards are exit nodes
226  *      in the graph. Each shader can have multiple disjoint graphs where
227  *      the Lowest Common Ancestor of 2 instructions doesn't exist.
228  *    * Given the above definition, the instruction whose result is the best
229  *      candidate for a new input is the farthest instruction that
230  *      post-dominates one of more inputs and is movable between shaders.
231  *
232  *    Algorithm Idea Part 1: Search
233  *    * Pick any input load that is hypothetically movable and call it
234  *      the iterator.
235  *    * Get the immediate post-dominator of the iterator, and if it's movable,
236  *      replace the iterator with it.
237  *    * Repeat the previous step until the obtained immediate post-dominator
238  *      is not movable.
239  *    * The iterator now contains the farthest post-dominator that is movable.
240  *    * Gather all input loads that the post-dominator consumes.
241  *    * For each of those input loads, all matching output stores must be
242  *      in the same block (because they will be replaced by a single store).
243  *
244  *    Algorithm Idea Part 2: Code Motion
245  *    * Clone the post-dominator in the producer except input loads, which
246  *      should be replaced by stored output values. Uniform and UBO loads,
247  *      if any, should be cloned too.
248  *    * Remove the original output stores.
249  *    * Replace the post-dominator from the consumer with a new input load.
250  *    * The step above makes the post-dominated input load that we picked
251  *      at the beginning dead, but other input loads used by the post-
252  *      dominator might still have other uses (shown in the example below).
253  *
254  *    Example SSA-use graph - initial shader and the result:
255  *    ```
256  *          input0 input1             input0 input1
257  *              \   / \                  |      \
258  *    constant   alu  ...    ======>     |     ...
259  *           \   /
260  *            alu
261  *      (post-dominator)
262  *    ```
263  *
264  *    Description:
265  *       On the right, the algorithm moved the constant and both ALU opcodes
266  *       into the previous shader and input0 now contains the value of
267  *       the post-dominator. input1 stays the same because it still has one
268  *       use left. If input1 hadn't had the other use, it would have been
269  *       removed.
270  *
271  *    If the algorithm moves any code, the algorithm is repeated until there
272  *    is no code that it can move.
273  *
274  *    Which shader pairs are supported:
275  *    * (VS, FS), (TES, FS): yes, fully
276  *      * Limitation: If Infs must be preserved, no code is moved across
277  *                    interpolation, so only flat varyings are optimized.
278  *    * (VS, TCS), (VS, GS), (TES, GS): no, but possible -- TODO
279  *      * Current behavior:
280  *        * Per-vertex inputs are rejected.
281  *      * Possible solution:
282  *        * All input loads used by an accepted post-dominator must use
283  *          the same vertex index. The post-dominator must use all loads with
284  *          that vertex index.
285  *        * If a post-dominator is found for an input load from a specific
286  *          slot, all other input loads from that slot must also have
287  *          an accepted post-dominator, and all such post-dominators should
288  *          be identical expressions.
289  *    * (TCS, TES), (VS, TES): yes, with limitations
290  *      * Limitations:
291  *        * Only 1 store and 1 load per slot allowed.
292  *        * No output loads allowed.
293  *        * All stores used by an accepted post-dominator must be in
294  *          the same block.
295  *        * TCS barriers don't matter because there are no output loads.
296  *        * Patch varyings are handled trivially with the above constraints.
297  *        * Per-vertex outputs should only be indexed by gl_InvocationID.
298  *        * An interpolated TES load is any ALU instruction that computes
299  *          the result of linear interpolation of per-vertex inputs from
300  *          the same slot using gl_TessCoord. If such an ALU instruction is
301  *          found, it must be the only one, and all per-vertex input loads
302  *          from that slot must feed into it. The interpolation equation must
303  *          be equal to one of the allowed equations. Then the same rules as
304  *          for interpolated FS inputs are used, treating different
305  *          interpolation equations just like different interpolation
306  *          qualifiers.
307  *        * Patch inputs are treated as convergent, which means they are
308  *          allowed to be in the same movable expression as interpolated TES
309  *          inputs, and the same rules as for convergent FS inputs apply.
310  *    * (GS, FS), (MS, FS): no
311  *      * Workaround: Add a passthrough VS between GS/MS and FS, run
312  *                    the pass on the (VS, FS) pair to move code out of FS,
313  *                    and inline that VS at the end of your hw-specific
314  *                    GS/MS if it's possible.
315  *    * (TS, MS): no
316  *
317  *    The disadvantage of using the post-dominator tree is that it's a tree,
318  *    which means there is only 1 post-dominator of each input. This example
319  *    shows a case that could be optimized by replacing 3 inputs with 2 inputs,
320  *    reducing the number of inputs by 1, but the immediate post-dominator of
321  *    all input loads is NULL:
322  *    ```
323  *        temp0 = input0 + input1 + input2;
324  *        temp1 = input0 + input1 * const1 + input2 * const2;
325  *    ```
326  *
327  *    If there is a graph algorithm that returns the best solution to
328  *    the above case (which is temp0 and temp1 to replace all 3 inputs), let
329  *    us know.
330  *
331  * 6. Forward inter-shader code motion
332  *
333  *    TODO: Not implemented. The text below is a draft of the description.
334  *
335  *    "Forward" refers to moving code in the direction that shaders are
336  *    executed, i.e. moving code from the producer to the consumer.
337  *
338  *    Vertex shader example:
339  *    ```
340  *       output0 = value + 1;
341  *       output1 = value * 2;
342  *    ```
343  *
344  *    Both outputs can be replaced by 1 output storing "value", and both ALU
345  *    operations can be moved into the next shader.
346  *
347  *    The same dominance algorithm as in the previous optimization is used,
348  *    except that:
349  *    * Instead of inputs, we use outputs.
350  *    * Instead of a post-dominator tree, we use a dominator tree of the exact
351  *      same graph.
352  *
353  *    The algorithm idea is: For each pair of 2 output stores, find their
354  *    Lowest Common Ancestor in the dominator tree, and that's a candidate
355  *    for a new output. All movable loads like load_const should be removed
356  *    from the graph, otherwise the LCA wouldn't exist.
357  *
358  *    The limitations on instructions that can be moved between shaders across
359  *    interpolated loads are exactly the same as the previous optimization.
360  *
361  *    nir->options has callbacks that are used to estimate the cost of
362  *    expressions that drivers can set to control the complexity of
363  *    expressions that can be moved to later shaders. This is to ensure that
364  *    we don't increase the GPU overhead measurably by moving code across
365  *    pipeline stages that amplify GPU work.
366  *
367  * 7. Compaction to vec4 slots (AKA packing)
368  *
369  *    First, varyings are divided into these groups, and components from each
370  *    group are assigned locations in this order (effectively forcing
371  *    components from the same group to be in the same vec4 slot or adjacent
372  *    vec4 slots) with some exceptions listed below:
373  *
374  *    Non-FS groups (patch and non-patch are packed separately):
375  *    * 32-bit cross-invocation (TCS inputs using cross-invocation access)
376  *    * 16-bit cross-invocation (TCS inputs using cross-invocation access)
377  *    * 32-bit flat
378  *    * 16-bit flat
379  *    * 32-bit no-varying (TCS outputs read by TCS but not TES)
380  *    * 16-bit no-varying (TCS outputs read by TCS but not TES)
381  *
382  *    FS groups:
383  *    * 32-bit interpolated (always FP32)
384  *    * 32-bit flat
385  *    * 32-bit convergent (always FP32)
386  *    * 16-bit interpolated (always FP16)
387  *    * 16-bit flat
388  *    * 16-bit convergent (always FP16)
389  *    * 32-bit transform feedback only
390  *    * 16-bit transform feedback only
391  *
392  *    When the driver/hw can't mix different interpolation qualifiers
393  *    in the same vec4, the interpolated groups are further split into 6
394  *    groups, one for each qualifier.
395  *
396  *    Then, all scalar varyings are relocated into new slots, starting from
397  *    VAR0.x and increasing the scalar slot offset in 32-bit or 16-bit
398  *    increments. Rules:
399  *    * Both 32-bit and 16-bit flat varyings are packed in the same vec4.
400  *    * Convergent varyings can be packed with interpolated varyings of
401  *      the same type or flat. The group to pack with is chosen based on
402  *      whichever has unused scalar slots because we want to reduce the total
403  *      number of vec4s. After filling all unused scalar slots, the remaining
404  *      convergent varyings are packed as flat.
405  *    * Transform-feedback-only slots and no-varying slots are packed last,
406  *      so that they are consecutive and not intermixed with varyings consumed
407  *      by the next shader stage, and 32-bit and 16-bit slots are packed in
408  *      the same vec4. This allows reducing memory for outputs by ignoring
409  *      the trailing outputs that the next shader stage doesn't read.
410  *
411  *    In the end, we should end up with these groups for FS:
412  *    * 32-bit interpolated (always FP32) on separate vec4s
413  *    * 16-bit interpolated (always FP16) on separate vec4s
414  *    * 32-bit flat and 16-bit flat, mixed in the same vec4
415  *    * 32-bit and 16-bit transform feedback only, sharing vec4s with flat
416  *
417  *    Colors are compacted the same but separately because they can't be mixed
418  *    with VARn. Colors are divided into 3 FS groups. They are:
419  *    * 32-bit maybe-interpolated (affected by the flat-shade state)
420  *    * 32-bit interpolated (not affected by the flat-shade state)
421  *    * 32-bit flat (not affected by the flat-shade state)
422  *
423  *    To facilitate driver-specific output merging, color channels are
424  *    assigned in a rotated order depending on which one the first unused VARn
425  *    channel is. For example, if the first unused VARn channel is VAR0.z,
426  *    color channels are allocated in this order:
427  *       COL0.z, COL0.w, COL0.x, COL0.y, COL1.z, COL1.w, COL1.x, COL1.y
428  *    The reason is that some drivers merge outputs if each output sets
429  *    different components, for example 2 outputs defining VAR0.xy and COL0.z.
430  *    If drivers do interpolation in the fragment shader and color
431  *    interpolation can differ for each component, VAR0.xy and COL.z can be
432  *    stored in the same output storage slot, and the consumer can load VAR0
433  *    and COL0 from the same slot.
434  *
435  *    If COLn, BFCn, and TEXn are transform-feedback-only, they are moved to
436  *    VARn. PRIMITIVE_ID in (GS, FS) and FOGC in (xx, FS) are always moved to
437  *    VARn for better packing.
438  *
439  *
440  * Issue: Interpolation converts Infs to NaNs
441  * ==========================================
442  *
443  * Interpolation converts Infs to NaNs, i.e. interp(Inf, i, j) = NaN, which
444  * impacts and limits backward inter-shader code motion, uniform expression
445  * propagation, and compaction.
446  *
447  * When we decide not to interpolate a varying, we need to convert Infs to
448  * NaNs manually. Infs can be converted to NaNs like this: x*0 + x
449  * (suggested by Ian Romanick, the multiplication must be "exact")
450  *
451  * Changes to optimizations:
452  * - When we propagate a uniform expression and NaNs must be preserved,
453  *   convert Infs in the result to NaNs using "x*0 + x" in the consumer.
454  * - When we change interpolation to flat for convergent varyings and NaNs
455  *   must be preserved, apply "x*0 + x" to the stored output value
456  *   in the producer.
457  * - There is no solution for backward inter-shader code motion with
458  *   interpolation if Infs must be preserved. As an alternative, we can allow
459  *   code motion across interpolation only for specific shader hashes in
460  *   can_move_alu_across_interp. We can use shader-db to automatically produce
461  *   a list of shader hashes that benefit from this optimization.
462  *
463  *
464  * Usage
465  * =====
466  *
467  * Requirements:
468  * - ALUs should be scalarized
469  * - Dot products and other vector opcodes should be lowered (recommended)
470  * - Input loads and output stores should be scalarized
471  * - 64-bit varyings should be lowered to 32 bits
472  * - nir_vertex_divergence_analysis must be called on the producer if
473  *   the constumer is a fragment shader
474  *
475  * It's recommended to run this for all shader pairs from the first shader
476  * to the last shader first (to propagate constants etc.). If the optimization
477  * of (S1, S2) stages leads to changes in S1, remember the highest S1. Then
478  * re-run this for all shader pairs in the descending order from S1 to VS.
479  *
480  * NIR optimizations should be performed after every run that changes the IR.
481  *
482  *
483  * Analyzing the optimization potential of linking separate shaders
484  * ================================================================
485  *
486  * We can use this pass in an analysis pass that decides whether a separate
487  * shader has the potential to benefit from full draw-time linking. The way
488  * it would work is that we would create a passthrough shader adjacent to
489  * the separate shader, run this pass on both shaders, and check if the number
490  * of varyings decreased. This way we can decide to perform the draw-time
491  * linking only if we are confident that it would help performance.
492  *
493  * TODO: not implemented, mention the pass that implements it
494  */
495 
496 #include "nir.h"
497 #include "nir_builder.h"
498 #include "util/hash_table.h"
499 #include "util/u_math.h"
500 #include "util/u_memory.h"
501 
502 /* nir_opt_varyings works at scalar 16-bit granularity across all varyings.
503  *
504  * Slots (i % 8 == 0,2,4,6) are 32-bit channels or low bits of 16-bit channels.
505  * Slots (i % 8 == 1,3,5,7) are high bits of 16-bit channels. 32-bit channels
506  * don't set these slots as used in bitmasks.
507  */
508 #define NUM_SCALAR_SLOTS  (NUM_TOTAL_VARYING_SLOTS * 8)
509 
510 /* Fragment shader input slots can be packed with indirectly-indexed vec4
511  * slots if there are unused components, but only if the vec4 slot has
512  * the same interpolation type. There are only 3 types: FLAT, FP32, FP16.
513  */
514 enum fs_vec4_type {
515    FS_VEC4_TYPE_NONE = 0,
516    FS_VEC4_TYPE_FLAT,
517    FS_VEC4_TYPE_INTERP_EXPLICIT,
518    FS_VEC4_TYPE_INTERP_EXPLICIT_STRICT,
519    FS_VEC4_TYPE_PER_PRIMITIVE,
520    /* When nir_io_has_flexible_input_interpolation_except_flat is set: */
521    FS_VEC4_TYPE_INTERP_FP32,
522    FS_VEC4_TYPE_INTERP_FP16,
523    FS_VEC4_TYPE_INTERP_COLOR, /* only for glShadeModel, i.e. INTERP_MODE_NONE */
524    /* When nir_io_has_flexible_input_interpolation_except_flat is not set: */
525    FS_VEC4_TYPE_INTERP_FP32_PERSP_PIXEL,
526    FS_VEC4_TYPE_INTERP_FP32_PERSP_CENTROID,
527    FS_VEC4_TYPE_INTERP_FP32_PERSP_SAMPLE,
528    FS_VEC4_TYPE_INTERP_FP32_LINEAR_PIXEL,
529    FS_VEC4_TYPE_INTERP_FP32_LINEAR_CENTROID,
530    FS_VEC4_TYPE_INTERP_FP32_LINEAR_SAMPLE,
531    FS_VEC4_TYPE_INTERP_FP16_PERSP_PIXEL,
532    FS_VEC4_TYPE_INTERP_FP16_PERSP_CENTROID,
533    FS_VEC4_TYPE_INTERP_FP16_PERSP_SAMPLE,
534    FS_VEC4_TYPE_INTERP_FP16_LINEAR_PIXEL,
535    FS_VEC4_TYPE_INTERP_FP16_LINEAR_CENTROID,
536    FS_VEC4_TYPE_INTERP_FP16_LINEAR_SAMPLE,
537    FS_VEC4_TYPE_INTERP_COLOR_PIXEL,    /* only for glShadeModel, i.e. INTERP_MODE_NONE */
538    FS_VEC4_TYPE_INTERP_COLOR_CENTROID, /* same */
539    FS_VEC4_TYPE_INTERP_COLOR_SAMPLE,   /* same */
540 };
541 
542 enum {
543    PERSP_PIXEL,
544    PERSP_CENTROID,
545    PERSP_SAMPLE,
546    LINEAR_PIXEL,
547    LINEAR_CENTROID,
548    LINEAR_SAMPLE,
549    NUM_INTERP_QUALIFIERS,
550 };
551 
552 enum {
553    COLOR_PIXEL,
554    COLOR_CENTROID,
555    COLOR_SAMPLE,
556    NUM_COLOR_QUALIFIERS,
557 };
558 
559 #if PRINT_RELOCATE_SLOT
560 static const char *fs_vec4_type_strings[] = {
561    "NONE",
562    "FLAT",
563    "INTERP_EXPLICIT",
564    "INTERP_EXPLICIT_STRICT",
565    "PER_PRIMITIVE",
566    "INTERP_FP32",
567    "INTERP_FP16",
568    "INTERP_COLOR",
569    "INTERP_FP32_PERSP_PIXEL",
570    "INTERP_FP32_PERSP_CENTROID",
571    "INTERP_FP32_PERSP_SAMPLE",
572    "INTERP_FP32_LINEAR_PIXEL",
573    "INTERP_FP32_LINEAR_CENTROID",
574    "INTERP_FP32_LINEAR_SAMPLE",
575    "INTERP_FP16_PERSP_PIXEL",
576    "INTERP_FP16_PERSP_CENTROID",
577    "INTERP_FP16_PERSP_SAMPLE",
578    "INTERP_FP16_LINEAR_PIXEL",
579    "INTERP_FP16_LINEAR_CENTROID",
580    "INTERP_FP16_LINEAR_SAMPLE",
581    "INTERP_COLOR_PIXEL",
582    "INTERP_COLOR_CENTROID",
583    "INTERP_COLOR_SAMPLE",
584 };
585 #endif // PRINT_RELOCATE_SLOT
586 
587 typedef BITSET_WORD INTERP_QUAL_BITSET[NUM_INTERP_QUALIFIERS][BITSET_WORDS(NUM_SCALAR_SLOTS)];
588 typedef BITSET_WORD COLOR_QUAL_BITSET[NUM_COLOR_QUALIFIERS][BITSET_WORDS(NUM_SCALAR_SLOTS)];
589 
590 static unsigned
get_scalar_16bit_slot(nir_io_semantics sem,unsigned component)591 get_scalar_16bit_slot(nir_io_semantics sem, unsigned component)
592 {
593    return sem.location * 8 + component * 2 + sem.high_16bits;
594 }
595 
596 static unsigned
intr_get_scalar_16bit_slot(nir_intrinsic_instr * intr)597 intr_get_scalar_16bit_slot(nir_intrinsic_instr *intr)
598 {
599     return get_scalar_16bit_slot(nir_intrinsic_io_semantics(intr),
600                                  nir_intrinsic_component(intr));
601 }
602 
603 static unsigned
vec4_slot(unsigned scalar_slot)604 vec4_slot(unsigned scalar_slot)
605 {
606    return scalar_slot / 8;
607 }
608 
609 struct list_node {
610    struct list_head head;
611    nir_intrinsic_instr *instr;
612 };
613 
614 /* Information about 1 scalar varying slot for both shader stages. */
615 struct scalar_slot {
616    struct {
617       /* Linked list of all store instructions writing into the scalar slot
618        * in the producer.
619        */
620       struct list_head stores;
621 
622       /* Only for TCS: Linked list of all load instructions read the scalar
623        * slot in the producer.
624        */
625       struct list_head loads;
626 
627       /* If there is only one store instruction or if all store instructions
628        * store the same value in the producer, this is the instruction
629        * computing the stored value. Used by constant and uniform propagation
630        * to the next shader.
631        */
632       nir_instr *value;
633    } producer;
634 
635    struct {
636       /* Linked list of all load instructions loading from the scalar slot
637        * in the consumer.
638        */
639       struct list_head loads;
640 
641       /* The result of TES input interpolation. */
642       nir_alu_instr *tes_interp_load;
643       unsigned tes_interp_mode;  /* FLAG_INTERP_TES_* */
644       nir_def *tes_load_tess_coord;
645    } consumer;
646 
647    /* The number of accessed slots if this slot has indirect indexing. */
648    unsigned num_slots;
649 };
650 
651 struct linkage_info {
652    struct scalar_slot slot[NUM_SCALAR_SLOTS];
653 
654    bool spirv;
655    bool can_move_uniforms;
656    bool can_move_ubos;
657    bool can_mix_convergent_flat_with_interpolated;
658    bool has_flexible_interp;
659    bool always_interpolate_convergent_fs_inputs;
660 
661    gl_shader_stage producer_stage;
662    gl_shader_stage consumer_stage;
663    nir_builder producer_builder;
664    nir_builder consumer_builder;
665    unsigned max_varying_expression_cost;
666    unsigned (*varying_estimate_instr_cost)(struct nir_instr *instr);
667 
668    /* Memory context for linear_alloc_child (fast allocation). */
669    void *linear_mem_ctx;
670 
671    /* Hash table for efficient cloning instructions between shaders. */
672    struct hash_table *clones_ht;
673 
674    /* If any component of a vec4 slot is accessed indirectly, this is its
675     * FS vec4 qualifier type, which is either FLAT, FP32, or FP16.
676     * Components with different qualifier types can't be compacted
677     * in the same vec4.
678     */
679    uint8_t fs_vec4_type[NUM_TOTAL_VARYING_SLOTS];
680 
681    /* Mask of all varyings that can be removed. Only a few non-VARn non-PATCHn
682     * varyings can't be removed.
683     */
684    BITSET_DECLARE(removable_mask, NUM_SCALAR_SLOTS);
685 
686    /* Mask of all slots that have transform feedback info. */
687    BITSET_DECLARE(xfb_mask, NUM_SCALAR_SLOTS);
688 
689    /* Mask of all slots that have transform feedback info, but are not used
690     * by the next shader. Separate masks for 32-bit and 16-bit outputs.
691     */
692    BITSET_DECLARE(xfb32_only_mask, NUM_SCALAR_SLOTS);
693    BITSET_DECLARE(xfb16_only_mask, NUM_SCALAR_SLOTS);
694 
695    /* Mask of all TCS inputs using cross-invocation access. */
696    BITSET_DECLARE(tcs_cross_invoc32_mask, NUM_SCALAR_SLOTS);
697    BITSET_DECLARE(tcs_cross_invoc16_mask, NUM_SCALAR_SLOTS);
698 
699    /* Mask of all TCS->TES slots that are read by TCS, but not TES. */
700    BITSET_DECLARE(no_varying32_mask, NUM_SCALAR_SLOTS);
701    BITSET_DECLARE(no_varying16_mask, NUM_SCALAR_SLOTS);
702 
703    /* Mask of all slots accessed with indirect indexing. */
704    BITSET_DECLARE(indirect_mask, NUM_SCALAR_SLOTS);
705 
706    /* The following masks only contain slots that can be compacted and
707     * describe the groups in which they should be compacted. Non-fragment
708     * shaders only use the flat bitmasks.
709     *
710     * Some legacy varyings are excluded when they can't be compacted due to
711     * being affected by pipeline states (like coord replace). That only
712     * applies to xx->FS shader pairs. Other shader pairs get all legacy
713     * varyings compacted and relocated to VARn.
714     *
715     * Indirectly-indexed varyings are also excluded because they are not
716     * compacted.
717     */
718    BITSET_DECLARE(interp_fp32_mask, NUM_SCALAR_SLOTS);
719    BITSET_DECLARE(interp_fp16_mask, NUM_SCALAR_SLOTS);
720    BITSET_DECLARE(flat32_mask, NUM_SCALAR_SLOTS);
721    BITSET_DECLARE(flat16_mask, NUM_SCALAR_SLOTS);
722    BITSET_DECLARE(interp_explicit32_mask, NUM_SCALAR_SLOTS);
723    BITSET_DECLARE(interp_explicit16_mask, NUM_SCALAR_SLOTS);
724    BITSET_DECLARE(interp_explicit_strict32_mask, NUM_SCALAR_SLOTS);
725    BITSET_DECLARE(interp_explicit_strict16_mask, NUM_SCALAR_SLOTS);
726    BITSET_DECLARE(per_primitive32_mask, NUM_SCALAR_SLOTS);
727    BITSET_DECLARE(per_primitive16_mask, NUM_SCALAR_SLOTS);
728 
729    /* Color interpolation unqualified (follows the flat-shade state). */
730    BITSET_DECLARE(color32_mask, NUM_SCALAR_SLOTS);
731 
732    /* A separate bitmask for each qualifier when
733     * nir_io_has_flexible_input_interpolation_except_flat is not set.
734     */
735    INTERP_QUAL_BITSET interp_fp32_qual_masks;
736    INTERP_QUAL_BITSET interp_fp16_qual_masks;
737    COLOR_QUAL_BITSET color32_qual_masks;
738 
739    /* Mask of output components that have only one store instruction, or if
740     * they have multiple store instructions, all those instructions store
741     * the same value. If the output has multiple vertices, all vertices store
742     * the same value. This is a useful property for:
743     * - constant and uniform propagation to the next shader
744     * - deduplicating outputs
745     */
746    BITSET_DECLARE(output_equal_mask, NUM_SCALAR_SLOTS);
747 
748    /* Mask of output components that store values that are convergent,
749     * i.e. all values stored into the outputs are equal within a primitive.
750     *
751     * This is different from output_equal_mask, which says that all stores
752     * to the same slot in the same thread are equal, while this says that
753     * each store to the same slot can be different, but it always stores
754     * a convergent value, which means the stored value is equal among all
755     * threads within a primitive.
756     *
757     * The advantage is that these varyings can always be promoted to flat
758     * regardless of the original interpolation mode, and they can always be
759     * compacted with both interpolated and flat varyings.
760     */
761    BITSET_DECLARE(convergent32_mask, NUM_SCALAR_SLOTS);
762    BITSET_DECLARE(convergent16_mask, NUM_SCALAR_SLOTS);
763 };
764 
765 /******************************************************************
766  * HELPERS
767  ******************************************************************/
768 
769 /* Return whether the low or high 16-bit slot is 1. */
770 #define BITSET_TEST32(m, b) \
771    (BITSET_TEST(m, (b) & ~0x1) || BITSET_TEST(m, ((b) & ~0x1) + 1))
772 
773 #define BITSET3_TEST_ANY(bitsets, b) (BITSET_TEST((bitsets)[0], (b)) || \
774                                       BITSET_TEST((bitsets)[1], (b)) || \
775                                       BITSET_TEST((bitsets)[2], (b)))
776 #define BITSET6_TEST_ANY(bitsets, b) (BITSET3_TEST_ANY((bitsets), (b)) || \
777                                       BITSET3_TEST_ANY(&(bitsets)[3], (b)))
778 
779 static void
print_linkage(struct linkage_info * linkage)780 print_linkage(struct linkage_info *linkage)
781 {
782    printf("Linkage: %s -> %s\n",
783           _mesa_shader_stage_to_abbrev(linkage->producer_stage),
784           _mesa_shader_stage_to_abbrev(linkage->consumer_stage));
785 
786    for (unsigned i = 0; i < NUM_SCALAR_SLOTS; i++) {
787       struct scalar_slot *slot = &linkage->slot[i];
788 
789       if (!slot->num_slots &&
790           list_is_empty(&slot->producer.stores) &&
791           list_is_empty(&slot->producer.loads) &&
792           list_is_empty(&slot->consumer.loads) &&
793           !BITSET_TEST(linkage->removable_mask, i) &&
794           !BITSET_TEST(linkage->indirect_mask, i) &&
795           !BITSET_TEST(linkage->xfb32_only_mask, i) &&
796           !BITSET_TEST(linkage->xfb16_only_mask, i) &&
797           !BITSET_TEST(linkage->tcs_cross_invoc32_mask, i) &&
798           !BITSET_TEST(linkage->tcs_cross_invoc16_mask, i) &&
799           !BITSET_TEST(linkage->no_varying32_mask, i) &&
800           !BITSET_TEST(linkage->no_varying16_mask, i) &&
801           !BITSET_TEST(linkage->interp_fp32_mask, i) &&
802           !BITSET_TEST(linkage->interp_fp16_mask, i) &&
803           !BITSET6_TEST_ANY(linkage->interp_fp32_qual_masks, i) &&
804           !BITSET6_TEST_ANY(linkage->interp_fp16_qual_masks, i) &&
805           !BITSET_TEST(linkage->color32_mask, i) &&
806           !BITSET3_TEST_ANY(linkage->color32_qual_masks, i) &&
807           !BITSET_TEST(linkage->flat32_mask, i) &&
808           !BITSET_TEST(linkage->flat16_mask, i) &&
809           !BITSET_TEST(linkage->interp_explicit32_mask, i) &&
810           !BITSET_TEST(linkage->interp_explicit16_mask, i) &&
811           !BITSET_TEST(linkage->interp_explicit_strict32_mask, i) &&
812           !BITSET_TEST(linkage->interp_explicit_strict16_mask, i) &&
813           !BITSET_TEST(linkage->per_primitive32_mask, i) &&
814           !BITSET_TEST(linkage->per_primitive16_mask, i) &&
815           !BITSET_TEST(linkage->convergent32_mask, i) &&
816           !BITSET_TEST(linkage->convergent16_mask, i) &&
817           !BITSET_TEST(linkage->output_equal_mask, i))
818          continue;
819 
820       printf("  %7s.%c.%s: num_slots=%2u%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
821              gl_varying_slot_name_for_stage(vec4_slot(i),
822                                             linkage->producer_stage) + 13,
823              "xyzw"[(i / 2) % 4],
824              i % 2 ? "hi" : "lo",
825              slot->num_slots,
826              BITSET_TEST(linkage->removable_mask, i) ? " removable" : "",
827              BITSET_TEST(linkage->indirect_mask, i) ? " indirect" : "",
828              BITSET_TEST(linkage->xfb32_only_mask, i) ? " xfb32_only" : "",
829              BITSET_TEST(linkage->xfb16_only_mask, i) ? " xfb16_only" : "",
830              BITSET_TEST(linkage->tcs_cross_invoc32_mask, i) ? " tcs_cross_invoc32" : "",
831              BITSET_TEST(linkage->tcs_cross_invoc16_mask, i) ? " tcs_cross_invoc16" : "",
832              BITSET_TEST(linkage->no_varying32_mask, i) ? " no_varying32" : "",
833              BITSET_TEST(linkage->no_varying16_mask, i) ? " no_varying16" : "",
834              BITSET_TEST(linkage->interp_fp32_mask, i) ? " interp_fp32" : "",
835              BITSET_TEST(linkage->interp_fp32_qual_masks[0], i) ? " interp_fp32_persp_pixel" : "",
836              BITSET_TEST(linkage->interp_fp32_qual_masks[1], i) ? " interp_fp32_persp_centroid" : "",
837              BITSET_TEST(linkage->interp_fp32_qual_masks[2], i) ? " interp_fp32_persp_sample" : "",
838              BITSET_TEST(linkage->interp_fp32_qual_masks[3], i) ? " interp_fp32_linear_pixel" : "",
839              BITSET_TEST(linkage->interp_fp32_qual_masks[4], i) ? " interp_fp32_linear_centroid" : "",
840              BITSET_TEST(linkage->interp_fp32_qual_masks[5], i) ? " interp_fp32_linear_sample" : "",
841              BITSET_TEST(linkage->interp_fp16_mask, i) ? " interp_fp16" : "",
842              BITSET_TEST(linkage->interp_fp16_qual_masks[0], i) ? " interp_fp16_persp_pixel" : "",
843              BITSET_TEST(linkage->interp_fp16_qual_masks[1], i) ? " interp_fp16_persp_centroid" : "",
844              BITSET_TEST(linkage->interp_fp16_qual_masks[2], i) ? " interp_fp16_persp_sample" : "",
845              BITSET_TEST(linkage->interp_fp16_qual_masks[3], i) ? " interp_fp16_linear_pixel" : "",
846              BITSET_TEST(linkage->interp_fp16_qual_masks[4], i) ? " interp_fp16_linear_centroid" : "",
847              BITSET_TEST(linkage->interp_fp16_qual_masks[5], i) ? " interp_fp16_linear_sample" : "",
848              BITSET_TEST(linkage->color32_mask, i) ? " color32" : "",
849              BITSET_TEST(linkage->color32_qual_masks[0], i) ? " color32_pixel" : "",
850              BITSET_TEST(linkage->color32_qual_masks[1], i) ? " color32_centroid" : "",
851              BITSET_TEST(linkage->color32_qual_masks[2], i) ? " color32_sample" : "",
852              BITSET_TEST(linkage->flat32_mask, i) ? " flat32" : "",
853              BITSET_TEST(linkage->flat16_mask, i) ? " flat16" : "",
854              BITSET_TEST(linkage->interp_explicit32_mask, i) ? " interp_explicit32" : "",
855              BITSET_TEST(linkage->interp_explicit16_mask, i) ? " interp_explicit16" : "",
856              BITSET_TEST(linkage->interp_explicit_strict32_mask, i) ? " interp_explicit_strict32" : "",
857              BITSET_TEST(linkage->interp_explicit_strict16_mask, i) ? " interp_explicit_strict16" : "",
858              BITSET_TEST(linkage->per_primitive32_mask, i) ? " per_primitive32" : "",
859              BITSET_TEST(linkage->per_primitive32_mask, i) ? " per_primitive16" : "",
860              BITSET_TEST(linkage->convergent32_mask, i) ? " convergent32" : "",
861              BITSET_TEST(linkage->convergent16_mask, i) ? " convergent16" : "",
862              BITSET_TEST(linkage->output_equal_mask, i) ? " output_equal" : "",
863              !list_is_empty(&slot->producer.stores) ? " producer_stores" : "",
864              !list_is_empty(&slot->producer.loads) ? " producer_loads" : "",
865              !list_is_empty(&slot->consumer.loads) ? " consumer_loads" : "");
866    }
867 }
868 
869 static void
slot_disable_optimizations_and_compaction(struct linkage_info * linkage,unsigned i)870 slot_disable_optimizations_and_compaction(struct linkage_info *linkage,
871                                           unsigned i)
872 {
873    BITSET_CLEAR(linkage->output_equal_mask, i);
874    BITSET_CLEAR(linkage->convergent32_mask, i);
875    BITSET_CLEAR(linkage->convergent16_mask, i);
876    BITSET_CLEAR(linkage->interp_fp32_mask, i);
877    BITSET_CLEAR(linkage->interp_fp16_mask, i);
878    for (unsigned b = 0; b < NUM_INTERP_QUALIFIERS; b++) {
879       BITSET_CLEAR(linkage->interp_fp32_qual_masks[b], i);
880       BITSET_CLEAR(linkage->interp_fp16_qual_masks[b], i);
881    }
882    BITSET_CLEAR(linkage->flat32_mask, i);
883    BITSET_CLEAR(linkage->flat16_mask, i);
884    BITSET_CLEAR(linkage->interp_explicit32_mask, i);
885    BITSET_CLEAR(linkage->interp_explicit16_mask, i);
886    BITSET_CLEAR(linkage->interp_explicit_strict32_mask, i);
887    BITSET_CLEAR(linkage->interp_explicit_strict16_mask, i);
888    BITSET_CLEAR(linkage->per_primitive32_mask, i);
889    BITSET_CLEAR(linkage->per_primitive16_mask, i);
890    BITSET_CLEAR(linkage->tcs_cross_invoc32_mask, i);
891    BITSET_CLEAR(linkage->tcs_cross_invoc16_mask, i);
892    BITSET_CLEAR(linkage->no_varying32_mask, i);
893    BITSET_CLEAR(linkage->no_varying16_mask, i);
894    BITSET_CLEAR(linkage->color32_mask, i);
895    for (unsigned b = 0; b < NUM_COLOR_QUALIFIERS; b++)
896       BITSET_CLEAR(linkage->color32_qual_masks[b], i);
897 }
898 
899 static void
clear_slot_info_after_removal(struct linkage_info * linkage,unsigned i,bool uses_xfb)900 clear_slot_info_after_removal(struct linkage_info *linkage, unsigned i, bool uses_xfb)
901 {
902    slot_disable_optimizations_and_compaction(linkage, i);
903 
904    if (uses_xfb)
905       return;
906 
907    linkage->slot[i].num_slots = 0;
908 
909    BITSET_CLEAR(linkage->indirect_mask, i);
910    BITSET_CLEAR(linkage->removable_mask, i);
911 
912    /* Transform feedback stores can't be removed. */
913    assert(!BITSET_TEST(linkage->xfb32_only_mask, i));
914    assert(!BITSET_TEST(linkage->xfb16_only_mask, i));
915 }
916 
917 static bool
has_xfb(nir_intrinsic_instr * intr)918 has_xfb(nir_intrinsic_instr *intr)
919 {
920    /* This means whether the instrinsic is ABLE to have xfb info. */
921    if (!nir_intrinsic_has_io_xfb(intr))
922       return false;
923 
924    unsigned comp = nir_intrinsic_component(intr);
925 
926    if (comp >= 2)
927       return nir_intrinsic_io_xfb2(intr).out[comp - 2].num_components > 0;
928    else
929       return nir_intrinsic_io_xfb(intr).out[comp].num_components > 0;
930 }
931 
932 static bool
is_interpolated_color(struct linkage_info * linkage,unsigned i)933 is_interpolated_color(struct linkage_info *linkage, unsigned i)
934 {
935    if (linkage->consumer_stage != MESA_SHADER_FRAGMENT)
936       return false;
937 
938    /* BFCn stores are bunched in the COLn slots with COLn, so we should never
939     * get BFCn here.
940     */
941    assert(vec4_slot(i) != VARYING_SLOT_BFC0 &&
942           vec4_slot(i) != VARYING_SLOT_BFC1);
943 
944    return vec4_slot(i) == VARYING_SLOT_COL0 ||
945           vec4_slot(i) == VARYING_SLOT_COL1;
946 }
947 
948 static bool
is_interpolated_texcoord(struct linkage_info * linkage,unsigned i)949 is_interpolated_texcoord(struct linkage_info *linkage, unsigned i)
950 {
951    if (linkage->consumer_stage != MESA_SHADER_FRAGMENT)
952       return false;
953 
954    return vec4_slot(i) >= VARYING_SLOT_TEX0 &&
955           vec4_slot(i) <= VARYING_SLOT_TEX7;
956 }
957 
958 static bool
color_uses_shade_model(struct linkage_info * linkage,unsigned i)959 color_uses_shade_model(struct linkage_info *linkage, unsigned i)
960 {
961    if (!is_interpolated_color(linkage, i))
962       return false;
963 
964    list_for_each_entry(struct list_node, iter,
965                        &linkage->slot[i].consumer.loads, head) {
966       assert(iter->instr->intrinsic == nir_intrinsic_load_interpolated_input);
967 
968       nir_intrinsic_instr *baryc =
969          nir_instr_as_intrinsic(iter->instr->src[0].ssa->parent_instr);
970       if (nir_intrinsic_interp_mode(baryc) == INTERP_MODE_NONE)
971          return true;
972    }
973 
974    return false;
975 }
976 
977 static enum fs_vec4_type
get_interp_vec4_type(struct linkage_info * linkage,unsigned slot,nir_intrinsic_instr * load)978 get_interp_vec4_type(struct linkage_info *linkage, unsigned slot,
979                      nir_intrinsic_instr *load)
980 {
981    assert(!linkage->has_flexible_interp);
982    assert(load->intrinsic == nir_intrinsic_load_interpolated_input);
983 
984    nir_intrinsic_instr *baryc =
985       nir_instr_as_intrinsic(load->src[0].ssa->parent_instr);
986    enum fs_vec4_type base;
987 
988    if (color_uses_shade_model(linkage, slot))
989       base = FS_VEC4_TYPE_INTERP_COLOR_PIXEL;
990    else if (load->def.bit_size == 32)
991       base = FS_VEC4_TYPE_INTERP_FP32_PERSP_PIXEL;
992    else if (load->def.bit_size == 16)
993       base = FS_VEC4_TYPE_INTERP_FP16_PERSP_PIXEL;
994    else
995       unreachable("invalid load_interpolated_input type");
996 
997    bool linear = nir_intrinsic_interp_mode(baryc) == INTERP_MODE_NOPERSPECTIVE;
998 
999    if (linear)
1000       base += 3;
1001 
1002    switch (baryc->intrinsic) {
1003    case nir_intrinsic_load_barycentric_pixel:
1004    case nir_intrinsic_load_barycentric_at_offset:
1005    case nir_intrinsic_load_barycentric_at_sample:
1006       return base;
1007    case nir_intrinsic_load_barycentric_centroid:
1008       return base + 1;
1009    case nir_intrinsic_load_barycentric_sample:
1010       return base + 2;
1011    default:
1012       unreachable("unexpected barycentric intrinsic");
1013    }
1014 }
1015 
1016 static bool
preserve_infs_nans(nir_shader * nir,unsigned bit_size)1017 preserve_infs_nans(nir_shader *nir, unsigned bit_size)
1018 {
1019    unsigned mode = nir->info.float_controls_execution_mode;
1020 
1021    return nir_is_float_control_inf_preserve(mode, bit_size) ||
1022           nir_is_float_control_nan_preserve(mode, bit_size);
1023 }
1024 
1025 static bool
preserve_nans(nir_shader * nir,unsigned bit_size)1026 preserve_nans(nir_shader *nir, unsigned bit_size)
1027 {
1028    unsigned mode = nir->info.float_controls_execution_mode;
1029 
1030    return nir_is_float_control_nan_preserve(mode, bit_size);
1031 }
1032 
1033 static nir_def *
build_convert_inf_to_nan(nir_builder * b,nir_def * x)1034 build_convert_inf_to_nan(nir_builder *b, nir_def *x)
1035 {
1036    /* Do x*0 + x. The multiplication by 0 can't be optimized out. */
1037    nir_def *fma = nir_ffma_imm1(b, x, 0, x);
1038    nir_instr_as_alu(fma->parent_instr)->exact = true;
1039    return fma;
1040 }
1041 
1042 static bool
is_sysval(nir_instr * instr,gl_system_value sysval)1043 is_sysval(nir_instr *instr, gl_system_value sysval)
1044 {
1045    if (instr->type == nir_instr_type_intrinsic) {
1046       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1047 
1048       if (intr->intrinsic == nir_intrinsic_from_system_value(sysval))
1049          return true;
1050 
1051       if (intr->intrinsic == nir_intrinsic_load_deref) {
1052           nir_deref_instr *deref =
1053             nir_instr_as_deref(intr->src[0].ssa->parent_instr);
1054 
1055           return nir_deref_mode_is_one_of(deref, nir_var_system_value) &&
1056                  nir_deref_instr_get_variable(deref)->data.location == sysval;
1057       }
1058    }
1059 
1060    return false;
1061 }
1062 
1063 /******************************************************************
1064  * GATHERING INPUTS & OUTPUTS
1065  ******************************************************************/
1066 
1067 static bool
is_active_sysval_output(struct linkage_info * linkage,unsigned slot,nir_intrinsic_instr * intr)1068 is_active_sysval_output(struct linkage_info *linkage, unsigned slot,
1069                         nir_intrinsic_instr *intr)
1070 {
1071    return nir_slot_is_sysval_output(vec4_slot(slot),
1072                                     linkage->consumer_stage) &&
1073           !nir_intrinsic_io_semantics(intr).no_sysval_output;
1074 }
1075 
1076 /**
1077  * This function acts like a filter. The pass won't touch varyings that
1078  * return false here, and the return value is saved in the linkage bitmasks,
1079  * so that all subpasses will *automatically* skip such varyings.
1080  */
1081 static bool
can_remove_varying(struct linkage_info * linkage,gl_varying_slot location)1082 can_remove_varying(struct linkage_info *linkage, gl_varying_slot location)
1083 {
1084    if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) {
1085       /* User-defined varyings and fog coordinates can always be removed. */
1086       if (location >= VARYING_SLOT_VAR0 ||
1087           location == VARYING_SLOT_FOGC)
1088          return true;
1089 
1090       /* These can be removed as varyings, which means they will be demoted to
1091        * sysval-only outputs keeping their culling/rasterization functions
1092        * while not passing the values to FS. Drivers should handle
1093        * the "no_varying" semantic to benefit from this.
1094        *
1095        * Note: When removing unset LAYER and VIEWPORT FS inputs, they will
1096        *       be replaced by 0 instead of undef.
1097        */
1098       if (location == VARYING_SLOT_CLIP_DIST0 ||
1099           location == VARYING_SLOT_CLIP_DIST1 ||
1100           location == VARYING_SLOT_CULL_DIST0 ||
1101           location == VARYING_SLOT_CULL_DIST1 ||
1102           location == VARYING_SLOT_LAYER ||
1103           location == VARYING_SLOT_VIEWPORT)
1104          return true;
1105 
1106       /* COLn inputs can be removed only if both COLn and BFCn are not
1107        * written. Both COLn and BFCn outputs can be removed if COLn inputs
1108        * aren't read.
1109        *
1110        * TEXn inputs can never be removed in FS because of the coord replace
1111        * state, but TEXn outputs can be removed if they are not read by FS.
1112        */
1113       if (location == VARYING_SLOT_COL0 ||
1114           location == VARYING_SLOT_COL1 ||
1115           location == VARYING_SLOT_BFC0 ||
1116           location == VARYING_SLOT_BFC1 ||
1117           (location >= VARYING_SLOT_TEX0 && location <= VARYING_SLOT_TEX7))
1118          return true;
1119 
1120       /* "GS -> FS" can remove the primitive ID if not written or not read. */
1121       if ((linkage->producer_stage == MESA_SHADER_GEOMETRY ||
1122            linkage->producer_stage == MESA_SHADER_MESH) &&
1123           location == VARYING_SLOT_PRIMITIVE_ID)
1124          return true;
1125 
1126       /* No other varyings can be removed. */
1127       return false;
1128    } else if (linkage->consumer_stage == MESA_SHADER_TESS_EVAL) {
1129       /* Only VS->TES shouldn't remove TESS_LEVEL_* inputs because the values
1130        * come from glPatchParameterfv.
1131        *
1132        * For TCS->TES, TESS_LEVEL_* outputs can be removed as varyings, which
1133        * means they will be demoted to sysval-only outputs, so that drivers
1134        * know that TES doesn't read them.
1135        */
1136       if (linkage->producer_stage == MESA_SHADER_VERTEX &&
1137           (location == VARYING_SLOT_TESS_LEVEL_INNER ||
1138            location == VARYING_SLOT_TESS_LEVEL_OUTER))
1139          return false;
1140 
1141       return true;
1142    }
1143 
1144    /* All other varyings can be removed. */
1145    return true;
1146 }
1147 
1148 struct opt_options {
1149    bool propagate_uniform_expr:1;
1150    bool deduplicate:1;
1151    bool inter_shader_code_motion:1;
1152    bool compact:1;
1153    bool disable_all:1;
1154 };
1155 
1156 /**
1157  * Return which optimizations are allowed.
1158  */
1159 static struct opt_options
can_optimize_varying(struct linkage_info * linkage,gl_varying_slot location)1160 can_optimize_varying(struct linkage_info *linkage, gl_varying_slot location)
1161 {
1162    struct opt_options options_var = {
1163       .propagate_uniform_expr = true,
1164       .deduplicate = true,
1165       .inter_shader_code_motion = true,
1166       .compact = true,
1167    };
1168    struct opt_options options_color = {
1169       .propagate_uniform_expr = true, /* only constants in [0, 1] */
1170       .deduplicate = true,
1171       .compact = true,
1172    };
1173    struct opt_options options_tex = {
1174       .propagate_uniform_expr = true, /* only TEX.zw if equal to (0, 1) */
1175    };
1176    struct opt_options options_sysval_output = {
1177       .propagate_uniform_expr = true,
1178       .deduplicate = true,
1179    };
1180    struct opt_options options_tess_levels = {
1181       .propagate_uniform_expr = true,
1182       .deduplicate = true,
1183    };
1184    struct opt_options options_disable_all = {
1185       .disable_all = true,
1186    };
1187 
1188    assert(can_remove_varying(linkage, location));
1189 
1190    if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) {
1191       /* xx -> FS */
1192       /* User-defined varyings and fog coordinates can always be optimized. */
1193       if (location >= VARYING_SLOT_VAR0 ||
1194           location == VARYING_SLOT_FOGC)
1195          return options_var;
1196 
1197       /* The primitive ID can always be optimized in GS -> FS and MS -> FS. */
1198       if ((linkage->producer_stage == MESA_SHADER_GEOMETRY ||
1199            linkage->producer_stage == MESA_SHADER_MESH) &&
1200           location == VARYING_SLOT_PRIMITIVE_ID)
1201          return options_var;
1202 
1203       /* Colors can only do constant propagation if COLn and BFCn store the
1204        * same constant and the constant is between 0 and 1 (because clamp
1205        * vertex color state is unknown). Uniform propagation isn't possible
1206        * because of the clamping.
1207        *
1208        * Color components can only be deduplicated and compacted among
1209        * themselves if they have the same interpolation qualifier, and can't
1210        * be mixed with other varyings.
1211        */
1212       if (location == VARYING_SLOT_COL0 ||
1213           location == VARYING_SLOT_COL1 ||
1214           location == VARYING_SLOT_BFC0 ||
1215           location == VARYING_SLOT_BFC1)
1216          return options_color;
1217 
1218       /* TEXn.zw can only be constant-propagated if the value is (0, 1)
1219        * because it matches the coord replace values.
1220        */
1221       if (location >= VARYING_SLOT_TEX0 && location <= VARYING_SLOT_TEX7)
1222          return options_tex;
1223 
1224       /* LAYER, VIEWPORT, CLIP_DISTn, and CULL_DISTn can only propagate
1225        * uniform expressions and be compacted (moved to VARn while keeping
1226        * the sysval outputs where they are).
1227        */
1228       if (location == VARYING_SLOT_LAYER ||
1229           location == VARYING_SLOT_VIEWPORT ||
1230           location == VARYING_SLOT_CLIP_DIST0 ||
1231           location == VARYING_SLOT_CLIP_DIST1 ||
1232           location == VARYING_SLOT_CULL_DIST0 ||
1233           location == VARYING_SLOT_CULL_DIST1)
1234          return options_sysval_output;
1235 
1236       /* Everything else can't be read by the consumer, such as POS, PSIZ,
1237        * CLIP_VERTEX, EDGE, PRIMITIVE_SHADING_RATE, etc.
1238        */
1239       return options_disable_all;
1240    }
1241 
1242    if (linkage->producer_stage == MESA_SHADER_TESS_CTRL) {
1243       /* TESS_LEVEL_* can only propagate uniform expressions.
1244        * Compaction is disabled because AMD doesn't want the varying to be
1245        * moved to PATCHn while keeping the sysval output where it is.
1246        */
1247       if (location == VARYING_SLOT_TESS_LEVEL_INNER ||
1248           location == VARYING_SLOT_TESS_LEVEL_OUTER)
1249          return options_tess_levels;
1250    }
1251 
1252    /* All other shader pairs, which are (VS, TCS), (TCS, TES), (VS, TES),
1253     * (TES, GS), and (VS, GS) can compact and optimize all varyings.
1254     */
1255    return options_var;
1256 }
1257 
1258 static bool
gather_inputs(struct nir_builder * builder,nir_intrinsic_instr * intr,void * cb_data)1259 gather_inputs(struct nir_builder *builder, nir_intrinsic_instr *intr, void *cb_data)
1260 {
1261    struct linkage_info *linkage = (struct linkage_info *)cb_data;
1262 
1263    if (intr->intrinsic != nir_intrinsic_load_input &&
1264        intr->intrinsic != nir_intrinsic_load_per_vertex_input &&
1265        intr->intrinsic != nir_intrinsic_load_per_primitive_input &&
1266        intr->intrinsic != nir_intrinsic_load_interpolated_input &&
1267        intr->intrinsic != nir_intrinsic_load_input_vertex)
1268       return false;
1269 
1270    /* nir_lower_io_to_scalar is required before this */
1271    assert(intr->def.num_components == 1);
1272    /* Non-zero constant offsets should have been folded by
1273     * nir_io_add_const_offset_to_base.
1274     */
1275    nir_src offset = *nir_get_io_offset_src(intr);
1276    assert(!nir_src_is_const(offset) || nir_src_as_uint(offset) == 0);
1277 
1278    nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
1279 
1280    if (!can_remove_varying(linkage, sem.location))
1281       return false;
1282 
1283    /* Insert the load into the list of loads for this scalar slot. */
1284    unsigned slot = intr_get_scalar_16bit_slot(intr);
1285    struct scalar_slot *in = &linkage->slot[slot];
1286    struct list_node *node = linear_alloc_child(linkage->linear_mem_ctx,
1287                                                sizeof(struct list_node));
1288    node->instr = intr;
1289    list_addtail(&node->head, &in->consumer.loads);
1290    in->num_slots = MAX2(in->num_slots, sem.num_slots);
1291 
1292    BITSET_SET(linkage->removable_mask, slot);
1293 
1294    enum fs_vec4_type fs_vec4_type = FS_VEC4_TYPE_NONE;
1295 
1296    /* Determine the type of the input for compaction. Other inputs
1297     * can be compacted with indirectly-indexed vec4 slots if they
1298     * have unused components, but only if they are of the same type.
1299     */
1300    if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) {
1301       switch (intr->intrinsic) {
1302       case nir_intrinsic_load_input:
1303          fs_vec4_type = FS_VEC4_TYPE_FLAT;
1304          break;
1305       case nir_intrinsic_load_per_primitive_input:
1306          fs_vec4_type = FS_VEC4_TYPE_PER_PRIMITIVE;
1307          break;
1308       case nir_intrinsic_load_input_vertex:
1309          if (sem.interp_explicit_strict)
1310             fs_vec4_type = FS_VEC4_TYPE_INTERP_EXPLICIT_STRICT;
1311          else
1312             fs_vec4_type = FS_VEC4_TYPE_INTERP_EXPLICIT;
1313          break;
1314       case nir_intrinsic_load_interpolated_input:
1315          if (linkage->has_flexible_interp) {
1316             if (color_uses_shade_model(linkage, slot))
1317                fs_vec4_type = FS_VEC4_TYPE_INTERP_COLOR;
1318             else if (intr->def.bit_size == 32)
1319                fs_vec4_type = FS_VEC4_TYPE_INTERP_FP32;
1320             else if (intr->def.bit_size == 16)
1321                fs_vec4_type = FS_VEC4_TYPE_INTERP_FP16;
1322             else
1323                unreachable("invalid load_interpolated_input type");
1324          } else {
1325             fs_vec4_type = get_interp_vec4_type(linkage, slot, intr);
1326          }
1327          break;
1328       default:
1329          unreachable("unexpected input load intrinsic");
1330       }
1331 
1332       linkage->fs_vec4_type[sem.location] = fs_vec4_type;
1333    }
1334 
1335    /* Indirect indexing. */
1336    if (!nir_src_is_const(offset)) {
1337       /* Only the indirectly-indexed component is marked as indirect. */
1338       for (unsigned i = 0; i < sem.num_slots; i++)
1339          BITSET_SET(linkage->indirect_mask, slot + i * 8);
1340 
1341       /* Set the same vec4 type as the first element in all slots. */
1342       if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) {
1343          for (unsigned i = 1; i < sem.num_slots; i++)
1344             linkage->fs_vec4_type[sem.location + i] = fs_vec4_type;
1345       }
1346       return false;
1347    }
1348 
1349    if (!can_optimize_varying(linkage, sem.location).compact)
1350       return false;
1351 
1352    /* Record inputs that can be compacted. */
1353    if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) {
1354       unsigned i;
1355       assert(intr->def.bit_size == 32 || intr->def.bit_size == 16);
1356 
1357       switch (fs_vec4_type) {
1358       case FS_VEC4_TYPE_FLAT:
1359          if (intr->def.bit_size == 32)
1360             BITSET_SET(linkage->flat32_mask, slot);
1361          else
1362             BITSET_SET(linkage->flat16_mask, slot);
1363          break;
1364       case FS_VEC4_TYPE_INTERP_EXPLICIT:
1365          if (intr->def.bit_size == 32)
1366             BITSET_SET(linkage->interp_explicit32_mask, slot);
1367          else
1368             BITSET_SET(linkage->interp_explicit16_mask, slot);
1369          break;
1370       case FS_VEC4_TYPE_INTERP_EXPLICIT_STRICT:
1371          if (intr->def.bit_size == 32)
1372             BITSET_SET(linkage->interp_explicit_strict32_mask, slot);
1373          else
1374             BITSET_SET(linkage->interp_explicit_strict16_mask, slot);
1375          break;
1376       case FS_VEC4_TYPE_PER_PRIMITIVE:
1377          if (intr->def.bit_size == 32)
1378             BITSET_SET(linkage->per_primitive32_mask, slot);
1379          else
1380             BITSET_SET(linkage->per_primitive16_mask, slot);
1381          break;
1382 
1383       case FS_VEC4_TYPE_INTERP_FP32:
1384          BITSET_SET(linkage->interp_fp32_mask, slot);
1385          break;
1386       case FS_VEC4_TYPE_INTERP_FP16:
1387          BITSET_SET(linkage->interp_fp16_mask, slot);
1388          break;
1389       case FS_VEC4_TYPE_INTERP_COLOR:
1390          BITSET_SET(linkage->color32_mask, slot);
1391          break;
1392 
1393       case FS_VEC4_TYPE_INTERP_FP32_PERSP_PIXEL:
1394       case FS_VEC4_TYPE_INTERP_FP32_PERSP_CENTROID:
1395       case FS_VEC4_TYPE_INTERP_FP32_PERSP_SAMPLE:
1396       case FS_VEC4_TYPE_INTERP_FP32_LINEAR_PIXEL:
1397       case FS_VEC4_TYPE_INTERP_FP32_LINEAR_CENTROID:
1398       case FS_VEC4_TYPE_INTERP_FP32_LINEAR_SAMPLE:
1399          i = fs_vec4_type - FS_VEC4_TYPE_INTERP_FP32_PERSP_PIXEL;
1400          BITSET_SET(linkage->interp_fp32_qual_masks[i], slot);
1401          break;
1402 
1403       case FS_VEC4_TYPE_INTERP_FP16_PERSP_PIXEL:
1404       case FS_VEC4_TYPE_INTERP_FP16_PERSP_CENTROID:
1405       case FS_VEC4_TYPE_INTERP_FP16_PERSP_SAMPLE:
1406       case FS_VEC4_TYPE_INTERP_FP16_LINEAR_PIXEL:
1407       case FS_VEC4_TYPE_INTERP_FP16_LINEAR_CENTROID:
1408       case FS_VEC4_TYPE_INTERP_FP16_LINEAR_SAMPLE:
1409          i = fs_vec4_type - FS_VEC4_TYPE_INTERP_FP16_PERSP_PIXEL;
1410          BITSET_SET(linkage->interp_fp16_qual_masks[i], slot);
1411          break;
1412 
1413       case FS_VEC4_TYPE_INTERP_COLOR_PIXEL:
1414       case FS_VEC4_TYPE_INTERP_COLOR_CENTROID:
1415       case FS_VEC4_TYPE_INTERP_COLOR_SAMPLE:
1416          i = fs_vec4_type - FS_VEC4_TYPE_INTERP_COLOR_PIXEL;
1417          BITSET_SET(linkage->color32_qual_masks[i], slot);
1418          break;
1419 
1420       case FS_VEC4_TYPE_NONE:
1421          unreachable("unexpected fs_vec4_type");
1422       }
1423 
1424       if (!linkage->has_flexible_interp &&
1425           intr->intrinsic == nir_intrinsic_load_interpolated_input) {
1426          /* interpolateAtCentroid can occur simultaneously with any other
1427           * qualifier. If centroid is flagged with any other qualifier,
1428           * unflag centroid. Even though we track such outputs as the other
1429           * qualifier, the load_barycentric_centroid intrinsic must be
1430           * preserved by all optimizations. The only case when it's not
1431           * preserved is when the input is convergent, in which case
1432           * all qualifiers have the same behavior and we opportunistically
1433           * change it during compaction.
1434           */
1435          if (color_uses_shade_model(linkage, slot)) {
1436             if (BITSET_TEST(linkage->color32_qual_masks[COLOR_CENTROID], slot) &&
1437                 (BITSET_TEST(linkage->color32_qual_masks[COLOR_PIXEL], slot) ||
1438                  BITSET_TEST(linkage->color32_qual_masks[COLOR_SAMPLE], slot)))
1439                BITSET_CLEAR(linkage->color32_qual_masks[COLOR_CENTROID], slot);
1440          } else {
1441             INTERP_QUAL_BITSET *bitsets =
1442                intr->def.bit_size == 32 ? &linkage->interp_fp32_qual_masks :
1443                                           &linkage->interp_fp16_qual_masks;
1444 
1445             if (BITSET_TEST((*bitsets)[PERSP_CENTROID], slot) &&
1446                 (BITSET_TEST((*bitsets)[PERSP_PIXEL], slot) ||
1447                  BITSET_TEST((*bitsets)[PERSP_SAMPLE], slot)))
1448                BITSET_CLEAR((*bitsets)[PERSP_CENTROID], slot);
1449 
1450             if (BITSET_TEST((*bitsets)[LINEAR_CENTROID], slot) &&
1451                 (BITSET_TEST((*bitsets)[LINEAR_PIXEL], slot) ||
1452                  BITSET_TEST((*bitsets)[LINEAR_SAMPLE], slot)))
1453                BITSET_CLEAR((*bitsets)[LINEAR_CENTROID], slot);
1454          }
1455       }
1456    } else {
1457       if (intr->def.bit_size == 32)
1458          BITSET_SET(linkage->flat32_mask, slot);
1459       else if (intr->def.bit_size == 16)
1460          BITSET_SET(linkage->flat16_mask, slot);
1461       else
1462          unreachable("invalid load_input type");
1463 
1464       if (linkage->consumer_stage == MESA_SHADER_TESS_CTRL &&
1465           intr->intrinsic == nir_intrinsic_load_per_vertex_input) {
1466          nir_src *vertex_index_src = nir_get_io_arrayed_index_src(intr);
1467          nir_instr *vertex_index_instr = vertex_index_src->ssa->parent_instr;
1468 
1469          if (!is_sysval(vertex_index_instr, SYSTEM_VALUE_INVOCATION_ID)) {
1470             if (intr->def.bit_size == 32)
1471                BITSET_SET(linkage->tcs_cross_invoc32_mask, slot);
1472             else if (intr->def.bit_size == 16)
1473                BITSET_SET(linkage->tcs_cross_invoc16_mask, slot);
1474             else
1475                unreachable("invalid load_input type");
1476          }
1477       }
1478    }
1479    return false;
1480 }
1481 
1482 static bool
gather_outputs(struct nir_builder * builder,nir_intrinsic_instr * intr,void * cb_data)1483 gather_outputs(struct nir_builder *builder, nir_intrinsic_instr *intr, void *cb_data)
1484 {
1485    struct linkage_info *linkage = (struct linkage_info *)cb_data;
1486 
1487    if (intr->intrinsic != nir_intrinsic_store_output &&
1488        intr->intrinsic != nir_intrinsic_load_output &&
1489        intr->intrinsic != nir_intrinsic_store_per_vertex_output &&
1490        intr->intrinsic != nir_intrinsic_store_per_view_output &&
1491        intr->intrinsic != nir_intrinsic_store_per_primitive_output &&
1492        intr->intrinsic != nir_intrinsic_load_per_vertex_output &&
1493        intr->intrinsic != nir_intrinsic_load_per_view_output &&
1494        intr->intrinsic != nir_intrinsic_load_per_primitive_output)
1495       return false;
1496 
1497    bool is_store =
1498       intr->intrinsic == nir_intrinsic_store_output ||
1499       intr->intrinsic == nir_intrinsic_store_per_vertex_output ||
1500       intr->intrinsic == nir_intrinsic_store_per_view_output ||
1501       intr->intrinsic == nir_intrinsic_store_per_primitive_output;
1502 
1503    if (is_store) {
1504       /* nir_lower_io_to_scalar is required before this */
1505       assert(intr->src[0].ssa->num_components == 1);
1506       /* nit_opt_undef is required before this. */
1507       assert(intr->src[0].ssa->parent_instr->type !=
1508             nir_instr_type_undef);
1509    } else {
1510       /* nir_lower_io_to_scalar is required before this */
1511       assert(intr->def.num_components == 1);
1512       /* Outputs loads are only allowed in TCS. */
1513       assert(linkage->producer_stage == MESA_SHADER_TESS_CTRL);
1514    }
1515 
1516    /* Non-zero constant offsets should have been folded by
1517     * nir_io_add_const_offset_to_base.
1518     */
1519    nir_src offset = *nir_get_io_offset_src(intr);
1520    assert(!nir_src_is_const(offset) || nir_src_as_uint(offset) == 0);
1521 
1522    nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
1523 
1524    if (!can_remove_varying(linkage, sem.location))
1525       return false;
1526 
1527    /* For "xx -> FS", treat BFCn stores as COLn to make dead varying
1528     * elimination do the right thing automatically. The rules are:
1529     * - COLn inputs can be removed only if both COLn and BFCn are not
1530     *   written.
1531     * - Both COLn and BFCn outputs can be removed if COLn inputs
1532     *   aren't read.
1533     */
1534    if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) {
1535       if (sem.location == VARYING_SLOT_BFC0)
1536          sem.location = VARYING_SLOT_COL0;
1537       else if (sem.location == VARYING_SLOT_BFC1)
1538          sem.location = VARYING_SLOT_COL1;
1539    }
1540 
1541    /* Insert the instruction into the list of stores or loads for this
1542     * scalar slot.
1543     */
1544    unsigned slot =
1545       get_scalar_16bit_slot(sem, nir_intrinsic_component(intr));
1546 
1547    struct scalar_slot *out = &linkage->slot[slot];
1548    struct list_node *node = linear_alloc_child(linkage->linear_mem_ctx,
1549                                                sizeof(struct list_node));
1550    node->instr = intr;
1551    out->num_slots = MAX2(out->num_slots, sem.num_slots);
1552 
1553    if (is_store) {
1554       list_addtail(&node->head, &out->producer.stores);
1555 
1556       if (has_xfb(intr)) {
1557          BITSET_SET(linkage->xfb_mask, slot);
1558 
1559          if (sem.no_varying &&
1560              !is_active_sysval_output(linkage, slot, intr)) {
1561             if (intr->src[0].ssa->bit_size == 32)
1562                BITSET_SET(linkage->xfb32_only_mask, slot);
1563             else if (intr->src[0].ssa->bit_size == 16)
1564                BITSET_SET(linkage->xfb16_only_mask, slot);
1565             else
1566                unreachable("invalid load_input type");
1567          }
1568       }
1569    } else {
1570       list_addtail(&node->head, &out->producer.loads);
1571    }
1572 
1573    BITSET_SET(linkage->removable_mask, slot);
1574 
1575    /* Indirect indexing. */
1576    if (!nir_src_is_const(offset)) {
1577       /* Only the indirectly-indexed component is marked as indirect. */
1578       for (unsigned i = 0; i < sem.num_slots; i++)
1579          BITSET_SET(linkage->indirect_mask, slot + i * 8);
1580 
1581       /* Set the same vec4 type as the first element in all slots. */
1582       if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) {
1583          enum fs_vec4_type fs_vec4_type =
1584             linkage->fs_vec4_type[sem.location];
1585 
1586          for (unsigned i = 1; i < sem.num_slots; i++)
1587             linkage->fs_vec4_type[sem.location + i] = fs_vec4_type;
1588       }
1589       return false;
1590    }
1591 
1592    if (can_optimize_varying(linkage, sem.location).disable_all)
1593       return false;
1594 
1595    if (is_store) {
1596       nir_def *value = intr->src[0].ssa;
1597 
1598       const bool constant = value->parent_instr->type == nir_instr_type_load_const;
1599 
1600       /* If the store instruction is executed in a divergent block, the value
1601        * that's stored in the output becomes divergent.
1602        *
1603        * Mesh shaders get special treatment because we can't follow their topology,
1604        * so we only propagate constants.
1605        * TODO: revisit this when workgroup divergence analysis is merged.
1606        */
1607       const bool divergent = (!constant && linkage->producer_stage == MESA_SHADER_MESH) ||
1608                              intr->instr.block->divergent ||
1609                              nir_src_is_divergent(&intr->src[0]);
1610 
1611       if (!out->producer.value) {
1612          /* This is the first store to this output. */
1613          BITSET_SET(linkage->output_equal_mask, slot);
1614          out->producer.value = value->parent_instr;
1615 
1616          /* Set whether the value is convergent. Such varyings can be
1617           * promoted to flat regardless of their original interpolation
1618           * mode.
1619           */
1620          if (linkage->consumer_stage == MESA_SHADER_FRAGMENT && !divergent) {
1621             if (value->bit_size == 32)
1622                BITSET_SET(linkage->convergent32_mask, slot);
1623             else if (value->bit_size == 16)
1624                BITSET_SET(linkage->convergent16_mask, slot);
1625             else
1626                unreachable("invalid store_output type");
1627          }
1628       } else {
1629          /* There are multiple stores to the same output. If they store
1630           * different values, clear the mask.
1631           */
1632          if (out->producer.value != value->parent_instr)
1633             BITSET_CLEAR(linkage->output_equal_mask, slot);
1634 
1635          /* Update divergence information. */
1636          if (linkage->consumer_stage == MESA_SHADER_FRAGMENT && divergent) {
1637             if (value->bit_size == 32)
1638                BITSET_CLEAR(linkage->convergent32_mask, slot);
1639             else if (value->bit_size == 16)
1640                BITSET_CLEAR(linkage->convergent16_mask, slot);
1641             else
1642                unreachable("invalid store_output type");
1643          }
1644       }
1645    } else {
1646       /* Only TCS output loads can get here.
1647        *
1648        * We need to record output loads as flat32 or flat16, otherwise
1649        * compaction will think that the slot is free and will put some
1650        * other output in its place.
1651        */
1652       assert(linkage->producer_stage == MESA_SHADER_TESS_CTRL);
1653 
1654       if (!can_optimize_varying(linkage, sem.location).compact)
1655          return false;
1656 
1657       if (intr->def.bit_size == 32)
1658          BITSET_SET(linkage->flat32_mask, slot);
1659       else if (intr->def.bit_size == 16)
1660          BITSET_SET(linkage->flat16_mask, slot);
1661       else
1662          unreachable("invalid load_input type");
1663    }
1664    return false;
1665 }
1666 
1667 /******************************************************************
1668  * TIDYING UP INDIRECT VARYINGS (BEFORE DEAD VARYINGS REMOVAL)
1669  ******************************************************************/
1670 
1671 static void
tidy_up_indirect_varyings(struct linkage_info * linkage)1672 tidy_up_indirect_varyings(struct linkage_info *linkage)
1673 {
1674    unsigned i;
1675 
1676    /* Indirectly-indexed slots can have direct access too and thus set
1677     * various bitmasks, so clear those bitmasks to make sure they are not
1678     * touched.
1679     */
1680    BITSET_FOREACH_SET(i, linkage->indirect_mask, NUM_SCALAR_SLOTS) {
1681       slot_disable_optimizations_and_compaction(linkage, i);
1682    }
1683 
1684    /* If some slots have both direct and indirect accesses, move instructions
1685     * of such slots to the slot representing the first array element, so that
1686     * we can remove all loads/stores of dead indirectly-indexed varyings
1687     * by only looking at the first element.
1688     */
1689    BITSET_FOREACH_SET(i, linkage->indirect_mask, NUM_SCALAR_SLOTS) {
1690       struct scalar_slot *first = &linkage->slot[i];
1691 
1692       /* Skip if this is not the first array element. The first element
1693        * always sets num_slots to at least 2.
1694        */
1695       if (first->num_slots <= 1)
1696          continue;
1697 
1698       /* Move instructions from other elements of the indirectly-accessed
1699        * array to the first element (by merging the linked lists).
1700        */
1701       for (unsigned elem = 1; elem < first->num_slots; elem++) {
1702          /* The component slots are at 16-bit granularity, so we need to
1703           * increment by 8 to get the same component in the next vec4 slot.
1704           */
1705          struct scalar_slot *other = &linkage->slot[i + elem * 8];
1706 
1707          list_splicetail(&other->producer.stores, &first->producer.stores);
1708          list_splicetail(&other->producer.loads, &first->producer.loads);
1709          list_splicetail(&other->consumer.loads, &first->consumer.loads);
1710          list_inithead(&other->producer.stores);
1711          list_inithead(&other->producer.loads);
1712          list_inithead(&other->consumer.loads);
1713       }
1714    }
1715 }
1716 
1717 /******************************************************************
1718  * TIDYING UP CONVERGENT VARYINGS
1719  ******************************************************************/
1720 
1721 /**
1722  * Reorganize bitmasks for FS because they are initialized such that they can
1723  * intersect with the convergent bitmasks. We want them to be disjoint, so
1724  * that masks of interpolated, flat, and convergent varyings don't intersect.
1725  */
1726 static void
tidy_up_convergent_varyings(struct linkage_info * linkage)1727 tidy_up_convergent_varyings(struct linkage_info *linkage)
1728 {
1729    if (linkage->consumer_stage != MESA_SHADER_FRAGMENT)
1730       return;
1731 
1732    unsigned i;
1733    /* Whether to promote convergent interpolated slots to flat if it
1734     * doesn't lead to worse compaction.
1735     */
1736    bool optimize_convergent_slots = true; /* only turn off for debugging */
1737 
1738    if (optimize_convergent_slots) {
1739       /* If a slot is flat and convergent and the driver can't load as flat
1740        * from interpolated vec4 slots, keep the flat bit and remove
1741        * the convergent bit. If the driver can load as flat from interpolated
1742        * vec4 slots, keep the convergent bit.
1743        *
1744        * If a slot is interpolated and convergent, remove the interpolated
1745        * bit and keep the convergent bit, which means that it's interpolated,
1746        * but can be promoted to flat.
1747        *
1748        * Since the geometry shader is the only shader that can store values
1749        * in multiple vertices before FS, it's required that all stores are
1750        * equal to be considered convergent (output_equal_mask), otherwise
1751        * the promotion to flat would be incorrect.
1752        */
1753       BITSET_FOREACH_SET(i, linkage->convergent32_mask, NUM_SCALAR_SLOTS) {
1754          if (!BITSET_TEST(linkage->interp_fp32_mask, i) &&
1755              !BITSET_TEST(linkage->color32_mask, i) &&
1756              !BITSET_TEST(linkage->flat32_mask, i) &&
1757              !BITSET6_TEST_ANY(linkage->interp_fp32_qual_masks, i) &&
1758              !BITSET3_TEST_ANY(linkage->color32_qual_masks, i)) {
1759             /* Clear the flag - not used by FS. */
1760             BITSET_CLEAR(linkage->convergent32_mask, i);
1761          } else if ((!linkage->can_mix_convergent_flat_with_interpolated &&
1762                      BITSET_TEST(linkage->flat32_mask, i)) ||
1763                     (linkage->producer_stage == MESA_SHADER_GEOMETRY &&
1764                      !BITSET_TEST(linkage->output_equal_mask, i))) {
1765             /* Keep the original qualifier. */
1766             BITSET_CLEAR(linkage->convergent32_mask, i);
1767          } else {
1768             /* Keep it convergent. */
1769             BITSET_CLEAR(linkage->interp_fp32_mask, i);
1770             for (unsigned b = 0; b < NUM_INTERP_QUALIFIERS; b++)
1771                BITSET_CLEAR(linkage->interp_fp32_qual_masks[b], i);
1772             BITSET_CLEAR(linkage->color32_mask, i);
1773             for (unsigned b = 0; b < NUM_COLOR_QUALIFIERS; b++)
1774                BITSET_CLEAR(linkage->color32_qual_masks[b], i);
1775             BITSET_CLEAR(linkage->flat32_mask, i);
1776          }
1777       }
1778 
1779       BITSET_FOREACH_SET(i, linkage->convergent16_mask, NUM_SCALAR_SLOTS) {
1780          if (!BITSET_TEST(linkage->interp_fp16_mask, i) &&
1781              !BITSET_TEST(linkage->flat16_mask, i) &&
1782              !BITSET6_TEST_ANY(linkage->interp_fp16_qual_masks, i)) {
1783             /* Clear the flag - not used by FS. */
1784             BITSET_CLEAR(linkage->convergent16_mask, i);
1785          } else if ((!linkage->can_mix_convergent_flat_with_interpolated &&
1786                      BITSET_TEST(linkage->flat16_mask, i)) ||
1787                     (linkage->producer_stage == MESA_SHADER_GEOMETRY &&
1788                      !BITSET_TEST(linkage->output_equal_mask, i))) {
1789             /* Keep the original qualifier. */
1790             BITSET_CLEAR(linkage->convergent16_mask, i);
1791          } else {
1792             /* Keep it convergent. */
1793             BITSET_CLEAR(linkage->interp_fp16_mask, i);
1794             for (unsigned b = 0; b < NUM_INTERP_QUALIFIERS; b++)
1795                BITSET_CLEAR(linkage->interp_fp16_qual_masks[b], i);
1796             BITSET_CLEAR(linkage->flat16_mask, i);
1797          }
1798       }
1799    } else {
1800       /* Don't do anything with convergent slots. */
1801       BITSET_ZERO(linkage->convergent32_mask);
1802       BITSET_ZERO(linkage->convergent16_mask);
1803    }
1804 }
1805 
1806 /******************************************************************
1807  * DETERMINING UNIFORM AND UBO MOVABILITY BASED ON DRIVER LIMITS
1808  ******************************************************************/
1809 
1810 static bool
is_variable_present(nir_shader * nir,nir_variable * var,nir_variable_mode mode,bool spirv)1811 is_variable_present(nir_shader *nir, nir_variable *var,
1812                     nir_variable_mode mode, bool spirv)
1813 {
1814    nir_foreach_variable_with_modes(it, nir, mode) {
1815       if ((spirv && it->data.binding == var->data.binding) ||
1816           (!spirv && !strcmp(it->name, var->name)))
1817          return true;
1818    }
1819    return false;
1820 }
1821 
1822 /* TODO: this should be a helper in common code */
1823 static unsigned
get_uniform_components(const struct glsl_type * type)1824 get_uniform_components(const struct glsl_type *type)
1825 {
1826    unsigned size = glsl_get_aoa_size(type);
1827    size = MAX2(size, 1);
1828    size *= glsl_get_matrix_columns(glsl_without_array(type));
1829 
1830    if (glsl_type_is_dual_slot(glsl_without_array(type)))
1831       size *= 2;
1832 
1833    /* Convert from vec4 to scalar. */
1834    return size * 4;
1835 }
1836 
1837 static unsigned
get_ubo_slots(const nir_variable * var)1838 get_ubo_slots(const nir_variable *var)
1839 {
1840    if (glsl_type_is_interface(glsl_without_array(var->type))) {
1841       unsigned slots = glsl_get_aoa_size(var->type);
1842       return MAX2(slots, 1);
1843    }
1844 
1845    return 1;
1846 }
1847 
1848 /**
1849  * Count uniforms and see if the combined uniform component count is over
1850  * the limit. If it is, don't move any uniforms. It's sufficient if drivers
1851  * declare a very high limit.
1852  */
1853 static void
determine_uniform_movability(struct linkage_info * linkage,unsigned max_uniform_components)1854 determine_uniform_movability(struct linkage_info *linkage,
1855                              unsigned max_uniform_components)
1856 {
1857    nir_shader *producer = linkage->producer_builder.shader;
1858    nir_shader *consumer = linkage->consumer_builder.shader;
1859    unsigned num_producer_uniforms = 0;
1860    unsigned num_consumer_uniforms = 0;
1861    unsigned num_shared_uniforms = 0;
1862 
1863    nir_foreach_variable_with_modes(var, producer, nir_var_uniform) {
1864       if (is_variable_present(consumer, var, nir_var_uniform, linkage->spirv))
1865          num_shared_uniforms += get_uniform_components(var->type);
1866       else
1867          num_producer_uniforms += get_uniform_components(var->type);
1868    }
1869 
1870    nir_foreach_variable_with_modes(var, consumer, nir_var_uniform) {
1871       if (!is_variable_present(producer, var, nir_var_uniform, linkage->spirv))
1872          num_consumer_uniforms += get_uniform_components(var->type);
1873    }
1874 
1875    linkage->can_move_uniforms =
1876       num_producer_uniforms + num_consumer_uniforms + num_shared_uniforms <=
1877       max_uniform_components;
1878 }
1879 
1880 /**
1881  * Count UBOs and see if the combined UBO count is over the limit. If it is,
1882  * don't move any UBOs. It's sufficient if drivers declare a very high limit.
1883  */
1884 static void
determine_ubo_movability(struct linkage_info * linkage,unsigned max_ubos_per_stage)1885 determine_ubo_movability(struct linkage_info *linkage,
1886                          unsigned max_ubos_per_stage)
1887 {
1888    nir_shader *producer = linkage->producer_builder.shader;
1889    nir_shader *consumer = linkage->consumer_builder.shader;
1890    unsigned num_producer_ubos = 0;
1891    unsigned num_consumer_ubos = 0;
1892    unsigned num_shared_ubos = 0;
1893 
1894    nir_foreach_variable_with_modes(var, producer, nir_var_mem_ubo) {
1895       if (is_variable_present(consumer, var, nir_var_mem_ubo, linkage->spirv))
1896          num_shared_ubos += get_ubo_slots(var);
1897       else
1898          num_producer_ubos += get_ubo_slots(var);
1899    }
1900 
1901    nir_foreach_variable_with_modes(var, consumer, nir_var_mem_ubo) {
1902       if (!is_variable_present(producer, var, nir_var_mem_ubo,
1903                                linkage->spirv))
1904          num_consumer_ubos += get_ubo_slots(var);
1905    }
1906 
1907    linkage->can_move_ubos =
1908       num_producer_ubos + num_consumer_ubos + num_shared_ubos <=
1909       max_ubos_per_stage;
1910 }
1911 
1912 /******************************************************************
1913  * DEAD VARYINGS REMOVAL
1914  ******************************************************************/
1915 
1916 static void
remove_all_stores(struct linkage_info * linkage,unsigned i,bool * uses_xfb,nir_opt_varyings_progress * progress)1917 remove_all_stores(struct linkage_info *linkage, unsigned i,
1918                   bool *uses_xfb, nir_opt_varyings_progress *progress)
1919 {
1920    struct scalar_slot *slot = &linkage->slot[i];
1921 
1922    assert(!list_is_empty(&slot->producer.stores) &&
1923           list_is_empty(&slot->producer.loads) &&
1924           list_is_empty(&slot->consumer.loads));
1925 
1926    /* Remove all stores. */
1927    list_for_each_entry_safe(struct list_node, iter, &slot->producer.stores, head) {
1928       if (nir_remove_varying(iter->instr, linkage->consumer_stage)) {
1929          list_del(&iter->head);
1930          *progress |= nir_progress_producer;
1931       } else {
1932          if (has_xfb(iter->instr)) {
1933             *uses_xfb = true;
1934 
1935             if (!is_active_sysval_output(linkage, i, iter->instr)) {
1936                if (iter->instr->src[0].ssa->bit_size == 32)
1937                   BITSET_SET(linkage->xfb32_only_mask, i);
1938                else if (iter->instr->src[0].ssa->bit_size == 16)
1939                   BITSET_SET(linkage->xfb16_only_mask, i);
1940                else
1941                   unreachable("invalid load_input type");
1942             }
1943          }
1944       }
1945    }
1946 }
1947 
1948 static void
remove_dead_varyings(struct linkage_info * linkage,nir_opt_varyings_progress * progress)1949 remove_dead_varyings(struct linkage_info *linkage,
1950                      nir_opt_varyings_progress *progress)
1951 {
1952    unsigned i;
1953 
1954    /* Remove dead inputs and outputs. */
1955    BITSET_FOREACH_SET(i, linkage->removable_mask, NUM_SCALAR_SLOTS) {
1956       struct scalar_slot *slot = &linkage->slot[i];
1957 
1958       /* Only indirect access can have no loads and stores because we moved
1959        * them to the first element in tidy_up_indirect_varyings().
1960        */
1961       assert(!list_is_empty(&slot->producer.stores) ||
1962              !list_is_empty(&slot->producer.loads) ||
1963              !list_is_empty(&slot->consumer.loads) ||
1964              BITSET_TEST(linkage->indirect_mask, i));
1965 
1966       /* Nothing to do if there are no loads and stores. */
1967       if (list_is_empty(&slot->producer.stores) &&
1968           list_is_empty(&slot->producer.loads) &&
1969           list_is_empty(&slot->consumer.loads))
1970          continue;
1971 
1972       /* If there are producer loads (e.g. TCS) but no consumer loads
1973        * (e.g. TES), set the "no_varying" flag to indicate that the outputs
1974        * are not consumed by the next shader stage (e.g. TES).
1975        */
1976       if (!list_is_empty(&slot->producer.stores) &&
1977           !list_is_empty(&slot->producer.loads) &&
1978           list_is_empty(&slot->consumer.loads)) {
1979          for (unsigned list_index = 0; list_index < 2; list_index++) {
1980             struct list_head *list = list_index ? &slot->producer.stores :
1981                                                   &slot->producer.loads;
1982 
1983             list_for_each_entry(struct list_node, iter, list, head) {
1984                nir_io_semantics sem = nir_intrinsic_io_semantics(iter->instr);
1985                sem.no_varying = 1;
1986                nir_intrinsic_set_io_semantics(iter->instr, sem);
1987             }
1988          }
1989 
1990          /* This tells the compaction to move these varyings to the end. */
1991          if (BITSET_TEST(linkage->flat32_mask, i)) {
1992             assert(linkage->consumer_stage != MESA_SHADER_FRAGMENT);
1993             BITSET_CLEAR(linkage->flat32_mask, i);
1994             BITSET_SET(linkage->no_varying32_mask, i);
1995          }
1996          if (BITSET_TEST(linkage->flat16_mask, i)) {
1997             assert(linkage->consumer_stage != MESA_SHADER_FRAGMENT);
1998             BITSET_CLEAR(linkage->flat16_mask, i);
1999             BITSET_SET(linkage->no_varying16_mask, i);
2000          }
2001          continue;
2002       }
2003 
2004       /* The varyings aren't dead if both loads and stores are present. */
2005       if (!list_is_empty(&slot->producer.stores) &&
2006           (!list_is_empty(&slot->producer.loads) ||
2007            !list_is_empty(&slot->consumer.loads)))
2008          continue;
2009 
2010       bool uses_xfb = false;
2011 
2012       if (list_is_empty(&slot->producer.stores)) {
2013          /* There are no stores. */
2014          assert(!list_is_empty(&slot->producer.loads) ||
2015                 !list_is_empty(&slot->consumer.loads));
2016 
2017          /* TEXn.xy loads can't be removed in FS because of the coord
2018           * replace state, but TEXn outputs can be removed if they are
2019           * not read by FS.
2020           *
2021           * TEXn.zw loads can be eliminated and replaced by (0, 1), which
2022           * is equal to the coord replace value.
2023           */
2024          if (is_interpolated_texcoord(linkage, i)) {
2025             assert(i % 2 == 0); /* high 16-bit slots disallowed */
2026             /* Keep TEXn.xy. */
2027             if (i % 8 < 4)
2028                continue;
2029          }
2030 
2031          /* Replace all loads with undef. Do that for both input loads
2032           * in the consumer stage and output loads in the producer stage
2033           * because we also want to eliminate TCS loads that have no
2034           * corresponding TCS stores.
2035           */
2036          for (unsigned list_index = 0; list_index < 2; list_index++) {
2037             struct list_head *list = list_index ? &slot->producer.loads :
2038                                                   &slot->consumer.loads;
2039             nir_builder *b = list_index ? &linkage->producer_builder :
2040                                           &linkage->consumer_builder;
2041 
2042             list_for_each_entry(struct list_node, iter, list, head) {
2043                nir_intrinsic_instr *loadi = iter->instr;
2044                nir_def *replacement = NULL;
2045 
2046                b->cursor = nir_before_instr(&loadi->instr);
2047 
2048                /* LAYER and VIEWPORT FS inputs should be replaced by 0
2049                 * instead of undef.
2050                 */
2051                gl_varying_slot location = (gl_varying_slot)(vec4_slot(i));
2052 
2053                if (linkage->consumer_stage == MESA_SHADER_FRAGMENT &&
2054                    (location == VARYING_SLOT_LAYER ||
2055                     location == VARYING_SLOT_VIEWPORT ||
2056                     /* TEXn.z is replaced by 0 (matching coord replace) */
2057                     (is_interpolated_texcoord(linkage, i) && i % 8 == 4)))
2058                   replacement = nir_imm_intN_t(b, 0, loadi->def.bit_size);
2059                else if (linkage->consumer_stage == MESA_SHADER_FRAGMENT &&
2060                         /* TEXn.w is replaced by 1 (matching coord replace) */
2061                         is_interpolated_texcoord(linkage, i) && i % 8 == 6)
2062                   replacement = nir_imm_floatN_t(b, 1, loadi->def.bit_size);
2063                else
2064                   replacement = nir_undef(b, 1, loadi->def.bit_size);
2065 
2066                nir_def_replace(&loadi->def, replacement);
2067 
2068                *progress |= list_index ? nir_progress_producer :
2069                                          nir_progress_consumer;
2070             }
2071          }
2072 
2073          /* Clear the lists. */
2074          list_inithead(&slot->producer.loads);
2075          list_inithead(&slot->consumer.loads);
2076       } else {
2077          /* There are no loads. */
2078          remove_all_stores(linkage, i, &uses_xfb, progress);
2079       }
2080 
2081       /* Clear bitmasks associated with this varying slot or array. */
2082       for (unsigned elem = 0; elem < slot->num_slots; elem++)
2083          clear_slot_info_after_removal(linkage, i + elem, uses_xfb);
2084    }
2085 }
2086 
2087 /******************************************************************
2088  * SSA CLONING HELPERS
2089  ******************************************************************/
2090 
2091 /* Pass flags for inter-shader code motion. Also used by helpers. */
2092 #define FLAG_ALU_IS_TES_INTERP_LOAD    BITFIELD_BIT(0)
2093 #define FLAG_MOVABLE                   BITFIELD_BIT(1)
2094 #define FLAG_UNMOVABLE                 BITFIELD_BIT(2)
2095 #define FLAG_POST_DOMINATOR_PROCESSED  BITFIELD_BIT(3)
2096 #define FLAG_GATHER_LOADS_VISITED      BITFIELD_BIT(4)
2097 
2098 #define FLAG_INTERP_MASK               BITFIELD_RANGE(5, 3)
2099 #define FLAG_INTERP_CONVERGENT         (0 << 5)
2100 #define FLAG_INTERP_FLAT               (1 << 5)
2101 /* FS-only interpolation modes. */
2102 #define FLAG_INTERP_PERSP_PIXEL        (2 << 5)
2103 #define FLAG_INTERP_PERSP_CENTROID     (3 << 5)
2104 #define FLAG_INTERP_PERSP_SAMPLE       (4 << 5)
2105 #define FLAG_INTERP_LINEAR_PIXEL       (5 << 5)
2106 #define FLAG_INTERP_LINEAR_CENTROID    (6 << 5)
2107 #define FLAG_INTERP_LINEAR_SAMPLE      (7 << 5)
2108 /* TES-only interpolation modes. (these were found in shaders) */
2109 #define FLAG_INTERP_TES_TRIANGLE_UVW   (2 << 5) /* v0*u + v1*v + v2*w */
2110 #define FLAG_INTERP_TES_TRIANGLE_WUV   (3 << 5) /* v0*w + v1*u + v2*v */
2111 /* TODO: Feel free to insert more TES interpolation equations here. */
2112 
2113 static bool
can_move_deref_between_shaders(struct linkage_info * linkage,nir_instr * instr)2114 can_move_deref_between_shaders(struct linkage_info *linkage, nir_instr *instr)
2115 {
2116    nir_deref_instr *deref = nir_instr_as_deref(instr);
2117    unsigned allowed_modes =
2118       (linkage->can_move_uniforms ? nir_var_uniform : 0) |
2119       (linkage->can_move_ubos ? nir_var_mem_ubo : 0);
2120 
2121    if (!nir_deref_mode_is_one_of(deref, allowed_modes))
2122       return false;
2123 
2124    switch (deref->deref_type) {
2125    case nir_deref_type_var:
2126    case nir_deref_type_struct:
2127    case nir_deref_type_array:
2128       break;
2129    default:
2130       return false;
2131    }
2132 
2133    nir_variable *var = nir_deref_instr_get_variable(deref);
2134 
2135    /* Subroutine uniforms are not moved. Even though it works and subroutine
2136     * uniforms are moved correctly and subroutines have been inlined at this
2137     * point, subroutine functions aren't moved and the linker doesn't like
2138     * when a shader only contains a subroutine uniform but no subroutine
2139     * functions. This could be fixed in the linker, but for now, don't
2140     * move subroutine uniforms.
2141     */
2142    if (var->name && strstr(var->name, "__subu_") == var->name)
2143       return false;
2144 
2145    return true;
2146 }
2147 
2148 static nir_intrinsic_instr *
find_per_vertex_load_for_tes_interp(nir_instr * instr)2149 find_per_vertex_load_for_tes_interp(nir_instr *instr)
2150 {
2151    switch (instr->type) {
2152    case nir_instr_type_alu: {
2153       nir_alu_instr *alu = nir_instr_as_alu(instr);
2154       unsigned num_srcs = nir_op_infos[alu->op].num_inputs;
2155 
2156       for (unsigned i = 0; i < num_srcs; i++) {
2157          nir_instr *src = alu->src[i].src.ssa->parent_instr;
2158          nir_intrinsic_instr *intr = find_per_vertex_load_for_tes_interp(src);
2159 
2160          if (intr)
2161             return intr;
2162       }
2163       return NULL;
2164    }
2165 
2166    case nir_instr_type_intrinsic: {
2167       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
2168 
2169       return intr->intrinsic == nir_intrinsic_load_per_vertex_input ?
2170                intr : NULL;
2171    }
2172 
2173    default:
2174       unreachable("unexpected instruction type");
2175    }
2176 }
2177 
2178 static nir_def *
get_stored_value_for_load(struct linkage_info * linkage,nir_instr * instr)2179 get_stored_value_for_load(struct linkage_info *linkage, nir_instr *instr)
2180 {
2181    nir_intrinsic_instr *intr;
2182 
2183    if (instr->type == nir_instr_type_intrinsic) {
2184       intr = nir_instr_as_intrinsic(instr);
2185    } else {
2186       assert(instr->type == nir_instr_type_alu &&
2187              instr->pass_flags & FLAG_ALU_IS_TES_INTERP_LOAD);
2188       intr = find_per_vertex_load_for_tes_interp(instr);
2189    }
2190 
2191    unsigned slot_index = intr_get_scalar_16bit_slot(intr);
2192    assert(list_is_singular(&linkage->slot[slot_index].producer.stores));
2193 
2194    nir_def *stored_value =
2195       list_first_entry(&linkage->slot[slot_index].producer.stores,
2196                        struct list_node, head)->instr->src[0].ssa;
2197    assert(stored_value->num_components == 1);
2198    return stored_value;
2199 }
2200 
2201 /* Clone the SSA, which can be in a different shader. */
2202 static nir_def *
clone_ssa_impl(struct linkage_info * linkage,nir_builder * b,nir_def * ssa)2203 clone_ssa_impl(struct linkage_info *linkage, nir_builder *b, nir_def *ssa)
2204 {
2205    struct hash_entry *entry = _mesa_hash_table_search(linkage->clones_ht,
2206                                                       ssa->parent_instr);
2207    if (entry)
2208       return entry->data;
2209 
2210    nir_def *clone = NULL;
2211 
2212    switch (ssa->parent_instr->type) {
2213    case nir_instr_type_load_const:
2214       clone = nir_build_imm(b, ssa->num_components, ssa->bit_size,
2215                             nir_instr_as_load_const(ssa->parent_instr)->value);
2216       break;
2217 
2218    case nir_instr_type_undef:
2219       clone = nir_undef(b, ssa->num_components, ssa->bit_size);
2220       break;
2221 
2222    case nir_instr_type_alu: {
2223       nir_alu_instr *alu = nir_instr_as_alu(ssa->parent_instr);
2224 
2225       if (alu->instr.pass_flags & FLAG_ALU_IS_TES_INTERP_LOAD) {
2226          /* We are cloning an interpolated TES load in the producer for
2227           * backward inter-shader code motion.
2228           */
2229          assert(&linkage->producer_builder == b);
2230          return get_stored_value_for_load(linkage, &alu->instr);
2231       }
2232 
2233       nir_def *src[4] = {0};
2234       unsigned num_srcs = nir_op_infos[alu->op].num_inputs;
2235       assert(num_srcs <= ARRAY_SIZE(src));
2236 
2237       for (unsigned i = 0; i < num_srcs; i++)
2238          src[i] = clone_ssa_impl(linkage, b, alu->src[i].src.ssa);
2239 
2240       clone = nir_build_alu(b, alu->op, src[0], src[1], src[2], src[3]);
2241       nir_alu_instr *alu_clone = nir_instr_as_alu(clone->parent_instr);
2242 
2243       alu_clone->exact = alu->exact;
2244       alu_clone->no_signed_wrap = alu->no_signed_wrap;
2245       alu_clone->no_unsigned_wrap = alu->no_unsigned_wrap;
2246       alu_clone->def.num_components = alu->def.num_components;
2247       alu_clone->def.bit_size = alu->def.bit_size;
2248 
2249       for (unsigned i = 0; i < num_srcs; i++) {
2250          memcpy(alu_clone->src[i].swizzle, alu->src[i].swizzle,
2251                 NIR_MAX_VEC_COMPONENTS);
2252       }
2253       break;
2254    }
2255 
2256    case nir_instr_type_intrinsic: {
2257       /* Clone load_deref of uniform or ubo. It's the only thing that can
2258        * occur here.
2259        */
2260       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(ssa->parent_instr);
2261 
2262       switch (intr->intrinsic) {
2263       case nir_intrinsic_load_deref: {
2264          nir_def *ssa = clone_ssa_impl(linkage, b, intr->src[0].ssa);
2265          clone = nir_load_deref(b, nir_instr_as_deref(ssa->parent_instr));
2266          break;
2267       }
2268 
2269       case nir_intrinsic_load_input:
2270       case nir_intrinsic_load_per_primitive_input:
2271       case nir_intrinsic_load_interpolated_input: {
2272          /* We are cloning load_input in the producer for backward
2273           * inter-shader code motion. Replace the input load with the stored
2274           * output value. That way we can clone any expression using inputs
2275           * from the consumer in the producer.
2276           */
2277          assert(&linkage->producer_builder == b);
2278          clone = get_stored_value_for_load(linkage, &intr->instr);
2279          break;
2280       }
2281 
2282       default:
2283          unreachable("unexpected intrinsic");
2284       }
2285       break;
2286    }
2287 
2288    case nir_instr_type_deref: {
2289       nir_deref_instr *deref = nir_instr_as_deref(ssa->parent_instr);
2290       assert(nir_deref_mode_is_one_of(deref, nir_var_uniform | nir_var_mem_ubo));
2291 
2292       /* Get the uniform from the original shader. */
2293       nir_variable *var = nir_deref_instr_get_variable(deref);
2294       assert(!(var->data.mode & nir_var_mem_ubo) || linkage->can_move_ubos);
2295 
2296       /* Declare the uniform in the target shader. If it's the same shader
2297        * (in the case of replacing output loads with a uniform), this has
2298        * no effect. If the variable already exists in the target shader, this
2299        * just returns the existing one.
2300        */
2301       var = nir_clone_uniform_variable(b->shader, var, linkage->spirv);
2302 
2303       if (deref->deref_type == nir_deref_type_var) {
2304          clone = &nir_build_deref_var(b, var)->def;
2305       } else {
2306          nir_deref_instr *parent_orig = nir_deref_instr_parent(deref);
2307          nir_deref_instr *parent_clone =
2308             nir_instr_as_deref(clone_ssa_impl(linkage, b, &parent_orig->def)
2309                                ->parent_instr);
2310 
2311          switch (deref->deref_type) {
2312          case nir_deref_type_array: {
2313             nir_def *index = clone_ssa_impl(linkage, b, deref->arr.index.ssa);
2314             clone = &nir_build_deref_array(b, parent_clone, index)->def;
2315             break;
2316          }
2317          case nir_deref_type_struct:
2318             clone = &nir_build_deref_struct(b, parent_clone,
2319                                             deref->strct.index)->def;
2320             break;
2321          default:
2322             unreachable("invalid deref type");
2323          }
2324       }
2325       break;
2326    }
2327 
2328    default:
2329       unreachable("unexpected instruction type");
2330    }
2331 
2332    _mesa_hash_table_insert(linkage->clones_ht, ssa->parent_instr, clone);
2333    return clone;
2334 }
2335 
2336 static nir_def *
clone_ssa(struct linkage_info * linkage,nir_builder * b,nir_def * ssa)2337 clone_ssa(struct linkage_info *linkage, nir_builder *b, nir_def *ssa)
2338 {
2339    assert(!linkage->clones_ht);
2340    linkage->clones_ht = _mesa_pointer_hash_table_create(NULL);
2341 
2342    nir_def *clone = clone_ssa_impl(linkage, b, ssa);
2343 
2344    _mesa_hash_table_destroy(linkage->clones_ht, NULL);
2345    linkage->clones_ht = NULL;
2346    return clone;
2347 }
2348 
2349 /******************************************************************
2350  * UNIFORM EXPRESSION PROPAGATION (CONSTANTS, UNIFORMS, UBO LOADS)
2351  ******************************************************************/
2352 
2353 static void
remove_all_stores_and_clear_slot(struct linkage_info * linkage,unsigned slot,nir_opt_varyings_progress * progress)2354 remove_all_stores_and_clear_slot(struct linkage_info *linkage, unsigned slot,
2355                                  nir_opt_varyings_progress *progress)
2356 {
2357    bool uses_xfb = false;
2358    remove_all_stores(linkage, slot, &uses_xfb, progress);
2359    clear_slot_info_after_removal(linkage, slot, uses_xfb);
2360 }
2361 
2362 struct is_uniform_expr_state {
2363    struct linkage_info *linkage;
2364    unsigned cost;
2365 };
2366 
2367 static bool
2368 is_uniform_expression(nir_instr *instr, struct is_uniform_expr_state *state);
2369 
2370 static bool
src_is_uniform_expression(nir_src * src,void * data)2371 src_is_uniform_expression(nir_src *src, void *data)
2372 {
2373    return is_uniform_expression(src->ssa->parent_instr,
2374                                 (struct is_uniform_expr_state*)data);
2375 }
2376 
2377 /**
2378  * Return whether instr is a uniform expression that can be moved into
2379  * the next shader.
2380  */
2381 static bool
is_uniform_expression(nir_instr * instr,struct is_uniform_expr_state * state)2382 is_uniform_expression(nir_instr *instr, struct is_uniform_expr_state *state)
2383 {
2384    switch (instr->type) {
2385    case nir_instr_type_load_const:
2386    case nir_instr_type_undef:
2387       return true;
2388 
2389    case nir_instr_type_alu:
2390       break;
2391 
2392    case nir_instr_type_intrinsic:
2393       if (nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_deref)
2394          break;
2395       return false;
2396 
2397    case nir_instr_type_deref:
2398       if (!can_move_deref_between_shaders(state->linkage, instr))
2399          return false;
2400       /* We need to iterate over the deref chain recursively. */
2401       break;
2402 
2403    default:
2404       return false;
2405    }
2406 
2407    if (!instr->pass_flags) {
2408       state->cost += state->linkage->varying_estimate_instr_cost ?
2409                         state->linkage->varying_estimate_instr_cost(instr) : 1;
2410       instr->pass_flags = 1;
2411       return nir_foreach_src(instr, src_is_uniform_expression, state);
2412    }
2413    return true;
2414 }
2415 
2416 /**
2417  * Propagate constants, uniforms, UBO loads, and uniform expressions
2418  * in output components to inputs loads in the next shader and output
2419  * loads in the current stage, and remove the output components.
2420  *
2421  * Uniform expressions are ALU expressions only sourcing constants, uniforms,
2422  * and UBO loads.
2423  */
2424 static void
propagate_uniform_expressions(struct linkage_info * linkage,nir_opt_varyings_progress * progress)2425 propagate_uniform_expressions(struct linkage_info *linkage,
2426                               nir_opt_varyings_progress *progress)
2427 {
2428    unsigned i;
2429 
2430    /* Find uniform expressions. If there are multiple stores, they should all
2431     * store the same value. That's guaranteed by output_equal_mask.
2432     */
2433    BITSET_FOREACH_SET(i, linkage->output_equal_mask, NUM_SCALAR_SLOTS) {
2434       if (!can_optimize_varying(linkage, vec4_slot(i)).propagate_uniform_expr)
2435          continue;
2436 
2437       struct scalar_slot *slot = &linkage->slot[i];
2438       assert(!list_is_empty(&slot->producer.loads) ||
2439              !list_is_empty(&slot->consumer.loads));
2440 
2441       struct is_uniform_expr_state state = {
2442          .linkage = linkage,
2443          .cost = 0,
2444       };
2445 
2446       /* Clear pass_flags, which is used to prevent adding the cost of
2447        * the same instruction multiple times.
2448        */
2449       nir_shader_clear_pass_flags(linkage->producer_builder.shader);
2450 
2451       if (!is_uniform_expression(slot->producer.value, &state))
2452          continue;
2453 
2454       if (state.cost > linkage->max_varying_expression_cost)
2455          continue;
2456 
2457       /* Colors can be propagated only if they are constant between [0, 1]
2458        * because that's the only case when the clamp vertex color state has
2459        * no effect.
2460        */
2461       if (is_interpolated_color(linkage, i) &&
2462           (slot->producer.value->type != nir_instr_type_load_const ||
2463            nir_instr_as_load_const(slot->producer.value)->value[0].f32 < 0 ||
2464            nir_instr_as_load_const(slot->producer.value)->value[0].f32 > 1))
2465          continue;
2466 
2467       /* TEXn.zw can be propagated only if it's equal to (0, 1) because it's
2468        * the coord replace value.
2469        */
2470       if (is_interpolated_texcoord(linkage, i)) {
2471          assert(i % 2 == 0); /* high 16-bit slots disallowed */
2472 
2473          if (i % 8 == 0 || /* TEXn.x */
2474              i % 8 == 2 || /* TEXn.y */
2475              slot->producer.value->type != nir_instr_type_load_const)
2476             continue;
2477 
2478          float value =
2479             nir_instr_as_load_const(slot->producer.value)->value[0].f32;
2480 
2481          /* This ignores signed zeros, but those are destroyed by
2482           * interpolation, so it doesn't matter.
2483           */
2484          if ((i % 8 == 4 && value != 0) ||
2485              (i % 8 == 6 && value != 1))
2486             continue;
2487       }
2488 
2489       /* Clear pass_flags, which is used by clone_ssa. */
2490       nir_shader_clear_pass_flags(linkage->producer_builder.shader);
2491 
2492       /* Replace all loads. Do that for both input and output loads. */
2493       for (unsigned list_index = 0; list_index < 2; list_index++) {
2494          struct list_head *load = list_index ? &slot->producer.loads :
2495                                                &slot->consumer.loads;
2496          nir_builder *b = list_index ? &linkage->producer_builder :
2497                                        &linkage->consumer_builder;
2498 
2499          list_for_each_entry(struct list_node, node, load, head) {
2500             nir_intrinsic_instr *loadi = node->instr;
2501             b->cursor = nir_before_instr(&loadi->instr);
2502 
2503             /* Copy the uniform expression before the load. */
2504             nir_def *clone = clone_ssa(linkage, b,
2505                                        nir_instr_def(slot->producer.value));
2506 
2507             /* Interpolation converts Infs to NaNs. If we skip it, we need to
2508              * convert Infs to NaNs manually.
2509              */
2510             if (loadi->intrinsic == nir_intrinsic_load_interpolated_input &&
2511                 preserve_nans(b->shader, clone->bit_size))
2512                clone = build_convert_inf_to_nan(b, clone);
2513 
2514             /* Replace the original load. */
2515             nir_def_replace(&loadi->def, clone);
2516             *progress |= list_index ? nir_progress_producer :
2517                                       nir_progress_consumer;
2518          }
2519       }
2520 
2521       /* Clear the lists. */
2522       list_inithead(&slot->producer.loads);
2523       list_inithead(&slot->consumer.loads);
2524 
2525       /* Remove all stores now that loads have been replaced. */
2526       remove_all_stores_and_clear_slot(linkage, i, progress);
2527    }
2528 }
2529 
2530 /******************************************************************
2531  * OUTPUT DEDUPLICATION
2532  ******************************************************************/
2533 
2534 /* We can only deduplicate outputs that have the same qualifier, and color
2535  * components must be deduplicated separately because they are affected by GL
2536  * states.
2537  *
2538  * QUAL_*_INTERP_ANY means that the interpolation qualifier doesn't matter for
2539  * deduplication as long as it's not flat.
2540  *
2541  * QUAL_COLOR_SHADEMODEL_ANY is the same, but can be switched to flat
2542  * by the flatshade state, so it can't be deduplicated with
2543  * QUAL_COLOR_INTERP_ANY, which is never flat.
2544  */
2545 enum var_qualifier {
2546    QUAL_PATCH,
2547    QUAL_VAR_FLAT,
2548    QUAL_COLOR_FLAT,
2549    QUAL_EXPLICIT,
2550    QUAL_EXPLICIT_STRICT,
2551    QUAL_PER_PRIMITIVE,
2552    /* When nir_io_has_flexible_input_interpolation_except_flat is set: */
2553    QUAL_VAR_INTERP_ANY,
2554    QUAL_COLOR_INTERP_ANY,
2555    QUAL_COLOR_SHADEMODEL_ANY,
2556    /* When nir_io_has_flexible_input_interpolation_except_flat is not set: */
2557    QUAL_VAR_PERSP_PIXEL,
2558    QUAL_VAR_PERSP_CENTROID,
2559    QUAL_VAR_PERSP_SAMPLE,
2560    QUAL_VAR_LINEAR_PIXEL,
2561    QUAL_VAR_LINEAR_CENTROID,
2562    QUAL_VAR_LINEAR_SAMPLE,
2563    QUAL_COLOR_PERSP_PIXEL,
2564    QUAL_COLOR_PERSP_CENTROID,
2565    QUAL_COLOR_PERSP_SAMPLE,
2566    QUAL_COLOR_LINEAR_PIXEL,
2567    QUAL_COLOR_LINEAR_CENTROID,
2568    QUAL_COLOR_LINEAR_SAMPLE,
2569    QUAL_COLOR_SHADEMODEL_PIXEL,
2570    QUAL_COLOR_SHADEMODEL_CENTROID,
2571    QUAL_COLOR_SHADEMODEL_SAMPLE,
2572    NUM_DEDUP_QUALIFIERS,
2573 
2574    QUAL_SKIP,
2575    QUAL_UNKNOWN,
2576 };
2577 
2578 /* Return the input qualifier if all loads use the same one, else skip.
2579  * This is only used by output deduplication to determine input compatibility.
2580  */
2581 static enum var_qualifier
get_input_qualifier(struct linkage_info * linkage,unsigned i)2582 get_input_qualifier(struct linkage_info *linkage, unsigned i)
2583 {
2584    assert(linkage->consumer_stage == MESA_SHADER_FRAGMENT);
2585    struct scalar_slot *slot = &linkage->slot[i];
2586    bool is_color = is_interpolated_color(linkage, i);
2587    nir_intrinsic_instr *load =
2588       list_first_entry(&slot->consumer.loads, struct list_node, head)->instr;
2589 
2590    if (load->intrinsic == nir_intrinsic_load_input)
2591       return is_color ? QUAL_COLOR_FLAT : QUAL_VAR_FLAT;
2592 
2593    if (load->intrinsic == nir_intrinsic_load_per_primitive_input)
2594       return QUAL_PER_PRIMITIVE;
2595 
2596    if (load->intrinsic == nir_intrinsic_load_input_vertex) {
2597       return nir_intrinsic_io_semantics(load).interp_explicit_strict ?
2598                QUAL_EXPLICIT_STRICT : QUAL_EXPLICIT;
2599    }
2600 
2601    assert(load->intrinsic == nir_intrinsic_load_interpolated_input);
2602 
2603    nir_instr *baryc_instr = load->src[0].ssa->parent_instr;
2604    nir_intrinsic_instr *baryc = baryc_instr->type == nir_instr_type_intrinsic ?
2605                                    nir_instr_as_intrinsic(baryc_instr) : NULL;
2606 
2607    if (linkage->has_flexible_interp) {
2608       if (is_color) {
2609          return nir_intrinsic_interp_mode(baryc) == INTERP_MODE_NONE ?
2610                    QUAL_COLOR_SHADEMODEL_ANY : QUAL_COLOR_INTERP_ANY;
2611       } else {
2612          return QUAL_VAR_INTERP_ANY;
2613       }
2614    }
2615 
2616    /* This is either lowered barycentric_at_offset/at_sample or user
2617     * barycentrics. Treat it like barycentric_at_offset.
2618     */
2619    if (!baryc)
2620       return QUAL_SKIP;
2621 
2622    /* If interpolateAt{Centroid,Offset,Sample} is used, see if there is
2623     * another load that doesn't use those, so that we get the real qualifier.
2624     */
2625    if (baryc->intrinsic == nir_intrinsic_load_barycentric_centroid ||
2626        baryc->intrinsic == nir_intrinsic_load_barycentric_at_offset ||
2627        baryc->intrinsic == nir_intrinsic_load_barycentric_at_sample) {
2628       list_for_each_entry(struct list_node, iter, &slot->consumer.loads, head) {
2629          nir_intrinsic_instr *bar =
2630             nir_instr_as_intrinsic(iter->instr->src[0].ssa->parent_instr);
2631 
2632          if (bar->intrinsic != nir_intrinsic_load_barycentric_centroid &&
2633              bar->intrinsic != nir_intrinsic_load_barycentric_at_offset &&
2634              bar->intrinsic != nir_intrinsic_load_barycentric_at_sample) {
2635             baryc = bar;
2636             break;
2637          }
2638       }
2639    }
2640 
2641    /* Get the exact interpolation qualifier. */
2642    unsigned pixel_location;
2643    enum var_qualifier qual;
2644 
2645    switch (baryc->intrinsic) {
2646    case nir_intrinsic_load_barycentric_pixel:
2647       pixel_location = 0;
2648       break;
2649    case nir_intrinsic_load_barycentric_centroid:
2650       pixel_location = 1;
2651       break;
2652    case nir_intrinsic_load_barycentric_sample:
2653       pixel_location = 2;
2654       break;
2655    case nir_intrinsic_load_barycentric_at_offset:
2656    case nir_intrinsic_load_barycentric_at_sample:
2657       /* Don't deduplicate outputs that are interpolated at offset/sample. */
2658       return QUAL_SKIP;
2659    default:
2660       unreachable("unexpected barycentric src");
2661    }
2662 
2663    switch (nir_intrinsic_interp_mode(baryc)) {
2664    case INTERP_MODE_NONE:
2665       qual = is_color ? QUAL_COLOR_SHADEMODEL_PIXEL :
2666                         QUAL_VAR_PERSP_PIXEL;
2667       break;
2668    case INTERP_MODE_SMOOTH:
2669       qual = is_color ? QUAL_COLOR_PERSP_PIXEL : QUAL_VAR_PERSP_PIXEL;
2670       break;
2671    case INTERP_MODE_NOPERSPECTIVE:
2672       qual = is_color ? QUAL_COLOR_LINEAR_PIXEL : QUAL_VAR_LINEAR_PIXEL;
2673       break;
2674    default:
2675       unreachable("unexpected interp mode");
2676    }
2677 
2678    /* The ordering of the "qual" enum was carefully chosen to make this
2679     * addition correct.
2680     */
2681    STATIC_ASSERT(QUAL_VAR_PERSP_PIXEL + 1 == QUAL_VAR_PERSP_CENTROID);
2682    STATIC_ASSERT(QUAL_VAR_PERSP_PIXEL + 2 == QUAL_VAR_PERSP_SAMPLE);
2683    STATIC_ASSERT(QUAL_VAR_LINEAR_PIXEL + 1 == QUAL_VAR_LINEAR_CENTROID);
2684    STATIC_ASSERT(QUAL_VAR_LINEAR_PIXEL + 2 == QUAL_VAR_LINEAR_SAMPLE);
2685    STATIC_ASSERT(QUAL_COLOR_PERSP_PIXEL + 1 == QUAL_COLOR_PERSP_CENTROID);
2686    STATIC_ASSERT(QUAL_COLOR_PERSP_PIXEL + 2 == QUAL_COLOR_PERSP_SAMPLE);
2687    STATIC_ASSERT(QUAL_COLOR_LINEAR_PIXEL + 1 == QUAL_COLOR_LINEAR_CENTROID);
2688    STATIC_ASSERT(QUAL_COLOR_LINEAR_PIXEL + 2 == QUAL_COLOR_LINEAR_SAMPLE);
2689    STATIC_ASSERT(QUAL_COLOR_SHADEMODEL_PIXEL + 1 ==
2690                  QUAL_COLOR_SHADEMODEL_CENTROID);
2691    STATIC_ASSERT(QUAL_COLOR_SHADEMODEL_PIXEL + 2 ==
2692                  QUAL_COLOR_SHADEMODEL_SAMPLE);
2693    return qual + pixel_location;
2694 }
2695 
2696 static void
deduplicate_outputs(struct linkage_info * linkage,nir_opt_varyings_progress * progress)2697 deduplicate_outputs(struct linkage_info *linkage,
2698                     nir_opt_varyings_progress *progress)
2699 {
2700    struct hash_table *tables[NUM_DEDUP_QUALIFIERS] = {NULL};
2701    unsigned i;
2702 
2703    /* Find duplicated outputs. If there are multiple stores, they should all
2704     * store the same value as all stores of some other output. That's
2705     * guaranteed by output_equal_mask.
2706     */
2707    BITSET_FOREACH_SET(i, linkage->output_equal_mask, NUM_SCALAR_SLOTS) {
2708       if (!can_optimize_varying(linkage, vec4_slot(i)).deduplicate)
2709          continue;
2710 
2711       struct scalar_slot *slot = &linkage->slot[i];
2712       enum var_qualifier qualifier;
2713       gl_varying_slot var_slot = vec4_slot(i);
2714 
2715       /* Determine which qualifier this slot has. */
2716       if ((var_slot >= VARYING_SLOT_PATCH0 &&
2717            var_slot <= VARYING_SLOT_PATCH31) ||
2718           var_slot == VARYING_SLOT_TESS_LEVEL_INNER ||
2719           var_slot == VARYING_SLOT_TESS_LEVEL_OUTER)
2720          qualifier = QUAL_PATCH;
2721       else if (linkage->consumer_stage != MESA_SHADER_FRAGMENT)
2722          qualifier = QUAL_VAR_FLAT;
2723       else
2724          qualifier = get_input_qualifier(linkage, i);
2725 
2726       if (qualifier == QUAL_SKIP)
2727          continue;
2728 
2729       struct hash_table **table = &tables[qualifier];
2730       if (!*table)
2731          *table = _mesa_pointer_hash_table_create(NULL);
2732 
2733       nir_instr *value = slot->producer.value;
2734 
2735       struct hash_entry *entry = _mesa_hash_table_search(*table, value);
2736       if (!entry) {
2737          _mesa_hash_table_insert(*table, value, (void*)(uintptr_t)i);
2738          continue;
2739       }
2740 
2741       /* We've found a duplicate. Redirect loads and remove stores. */
2742       struct scalar_slot *found_slot = &linkage->slot[(uintptr_t)entry->data];
2743       nir_intrinsic_instr *store =
2744          list_first_entry(&found_slot->producer.stores,
2745                           struct list_node, head)->instr;
2746       nir_io_semantics sem = nir_intrinsic_io_semantics(store);
2747       unsigned component = nir_intrinsic_component(store);
2748 
2749       /* Redirect loads. */
2750       for (unsigned list_index = 0; list_index < 2; list_index++) {
2751          struct list_head *src_loads = list_index ? &slot->producer.loads :
2752                                                     &slot->consumer.loads;
2753          struct list_head *dst_loads = list_index ? &found_slot->producer.loads :
2754                                                     &found_slot->consumer.loads;
2755          bool has_progress = !list_is_empty(src_loads);
2756 
2757          list_for_each_entry(struct list_node, iter, src_loads, head) {
2758             nir_intrinsic_instr *loadi = iter->instr;
2759 
2760             nir_intrinsic_set_io_semantics(loadi, sem);
2761             nir_intrinsic_set_component(loadi, component);
2762 
2763             /* We also need to set the base to match the duplicate load, so
2764              * that CSE can eliminate it.
2765              */
2766             if (!list_is_empty(dst_loads)) {
2767                struct list_node *first =
2768                   list_first_entry(dst_loads, struct list_node, head);
2769                nir_intrinsic_set_base(loadi, nir_intrinsic_base(first->instr));
2770             } else {
2771                /* Use the base of the found store if there are no loads (it can
2772                 * only happen with TCS).
2773                 */
2774                assert(list_index == 0);
2775                nir_intrinsic_set_base(loadi, nir_intrinsic_base(store));
2776             }
2777          }
2778 
2779          if (has_progress) {
2780             /* Move the redirected loads to the found slot, so that compaction
2781              * can find them.
2782              */
2783             list_splicetail(src_loads, dst_loads);
2784             list_inithead(src_loads);
2785 
2786             *progress |= list_index ? nir_progress_producer :
2787                                       nir_progress_consumer;
2788          }
2789       }
2790 
2791       /* Remove all duplicated stores now that loads have been redirected. */
2792       remove_all_stores_and_clear_slot(linkage, i, progress);
2793    }
2794 
2795    for (unsigned i = 0; i < ARRAY_SIZE(tables); i++)
2796       _mesa_hash_table_destroy(tables[i], NULL);
2797 }
2798 
2799 /******************************************************************
2800  * FIND OPEN-CODED TES INPUT INTERPOLATION
2801  ******************************************************************/
2802 
2803 static nir_alu_instr *
get_single_use_as_alu(nir_def * def)2804 get_single_use_as_alu(nir_def *def)
2805 {
2806    /* Only 1 use allowed. */
2807    if (!list_is_singular(&def->uses))
2808       return NULL;
2809 
2810    nir_instr *instr =
2811       nir_src_parent_instr(list_first_entry(&def->uses, nir_src, use_link));
2812    if (instr->type != nir_instr_type_alu)
2813       return NULL;
2814 
2815    return nir_instr_as_alu(instr);
2816 }
2817 
2818 static nir_alu_instr *
check_tes_input_load_get_single_use_alu(nir_intrinsic_instr * load,unsigned * vertex_index,unsigned * vertices_used,unsigned max_vertices)2819 check_tes_input_load_get_single_use_alu(nir_intrinsic_instr *load,
2820                                         unsigned *vertex_index,
2821                                         unsigned *vertices_used,
2822                                         unsigned max_vertices)
2823 {
2824    if (load->intrinsic != nir_intrinsic_load_per_vertex_input)
2825       return NULL;
2826 
2827    /* Check the vertex index. Each vertex can be loaded only once. */
2828    if (!nir_src_is_const(load->src[0]))
2829       return false;
2830 
2831    *vertex_index = nir_src_as_uint(load->src[0]);
2832    if (*vertex_index >= max_vertices ||
2833        *vertices_used & BITFIELD_BIT(*vertex_index))
2834       return false;
2835 
2836    *vertices_used |= BITFIELD_BIT(*vertex_index);
2837 
2838    return get_single_use_as_alu(&load->def);
2839 }
2840 
2841 static bool
gather_fmul_tess_coord(nir_intrinsic_instr * load,nir_alu_instr * fmul,unsigned vertex_index,unsigned * tess_coord_swizzle,unsigned * tess_coord_used,nir_def ** load_tess_coord)2842 gather_fmul_tess_coord(nir_intrinsic_instr *load, nir_alu_instr *fmul,
2843                        unsigned vertex_index, unsigned *tess_coord_swizzle,
2844                        unsigned *tess_coord_used, nir_def **load_tess_coord)
2845 {
2846    unsigned other_src = fmul->src[0].src.ssa == &load->def;
2847    nir_instr *other_instr = fmul->src[other_src].src.ssa->parent_instr;
2848 
2849    assert(fmul->src[!other_src].swizzle[0] == 0);
2850 
2851    if (!is_sysval(other_instr, SYSTEM_VALUE_TESS_COORD))
2852       return false;
2853 
2854    unsigned tess_coord_component = fmul->src[other_src].swizzle[0];
2855    /* Each tesscoord component can be used only once. */
2856    if (*tess_coord_used & BITFIELD_BIT(tess_coord_component))
2857       return false;
2858 
2859    *tess_coord_swizzle |= tess_coord_component << (4 * vertex_index);
2860    *tess_coord_used |= BITFIELD_BIT(tess_coord_component);
2861    *load_tess_coord = &nir_instr_as_intrinsic(other_instr)->def;
2862    return true;
2863 }
2864 
2865 /**
2866  * Find interpolation of the form:
2867  *    input[0].slot * TessCoord.a +
2868  *    input[1].slot * TessCoord.b +
2869  *    input[2].slot * TessCoord.c;
2870  *
2871  * a,b,c can be any of x,y,z, but each can occur only once.
2872  */
2873 static bool
find_tes_triangle_interp_3fmul_2fadd(struct linkage_info * linkage,unsigned i)2874 find_tes_triangle_interp_3fmul_2fadd(struct linkage_info *linkage, unsigned i)
2875 {
2876    struct scalar_slot *slot = &linkage->slot[i];
2877    unsigned vertices_used = 0;
2878    unsigned tess_coord_used = 0;
2879    unsigned tess_coord_swizzle = 0;
2880    unsigned num_fmuls = 0, num_fadds = 0;
2881    nir_alu_instr *fadds[2];
2882    nir_def *load_tess_coord = NULL;
2883 
2884    /* Find 3 multiplications by TessCoord and their uses, which must be
2885     * fadds.
2886     */
2887    list_for_each_entry(struct list_node, iter, &slot->consumer.loads, head) {
2888       unsigned vertex_index;
2889       nir_alu_instr *fmul =
2890          check_tes_input_load_get_single_use_alu(iter->instr, &vertex_index,
2891                                                  &vertices_used, 3);
2892       /* Only maximum of 3 loads expected. Also reject exact ops because we
2893        * are going to do an inexact transformation with it.
2894        */
2895       if (!fmul || fmul->op != nir_op_fmul || fmul->exact || num_fmuls == 3 ||
2896           !gather_fmul_tess_coord(iter->instr, fmul, vertex_index,
2897                                   &tess_coord_swizzle, &tess_coord_used,
2898                                   &load_tess_coord))
2899          return false;
2900 
2901       num_fmuls++;
2902 
2903       /* The multiplication must only be used by fadd. Also reject exact ops.
2904        */
2905       nir_alu_instr *fadd = get_single_use_as_alu(&fmul->def);
2906       if (!fadd || fadd->op != nir_op_fadd || fadd->exact)
2907          return false;
2908 
2909       /* The 3 fmuls must only be used by 2 fadds. */
2910       unsigned i;
2911       for (i = 0; i < num_fadds; i++) {
2912          if (fadds[i] == fadd)
2913             break;
2914       }
2915       if (i == num_fadds) {
2916          if (num_fadds == 2)
2917             return false;
2918 
2919          fadds[num_fadds++] = fadd;
2920       }
2921    }
2922 
2923    if (num_fmuls != 3 || num_fadds != 2)
2924       return false;
2925 
2926    assert(tess_coord_used == 0x7);
2927 
2928    /* We have found that the only uses of the 3 fmuls are 2 fadds, which
2929     * implies that at least 2 fmuls are used by the same fadd.
2930     *
2931     * Check that 1 fadd is used by the other fadd, which can only be
2932     * the result of the TessCoord interpolation.
2933     */
2934    for (unsigned i = 0; i < 2; i++) {
2935       if (get_single_use_as_alu(&fadds[i]->def) == fadds[!i]) {
2936          switch (tess_coord_swizzle) {
2937          case 0x210:
2938             slot->consumer.tes_interp_load = fadds[!i];
2939             slot->consumer.tes_interp_mode = FLAG_INTERP_TES_TRIANGLE_UVW;
2940             slot->consumer.tes_load_tess_coord = load_tess_coord;
2941             return true;
2942 
2943          case 0x102:
2944             slot->consumer.tes_interp_load = fadds[!i];
2945             slot->consumer.tes_interp_mode = FLAG_INTERP_TES_TRIANGLE_WUV;
2946             slot->consumer.tes_load_tess_coord = load_tess_coord;
2947             return true;
2948 
2949          default:
2950             return false;
2951          }
2952       }
2953    }
2954 
2955    return false;
2956 }
2957 
2958 /**
2959  * Find interpolation of the form:
2960  *    fma(input[0].slot, TessCoord.a,
2961  *        fma(input[1].slot, TessCoord.b,
2962  *            input[2].slot * TessCoord.c))
2963  *
2964  * a,b,c can be any of x,y,z, but each can occur only once.
2965  */
2966 static bool
find_tes_triangle_interp_1fmul_2ffma(struct linkage_info * linkage,unsigned i)2967 find_tes_triangle_interp_1fmul_2ffma(struct linkage_info *linkage, unsigned i)
2968 {
2969    struct scalar_slot *slot = &linkage->slot[i];
2970    unsigned vertices_used = 0;
2971    unsigned tess_coord_used = 0;
2972    unsigned tess_coord_swizzle = 0;
2973    unsigned num_fmuls = 0, num_ffmas = 0;
2974    nir_alu_instr *ffmas[2], *fmul = NULL;
2975    nir_def *load_tess_coord = NULL;
2976 
2977    list_for_each_entry(struct list_node, iter, &slot->consumer.loads, head) {
2978       unsigned vertex_index;
2979       nir_alu_instr *alu =
2980          check_tes_input_load_get_single_use_alu(iter->instr, &vertex_index,
2981                                                  &vertices_used, 3);
2982 
2983       /* Reject exact ops because we are going to do an inexact transformation
2984        * with it.
2985        */
2986       if (!alu || (alu->op != nir_op_fmul && alu->op != nir_op_ffma) ||
2987           alu->exact ||
2988           !gather_fmul_tess_coord(iter->instr, alu, vertex_index,
2989                                   &tess_coord_swizzle, &tess_coord_used,
2990                                   &load_tess_coord))
2991          return false;
2992 
2993       /* The multiplication must only be used by ffma. */
2994       if (alu->op == nir_op_fmul) {
2995          nir_alu_instr *ffma = get_single_use_as_alu(&alu->def);
2996          if (!ffma || ffma->op != nir_op_ffma)
2997             return false;
2998 
2999          if (num_fmuls == 1)
3000             return false;
3001 
3002          fmul = alu;
3003          num_fmuls++;
3004       } else {
3005          if (num_ffmas == 2)
3006             return false;
3007 
3008          ffmas[num_ffmas++] = alu;
3009       }
3010    }
3011 
3012    if (num_fmuls != 1 || num_ffmas != 2)
3013       return false;
3014 
3015    assert(tess_coord_used == 0x7);
3016 
3017    /* We have found that fmul has only 1 use and it's ffma, and there are 2
3018     * ffmas. Fail if neither ffma is using fmul.
3019     */
3020    if (ffmas[0]->src[2].src.ssa != &fmul->def &&
3021        ffmas[1]->src[2].src.ssa != &fmul->def)
3022       return false;
3023 
3024    /* If one ffma is using the other ffma, it's guaranteed to be src[2]. */
3025    for (unsigned i = 0; i < 2; i++) {
3026       if (get_single_use_as_alu(&ffmas[i]->def) == ffmas[!i]) {
3027          switch (tess_coord_swizzle) {
3028          case 0x210:
3029             slot->consumer.tes_interp_load = ffmas[!i];
3030             slot->consumer.tes_interp_mode = FLAG_INTERP_TES_TRIANGLE_UVW;
3031             slot->consumer.tes_load_tess_coord = load_tess_coord;
3032             return true;
3033 
3034          case 0x102:
3035             slot->consumer.tes_interp_load = ffmas[!i];
3036             slot->consumer.tes_interp_mode = FLAG_INTERP_TES_TRIANGLE_WUV;
3037             slot->consumer.tes_load_tess_coord = load_tess_coord;
3038             return true;
3039 
3040          default:
3041             return false;
3042          }
3043       }
3044    }
3045 
3046    return false;
3047 }
3048 
3049 static void
find_open_coded_tes_input_interpolation(struct linkage_info * linkage)3050 find_open_coded_tes_input_interpolation(struct linkage_info *linkage)
3051 {
3052    if (linkage->consumer_stage != MESA_SHADER_TESS_EVAL)
3053       return;
3054 
3055    unsigned i;
3056    BITSET_FOREACH_SET(i, linkage->flat32_mask, NUM_SCALAR_SLOTS) {
3057       if (vec4_slot(i) >= VARYING_SLOT_PATCH0 &&
3058           vec4_slot(i) <= VARYING_SLOT_PATCH31)
3059          continue;
3060       if (find_tes_triangle_interp_3fmul_2fadd(linkage, i))
3061          continue;
3062       if (find_tes_triangle_interp_1fmul_2ffma(linkage, i))
3063          continue;
3064    }
3065 
3066    BITSET_FOREACH_SET(i, linkage->flat16_mask, NUM_SCALAR_SLOTS) {
3067       if (vec4_slot(i) >= VARYING_SLOT_PATCH0 &&
3068           vec4_slot(i) <= VARYING_SLOT_PATCH31)
3069          continue;
3070       if (find_tes_triangle_interp_3fmul_2fadd(linkage, i))
3071          continue;
3072       if (find_tes_triangle_interp_1fmul_2ffma(linkage, i))
3073          continue;
3074    }
3075 }
3076 
3077 /******************************************************************
3078  * BACKWARD INTER-SHADER CODE MOTION
3079  ******************************************************************/
3080 
3081 #define NEED_UPDATE_MOVABLE_FLAGS(instr) \
3082    (!((instr)->pass_flags & (FLAG_MOVABLE | FLAG_UNMOVABLE)))
3083 
3084 #define GET_SRC_INTERP(alu, i) \
3085    ((alu)->src[i].src.ssa->parent_instr->pass_flags & FLAG_INTERP_MASK)
3086 
3087 static bool
can_move_alu_across_interp(struct linkage_info * linkage,nir_alu_instr * alu)3088 can_move_alu_across_interp(struct linkage_info *linkage, nir_alu_instr *alu)
3089 {
3090    /* Exact ALUs can't be moved across interpolation. */
3091    if (alu->exact)
3092       return false;
3093 
3094    /* Interpolation converts Infs to NaNs. If we turn a result of an ALU
3095     * instruction into a new interpolated input, it converts Infs to NaNs for
3096     * that instruction, while removing the Infs to NaNs conversion for sourced
3097     * interpolated values. We can't do that if Infs and NaNs must be preserved.
3098     */
3099    if (preserve_infs_nans(linkage->consumer_builder.shader, alu->def.bit_size))
3100       return false;
3101 
3102    switch (alu->op) {
3103    /* Always legal if the sources are interpolated identically because:
3104     *    interp(x, i, j) + interp(y, i, j) = interp(x + y, i, j)
3105     *    interp(x, i, j) + convergent_expr = interp(x + convergent_expr, i, j)
3106     */
3107    case nir_op_fadd:
3108    case nir_op_fsub:
3109    /* This is the same as multiplying by -1, which is always legal, see fmul.
3110     */
3111    case nir_op_fneg:
3112    case nir_op_mov:
3113       return true;
3114 
3115    /* At least one side of the multiplication must be convergent because this
3116     * is the only equation with multiplication that is true:
3117     *    interp(x, i, j) * convergent_expr = interp(x * convergent_expr, i, j)
3118     */
3119    case nir_op_fmul:
3120    case nir_op_fmulz:
3121    case nir_op_ffma:
3122    case nir_op_ffmaz:
3123       return GET_SRC_INTERP(alu, 0) == FLAG_INTERP_CONVERGENT ||
3124              GET_SRC_INTERP(alu, 1) == FLAG_INTERP_CONVERGENT;
3125 
3126    case nir_op_fdiv:
3127       /* The right side must be convergent, which then follows the fmul rule.
3128        */
3129       return GET_SRC_INTERP(alu, 1) == FLAG_INTERP_CONVERGENT;
3130 
3131    case nir_op_flrp:
3132       /* Using the same rule as fmul. */
3133       return (GET_SRC_INTERP(alu, 0) == FLAG_INTERP_CONVERGENT &&
3134               GET_SRC_INTERP(alu, 1) == FLAG_INTERP_CONVERGENT) ||
3135              GET_SRC_INTERP(alu, 2) == FLAG_INTERP_CONVERGENT;
3136 
3137    default:
3138       /* Moving other ALU instructions across interpolation is illegal. */
3139       return false;
3140    }
3141 }
3142 
3143 /* Determine whether an instruction is movable from the consumer to
3144  * the producer. Also determine which interpolation modes each ALU instruction
3145  * should use if its value was promoted to a new input.
3146  */
3147 static void
update_movable_flags(struct linkage_info * linkage,nir_instr * instr)3148 update_movable_flags(struct linkage_info *linkage, nir_instr *instr)
3149 {
3150    /* This function shouldn't be called more than once for each instruction
3151     * to minimize recursive calling.
3152     */
3153    assert(NEED_UPDATE_MOVABLE_FLAGS(instr));
3154 
3155    switch (instr->type) {
3156    case nir_instr_type_undef:
3157    case nir_instr_type_load_const:
3158       /* Treat constants as convergent, which means compatible with both flat
3159        * and non-flat inputs.
3160        */
3161       instr->pass_flags |= FLAG_MOVABLE | FLAG_INTERP_CONVERGENT;
3162       return;
3163 
3164    case nir_instr_type_alu: {
3165       nir_alu_instr *alu = nir_instr_as_alu(instr);
3166       unsigned num_srcs = nir_op_infos[alu->op].num_inputs;
3167       unsigned alu_interp;
3168 
3169       /* Make vector ops unmovable. They are technically movable but more
3170        * complicated, and NIR should be scalarized for this pass anyway.
3171        * The only remaining vector ops should be vecN for intrinsic sources.
3172        */
3173       if (alu->def.num_components > 1) {
3174          instr->pass_flags |= FLAG_UNMOVABLE;
3175          return;
3176       }
3177 
3178       alu_interp = FLAG_INTERP_CONVERGENT;
3179 
3180       for (unsigned i = 0; i < num_srcs; i++) {
3181          nir_instr *src_instr = alu->src[i].src.ssa->parent_instr;
3182 
3183          if (NEED_UPDATE_MOVABLE_FLAGS(src_instr))
3184             update_movable_flags(linkage, src_instr);
3185 
3186          if (src_instr->pass_flags & FLAG_UNMOVABLE) {
3187             instr->pass_flags |= FLAG_UNMOVABLE;
3188             return;
3189          }
3190 
3191          /* Determine which interpolation mode this ALU instruction should
3192           * use if it was promoted to a new input.
3193           */
3194          unsigned src_interp = src_instr->pass_flags & FLAG_INTERP_MASK;
3195 
3196          if (alu_interp == src_interp ||
3197              src_interp == FLAG_INTERP_CONVERGENT) {
3198             /* Nothing to do. */
3199          } else if (alu_interp == FLAG_INTERP_CONVERGENT) {
3200             alu_interp = src_interp;
3201          } else {
3202             assert(alu_interp != FLAG_INTERP_CONVERGENT &&
3203                    src_interp != FLAG_INTERP_CONVERGENT &&
3204                    alu_interp != src_interp);
3205             /* The ALU instruction sources conflicting interpolation flags.
3206              * It can never become a new input.
3207              */
3208             instr->pass_flags |= FLAG_UNMOVABLE;
3209             return;
3210          }
3211       }
3212 
3213       /* Check if we can move the ALU instruction across an interpolated
3214        * load into the previous shader.
3215        */
3216       if (alu_interp > FLAG_INTERP_FLAT &&
3217           !can_move_alu_across_interp(linkage, alu)) {
3218          instr->pass_flags |= FLAG_UNMOVABLE;
3219          return;
3220       }
3221 
3222       instr->pass_flags |= FLAG_MOVABLE | alu_interp;
3223       return;
3224    }
3225 
3226    case nir_instr_type_intrinsic: {
3227       /* Movable input loads already have FLAG_MOVABLE on them.
3228        * Unmovable input loads skipped by initialization get UNMOVABLE here.
3229        * (e.g. colors, texcoords)
3230        *
3231        * The only other movable intrinsic is load_deref for uniforms and UBOs.
3232        * Other intrinsics are not movable.
3233        */
3234       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
3235 
3236       if (intr->intrinsic == nir_intrinsic_load_deref) {
3237          nir_instr *deref = intr->src[0].ssa->parent_instr;
3238 
3239          if (NEED_UPDATE_MOVABLE_FLAGS(deref))
3240             update_movable_flags(linkage, deref);
3241 
3242          instr->pass_flags |= deref->pass_flags;
3243          return;
3244       }
3245 
3246       instr->pass_flags |= FLAG_UNMOVABLE;
3247       return;
3248    }
3249 
3250    case nir_instr_type_deref: {
3251       if (!can_move_deref_between_shaders(linkage, instr)) {
3252          instr->pass_flags |= FLAG_UNMOVABLE;
3253          return;
3254       }
3255 
3256       nir_deref_instr *deref = nir_instr_as_deref(instr);
3257       nir_deref_instr *parent = nir_deref_instr_parent(deref);
3258 
3259       if (parent) {
3260          if (NEED_UPDATE_MOVABLE_FLAGS(&parent->instr))
3261             update_movable_flags(linkage, &parent->instr);
3262 
3263          if (parent->instr.pass_flags & FLAG_UNMOVABLE) {
3264             instr->pass_flags |= FLAG_UNMOVABLE;
3265             return;
3266          }
3267       }
3268 
3269       switch (deref->deref_type) {
3270       case nir_deref_type_var:
3271          instr->pass_flags |= FLAG_MOVABLE;
3272          return;
3273 
3274       case nir_deref_type_struct:
3275          assert(parent->instr.pass_flags & FLAG_MOVABLE);
3276          instr->pass_flags |= parent->instr.pass_flags;
3277          return;
3278 
3279       case nir_deref_type_array: {
3280          nir_instr *index = deref->arr.index.ssa->parent_instr;
3281 
3282          if (NEED_UPDATE_MOVABLE_FLAGS(index))
3283             update_movable_flags(linkage, index);
3284 
3285          /* Integer array indices should be movable only if they are
3286           * convergent or flat.
3287           */
3288          ASSERTED unsigned index_interp = index->pass_flags & FLAG_INTERP_MASK;
3289          assert(index->pass_flags & FLAG_UNMOVABLE ||
3290                 (index_interp == FLAG_INTERP_CONVERGENT ||
3291                  index_interp == FLAG_INTERP_FLAT));
3292 
3293          if (parent) {
3294             unsigned parent_interp = parent->instr.pass_flags & FLAG_INTERP_MASK;
3295 
3296             /* Check if the interpolation flags are compatible. */
3297             if (parent_interp != FLAG_INTERP_CONVERGENT &&
3298                 index_interp != FLAG_INTERP_CONVERGENT &&
3299                 parent_interp != index_interp) {
3300                instr->pass_flags |= FLAG_UNMOVABLE;
3301                return;
3302             }
3303 
3304             /* Pick the one that isn't convergent because convergent inputs
3305              * can be in expressions with any other qualifier.
3306              */
3307             if (parent_interp == FLAG_INTERP_CONVERGENT)
3308                instr->pass_flags |= index->pass_flags;
3309             else
3310                instr->pass_flags |= parent->instr.pass_flags;
3311          } else {
3312             instr->pass_flags |= index->pass_flags;
3313          }
3314          return;
3315       }
3316 
3317       default:
3318          instr->pass_flags |= FLAG_UNMOVABLE;
3319          return;
3320       }
3321    }
3322 
3323    default:
3324       instr->pass_flags |= FLAG_UNMOVABLE;
3325       return;
3326    }
3327 }
3328 
3329 /* Gather the input loads used by the post-dominator using DFS. */
3330 static void
gather_used_input_loads(nir_instr * instr,nir_intrinsic_instr * loads[NUM_SCALAR_SLOTS],unsigned * num_loads)3331 gather_used_input_loads(nir_instr *instr,
3332                         nir_intrinsic_instr *loads[NUM_SCALAR_SLOTS],
3333                         unsigned *num_loads)
3334 {
3335    switch (instr->type) {
3336    case nir_instr_type_undef:
3337    case nir_instr_type_load_const:
3338       return;
3339 
3340    case nir_instr_type_alu: {
3341       nir_alu_instr *alu = nir_instr_as_alu(instr);
3342       unsigned num_srcs = nir_op_infos[alu->op].num_inputs;
3343 
3344       for (unsigned i = 0; i < num_srcs; i++) {
3345          gather_used_input_loads(alu->src[i].src.ssa->parent_instr,
3346                                  loads, num_loads);
3347       }
3348       return;
3349    }
3350 
3351    case nir_instr_type_intrinsic: {
3352       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
3353 
3354       switch (intr->intrinsic) {
3355       case nir_intrinsic_load_tess_coord:
3356          return;
3357 
3358       case nir_intrinsic_load_deref:
3359          gather_used_input_loads(intr->src[0].ssa->parent_instr,
3360                                  loads, num_loads);
3361          return;
3362 
3363       case nir_intrinsic_load_input:
3364       case nir_intrinsic_load_per_vertex_input:
3365       case nir_intrinsic_load_interpolated_input:
3366          if (!(intr->instr.pass_flags & FLAG_GATHER_LOADS_VISITED)) {
3367             assert(*num_loads < NUM_SCALAR_SLOTS*8);
3368             loads[(*num_loads)++] = intr;
3369             intr->instr.pass_flags |= FLAG_GATHER_LOADS_VISITED;
3370          }
3371          return;
3372 
3373       default:
3374          printf("%u\n", intr->intrinsic);
3375          unreachable("unexpected intrinsic");
3376       }
3377    }
3378 
3379    case nir_instr_type_deref: {
3380       nir_deref_instr *deref = nir_instr_as_deref(instr);
3381       nir_deref_instr *parent = nir_deref_instr_parent(deref);
3382 
3383       if (parent)
3384          gather_used_input_loads(&parent->instr, loads, num_loads);
3385 
3386       switch (deref->deref_type) {
3387       case nir_deref_type_var:
3388       case nir_deref_type_struct:
3389          return;
3390 
3391       case nir_deref_type_array:
3392          gather_used_input_loads(deref->arr.index.ssa->parent_instr,
3393                                  loads, num_loads);
3394          return;
3395 
3396       default:
3397          unreachable("unexpected deref type");
3398       }
3399    }
3400 
3401    default:
3402       unreachable("unexpected instr type");
3403    }
3404 }
3405 
3406 /* Move a post-dominator, which is an ALU opcode, into the previous shader,
3407  * and replace the post-dominator with a new input load.
3408  */
3409 static bool
try_move_postdominator(struct linkage_info * linkage,struct nir_use_dominance_state * postdom_state,nir_instr * postdom,nir_def * load_def,nir_intrinsic_instr * first_load,nir_opt_varyings_progress * progress)3410 try_move_postdominator(struct linkage_info *linkage,
3411                        struct nir_use_dominance_state *postdom_state,
3412                        nir_instr *postdom,
3413                        nir_def *load_def,
3414                        nir_intrinsic_instr *first_load,
3415                        nir_opt_varyings_progress *progress)
3416 {
3417 #define PRINT 0
3418 #if PRINT
3419    printf("Trying to move post-dom: ");
3420    nir_print_instr(postdom, stdout);
3421    puts("");
3422 #endif
3423 
3424    /* Gather the input loads used by the post-dominator using DFS. */
3425    nir_intrinsic_instr *loads[NUM_SCALAR_SLOTS*8];
3426    unsigned num_loads = 0;
3427    gather_used_input_loads(postdom, loads, &num_loads);
3428    assert(num_loads && "no loads were gathered");
3429 
3430    /* Clear the flag set by gather_used_input_loads. */
3431    for (unsigned i = 0; i < num_loads; i++)
3432       loads[i]->instr.pass_flags &= ~FLAG_GATHER_LOADS_VISITED;
3433 
3434    /* For all the loads, the previous shader must have the corresponding
3435     * output stores in the same basic block because we are going to replace
3436     * them with 1 store. Only TCS and GS can have stores of different outputs
3437     * in different blocks.
3438     */
3439    nir_block *block = NULL;
3440 
3441    for (unsigned i = 0; i < num_loads; i++) {
3442       unsigned slot_index = intr_get_scalar_16bit_slot(loads[i]);
3443       struct scalar_slot *slot = &linkage->slot[slot_index];
3444 
3445       assert(list_is_singular(&slot->producer.stores));
3446       nir_intrinsic_instr *store =
3447          list_first_entry(&slot->producer.stores, struct list_node,
3448                           head)->instr;
3449 
3450       if (!block) {
3451          block = store->instr.block;
3452          continue;
3453       }
3454       if (block != store->instr.block)
3455          return false;
3456    }
3457 
3458    assert(block);
3459 
3460 #if PRINT
3461    printf("Post-dom accepted: ");
3462    nir_print_instr(postdom, stdout);
3463    puts("\n");
3464 #endif
3465 
3466    /* Determine the scalar slot index of the new varying. It will reuse
3467     * the slot of the load we started from because the load will be
3468     * removed.
3469     */
3470    unsigned final_slot = intr_get_scalar_16bit_slot(first_load);
3471 
3472    /* Replace the post-dominator in the consumer with a new input load.
3473     * Since we are reusing the same slot as the first load and it has
3474     * the right interpolation qualifiers, use it as the new load by using
3475     * it in place of the post-dominator.
3476     *
3477     * Boolean post-dominators are upcast in the producer and then downcast
3478     * in the consumer.
3479     */
3480    unsigned slot_index = final_slot;
3481    struct scalar_slot *slot = &linkage->slot[slot_index];
3482    nir_builder *b = &linkage->consumer_builder;
3483    b->cursor = nir_after_instr(load_def->parent_instr);
3484    nir_def *postdom_def = nir_instr_def(postdom);
3485    unsigned alu_interp = postdom->pass_flags & FLAG_INTERP_MASK;
3486    nir_def *new_input, *new_tes_loads[3];
3487    BITSET_WORD *mask;
3488 
3489    /* Convergent instruction results that are not interpolatable (integer or
3490     * FP64) should not be moved because compaction can relocate convergent
3491     * varyings to interpolated vec4 slots because the definition of convergent
3492     * varyings implies that they can be interpolated (which doesn't work with
3493     * integer and FP64 values).
3494     *
3495     * Check the result type and if it's not float and the driver doesn't
3496     * support convergent flat loads from interpolated vec4 slots, don't move
3497     * it.
3498     */
3499    if (linkage->consumer_stage == MESA_SHADER_FRAGMENT &&
3500        alu_interp == FLAG_INTERP_CONVERGENT &&
3501        !linkage->can_mix_convergent_flat_with_interpolated &&
3502        (postdom->type != nir_instr_type_alu ||
3503         (postdom_def->bit_size != 16 && postdom_def->bit_size != 32) ||
3504         !(nir_op_infos[nir_instr_as_alu(postdom)->op].output_type & nir_type_float)))
3505       return false;
3506 
3507    /* NIR can't do 1-bit inputs. Convert them to a bigger size. */
3508    assert(postdom_def->bit_size & (1 | 16 | 32));
3509    unsigned new_bit_size = postdom_def->bit_size;
3510 
3511    if (new_bit_size == 1) {
3512       assert(alu_interp == FLAG_INTERP_CONVERGENT ||
3513              alu_interp == FLAG_INTERP_FLAT);
3514       /* TODO: We could use 16 bits instead, but that currently fails on AMD.
3515        */
3516       new_bit_size = 32;
3517    }
3518 
3519    bool rewrite_convergent_to_flat =
3520       alu_interp == FLAG_INTERP_CONVERGENT &&
3521       linkage->can_mix_convergent_flat_with_interpolated;
3522 
3523    /* Create the new input load. This creates a new load (or a series of
3524     * loads in case of open-coded TES interpolation) that's identical to
3525     * the original load(s).
3526     */
3527    if (linkage->consumer_stage == MESA_SHADER_FRAGMENT &&
3528        alu_interp != FLAG_INTERP_FLAT && !rewrite_convergent_to_flat) {
3529       nir_def *baryc = NULL;
3530 
3531       /* Determine the barycentric coordinates. */
3532       switch (alu_interp) {
3533       case FLAG_INTERP_PERSP_PIXEL:
3534       case FLAG_INTERP_LINEAR_PIXEL:
3535          baryc = nir_load_barycentric_pixel(b, 32);
3536          break;
3537       case FLAG_INTERP_PERSP_CENTROID:
3538       case FLAG_INTERP_LINEAR_CENTROID:
3539          baryc = nir_load_barycentric_centroid(b, 32);
3540          break;
3541       case FLAG_INTERP_PERSP_SAMPLE:
3542       case FLAG_INTERP_LINEAR_SAMPLE:
3543          baryc = nir_load_barycentric_sample(b, 32);
3544          break;
3545       default:
3546          baryc = first_load->src[0].ssa;
3547          break;
3548       }
3549 
3550       if (baryc != first_load->src[0].ssa) {
3551          nir_intrinsic_instr *baryc_i =
3552             nir_instr_as_intrinsic(baryc->parent_instr);
3553 
3554          if (alu_interp == FLAG_INTERP_LINEAR_PIXEL ||
3555             alu_interp == FLAG_INTERP_LINEAR_CENTROID ||
3556             alu_interp == FLAG_INTERP_LINEAR_SAMPLE)
3557             nir_intrinsic_set_interp_mode(baryc_i, INTERP_MODE_NOPERSPECTIVE);
3558          else
3559             nir_intrinsic_set_interp_mode(baryc_i, INTERP_MODE_SMOOTH);
3560       }
3561 
3562       new_input = nir_load_interpolated_input(
3563                      b, 1, new_bit_size, baryc, nir_imm_int(b, 0),
3564                      .base = nir_intrinsic_base(first_load),
3565                      .component = nir_intrinsic_component(first_load),
3566                      .dest_type = nir_alu_type_get_base_type(nir_intrinsic_dest_type(first_load)) |
3567                                   new_bit_size,
3568                      .io_semantics = nir_intrinsic_io_semantics(first_load));
3569 
3570       if (alu_interp == FLAG_INTERP_CONVERGENT) {
3571          mask = new_bit_size == 16 ? linkage->convergent16_mask
3572                                    : linkage->convergent32_mask;
3573       } else if (linkage->has_flexible_interp) {
3574          mask = new_bit_size == 16 ? linkage->interp_fp16_mask
3575                                    : linkage->interp_fp32_mask;
3576       } else {
3577          /* The index of the qualifier is encoded in alu_interp, so extract it. */
3578          unsigned i = (alu_interp - FLAG_INTERP_PERSP_PIXEL) >> 5;
3579          mask = new_bit_size == 16 ? linkage->interp_fp16_qual_masks[i]
3580                                    : linkage->interp_fp32_qual_masks[i];
3581       }
3582    } else if (linkage->consumer_stage == MESA_SHADER_TESS_EVAL &&
3583               alu_interp > FLAG_INTERP_FLAT) {
3584       nir_def *zero = nir_imm_int(b, 0);
3585 
3586       for (unsigned i = 0; i < 3; i++) {
3587          new_tes_loads[i] =
3588             nir_load_per_vertex_input(b, 1, new_bit_size,
3589                   i ? nir_imm_int(b, i) : zero, zero,
3590                   .base = nir_intrinsic_base(first_load),
3591                   .component = nir_intrinsic_component(first_load),
3592                      .dest_type = nir_alu_type_get_base_type(nir_intrinsic_dest_type(first_load)) |
3593                                   new_bit_size,
3594                   .io_semantics = nir_intrinsic_io_semantics(first_load));
3595       }
3596 
3597       int remap_uvw[3] = {0, 1, 2};
3598       int remap_wuv[3] = {2, 0, 1};
3599       int *remap;
3600 
3601       switch (alu_interp) {
3602       case FLAG_INTERP_TES_TRIANGLE_UVW:
3603          remap = remap_uvw;
3604          break;
3605       case FLAG_INTERP_TES_TRIANGLE_WUV:
3606          remap = remap_wuv;
3607          break;
3608       default:
3609          unreachable("invalid TES interpolation mode");
3610       }
3611 
3612       nir_def *tesscoord = slot->consumer.tes_load_tess_coord;
3613       nir_def *defs[3];
3614 
3615       for (unsigned i = 0; i < 3; i++) {
3616          if (i == 0) {
3617             defs[i] = nir_fmul(b, new_tes_loads[i],
3618                                nir_channel(b, tesscoord, remap[i]));
3619          } else {
3620             defs[i] = nir_ffma(b, new_tes_loads[i],
3621                                nir_channel(b, tesscoord, remap[i]),
3622                                defs[i - 1]);
3623          }
3624       }
3625       new_input = defs[2];
3626 
3627       mask = new_bit_size == 16 ? linkage->flat16_mask
3628                                 : linkage->flat32_mask;
3629    } else {
3630       /* We have to rewrite convergent to flat here and not during compaction
3631        * because compaction adds code to convert Infs to NaNs for
3632        * "load_interpolated_input -> load_input" replacements, which corrupts
3633        * integer data.
3634        */
3635       assert(linkage->consumer_stage != MESA_SHADER_FRAGMENT ||
3636              alu_interp == FLAG_INTERP_FLAT || rewrite_convergent_to_flat);
3637 
3638       new_input =
3639          nir_load_input(b, 1, new_bit_size, nir_imm_int(b, 0),
3640                         .base = nir_intrinsic_base(first_load),
3641                         .component = nir_intrinsic_component(first_load),
3642                         .dest_type = nir_alu_type_get_base_type(nir_intrinsic_dest_type(first_load)) |
3643                                     new_bit_size,
3644                         .io_semantics = nir_intrinsic_io_semantics(first_load));
3645 
3646       mask = new_bit_size == 16 ? linkage->flat16_mask
3647                                 : linkage->flat32_mask;
3648 
3649       if (rewrite_convergent_to_flat) {
3650          mask = new_bit_size == 16 ? linkage->convergent16_mask
3651                                    : linkage->convergent32_mask;
3652       }
3653    }
3654 
3655    assert(!BITSET_TEST(linkage->no_varying32_mask, slot_index));
3656    assert(!BITSET_TEST(linkage->no_varying16_mask, slot_index));
3657 
3658    /* Re-set the category of the new scalar input. This will cause
3659     * the compaction to treat it as a different type, so that it will be moved
3660     * into the vec4 that has compatible interpolation qualifiers.
3661     *
3662     * This shouldn't be done if any of the interp masks are not set, which
3663     * indicates that compaction is disallowed.
3664     */
3665    if (BITSET_TEST(linkage->interp_fp32_mask, slot_index) ||
3666        BITSET_TEST(linkage->interp_fp16_mask, slot_index) ||
3667        BITSET6_TEST_ANY(linkage->interp_fp32_qual_masks, slot_index) ||
3668        BITSET6_TEST_ANY(linkage->interp_fp16_qual_masks, slot_index) ||
3669        BITSET_TEST(linkage->flat32_mask, slot_index) ||
3670        BITSET_TEST(linkage->flat16_mask, slot_index) ||
3671        BITSET_TEST(linkage->convergent32_mask, slot_index) ||
3672        BITSET_TEST(linkage->convergent16_mask, slot_index)) {
3673       BITSET_CLEAR(linkage->interp_fp32_mask, slot_index);
3674       for (unsigned i = 0; i < NUM_INTERP_QUALIFIERS; i++)
3675          BITSET_CLEAR(linkage->interp_fp32_qual_masks[i], slot_index);
3676       BITSET_CLEAR(linkage->interp_fp16_mask, slot_index);
3677       for (unsigned i = 0; i < NUM_INTERP_QUALIFIERS; i++)
3678          BITSET_CLEAR(linkage->interp_fp16_qual_masks[i], slot_index);
3679       BITSET_CLEAR(linkage->flat16_mask, slot_index);
3680       BITSET_CLEAR(linkage->flat32_mask, slot_index);
3681       BITSET_CLEAR(linkage->convergent16_mask, slot_index);
3682       BITSET_CLEAR(linkage->convergent32_mask, slot_index);
3683       BITSET_SET(mask, slot_index);
3684    }
3685 
3686    /* Replace the existing load with the new load in the slot. */
3687    if (linkage->consumer_stage == MESA_SHADER_TESS_EVAL &&
3688        alu_interp >= FLAG_INTERP_TES_TRIANGLE_UVW) {
3689       /* For TES, replace all 3 loads. */
3690       unsigned i = 0;
3691       list_for_each_entry(struct list_node, iter, &slot->consumer.loads,
3692                           head) {
3693          assert(i < 3);
3694          iter->instr = nir_instr_as_intrinsic(new_tes_loads[i]->parent_instr);
3695          i++;
3696       }
3697 
3698       assert(i == 3);
3699       assert(postdom_def->bit_size != 1);
3700 
3701       slot->consumer.tes_interp_load =
3702          nir_instr_as_alu(new_input->parent_instr);
3703    } else {
3704       assert(list_is_singular(&slot->consumer.loads));
3705       list_first_entry(&slot->consumer.loads, struct list_node, head)->instr =
3706          nir_instr_as_intrinsic(new_input->parent_instr);
3707 
3708       /* The input is a bigger type even if the post-dominator is boolean. */
3709       if (postdom_def->bit_size == 1)
3710          new_input = nir_ine_imm(b, new_input, 0);
3711    }
3712 
3713    nir_def_rewrite_uses(postdom_def, new_input);
3714 
3715    /* Clone the post-dominator at the end of the block in the producer
3716     * where the output stores are.
3717     */
3718    b = &linkage->producer_builder;
3719    b->cursor = nir_after_block_before_jump(block);
3720    nir_def *producer_clone = clone_ssa(linkage, b, postdom_def);
3721 
3722    /* Boolean post-dominators are upcast in the producer because we can't
3723     * use 1-bit outputs.
3724     */
3725    if (producer_clone->bit_size == 1)
3726       producer_clone = nir_b2bN(b, producer_clone, new_bit_size);
3727 
3728    /* Move the existing store to the end of the block and rewrite it to use
3729     * the post-dominator result.
3730     */
3731    nir_intrinsic_instr *store =
3732       list_first_entry(&linkage->slot[final_slot].producer.stores,
3733                        struct list_node, head)->instr;
3734    nir_instr_move(b->cursor, &store->instr);
3735    if (nir_src_bit_size(store->src[0]) != producer_clone->bit_size)
3736       nir_intrinsic_set_src_type(store, nir_alu_type_get_base_type(nir_intrinsic_src_type(store)) |
3737                                         producer_clone->bit_size);
3738    nir_src_rewrite(&store->src[0], producer_clone);
3739 
3740    /* Remove all loads and stores that we are replacing from the producer
3741     * and consumer.
3742     */
3743    for (unsigned i = 0; i < num_loads; i++) {
3744       unsigned slot_index = intr_get_scalar_16bit_slot(loads[i]);
3745 
3746       if (slot_index == final_slot) {
3747          /* Keep the load and store that we reused. */
3748          continue;
3749       }
3750 
3751       /* Remove loads and stores that are dead after the code motion. Only
3752        * those loads that are post-dominated by the post-dominator are dead.
3753        */
3754       struct scalar_slot *slot = &linkage->slot[slot_index];
3755       nir_instr *load;
3756 
3757       if (slot->consumer.tes_interp_load) {
3758          load = &slot->consumer.tes_interp_load->instr;
3759 
3760          /* With interpolated TES loads, we get here 3 times, once for each
3761           * per-vertex load. Skip this if we've been here before.
3762           */
3763          if (list_is_empty(&slot->producer.stores)) {
3764             assert(list_is_empty(&slot->consumer.loads));
3765             continue;
3766          }
3767       } else {
3768          assert(list_is_singular(&slot->consumer.loads));
3769          load = &list_first_entry(&slot->consumer.loads,
3770                                   struct list_node, head)->instr->instr;
3771       }
3772 
3773       if (nir_instr_dominates_use(postdom_state, postdom, load)) {
3774          list_inithead(&slot->consumer.loads);
3775 
3776          /* Remove stores. (transform feedback is allowed here, just not
3777           * in final_slot)
3778           */
3779          remove_all_stores_and_clear_slot(linkage, slot_index, progress);
3780       } else {
3781          /* If a load has 2 uses and one of those uses is moved into the previous
3782           * shader, making that "use" dead, the load and its associated store
3783           * can't be removed because there is still one use remaining. However,
3784           * there are actually 2 uses remaining because the use that is dead isn't
3785           * removed from NIR, but is left dangling there.
3786           *
3787           * When we run this optimization again and make the second use dead,
3788           * which makes the load dead, the output store in the producer isn't removed
3789           * because the post-dominator of the second use doesn't post-dominate
3790           * the load because we left the first use dangling there.
3791           *
3792           * To fix that, we could run DCE, but that would be costly because we would
3793           * need to re-gather all IO. Instead, remove dead uses by replacing them
3794           * with undef here, so that when this code motion pass is entered again,
3795           * the load has its number of uses reduced and the corresponding output store
3796           * will be removed by the code above.
3797           */
3798          nir_foreach_use_safe(src, nir_instr_def(load)) {
3799             if (nir_instr_dominates_use(postdom_state, postdom,
3800                                         nir_src_parent_instr(src))) {
3801                nir_src_rewrite(src, nir_undef(&linkage->consumer_builder,
3802                                               src->ssa->num_components,
3803                                               src->ssa->bit_size));
3804             }
3805          }
3806       }
3807    }
3808 
3809    *progress |= nir_progress_producer | nir_progress_consumer;
3810    return true;
3811 }
3812 
3813 static bool
backward_inter_shader_code_motion(struct linkage_info * linkage,nir_opt_varyings_progress * progress)3814 backward_inter_shader_code_motion(struct linkage_info *linkage,
3815                                   nir_opt_varyings_progress *progress)
3816 {
3817    /* These producers are not supported. The description at the beginning
3818     * suggests a possible workaround.
3819     */
3820    if (linkage->producer_stage == MESA_SHADER_GEOMETRY ||
3821        linkage->producer_stage == MESA_SHADER_MESH ||
3822        linkage->producer_stage == MESA_SHADER_TASK)
3823       return false;
3824 
3825    /* Clear pass_flags. */
3826    nir_shader_clear_pass_flags(linkage->consumer_builder.shader);
3827 
3828    /* Gather inputs that can be moved into the previous shader. These are only
3829     * checked for the basic constraints for movability.
3830     */
3831    struct {
3832       nir_def *def;
3833       nir_intrinsic_instr *first_load;
3834    } movable_loads[NUM_SCALAR_SLOTS];
3835    unsigned num_movable_loads = 0;
3836    unsigned i;
3837 
3838    BITSET_FOREACH_SET(i, linkage->output_equal_mask, NUM_SCALAR_SLOTS) {
3839       if (!can_optimize_varying(linkage,
3840                                 vec4_slot(i)).inter_shader_code_motion)
3841          continue;
3842 
3843       struct scalar_slot *slot = &linkage->slot[i];
3844 
3845       assert(!list_is_empty(&slot->producer.stores));
3846       assert(!is_interpolated_texcoord(linkage, i));
3847       assert(!is_interpolated_color(linkage, i));
3848 
3849       /* Disallow producer loads. */
3850       if (!list_is_empty(&slot->producer.loads))
3851          continue;
3852 
3853       /* There should be only 1 store per output. */
3854       if (!list_is_singular(&slot->producer.stores))
3855          continue;
3856 
3857       nir_def *load_def = NULL;
3858       nir_intrinsic_instr *load =
3859          list_first_entry(&slot->consumer.loads, struct list_node,
3860                           head)->instr;
3861 
3862       nir_intrinsic_instr *store =
3863         list_first_entry(&slot->producer.stores, struct list_node,
3864                          head)->instr;
3865 
3866       /* Set interpolation flags.
3867        * Handle interpolated TES loads first because they are special.
3868        */
3869       if (linkage->consumer_stage == MESA_SHADER_TESS_EVAL &&
3870           slot->consumer.tes_interp_load) {
3871          if (linkage->producer_stage == MESA_SHADER_VERTEX) {
3872             /* VS -> TES has no constraints on VS stores. */
3873             load_def = &slot->consumer.tes_interp_load->def;
3874             load_def->parent_instr->pass_flags |= FLAG_ALU_IS_TES_INTERP_LOAD |
3875                                                   slot->consumer.tes_interp_mode;
3876          } else {
3877             assert(linkage->producer_stage == MESA_SHADER_TESS_CTRL);
3878             assert(store->intrinsic == nir_intrinsic_store_per_vertex_output);
3879 
3880             /* The vertex index of the store must InvocationID. */
3881             if (is_sysval(store->src[1].ssa->parent_instr,
3882                           SYSTEM_VALUE_INVOCATION_ID)) {
3883                load_def = &slot->consumer.tes_interp_load->def;
3884                load_def->parent_instr->pass_flags |= FLAG_ALU_IS_TES_INTERP_LOAD |
3885                                                      slot->consumer.tes_interp_mode;
3886             } else {
3887                continue;
3888             }
3889          }
3890       } else {
3891          /* Allow only 1 load per input. CSE should be run before this. */
3892          if (!list_is_singular(&slot->consumer.loads))
3893             continue;
3894 
3895          /* This can only be TCS -> TES, which is handled above and rejected
3896           * otherwise.
3897           */
3898          if (store->intrinsic == nir_intrinsic_store_per_vertex_output) {
3899             assert(linkage->producer_stage == MESA_SHADER_TESS_CTRL);
3900             continue;
3901          }
3902 
3903          /* TODO: handle load_per_vertex_input for TCS and GS.
3904           * TES can also occur here if tes_interp_load is NULL.
3905           */
3906          if (load->intrinsic == nir_intrinsic_load_per_vertex_input)
3907             continue;
3908 
3909          load_def = &load->def;
3910 
3911          switch (load->intrinsic) {
3912          case nir_intrinsic_load_interpolated_input: {
3913             assert(linkage->consumer_stage == MESA_SHADER_FRAGMENT);
3914             nir_instr *baryc_instr = load->src[0].ssa->parent_instr;
3915 
3916             /* This is either lowered barycentric_at_offset/at_sample or user
3917              * barycentrics. Treat it like barycentric_at_offset.
3918              */
3919             if (baryc_instr->type != nir_instr_type_intrinsic)
3920                continue;
3921 
3922             nir_intrinsic_instr *baryc = nir_instr_as_intrinsic(baryc_instr);
3923             nir_intrinsic_op op = baryc->intrinsic;
3924             enum glsl_interp_mode interp = nir_intrinsic_interp_mode(baryc);
3925             bool linear = interp == INTERP_MODE_NOPERSPECTIVE;
3926             bool convergent = BITSET_TEST(linkage->convergent32_mask, i) ||
3927                               BITSET_TEST(linkage->convergent16_mask, i);
3928 
3929             assert(interp == INTERP_MODE_NONE ||
3930                    interp == INTERP_MODE_SMOOTH ||
3931                    interp == INTERP_MODE_NOPERSPECTIVE);
3932 
3933             if (convergent) {
3934                load->instr.pass_flags |= FLAG_INTERP_CONVERGENT;
3935             } else if (op == nir_intrinsic_load_barycentric_pixel) {
3936                load->instr.pass_flags |= linear ? FLAG_INTERP_LINEAR_PIXEL
3937                                                 : FLAG_INTERP_PERSP_PIXEL;
3938             } else if (op == nir_intrinsic_load_barycentric_centroid) {
3939                load->instr.pass_flags |= linear ? FLAG_INTERP_LINEAR_CENTROID
3940                                                 : FLAG_INTERP_PERSP_CENTROID;
3941             } else if (op == nir_intrinsic_load_barycentric_sample) {
3942                load->instr.pass_flags |= linear ? FLAG_INTERP_LINEAR_SAMPLE
3943                                                 : FLAG_INTERP_PERSP_SAMPLE;
3944             } else {
3945                /* Optimizing at_offset and at_sample would be possible but
3946                 * maybe not worth it if they are not convergent. Convergent
3947                 * inputs can trivially switch the barycentric coordinates
3948                 * to different ones or flat.
3949                 */
3950                continue;
3951             }
3952             break;
3953          }
3954          case nir_intrinsic_load_input:
3955             if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) {
3956                if (BITSET_TEST(linkage->convergent32_mask, i) ||
3957                    BITSET_TEST(linkage->convergent16_mask, i))
3958                   load->instr.pass_flags |= FLAG_INTERP_CONVERGENT;
3959                else
3960                   load->instr.pass_flags |= FLAG_INTERP_FLAT;
3961             } else if (linkage->consumer_stage == MESA_SHADER_TESS_EVAL) {
3962                assert(vec4_slot(i) >= VARYING_SLOT_PATCH0 &&
3963                       vec4_slot(i) <= VARYING_SLOT_PATCH31);
3964                /* Patch inputs are always convergent. */
3965                load->instr.pass_flags |= FLAG_INTERP_CONVERGENT;
3966             } else {
3967                /* It's not a fragment shader. We still need to set this. */
3968                load->instr.pass_flags |= FLAG_INTERP_FLAT;
3969             }
3970             break;
3971          case nir_intrinsic_load_per_primitive_input:
3972          case nir_intrinsic_load_input_vertex:
3973             /* Inter-shader code motion is unimplemented these. */
3974             continue;
3975          default:
3976             unreachable("unexpected load intrinsic");
3977          }
3978       }
3979 
3980       load_def->parent_instr->pass_flags |= FLAG_MOVABLE;
3981 
3982       /* Disallow transform feedback. The load is "movable" for the purpose of
3983        * finding a movable post-dominator, we just can't rewrite the store
3984        * because we need to keep it for xfb, so the post-dominator search
3985        * will have to start from a different load (only that varying will have
3986        * its value rewritten).
3987        */
3988       if (BITSET_TEST(linkage->xfb_mask, i))
3989          continue;
3990 
3991       assert(num_movable_loads < ARRAY_SIZE(movable_loads));
3992       movable_loads[num_movable_loads].def = load_def;
3993       movable_loads[num_movable_loads].first_load = load;
3994       num_movable_loads++;
3995    }
3996 
3997    if (!num_movable_loads)
3998       return false;
3999 
4000    /* Inter-shader code motion turns ALU results into outputs, but not all
4001     * bit sizes are supported by outputs.
4002     *
4003     * The 1-bit type is allowed because the pass always promotes 1-bit
4004     * outputs to 16 or 32 bits, whichever is supported.
4005     *
4006     * TODO: We could support replacing 2 32-bit inputs with one 64-bit
4007     * post-dominator by supporting 64 bits here, but the likelihood of that
4008     * occuring seems low.
4009     */
4010    unsigned supported_io_types = 32 | 1;
4011 
4012    if (linkage->producer_builder.shader->options->io_options &
4013        linkage->consumer_builder.shader->options->io_options &
4014        nir_io_16bit_input_output_support)
4015       supported_io_types |= 16;
4016 
4017    struct nir_use_dominance_state *postdom_state =
4018       nir_calc_use_dominance_impl(linkage->consumer_builder.impl, true);
4019 
4020    for (unsigned i = 0; i < num_movable_loads; i++) {
4021       nir_def *load_def = movable_loads[i].def;
4022       nir_instr *iter = load_def->parent_instr;
4023       nir_instr *movable_postdom = NULL;
4024 
4025       /* Find the farthest post-dominator that is movable. */
4026       while (iter) {
4027          iter = nir_get_immediate_use_dominator(postdom_state, iter);
4028          if (iter) {
4029             if (NEED_UPDATE_MOVABLE_FLAGS(iter))
4030                update_movable_flags(linkage, iter);
4031 
4032             if (iter->pass_flags & FLAG_UNMOVABLE)
4033                break;
4034 
4035             /* We can't move derefs into the previous shader, but we can move
4036              * instructions that use derefs.
4037              */
4038             if (iter->type == nir_instr_type_deref)
4039                continue;
4040 
4041             unsigned bit_size;
4042 
4043             if (iter->type == nir_instr_type_alu) {
4044                nir_alu_instr *alu = nir_instr_as_alu(iter);
4045 
4046                /* Skip comparison opcodes that directly source the first load
4047                 * and a constant because any 1-bit values would have to be
4048                 * converted to 32 bits in the producer and then converted back
4049                 * to 1 bit using nir_op_ine in the consumer, achieving nothing.
4050                 */
4051                if (alu->def.bit_size == 1 &&
4052                    ((nir_op_infos[alu->op].num_inputs == 1 &&
4053                      alu->src[0].src.ssa == load_def) ||
4054                     (nir_op_infos[alu->op].num_inputs == 2 &&
4055                      ((alu->src[0].src.ssa == load_def &&
4056                        alu->src[1].src.ssa->parent_instr->type ==
4057                        nir_instr_type_load_const) ||
4058                       (alu->src[0].src.ssa->parent_instr->type ==
4059                        nir_instr_type_load_const &&
4060                        alu->src[1].src.ssa == load_def)))))
4061                   continue;
4062 
4063                bit_size = alu->def.bit_size;
4064             } else if (iter->type == nir_instr_type_intrinsic) {
4065                nir_intrinsic_instr *intr = nir_instr_as_intrinsic(iter);
4066 
4067                /* This is a uniform load with a non-constant index because
4068                 * only a non-constant index can be post-dominated by a load.
4069                 */
4070                assert(intr->intrinsic == nir_intrinsic_load_deref);
4071 
4072                /* Uniform loads must be scalar if their result is immediately
4073                 * stored into an output because this pass only works with
4074                 * scalar outputs.
4075                 */
4076                if (intr->num_components > 1)
4077                   continue;
4078 
4079                bit_size = intr->def.bit_size;
4080             } else {
4081                unreachable("unexpected instr type");
4082             }
4083 
4084             /* Skip unsupported bit sizes and keep searching. */
4085             if (!(bit_size & supported_io_types))
4086                continue;
4087 
4088             movable_postdom = iter;
4089          }
4090       }
4091 
4092       /* Add the post-dominator to the list unless it's been added already. */
4093       if (movable_postdom &&
4094           !(movable_postdom->pass_flags & FLAG_POST_DOMINATOR_PROCESSED)) {
4095          if (try_move_postdominator(linkage, postdom_state, movable_postdom,
4096                                     load_def, movable_loads[i].first_load,
4097                                     progress)) {
4098             /* Moving only one postdominator can change the IR enough that
4099              * we should start from scratch.
4100              */
4101             ralloc_free(postdom_state);
4102             return true;
4103          }
4104 
4105          movable_postdom->pass_flags |= FLAG_POST_DOMINATOR_PROCESSED;
4106       }
4107    }
4108 
4109    ralloc_free(postdom_state);
4110    return false;
4111 }
4112 
4113 /******************************************************************
4114  * COMPACTION
4115  ******************************************************************/
4116 
4117 /* Relocate a slot to a new index. Used by compaction. new_index is
4118  * the component index at 16-bit granularity, so the size of vec4 is 8
4119  * in that representation.
4120  */
4121 static void
relocate_slot(struct linkage_info * linkage,struct scalar_slot * slot,unsigned i,unsigned new_index,enum fs_vec4_type fs_vec4_type,bool convergent,nir_opt_varyings_progress * progress)4122 relocate_slot(struct linkage_info *linkage, struct scalar_slot *slot,
4123               unsigned i, unsigned new_index, enum fs_vec4_type fs_vec4_type,
4124               bool convergent, nir_opt_varyings_progress *progress)
4125 {
4126    assert(!list_is_empty(&slot->producer.stores));
4127 
4128    list_for_each_entry(struct list_node, iter, &slot->producer.stores, head) {
4129       assert(!nir_intrinsic_io_semantics(iter->instr).no_varying ||
4130              has_xfb(iter->instr) ||
4131              linkage->producer_stage == MESA_SHADER_TESS_CTRL);
4132       assert(!is_active_sysval_output(linkage, i, iter->instr));
4133    }
4134 
4135    /* Relocate the slot in all loads and stores. */
4136    struct list_head *instruction_lists[3] = {
4137       &slot->producer.stores,
4138       &slot->producer.loads,
4139       &slot->consumer.loads,
4140    };
4141 
4142    for (unsigned i = 0; i < ARRAY_SIZE(instruction_lists); i++) {
4143       list_for_each_entry(struct list_node, iter, instruction_lists[i], head) {
4144          nir_intrinsic_instr *intr = iter->instr;
4145 
4146          gl_varying_slot new_semantic = vec4_slot(new_index);
4147          unsigned new_component = (new_index % 8) / 2;
4148          bool new_high_16bits = new_index % 2;
4149 
4150          /* We also need to relocate xfb info because it's always relative
4151           * to component 0. This just moves it into the correct xfb slot.
4152           */
4153          if (has_xfb(intr)) {
4154             unsigned old_component = nir_intrinsic_component(intr);
4155             static const nir_io_xfb clear_xfb;
4156             nir_io_xfb xfb;
4157             bool new_is_odd = new_component % 2 == 1;
4158 
4159             memset(&xfb, 0, sizeof(xfb));
4160 
4161             if (old_component >= 2) {
4162                xfb.out[new_is_odd] = nir_intrinsic_io_xfb2(intr).out[old_component - 2];
4163                nir_intrinsic_set_io_xfb2(intr, clear_xfb);
4164             } else {
4165                xfb.out[new_is_odd] = nir_intrinsic_io_xfb(intr).out[old_component];
4166                nir_intrinsic_set_io_xfb(intr, clear_xfb);
4167             }
4168 
4169             if (new_component >= 2)
4170                nir_intrinsic_set_io_xfb2(intr, xfb);
4171             else
4172                nir_intrinsic_set_io_xfb(intr, xfb);
4173          }
4174 
4175          nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
4176          unsigned bit_size = nir_intrinsic_infos[intr->intrinsic].has_dest ?
4177                                 intr->def.bit_size : intr->src[0].ssa->bit_size;
4178 
4179          /* Set all types to float to facilitate full IO vectorization.
4180           * This is skipped only if mediump is not lowered to 16 bits.
4181           *
4182           * Set nir_io_mediump_is_32bit if you never lower mediump IO to 16
4183           * bits, which sets nir_io_semantics::mediump_precision = 0 during
4184           * nir_lower_io.
4185           *
4186           * Set nir_shader_compiler_options::lower_mediump_io if you want to
4187           * lower mediump to 16 bits in the GLSL linker before this pass.
4188           */
4189          if (bit_size != 32 || !sem.medium_precision) {
4190             nir_alu_type type = nir_intrinsic_has_src_type(intr) ?
4191                                    nir_intrinsic_src_type(intr) :
4192                                    nir_intrinsic_dest_type(intr);
4193             type = nir_alu_type_get_type_size(type) | nir_type_float;
4194 
4195             if (nir_intrinsic_has_src_type(intr))
4196                nir_intrinsic_set_src_type(intr, type);
4197             else
4198                nir_intrinsic_set_dest_type(intr, type);
4199          }
4200 
4201          /* When relocating a back color store, don't change it to a front
4202           * color as that would be incorrect. Keep it as back color and only
4203           * relocate it between BFC0 and BFC1.
4204           */
4205          if (linkage->consumer_stage == MESA_SHADER_FRAGMENT &&
4206              (sem.location == VARYING_SLOT_BFC0 ||
4207               sem.location == VARYING_SLOT_BFC1)) {
4208             assert(new_semantic == VARYING_SLOT_COL0 ||
4209                    new_semantic == VARYING_SLOT_COL1);
4210             new_semantic = VARYING_SLOT_BFC0 +
4211                            (new_semantic - VARYING_SLOT_COL0);
4212          }
4213 
4214 #if PRINT_RELOCATE_SLOT
4215          assert(bit_size == 16 || bit_size == 32);
4216 
4217          fprintf(stderr, "--- relocating: %s.%c%s%s -> %s.%c%s%s FS_VEC4_TYPE_%s\n",
4218                  gl_varying_slot_name_for_stage(sem.location, linkage->producer_stage) + 13,
4219                  "xyzw"[nir_intrinsic_component(intr) % 4],
4220                  (bit_size == 16 && !sem.high_16bits) ? ".lo" : "",
4221                  (bit_size == 16 && sem.high_16bits) ? ".hi" : "",
4222                  gl_varying_slot_name_for_stage(new_semantic, linkage->producer_stage) + 13,
4223                  "xyzw"[new_component % 4],
4224                  (bit_size == 16 && !new_high_16bits) ? ".lo" : "",
4225                  (bit_size == 16 && new_high_16bits) ? ".hi" : "",
4226                  fs_vec4_type_strings[fs_vec4_type]);
4227 #endif /* PRINT_RELOCATE_SLOT */
4228 
4229          sem.location = new_semantic;
4230          sem.high_16bits = new_high_16bits;
4231 
4232          /* This is never indirectly indexed. Simplify num_slots. */
4233          sem.num_slots = 1;
4234 
4235          nir_intrinsic_set_io_semantics(intr, sem);
4236          nir_intrinsic_set_component(intr, new_component);
4237 
4238          if (fs_vec4_type == FS_VEC4_TYPE_PER_PRIMITIVE) {
4239             assert(intr->intrinsic == nir_intrinsic_store_per_primitive_output ||
4240                    intr->intrinsic == nir_intrinsic_load_per_primitive_output ||
4241                    intr->intrinsic == nir_intrinsic_load_per_primitive_input);
4242          } else {
4243             assert(intr->intrinsic != nir_intrinsic_store_per_primitive_output &&
4244                    intr->intrinsic != nir_intrinsic_load_per_primitive_output &&
4245                    intr->intrinsic != nir_intrinsic_load_per_primitive_input);
4246          }
4247 
4248          if (intr->intrinsic != nir_intrinsic_load_interpolated_input)
4249             continue;
4250 
4251          /* This path is used when promoting convergent interpolated
4252           * inputs to flat. Replace load_interpolated_input with load_input.
4253           */
4254          if (fs_vec4_type == FS_VEC4_TYPE_FLAT ||
4255              /* Promote all convergent loads to flat if the driver supports it. */
4256              (convergent &&
4257               linkage->can_mix_convergent_flat_with_interpolated)) {
4258             assert(instruction_lists[i] == &slot->consumer.loads);
4259             nir_builder *b = &linkage->consumer_builder;
4260 
4261             b->cursor = nir_before_instr(&intr->instr);
4262             nir_def *load =
4263                nir_load_input(b, 1, intr->def.bit_size,
4264                               nir_get_io_offset_src(intr)->ssa,
4265                               .io_semantics = sem,
4266                               .component = new_component,
4267                               .dest_type = nir_intrinsic_dest_type(intr));
4268 
4269             nir_def_rewrite_uses(&intr->def, load);
4270             iter->instr = nir_instr_as_intrinsic(load->parent_instr);
4271             nir_instr_remove(&intr->instr);
4272             *progress |= nir_progress_consumer;
4273 
4274             /* Interpolation converts Infs to NaNs. If we change it to flat,
4275              * we need to convert Infs to NaNs manually in the producer to
4276              * preserve that.
4277              */
4278             if (preserve_nans(linkage->consumer_builder.shader,
4279                               load->bit_size)) {
4280                list_for_each_entry(struct list_node, iter,
4281                                    &slot->producer.stores, head) {
4282                   nir_intrinsic_instr *store = iter->instr;
4283 
4284                   nir_builder *b = &linkage->producer_builder;
4285                   b->cursor = nir_before_instr(&store->instr);
4286                   nir_def *repl =
4287                      build_convert_inf_to_nan(b, store->src[0].ssa);
4288                   nir_src_rewrite(&store->src[0], repl);
4289                }
4290             }
4291             continue;
4292          }
4293 
4294          /* We are packing convergent inputs with any other interpolated
4295           * inputs in the same vec4, but the interpolation qualifier might not
4296           * be the same between the two. Set the qualifier of the convergent
4297           * input to match the input it's being packed with.
4298           */
4299          if (!linkage->has_flexible_interp && convergent) {
4300             enum fs_vec4_type current_vec4_type =
4301                get_interp_vec4_type(linkage, i, intr);
4302 
4303             /* Make the interpolation qualifier match the slot where we are
4304              * moving this input.
4305              */
4306             if (current_vec4_type != fs_vec4_type) {
4307                nir_builder *b = &linkage->consumer_builder;
4308                nir_def *baryc;
4309 
4310                b->cursor = nir_before_instr(&intr->instr);
4311 
4312                switch (fs_vec4_type) {
4313                case FS_VEC4_TYPE_INTERP_FP32_PERSP_PIXEL:
4314                case FS_VEC4_TYPE_INTERP_FP16_PERSP_PIXEL:
4315                   baryc = nir_load_barycentric_pixel(b, 32,
4316                              .interp_mode = INTERP_MODE_SMOOTH);
4317                   break;
4318                case FS_VEC4_TYPE_INTERP_FP32_PERSP_CENTROID:
4319                case FS_VEC4_TYPE_INTERP_FP16_PERSP_CENTROID:
4320                   baryc = nir_load_barycentric_centroid(b, 32,
4321                              .interp_mode = INTERP_MODE_SMOOTH);
4322                   break;
4323                case FS_VEC4_TYPE_INTERP_FP32_PERSP_SAMPLE:
4324                case FS_VEC4_TYPE_INTERP_FP16_PERSP_SAMPLE:
4325                   baryc = nir_load_barycentric_sample(b, 32,
4326                              .interp_mode = INTERP_MODE_SMOOTH);
4327                   break;
4328                case FS_VEC4_TYPE_INTERP_FP32_LINEAR_PIXEL:
4329                case FS_VEC4_TYPE_INTERP_FP16_LINEAR_PIXEL:
4330                   baryc = nir_load_barycentric_pixel(b, 32,
4331                              .interp_mode = INTERP_MODE_NOPERSPECTIVE);
4332                   break;
4333                case FS_VEC4_TYPE_INTERP_FP32_LINEAR_CENTROID:
4334                case FS_VEC4_TYPE_INTERP_FP16_LINEAR_CENTROID:
4335                   baryc = nir_load_barycentric_centroid(b, 32,
4336                              .interp_mode = INTERP_MODE_NOPERSPECTIVE);
4337                   break;
4338                case FS_VEC4_TYPE_INTERP_FP32_LINEAR_SAMPLE:
4339                case FS_VEC4_TYPE_INTERP_FP16_LINEAR_SAMPLE:
4340                   baryc = nir_load_barycentric_sample(b, 32,
4341                              .interp_mode = INTERP_MODE_NOPERSPECTIVE);
4342                   break;
4343                case FS_VEC4_TYPE_INTERP_COLOR_PIXEL:
4344                   baryc = nir_load_barycentric_pixel(b, 32,
4345                              .interp_mode = INTERP_MODE_NONE);
4346                   break;
4347                case FS_VEC4_TYPE_INTERP_COLOR_CENTROID:
4348                   baryc = nir_load_barycentric_centroid(b, 32,
4349                              .interp_mode = INTERP_MODE_NONE);
4350                   break;
4351                case FS_VEC4_TYPE_INTERP_COLOR_SAMPLE:
4352                   baryc = nir_load_barycentric_sample(b, 32,
4353                              .interp_mode = INTERP_MODE_NONE);
4354                   break;
4355                default:
4356                   unreachable("invalid qualifier");
4357                }
4358 
4359                nir_src_rewrite(&intr->src[0], baryc);
4360             }
4361          }
4362       }
4363    }
4364 }
4365 
4366 /**
4367  * A helper function for compact_varyings(). Assign new slot indices for
4368  * existing slots of a certain vec4 type (FLAT, FP16, or FP32). Skip already-
4369  * assigned scalar slots (determined by assigned_mask) and don't assign to
4370  * vec4 slots that have an incompatible vec4 type (determined by
4371  * assigned_fs_vec4_type). This works with both 32-bit and 16-bit types.
4372  * slot_size is the component size in the units of 16 bits (2 means 32 bits).
4373  *
4374  * The number of slots to assign can optionally be limited by
4375  * max_assigned_slots.
4376  *
4377  * Return how many 16-bit slots are left unused in the last vec4 (up to 8
4378  * slots).
4379  */
4380 static unsigned
fs_assign_slots(struct linkage_info * linkage,BITSET_WORD * assigned_mask,uint8_t assigned_fs_vec4_type[NUM_TOTAL_VARYING_SLOTS],BITSET_WORD * input_mask,enum fs_vec4_type fs_vec4_type,unsigned slot_size,unsigned max_assigned_slots,bool convergent,bool assign_colors,unsigned color_channel_rotate,nir_opt_varyings_progress * progress)4381 fs_assign_slots(struct linkage_info *linkage,
4382                 BITSET_WORD *assigned_mask,
4383                 uint8_t assigned_fs_vec4_type[NUM_TOTAL_VARYING_SLOTS],
4384                 BITSET_WORD *input_mask,
4385                 enum fs_vec4_type fs_vec4_type,
4386                 unsigned slot_size,
4387                 unsigned max_assigned_slots,
4388                 bool convergent,
4389                 bool assign_colors,
4390                 unsigned color_channel_rotate,
4391                 nir_opt_varyings_progress *progress)
4392 {
4393    unsigned i, slot_index, max_slot;
4394    unsigned num_assigned_slots = 0;
4395 
4396    if (assign_colors) {
4397       slot_index = VARYING_SLOT_COL0 * 8; /* starting slot */
4398       max_slot = VARYING_SLOT_COL1 + 1;
4399    } else {
4400       slot_index = VARYING_SLOT_VAR0 * 8; /* starting slot */
4401       max_slot = VARYING_SLOT_MAX;
4402    }
4403 
4404    /* Assign new slot indices for scalar slots. */
4405    BITSET_FOREACH_SET(i, input_mask, NUM_SCALAR_SLOTS) {
4406       if (is_interpolated_color(linkage, i) != assign_colors)
4407          continue;
4408 
4409       /* Skip indirectly-indexed scalar slots and slots incompatible
4410        * with the FS vec4 type.
4411        */
4412       while (1) {
4413          /* If the FS vec4 type is incompatible. Move to the next vec4. */
4414          if (fs_vec4_type != FS_VEC4_TYPE_NONE &&
4415              assigned_fs_vec4_type[vec4_slot(slot_index)] !=
4416              FS_VEC4_TYPE_NONE &&
4417              assigned_fs_vec4_type[vec4_slot(slot_index)] != fs_vec4_type) {
4418             slot_index = align(slot_index + slot_size, 8); /* move to next vec4 */
4419             continue;
4420          }
4421 
4422          /* This slot is already assigned (assigned_mask is set). Move to
4423           * the next one.
4424           */
4425          if (BITSET_TEST(assigned_mask, slot_index)) {
4426             slot_index += slot_size;
4427             continue;
4428          }
4429          break;
4430       }
4431 
4432       /* Assign color channels in this order, starting
4433        * at the color_channel_rotate component first. Cases:
4434        *    color_channel_rotate = 0: xyzw
4435        *    color_channel_rotate = 1: yzwx
4436        *    color_channel_rotate = 2: zwxy
4437        *    color_channel_rotate = 3: wxyz
4438        *
4439        * This has no effect on behavior per se, but some drivers merge VARn
4440        * and COLn into one output if each defines different components.
4441        * For example, if we store VAR0.xy and COL0.z, a driver can merge them
4442        * by mapping the same output to 2 different inputs (VAR0 and COL0) if
4443        * color-specific behavior is per component, but it can't merge VAR0.xy
4444        * and COL0.x because they both define x.
4445        */
4446       unsigned new_slot_index = slot_index;
4447       if (assign_colors && color_channel_rotate) {
4448          new_slot_index = (vec4_slot(new_slot_index)) * 8 +
4449                           (new_slot_index + color_channel_rotate * 2) % 8;
4450       }
4451 
4452       /* Relocate the slot. */
4453       assert(slot_index < max_slot * 8);
4454       relocate_slot(linkage, &linkage->slot[i], i, new_slot_index,
4455                     fs_vec4_type, convergent, progress);
4456 
4457       for (unsigned i = 0; i < slot_size; ++i)
4458          BITSET_SET(assigned_mask, slot_index + i);
4459 
4460       if (assigned_fs_vec4_type)
4461          assigned_fs_vec4_type[vec4_slot(slot_index)] = fs_vec4_type;
4462       slot_index += slot_size; /* move to the next slot */
4463       num_assigned_slots += slot_size;
4464 
4465       /* Remove the slot from the input (unassigned) mask. */
4466       BITSET_CLEAR(input_mask, i);
4467 
4468       /* The number of slots to assign can optionally be limited. */
4469       assert(num_assigned_slots <= max_assigned_slots);
4470       if (num_assigned_slots == max_assigned_slots)
4471          break;
4472    }
4473 
4474    assert(slot_index <= max_slot * 8);
4475 
4476    if (!convergent && fs_vec4_type != FS_VEC4_TYPE_NONE) {
4477       /* Count the number of unused 16-bit components. There can be holes
4478        * because indirect inputs are not moved from their original locations.
4479        * The result is used to determine which compoments should be filled
4480        * with convergent inputs.
4481        */
4482       unsigned unused_slots = 0;
4483 
4484       for (unsigned i = assign_colors ? VARYING_SLOT_COL0 : VARYING_SLOT_VAR0;
4485            i < max_slot; i++) {
4486          if (assigned_fs_vec4_type[i] != fs_vec4_type)
4487             continue;
4488 
4489          unsigned comp_mask =
4490             BITSET_GET_RANGE_INSIDE_WORD(assigned_mask, i * 8, i * 8 + 7);
4491          assert(comp_mask);
4492          assert(comp_mask <= 0xff);
4493 
4494          if (comp_mask == 0xff)
4495             continue;
4496 
4497          /* Only count full unused 32-bit slots, so that 2 disjoint unused
4498           * 16-bit slots don't give the misleading impression that there is
4499           * a full unused 32-bit slots.
4500           */
4501          for (unsigned i = 0; i < 4; i++) {
4502             if (!(comp_mask & BITFIELD_RANGE(i * 2, 2)))
4503                unused_slots += 2;
4504          }
4505       }
4506       return unused_slots;
4507    }
4508 
4509    return 0;
4510 }
4511 
4512 /**
4513  * This is called once for 32-bit inputs and once for 16-bit inputs.
4514  * It assigns new slot indices to all scalar slots specified in the masks.
4515  *
4516  * \param linkage             Linkage info
4517  * \param assigned_mask       Which scalar (16-bit) slots are already taken.
4518  * \param assigned_fs_vec4_type Which vec4 slots have an assigned qualifier
4519  *                              and can only be filled with compatible slots.
4520  * \param interp_mask         The list of interp slots to assign locations for.
4521  * \param flat_mask           The list of flat slots to assign locations for.
4522  * \param convergent_mask     The list of slots that have convergent output
4523  *                            stores.
4524  * \param sized_interp_type   One of FS_VEC4_TYPE_INTERP_{FP32, FP16, COLOR}*.
4525  * \param slot_size           1 for 16 bits, 2 for 32 bits
4526  * \param color_channel_rotate Assign color channels starting with this index,
4527  *                            e.g. 2 assigns channels in the zwxy order.
4528  * \param assign_colors       Whether to assign only color varyings or only
4529  *                            non-color varyings.
4530  */
4531 static void
fs_assign_slot_groups(struct linkage_info * linkage,BITSET_WORD * assigned_mask,uint8_t assigned_fs_vec4_type[NUM_TOTAL_VARYING_SLOTS],BITSET_WORD * interp_mask,BITSET_WORD * flat_mask,BITSET_WORD * convergent_mask,BITSET_WORD * color_interp_mask,enum fs_vec4_type sized_interp_type,unsigned slot_size,bool assign_colors,unsigned color_channel_rotate,nir_opt_varyings_progress * progress)4532 fs_assign_slot_groups(struct linkage_info *linkage,
4533                       BITSET_WORD *assigned_mask,
4534                       uint8_t assigned_fs_vec4_type[NUM_TOTAL_VARYING_SLOTS],
4535                       BITSET_WORD *interp_mask,
4536                       BITSET_WORD *flat_mask,
4537                       BITSET_WORD *convergent_mask,
4538                       BITSET_WORD *color_interp_mask,
4539                       enum fs_vec4_type sized_interp_type,
4540                       unsigned slot_size,
4541                       bool assign_colors,
4542                       unsigned color_channel_rotate,
4543                       nir_opt_varyings_progress *progress)
4544 {
4545    /* Put interpolated slots first. */
4546    unsigned unused_interp_slots =
4547       fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4548                       interp_mask, sized_interp_type,
4549                       slot_size, NUM_SCALAR_SLOTS, false, assign_colors,
4550                       color_channel_rotate, progress);
4551 
4552    unsigned unused_color_interp_slots = 0;
4553    if (color_interp_mask) {
4554       unused_color_interp_slots =
4555          fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4556                          color_interp_mask, FS_VEC4_TYPE_INTERP_COLOR,
4557                          slot_size, NUM_SCALAR_SLOTS, false, assign_colors,
4558                          color_channel_rotate, progress);
4559    }
4560 
4561    /* Put flat slots next.
4562     * Note that only flat vec4 slots can have both 32-bit and 16-bit types
4563     * packed in the same vec4. 32-bit flat inputs are packed first, followed
4564     * by 16-bit flat inputs.
4565     */
4566    unsigned unused_flat_slots =
4567       fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4568                       flat_mask, FS_VEC4_TYPE_FLAT,
4569                       slot_size, NUM_SCALAR_SLOTS, false, assign_colors,
4570                       color_channel_rotate, progress);
4571 
4572    /* Take the inputs with convergent values and assign them as follows.
4573     * Since they can be assigned as both interpolated and flat, we can
4574     * choose. We prefer them to be flat, but if interpolated vec4s have
4575     * unused components, try to fill those before starting a new flat vec4.
4576     *
4577     * First, fill the unused components of flat (if any), then fill
4578     * the unused components of interpolated (if any), and then make
4579     * the remaining convergent inputs flat.
4580     */
4581    if (!linkage->always_interpolate_convergent_fs_inputs &&
4582        unused_flat_slots) {
4583       fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4584                       convergent_mask, FS_VEC4_TYPE_FLAT,
4585                       slot_size, unused_flat_slots, true, assign_colors,
4586                       color_channel_rotate, progress);
4587    }
4588    if (unused_interp_slots) {
4589       fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4590                       convergent_mask, sized_interp_type,
4591                       slot_size, unused_interp_slots, true, assign_colors,
4592                       color_channel_rotate, progress);
4593    }
4594    if (unused_color_interp_slots) {
4595       fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4596                       convergent_mask, FS_VEC4_TYPE_INTERP_COLOR,
4597                       slot_size, unused_color_interp_slots, true, assign_colors,
4598                       color_channel_rotate, progress);
4599    }
4600    fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4601                    convergent_mask,
4602                    linkage->always_interpolate_convergent_fs_inputs ?
4603                       (slot_size == 2 ? FS_VEC4_TYPE_INTERP_FP32 :
4604                                         FS_VEC4_TYPE_INTERP_FP16) :
4605                       FS_VEC4_TYPE_FLAT,
4606                    slot_size, NUM_SCALAR_SLOTS, true, assign_colors,
4607                    color_channel_rotate, progress);
4608 }
4609 
4610 /**
4611  * Same as fs_assign_slot_groups, but don't mix different interpolation
4612  * qualifiers in the same vec4.
4613  */
4614 static void
fs_assign_slot_groups_separate_qual(struct linkage_info * linkage,BITSET_WORD * assigned_mask,uint8_t assigned_fs_vec4_type[NUM_TOTAL_VARYING_SLOTS],INTERP_QUAL_BITSET * interp_masks,BITSET_WORD * flat_mask,BITSET_WORD * convergent_mask,COLOR_QUAL_BITSET * color_interp_masks,enum fs_vec4_type sized_interp_type_base,unsigned slot_size,bool assign_colors,unsigned color_channel_rotate,nir_opt_varyings_progress * progress)4615 fs_assign_slot_groups_separate_qual(struct linkage_info *linkage,
4616                                     BITSET_WORD *assigned_mask,
4617                                     uint8_t assigned_fs_vec4_type[NUM_TOTAL_VARYING_SLOTS],
4618                                     INTERP_QUAL_BITSET *interp_masks,
4619                                     BITSET_WORD *flat_mask,
4620                                     BITSET_WORD *convergent_mask,
4621                                     COLOR_QUAL_BITSET *color_interp_masks,
4622                                     enum fs_vec4_type sized_interp_type_base,
4623                                     unsigned slot_size,
4624                                     bool assign_colors,
4625                                     unsigned color_channel_rotate,
4626                                     nir_opt_varyings_progress *progress)
4627 {
4628    unsigned unused_interp_slots[NUM_INTERP_QUALIFIERS] = {0};
4629    unsigned unused_color_slots[NUM_COLOR_QUALIFIERS] = {0};
4630 
4631    /* Put interpolated slots first. */
4632    for (unsigned i = 0; i < NUM_INTERP_QUALIFIERS; i++) {
4633       unused_interp_slots[i] =
4634          fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4635                          (*interp_masks)[i], sized_interp_type_base + i,
4636                          slot_size, NUM_SCALAR_SLOTS, false, assign_colors,
4637                          color_channel_rotate, progress);
4638    }
4639 
4640    if (color_interp_masks) {
4641       for (unsigned i = 0; i < NUM_COLOR_QUALIFIERS; i++) {
4642          unused_color_slots[i] =
4643             fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4644                             (*color_interp_masks)[i],
4645                             FS_VEC4_TYPE_INTERP_COLOR_PIXEL + i,
4646                             slot_size, NUM_SCALAR_SLOTS, false, assign_colors,
4647                             color_channel_rotate, progress);
4648       }
4649    }
4650 
4651    /* Put flat slots next.
4652     * Note that only flat vec4 slots can have both 32-bit and 16-bit types
4653     * packed in the same vec4. 32-bit flat inputs are packed first, followed
4654     * by 16-bit flat inputs.
4655     */
4656    unsigned unused_flat_slots =
4657       fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4658                       flat_mask, FS_VEC4_TYPE_FLAT,
4659                       slot_size, NUM_SCALAR_SLOTS, false, assign_colors,
4660                       color_channel_rotate, progress);
4661 
4662    /* Take the inputs with convergent values and assign them as follows.
4663     * Since they can be assigned as both interpolated and flat, we can
4664     * choose. We prefer them to be flat, but if interpolated vec4s have
4665     * unused components, try to fill those before starting a new flat vec4.
4666     *
4667     * First, fill the unused components of flat (if any) with convergent
4668     * inputs.
4669     */
4670    if (!linkage->always_interpolate_convergent_fs_inputs &&
4671        unused_flat_slots) {
4672       fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4673                       convergent_mask, FS_VEC4_TYPE_FLAT,
4674                       slot_size, unused_flat_slots, true, assign_colors,
4675                       color_channel_rotate, progress);
4676    }
4677 
4678    /* Then fill the unused components of interpolated slots (if any) with
4679     * convergent inputs.
4680     */
4681    for (unsigned i = 0; i < NUM_INTERP_QUALIFIERS; i++) {
4682       if (unused_interp_slots[i]) {
4683          fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4684                          convergent_mask, sized_interp_type_base + i,
4685                          slot_size, unused_interp_slots[i], true,
4686                          assign_colors, color_channel_rotate, progress);
4687       }
4688    }
4689 
4690    for (unsigned i = 0; i < NUM_COLOR_QUALIFIERS; i++) {
4691       if (unused_color_slots[i]) {
4692          fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4693                          convergent_mask, FS_VEC4_TYPE_INTERP_COLOR_PIXEL + i,
4694                          slot_size, unused_color_slots[i], true, assign_colors,
4695                          color_channel_rotate, progress);
4696       }
4697    }
4698 
4699    /* Then make the remaining convergent inputs flat. */
4700    fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4701                    convergent_mask,
4702                    linkage->always_interpolate_convergent_fs_inputs ?
4703                       (slot_size == 2 ? FS_VEC4_TYPE_INTERP_FP32_LINEAR_PIXEL :
4704                                         FS_VEC4_TYPE_INTERP_FP16_LINEAR_PIXEL) :
4705                       FS_VEC4_TYPE_FLAT,
4706                    slot_size, NUM_SCALAR_SLOTS, true, assign_colors,
4707                    color_channel_rotate, progress);
4708 }
4709 
4710 static void
vs_tcs_tes_gs_assign_slots(struct linkage_info * linkage,BITSET_WORD * input_mask,unsigned * slot_index,unsigned * patch_slot_index,unsigned slot_size,nir_opt_varyings_progress * progress)4711 vs_tcs_tes_gs_assign_slots(struct linkage_info *linkage,
4712                            BITSET_WORD *input_mask,
4713                            unsigned *slot_index,
4714                            unsigned *patch_slot_index,
4715                            unsigned slot_size,
4716                            nir_opt_varyings_progress *progress)
4717 {
4718    unsigned i;
4719 
4720    BITSET_FOREACH_SET(i, input_mask, NUM_SCALAR_SLOTS) {
4721       if (i >= VARYING_SLOT_PATCH0 * 8 && i < VARYING_SLOT_TESS_MAX * 8) {
4722          /* Skip indirectly-indexed scalar slots at 32-bit granularity.
4723           * We have to do it at this granularity because the low 16-bit
4724           * slot is set to 1 for 32-bit inputs but not the high 16-bit slot.
4725           */
4726          while (BITSET_TEST32(linkage->indirect_mask, *patch_slot_index))
4727             *patch_slot_index = align(*patch_slot_index + 1, 2);
4728 
4729          assert(*patch_slot_index < VARYING_SLOT_TESS_MAX * 8);
4730          relocate_slot(linkage, &linkage->slot[i], i, *patch_slot_index,
4731                        FS_VEC4_TYPE_NONE, false, progress);
4732          *patch_slot_index += slot_size; /* increment by 16 or 32 bits */
4733       } else {
4734          /* If the driver wants to use POS and we've already used it, move
4735           * to VARn.
4736           */
4737          if (*slot_index < VARYING_SLOT_VAR0 &&
4738              *slot_index >= VARYING_SLOT_POS + 8)
4739             *slot_index = VARYING_SLOT_VAR0 * 8;
4740 
4741          /* Skip indirectly-indexed scalar slots at 32-bit granularity. */
4742          while (BITSET_TEST32(linkage->indirect_mask, *slot_index))
4743             *slot_index = align(*slot_index + 1, 2);
4744 
4745          assert(*slot_index < VARYING_SLOT_MAX * 8);
4746          relocate_slot(linkage, &linkage->slot[i], i, *slot_index,
4747                        FS_VEC4_TYPE_NONE, false, progress);
4748          *slot_index += slot_size; /* increment by 16 or 32 bits */
4749       }
4750    }
4751 }
4752 
4753 static void
vs_tcs_tes_gs_assign_slots_2sets(struct linkage_info * linkage,BITSET_WORD * input32_mask,BITSET_WORD * input16_mask,unsigned * slot_index,unsigned * patch_slot_index,nir_opt_varyings_progress * progress)4754 vs_tcs_tes_gs_assign_slots_2sets(struct linkage_info *linkage,
4755                                  BITSET_WORD *input32_mask,
4756                                  BITSET_WORD *input16_mask,
4757                                  unsigned *slot_index,
4758                                  unsigned *patch_slot_index,
4759                                  nir_opt_varyings_progress *progress)
4760 {
4761    /* Compact 32-bit inputs, followed by 16-bit inputs allowing them to
4762     * share vec4 slots with 32-bit inputs.
4763     */
4764    vs_tcs_tes_gs_assign_slots(linkage, input32_mask, slot_index,
4765                               patch_slot_index, 2, progress);
4766    vs_tcs_tes_gs_assign_slots(linkage, input16_mask, slot_index,
4767                               patch_slot_index, 1, progress);
4768 
4769    assert(*slot_index <= VARYING_SLOT_MAX * 8);
4770    assert(!patch_slot_index || *patch_slot_index <= VARYING_SLOT_TESS_MAX * 8);
4771 }
4772 
4773 /**
4774  * Compaction means scalarizing and then packing scalar components into full
4775  * vec4s, so that we minimize the number of unused components in vec4 slots.
4776  *
4777  * Compaction is as simple as moving a scalar input from one scalar slot
4778  * to another. Indirectly-indexed slots are not touched, so the compaction
4779  * has to compact around them. Unused 32-bit components of indirectly-indexed
4780  * slots are still filled, so no space is wasted there, but if indirectly-
4781  * indexed 16-bit components have the other 16-bit half unused, that half is
4782  * wasted.
4783  */
4784 static void
compact_varyings(struct linkage_info * linkage,nir_opt_varyings_progress * progress)4785 compact_varyings(struct linkage_info *linkage,
4786                  nir_opt_varyings_progress *progress)
4787 {
4788    if (linkage->consumer_stage == MESA_SHADER_FRAGMENT) {
4789       /* These arrays are used to track which scalar slots we've already
4790        * assigned. We can fill unused components of indirectly-indexed slots,
4791        * but only if the vec4 slot type (FLAT, FP16, or FP32) is the same.
4792        * Assign vec4 slot type separately, skipping over already assigned
4793        * scalar slots.
4794        */
4795       uint8_t assigned_fs_vec4_type[NUM_TOTAL_VARYING_SLOTS] = {0};
4796       BITSET_DECLARE(assigned_mask, NUM_SCALAR_SLOTS);
4797       BITSET_ZERO(assigned_mask);
4798 
4799       /* Iterate over all indirectly accessed inputs and set the assigned vec4
4800        * type of each occupied slot to the vec4 type of indirect inputs, so
4801        * that compaction doesn't put inputs of a different vec4 type in
4802        * the same vec4.
4803        *
4804        * We don't try to compact indirect input arrays, though we could.
4805        */
4806       unsigned i;
4807       BITSET_FOREACH_SET(i, linkage->indirect_mask, NUM_SCALAR_SLOTS) {
4808          struct scalar_slot *slot = &linkage->slot[i];
4809 
4810          /* The slot of the first array element contains all loads for all
4811           * elements, including all direct accesses, while all other array
4812           * elements are empty (on purpose).
4813           */
4814          if (list_is_empty(&linkage->slot[i].consumer.loads))
4815             continue;
4816 
4817          assert(slot->num_slots >= 2);
4818 
4819          for (unsigned array_index = 0; array_index < slot->num_slots;
4820               array_index++) {
4821             unsigned vec4_index = vec4_slot(i) + array_index;
4822             unsigned scalar_index = i + array_index * 8;
4823             assigned_fs_vec4_type[vec4_index] = linkage->fs_vec4_type[vec4_index];
4824             /* Indirectly-indexed slots are marked to always occupy 32 bits
4825              * (2 16-bit slots), though we waste the high 16 bits if they are unused.
4826              */
4827             BITSET_SET_RANGE_INSIDE_WORD(assigned_mask, scalar_index, scalar_index + 1);
4828          }
4829       }
4830 
4831       if (linkage->has_flexible_interp) {
4832          /* This codepath packs convergent varyings with both interpolated and
4833           * flat, whichever has free space.
4834           */
4835          fs_assign_slot_groups(linkage, assigned_mask, assigned_fs_vec4_type,
4836                                linkage->interp_fp32_mask, linkage->flat32_mask,
4837                                linkage->convergent32_mask, NULL,
4838                                FS_VEC4_TYPE_INTERP_FP32, 2, false, 0, progress);
4839 
4840          /* Now do the same thing, but for 16-bit inputs. */
4841          fs_assign_slot_groups(linkage, assigned_mask, assigned_fs_vec4_type,
4842                                linkage->interp_fp16_mask, linkage->flat16_mask,
4843                                linkage->convergent16_mask, NULL,
4844                                FS_VEC4_TYPE_INTERP_FP16, 1, false, 0, progress);
4845       } else {
4846          /* Basically the same as above. */
4847          fs_assign_slot_groups_separate_qual(
4848                   linkage, assigned_mask, assigned_fs_vec4_type,
4849                   &linkage->interp_fp32_qual_masks, linkage->flat32_mask,
4850                   linkage->convergent32_mask, NULL,
4851                   FS_VEC4_TYPE_INTERP_FP32_PERSP_PIXEL, 2, false, 0, progress);
4852 
4853          fs_assign_slot_groups_separate_qual(
4854                   linkage, assigned_mask, assigned_fs_vec4_type,
4855                   &linkage->interp_fp16_qual_masks, linkage->flat16_mask,
4856                   linkage->convergent16_mask, NULL,
4857                   FS_VEC4_TYPE_INTERP_FP16_PERSP_PIXEL, 1, false, 0, progress);
4858       }
4859 
4860       /* Assign INTERP_MODE_EXPLICIT. Both FP32 and FP16 can occupy the same
4861        * slot because the vertex data is passed to FS as-is.
4862        */
4863       fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4864                       linkage->interp_explicit32_mask, FS_VEC4_TYPE_INTERP_EXPLICIT,
4865                       2, NUM_SCALAR_SLOTS, false, false, 0, progress);
4866 
4867       fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4868                       linkage->interp_explicit16_mask, FS_VEC4_TYPE_INTERP_EXPLICIT,
4869                       1, NUM_SCALAR_SLOTS, false, false, 0, progress);
4870 
4871       /* Same for strict vertex ordering. */
4872       fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4873                       linkage->interp_explicit_strict32_mask, FS_VEC4_TYPE_INTERP_EXPLICIT_STRICT,
4874                       2, NUM_SCALAR_SLOTS, false, false, 0, progress);
4875 
4876       fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4877                       linkage->interp_explicit_strict16_mask, FS_VEC4_TYPE_INTERP_EXPLICIT_STRICT,
4878                       1, NUM_SCALAR_SLOTS, false, false, 0, progress);
4879 
4880       /* Same for per-primitive. */
4881       fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4882                       linkage->per_primitive32_mask, FS_VEC4_TYPE_PER_PRIMITIVE,
4883                       2, NUM_SCALAR_SLOTS, false, false, 0, progress);
4884 
4885       fs_assign_slots(linkage, assigned_mask, assigned_fs_vec4_type,
4886                       linkage->per_primitive16_mask, FS_VEC4_TYPE_PER_PRIMITIVE,
4887                       1, NUM_SCALAR_SLOTS, false, false, 0, progress);
4888 
4889       /* Put transform-feedback-only outputs last. */
4890       fs_assign_slots(linkage, assigned_mask, NULL,
4891                       linkage->xfb32_only_mask, FS_VEC4_TYPE_NONE, 2,
4892                       NUM_SCALAR_SLOTS, false, false, 0, progress);
4893 
4894       fs_assign_slots(linkage, assigned_mask, NULL,
4895                       linkage->xfb16_only_mask, FS_VEC4_TYPE_NONE, 1,
4896                       NUM_SCALAR_SLOTS, false, false, 0, progress);
4897 
4898       /* Color varyings are only compacted among themselves. */
4899       /* Set whether the shader contains any color varyings. */
4900       unsigned col0 = VARYING_SLOT_COL0 * 8;
4901       bool has_colors =
4902          !BITSET_TEST_RANGE_INSIDE_WORD(linkage->interp_fp32_mask, col0,
4903                                         col0 + 15, 0) ||
4904          !BITSET_TEST_RANGE_INSIDE_WORD(linkage->convergent32_mask, col0,
4905                                         col0 + 15, 0) ||
4906          !BITSET_TEST_RANGE_INSIDE_WORD(linkage->color32_mask, col0,
4907                                         col0 + 15, 0) ||
4908          !BITSET_TEST_RANGE_INSIDE_WORD(linkage->flat32_mask, col0,
4909                                         col0 + 15, 0) ||
4910          !BITSET_TEST_RANGE_INSIDE_WORD(linkage->xfb32_only_mask, col0,
4911                                         col0 + 15, 0);
4912 
4913       for (unsigned i = 0; i < NUM_INTERP_QUALIFIERS; i++) {
4914          has_colors |=
4915             !BITSET_TEST_RANGE_INSIDE_WORD(linkage->interp_fp32_qual_masks[i],
4916                                            col0, col0 + 15, 0);
4917       }
4918       for (unsigned i = 0; i < NUM_COLOR_QUALIFIERS; i++) {
4919          has_colors |=
4920             !BITSET_TEST_RANGE_INSIDE_WORD(linkage->color32_qual_masks[i],
4921                                            col0, col0 + 15, 0);
4922       }
4923 
4924       if (has_colors) {
4925          unsigned color_channel_rotate = 0;
4926 
4927          if (linkage->consumer_builder.shader->options->io_options &
4928              nir_io_compaction_rotates_color_channels) {
4929             color_channel_rotate =
4930                DIV_ROUND_UP(BITSET_LAST_BIT(assigned_mask), 2) % 4;
4931          }
4932 
4933          if (linkage->has_flexible_interp) {
4934             fs_assign_slot_groups(linkage, assigned_mask, assigned_fs_vec4_type,
4935                                   linkage->interp_fp32_mask, linkage->flat32_mask,
4936                                   linkage->convergent32_mask, linkage->color32_mask,
4937                                   FS_VEC4_TYPE_INTERP_FP32, 2, true,
4938                                   color_channel_rotate, progress);
4939          } else {
4940             fs_assign_slot_groups_separate_qual(
4941                      linkage, assigned_mask, assigned_fs_vec4_type,
4942                      &linkage->interp_fp32_qual_masks, linkage->flat32_mask,
4943                      linkage->convergent32_mask, &linkage->color32_qual_masks,
4944                      FS_VEC4_TYPE_INTERP_FP32_PERSP_PIXEL, 2, true,
4945                      color_channel_rotate, progress);
4946          }
4947 
4948          /* Put transform-feedback-only outputs last. */
4949          fs_assign_slots(linkage, assigned_mask, NULL,
4950                          linkage->xfb32_only_mask, FS_VEC4_TYPE_NONE, 2,
4951                          NUM_SCALAR_SLOTS, false, true, color_channel_rotate,
4952                          progress);
4953       }
4954       return;
4955    }
4956 
4957    /* If we get here, the consumer can only be TCS, TES, or GS.
4958     *
4959     * "use_pos" says whether the driver prefers that compaction with non-FS
4960     * consumers puts varyings into POS first before using any VARn.
4961     */
4962    bool use_pos = !(linkage->producer_builder.shader->options->io_options &
4963                     nir_io_dont_use_pos_for_non_fs_varyings);
4964    unsigned slot_index = (use_pos ? VARYING_SLOT_POS
4965                                   : VARYING_SLOT_VAR0) * 8;
4966 
4967    if (linkage->consumer_stage == MESA_SHADER_TESS_CTRL) {
4968       /* Make tcs_cross_invoc*_mask bits disjoint with flat*_mask bits
4969        * because tcs_cross_invoc*_mask is initially a subset of flat*_mask,
4970        * but we must assign each scalar slot only once.
4971        */
4972       BITSET_ANDNOT(linkage->flat32_mask, linkage->flat32_mask,
4973                     linkage->tcs_cross_invoc32_mask);
4974       BITSET_ANDNOT(linkage->flat16_mask, linkage->flat16_mask,
4975                     linkage->tcs_cross_invoc16_mask);
4976 
4977       /* Put cross-invocation-accessed TCS inputs first. */
4978       vs_tcs_tes_gs_assign_slots_2sets(linkage, linkage->tcs_cross_invoc32_mask,
4979                                        linkage->tcs_cross_invoc16_mask,
4980                                        &slot_index, NULL, progress);
4981       /* Remaining TCS inputs. */
4982       vs_tcs_tes_gs_assign_slots_2sets(linkage, linkage->flat32_mask,
4983                                        linkage->flat16_mask, &slot_index,
4984                                        NULL,  progress);
4985       return;
4986    }
4987 
4988    if (linkage->consumer_stage == MESA_SHADER_TESS_EVAL) {
4989       unsigned patch_slot_index = VARYING_SLOT_PATCH0 * 8;
4990 
4991       vs_tcs_tes_gs_assign_slots_2sets(linkage, linkage->flat32_mask,
4992                                        linkage->flat16_mask, &slot_index,
4993                                        &patch_slot_index, progress);
4994 
4995       /* Put no-varying slots last. These are TCS outputs read by TCS but
4996        * not TES.
4997        */
4998       vs_tcs_tes_gs_assign_slots_2sets(linkage, linkage->no_varying32_mask,
4999                                        linkage->no_varying16_mask, &slot_index,
5000                                        &patch_slot_index, progress);
5001       return;
5002    }
5003 
5004    assert(linkage->consumer_stage == MESA_SHADER_GEOMETRY);
5005    vs_tcs_tes_gs_assign_slots_2sets(linkage, linkage->flat32_mask,
5006                                     linkage->flat16_mask, &slot_index,
5007                                     NULL, progress);
5008 }
5009 
5010 /******************************************************************
5011  * PUTTING IT ALL TOGETHER
5012  ******************************************************************/
5013 
5014 /* A costing function determining the cost of a uniform expression to determine
5015  * whether it's worth propagating from output stores to the next shader stage.
5016  * This tries to model instruction cost of a scalar desktop GPU.
5017  *
5018  * It's used by uniform expression propagation when drivers provide a cost
5019  * limit for such an optimization but don't provide their own costing function,
5020  * which are the majority of drivers.
5021  */
5022 static unsigned
default_varying_estimate_instr_cost(nir_instr * instr)5023 default_varying_estimate_instr_cost(nir_instr *instr)
5024 {
5025    unsigned dst_bit_size, src_bit_size, num_dst_dwords;
5026    nir_op alu_op;
5027 
5028    switch (instr->type) {
5029    case nir_instr_type_alu:
5030       dst_bit_size = nir_instr_as_alu(instr)->def.bit_size;
5031       src_bit_size = nir_instr_as_alu(instr)->src[0].src.ssa->bit_size;
5032       alu_op = nir_instr_as_alu(instr)->op;
5033       num_dst_dwords = DIV_ROUND_UP(dst_bit_size, 32);
5034 
5035       switch (alu_op) {
5036       /* Moves are free. */
5037       case nir_op_mov:
5038       case nir_op_vec2:
5039       case nir_op_vec3:
5040       case nir_op_vec4:
5041       case nir_op_vec5:
5042       case nir_op_vec8:
5043       case nir_op_vec16:
5044       /* These are usually folded into FP instructions as src or dst
5045        * modifiers.
5046        */
5047       case nir_op_fabs:
5048       case nir_op_fneg:
5049       case nir_op_fsat:
5050          return 0;
5051 
5052       /* 16-bit multiplication should be cheap. Greater sizes not so much. */
5053       case nir_op_imul:
5054       case nir_op_umul_low:
5055       case nir_op_imul_2x32_64:
5056       case nir_op_umul_2x32_64:
5057          return dst_bit_size <= 16 ? 1 : 4 * num_dst_dwords;
5058 
5059       /* High bits of 64-bit multiplications. */
5060       case nir_op_imul_high:
5061       case nir_op_umul_high:
5062       /* Lowered into multiple instructions typically. */
5063       case nir_op_fsign:
5064          return 4;
5065 
5066       /* Transcendental opcodes typically run at 1/4 rate of FMA. */
5067       case nir_op_fexp2:
5068       case nir_op_flog2:
5069       case nir_op_frcp:
5070       case nir_op_frsq:
5071       case nir_op_fsqrt:
5072       case nir_op_fsin:
5073       case nir_op_fcos:
5074       case nir_op_fsin_amd:
5075       case nir_op_fcos_amd:
5076          /* FP64 is usually much slower. */
5077          return dst_bit_size == 64 ? 32 : 4;
5078 
5079       case nir_op_fpow:
5080          return 4 + 1 + 4; /* log2 + mul + exp2 */
5081 
5082       /* Integer division is slow. */
5083       case nir_op_idiv:
5084       case nir_op_udiv:
5085       case nir_op_imod:
5086       case nir_op_umod:
5087       case nir_op_irem:
5088          return dst_bit_size == 64 ? 80 : 40;
5089 
5090       case nir_op_fdiv:
5091          return dst_bit_size == 64 ? 80 : 5; /* FP16 & FP32: rcp + mul */
5092 
5093       case nir_op_fmod:
5094       case nir_op_frem:
5095          return dst_bit_size == 64 ? 80 : 8;
5096 
5097       default:
5098          /* FP64 is usually much slower. */
5099          if ((dst_bit_size == 64 &&
5100               nir_op_infos[alu_op].output_type & nir_type_float) ||
5101              (src_bit_size == 64 &&
5102               nir_op_infos[alu_op].input_types[0] & nir_type_float))
5103             return 16;
5104 
5105          /* 1 per 32-bit result. */
5106          return DIV_ROUND_UP(MAX2(dst_bit_size, src_bit_size), 32);
5107       }
5108 
5109    case nir_instr_type_intrinsic:
5110       dst_bit_size = nir_instr_as_intrinsic(instr)->def.bit_size;
5111       num_dst_dwords = DIV_ROUND_UP(dst_bit_size, 32);
5112 
5113       /* This can only be a uniform load. Other intrinsics and variables are
5114        * rejected before this is called.
5115        */
5116       switch (nir_instr_as_intrinsic(instr)->intrinsic) {
5117       case nir_intrinsic_load_deref:
5118          /* Uniform loads can appear fast if latency hiding is effective. */
5119          return 2 * num_dst_dwords;
5120 
5121       default:
5122          unreachable("unexpected intrinsic");
5123       }
5124 
5125    case nir_instr_type_deref: {
5126       nir_deref_instr *deref = nir_instr_as_deref(instr);
5127 
5128       switch (deref->deref_type) {
5129       case nir_deref_type_var:
5130       case nir_deref_type_struct:
5131          return 0;
5132       case nir_deref_type_array:
5133          /* Indexing uniforms with a divergent index has a high cost. This cost
5134           * is likely only going to be accepted by the driver if the next
5135           * shader doesn't run after amplification (e.g. VS->TCS, TES->GS).
5136           */
5137          return nir_src_is_const(deref->arr.index) ? 0 : 128;
5138 
5139       default:
5140          unreachable("unexpected deref type");
5141       }
5142    }
5143 
5144    default:
5145       unreachable("unexpected instr type");
5146    }
5147 }
5148 
5149 static void
init_linkage(nir_shader * producer,nir_shader * consumer,bool spirv,unsigned max_uniform_components,unsigned max_ubos_per_stage,struct linkage_info * linkage,nir_opt_varyings_progress * progress)5150 init_linkage(nir_shader *producer, nir_shader *consumer, bool spirv,
5151              unsigned max_uniform_components, unsigned max_ubos_per_stage,
5152              struct linkage_info *linkage, nir_opt_varyings_progress *progress)
5153 {
5154    *linkage = (struct linkage_info){
5155       .spirv = spirv,
5156       .can_mix_convergent_flat_with_interpolated =
5157          consumer->info.stage == MESA_SHADER_FRAGMENT &&
5158          consumer->options->io_options &
5159          nir_io_mix_convergent_flat_with_interpolated,
5160       .has_flexible_interp =
5161          consumer->info.stage == MESA_SHADER_FRAGMENT &&
5162          consumer->options->io_options &
5163          nir_io_has_flexible_input_interpolation_except_flat,
5164       .always_interpolate_convergent_fs_inputs =
5165          consumer->info.stage == MESA_SHADER_FRAGMENT &&
5166          consumer->options->io_options &
5167          nir_io_always_interpolate_convergent_fs_inputs,
5168       .producer_stage = producer->info.stage,
5169       .consumer_stage = consumer->info.stage,
5170       .producer_builder =
5171          nir_builder_create(nir_shader_get_entrypoint(producer)),
5172       .consumer_builder =
5173          nir_builder_create(nir_shader_get_entrypoint(consumer)),
5174 
5175       .max_varying_expression_cost =
5176          producer->options->varying_expression_max_cost ?
5177             producer->options->varying_expression_max_cost(producer, consumer) :
5178             producer->options->max_varying_expression_cost,
5179       .varying_estimate_instr_cost =
5180          producer->options->varying_estimate_instr_cost ?
5181             producer->options->varying_estimate_instr_cost :
5182             default_varying_estimate_instr_cost,
5183 
5184       .linear_mem_ctx = linear_context(ralloc_context(NULL)),
5185    };
5186 
5187    for (unsigned i = 0; i < ARRAY_SIZE(linkage->slot); i++) {
5188       list_inithead(&linkage->slot[i].producer.loads);
5189       list_inithead(&linkage->slot[i].producer.stores);
5190       list_inithead(&linkage->slot[i].consumer.loads);
5191    }
5192 
5193    /* Preparation. */
5194    nir_shader_intrinsics_pass(consumer, gather_inputs, 0, linkage);
5195    nir_shader_intrinsics_pass(producer, gather_outputs, 0, linkage);
5196    tidy_up_indirect_varyings(linkage);
5197    determine_uniform_movability(linkage, max_uniform_components);
5198    determine_ubo_movability(linkage, max_ubos_per_stage);
5199    /* This must always be done because it also cleans up bitmasks. */
5200    remove_dead_varyings(linkage, progress);
5201 }
5202 
5203 static void
free_linkage(struct linkage_info * linkage)5204 free_linkage(struct linkage_info *linkage)
5205 {
5206    ralloc_free(ralloc_parent_of_linear_context(linkage->linear_mem_ctx));
5207 }
5208 
5209 static void
print_shader_linkage(nir_shader * producer,nir_shader * consumer)5210 print_shader_linkage(nir_shader *producer, nir_shader *consumer)
5211 {
5212    struct linkage_info *linkage = MALLOC_STRUCT(linkage_info);
5213    nir_opt_varyings_progress progress = 0;
5214 
5215    init_linkage(producer, consumer, false, 0, 0, linkage, &progress);
5216    print_linkage(linkage);
5217    free_linkage(linkage);
5218    FREE(linkage);
5219 }
5220 
5221 /**
5222  * Run lots of optimizations on varyings. See the description at the beginning
5223  * of this file.
5224  */
5225 nir_opt_varyings_progress
nir_opt_varyings(nir_shader * producer,nir_shader * consumer,bool spirv,unsigned max_uniform_components,unsigned max_ubos_per_stage)5226 nir_opt_varyings(nir_shader *producer, nir_shader *consumer, bool spirv,
5227                  unsigned max_uniform_components, unsigned max_ubos_per_stage)
5228 {
5229    /* Task -> Mesh I/O uses payload variables and not varying slots,
5230     * so this pass can't do anything about it.
5231     */
5232    if (producer->info.stage == MESA_SHADER_TASK)
5233       return 0;
5234 
5235    nir_opt_varyings_progress progress = 0;
5236    struct linkage_info *linkage = MALLOC_STRUCT(linkage_info);
5237    if (linkage == NULL)
5238       return 0;
5239 
5240    /* Producers before a fragment shader must have up-to-date vertex
5241     * divergence information.
5242     */
5243    if (consumer->info.stage == MESA_SHADER_FRAGMENT) {
5244       nir_vertex_divergence_analysis(producer);
5245    }
5246 
5247    /* This also removes dead varyings. */
5248    init_linkage(producer, consumer, spirv, max_uniform_components,
5249                 max_ubos_per_stage, linkage, &progress);
5250 
5251    /* Part 1: Run optimizations that only remove varyings. (they can move
5252     * instructions between shaders)
5253     */
5254    propagate_uniform_expressions(linkage, &progress);
5255 
5256    /* Part 2: Deduplicate outputs. */
5257    deduplicate_outputs(linkage, &progress);
5258 
5259    /* Run CSE on the consumer after output deduplication because duplicated
5260     * loads can prevent finding the post-dominator for inter-shader code
5261     * motion.
5262     */
5263    NIR_PASS(_, consumer, nir_opt_cse);
5264 
5265    /* Re-gather linkage info after CSE. */
5266    free_linkage(linkage);
5267    init_linkage(producer, consumer, spirv, max_uniform_components,
5268                 max_ubos_per_stage, linkage, &progress);
5269 
5270    /* This must be done after deduplication and before inter-shader code
5271     * motion.
5272     */
5273    tidy_up_convergent_varyings(linkage);
5274    find_open_coded_tes_input_interpolation(linkage);
5275 
5276    /* Part 3: Run optimizations that completely change varyings. */
5277 #if PRINT
5278    int i = 0;
5279    puts("Before:");
5280    nir_print_shader(linkage->producer_builder.shader, stdout);
5281    nir_print_shader(linkage->consumer_builder.shader, stdout);
5282    print_linkage(linkage);
5283    puts("");
5284 #endif
5285 
5286    while (backward_inter_shader_code_motion(linkage, &progress)) {
5287 #if PRINT
5288       i++;
5289       printf("Finished: %i\n", i);
5290       nir_print_shader(linkage->producer_builder.shader, stdout);
5291       nir_print_shader(linkage->consumer_builder.shader, stdout);
5292       print_linkage(linkage);
5293       puts("");
5294 #endif
5295    }
5296 
5297    /* Part 4: Do compaction. */
5298    compact_varyings(linkage, &progress);
5299 
5300    nir_metadata_preserve(linkage->producer_builder.impl,
5301                          progress & nir_progress_producer ?
5302                             (nir_metadata_control_flow) :
5303                             nir_metadata_all);
5304    nir_metadata_preserve(linkage->consumer_builder.impl,
5305                          progress & nir_progress_consumer ?
5306                             (nir_metadata_control_flow) :
5307                             nir_metadata_all);
5308    free_linkage(linkage);
5309    FREE(linkage);
5310 
5311    /* Compaction moves CLIP_DIST and CULL_DIST outputs to VARn if the next
5312     * shader is not FS. Clear those fields in shader_info.
5313     */
5314    if (consumer->info.stage <= MESA_SHADER_GEOMETRY) {
5315       producer->info.clip_distance_array_size = 0;
5316       producer->info.cull_distance_array_size = 0;
5317    }
5318 
5319    if (progress & nir_progress_producer)
5320       nir_validate_shader(producer, "nir_opt_varyings");
5321    if (progress & nir_progress_consumer)
5322       nir_validate_shader(consumer, "nir_opt_varyings");
5323 
5324    return progress;
5325 }
5326