• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2012 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 /* The compiler middle-end architecture: Explaining (non-)monolithic shaders
8  * -------------------------------------------------------------------------
9  *
10  * Typically, there is one-to-one correspondence between API and HW shaders,
11  * that is, for every API shader, there is exactly one shader binary in
12  * the driver.
13  *
14  * The problem with that is that we also have to emulate some API states
15  * (e.g. alpha-test, and many others) in shaders too. The two obvious ways
16  * to deal with it are:
17  * - each shader has multiple variants for each combination of emulated states,
18  *   and the variants are compiled on demand, possibly relying on a shader
19  *   cache for good performance
20  * - patch shaders at the binary level
21  *
22  * This driver uses something completely different. The emulated states are
23  * usually implemented at the beginning or end of shaders. Therefore, we can
24  * split the shader into 3 parts:
25  * - prolog part (shader code dependent on states)
26  * - main part (the API shader)
27  * - epilog part (shader code dependent on states)
28  *
29  * Each part is compiled as a separate shader and the final binaries are
30  * concatenated. This type of shader is called non-monolithic, because it
31  * consists of multiple independent binaries. Creating a new shader variant
32  * is therefore only a concatenation of shader parts (binaries) and doesn't
33  * involve any compilation. The main shader parts are the only parts that are
34  * compiled when applications create shader objects. The prolog and epilog
35  * parts are compiled on the first use and saved, so that their binaries can
36  * be reused by many other shaders.
37  *
38  * One of the roles of the prolog part is to compute vertex buffer addresses
39  * for vertex shaders. A few of the roles of the epilog part are color buffer
40  * format conversions in pixel shaders that we have to do manually, and write
41  * tessellation factors in tessellation control shaders. The prolog and epilog
42  * have many other important responsibilities in various shader stages.
43  * They don't just "emulate legacy stuff".
44  *
45  * Monolithic shaders are shaders where the parts are combined before LLVM
46  * compilation, and the whole thing is compiled and optimized as one unit with
47  * one binary on the output. The result is the same as the non-monolithic
48  * shader, but the final code can be better, because LLVM can optimize across
49  * all shader parts. Monolithic shaders aren't usually used except for these
50  * special cases:
51  *
52  * 1) Some rarely-used states require modification of the main shader part
53  *    itself, and in such cases, only the monolithic shader variant is
54  *    compiled, and that's always done on the first use.
55  *
56  * 2) When we do cross-stage optimizations for separate shader objects and
57  *    e.g. eliminate unused shader varyings, the resulting optimized shader
58  *    variants are always compiled as monolithic shaders, and always
59  *    asynchronously (i.e. not stalling ongoing rendering). We call them
60  *    "optimized monolithic" shaders. The important property here is that
61  *    the non-monolithic unoptimized shader variant is always available for use
62  *    when the asynchronous compilation of the optimized shader is not done
63  *    yet.
64  *
65  * Starting with GFX9 chips, some shader stages are merged, and the number of
66  * shader parts per shader increased. The complete new list of shader parts is:
67  * - 1st shader: prolog part
68  * - 1st shader: main part
69  * - 2nd shader: main part
70  * - 2nd shader: epilog part
71  */
72 
73 /* How linking shader inputs and outputs between vertex, tessellation, and
74  * geometry shaders works.
75  *
76  * Inputs and outputs between shaders are stored in a buffer. This buffer
77  * lives in LDS (typical case for tessellation), but it can also live
78  * in memory (ESGS). Each input or output has a fixed location within a vertex.
79  * The highest used input or output determines the stride between vertices.
80  *
81  * Since GS and tessellation are only possible in the OpenGL core profile,
82  * only these semantics are valid for per-vertex data:
83  *
84  *   Name             Location
85  *
86  *   POSITION         0
87  *   VAR0..31         1..32
88  *   CLIP_DIST0..1    49..50
89  *   PSIZ             51
90  *
91  * For example, a shader only writing GENERIC0 has the output stride of 5.
92  *
93  * Only these semantics are valid for per-patch data:
94  *
95  *   Name             Location
96  *
97  *   TESSOUTER        0
98  *   TESSINNER        1
99  *   PATCH0..29       2..31
100  *
101  * That's how independent shaders agree on input and output locations.
102  * The si_shader_io_get_unique_index function assigns the locations.
103  *
104  * For tessellation, other required information for calculating the input and
105  * output addresses like the vertex stride, the patch stride, and the offsets
106  * where per-vertex and per-patch data start, is passed to the shader via
107  * user data SGPRs. The offsets and strides are calculated at draw time and
108  * aren't available at compile time.
109  */
110 
111 #ifndef SI_SHADER_H
112 #define SI_SHADER_H
113 
114 #include "shader_info.h"
115 #include "ac_binary.h"
116 #include "ac_gpu_info.h"
117 #include "util/mesa-sha1.h"
118 #include "util/u_live_shader_cache.h"
119 #include "util/u_queue.h"
120 #include "si_pm4.h"
121 
122 #ifdef __cplusplus
123 extern "C" {
124 #endif
125 
126 struct nir_shader;
127 struct nir_instr;
128 struct nir_lower_subgroups_options;
129 
130 #define SI_NUM_INTERP     32
131 #define SI_MAX_ATTRIBS    16
132 #define SI_MAX_VS_OUTPUTS 40
133 #define SI_USER_CLIP_PLANE_MASK  0x3F
134 
135 #define INTERP_MODE_COLOR  INTERP_MODE_COUNT
136 
137 #define SI_PS_INPUT_CNTL_0000          (S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(0))
138 #define SI_PS_INPUT_CNTL_0001          (S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(3))
139 #define SI_PS_INPUT_CNTL_UNUSED        SI_PS_INPUT_CNTL_0000
140 /* D3D9 behaviour for COLOR0 requires 0001. GL is undefined. */
141 #define SI_PS_INPUT_CNTL_UNUSED_COLOR0 SI_PS_INPUT_CNTL_0001
142 
143 #define SI_VECTOR_ARG_IS_COLOR               BITFIELD_BIT(0)
144 #define SI_VECTOR_ARG_COLOR_COMPONENT(x)     (((x) & 0x7) << 1)
145 #define SI_GET_VECTOR_ARG_COLOR_COMPONENT(x) (((x) >> 1) & 0x7)
146 
147 /* SGPR user data indices */
148 enum
149 {
150    SI_SGPR_INTERNAL_BINDINGS,
151    SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES,
152    SI_SGPR_CONST_AND_SHADER_BUFFERS, /* or just a constant buffer 0 pointer */
153    SI_SGPR_SAMPLERS_AND_IMAGES,
154    SI_NUM_RESOURCE_SGPRS,
155 
156    /* API VS, TES without GS, GS copy shader */
157    SI_SGPR_VS_STATE_BITS = SI_NUM_RESOURCE_SGPRS,
158    SI_NUM_VS_STATE_RESOURCE_SGPRS,
159 
160    /* all VS variants */
161    SI_SGPR_BASE_VERTEX = SI_NUM_VS_STATE_RESOURCE_SGPRS,
162    SI_SGPR_DRAWID,
163    SI_SGPR_START_INSTANCE,
164    SI_VS_NUM_USER_SGPR,
165 
166    SI_SGPR_VS_BLIT_DATA = SI_SGPR_CONST_AND_SHADER_BUFFERS,
167 
168    /* TES */
169    SI_SGPR_TES_OFFCHIP_LAYOUT = SI_NUM_VS_STATE_RESOURCE_SGPRS,
170    SI_SGPR_TES_OFFCHIP_ADDR,
171    SI_TES_NUM_USER_SGPR,
172 
173    /* GFX6-8: TCS only */
174    GFX6_SGPR_TCS_OFFCHIP_LAYOUT = SI_NUM_RESOURCE_SGPRS,
175    GFX6_SGPR_TCS_OFFCHIP_ADDR,
176    GFX6_SGPR_TCS_IN_LAYOUT,
177    GFX6_TCS_NUM_USER_SGPR,
178 
179    /* GFX9: Merged LS-HS (VS-TCS) only. */
180    GFX9_SGPR_TCS_OFFCHIP_LAYOUT = SI_VS_NUM_USER_SGPR,
181    GFX9_SGPR_TCS_OFFCHIP_ADDR,
182    GFX9_TCS_NUM_USER_SGPR,
183 
184    /* GS limits */
185    GFX6_GS_NUM_USER_SGPR = SI_NUM_RESOURCE_SGPRS,
186    SI_GSCOPY_NUM_USER_SGPR = SI_NUM_VS_STATE_RESOURCE_SGPRS,
187 
188    GFX9_SGPR_SMALL_PRIM_CULL_INFO = MAX2(SI_VS_NUM_USER_SGPR, SI_TES_NUM_USER_SGPR),
189    GFX9_SGPR_ATTRIBUTE_RING_ADDR,
190    GFX9_GS_NUM_USER_SGPR,
191 
192    /* PS only */
193    SI_SGPR_ALPHA_REF = SI_NUM_RESOURCE_SGPRS,
194    SI_PS_NUM_USER_SGPR,
195 
196    /* The value has to be 12, because the hw requires that descriptors
197     * are aligned to 4 SGPRs.
198     */
199    SI_SGPR_VS_VB_DESCRIPTOR_FIRST = 12,
200 };
201 
202 /* LLVM function parameter indices */
203 enum
204 {
205    SI_NUM_RESOURCE_PARAMS = 4,
206 
207    /* PS only parameters */
208    SI_PARAM_ALPHA_REF = SI_NUM_RESOURCE_PARAMS,
209    SI_PARAM_PRIM_MASK,
210    SI_PARAM_PERSP_SAMPLE,
211    SI_PARAM_PERSP_CENTER,
212    SI_PARAM_PERSP_CENTROID,
213    SI_PARAM_PERSP_PULL_MODEL,
214    SI_PARAM_LINEAR_SAMPLE,
215    SI_PARAM_LINEAR_CENTER,
216    SI_PARAM_LINEAR_CENTROID,
217    SI_PARAM_LINE_STIPPLE_TEX,
218    SI_PARAM_POS_X_FLOAT,
219    SI_PARAM_POS_Y_FLOAT,
220    SI_PARAM_POS_Z_FLOAT,
221    SI_PARAM_POS_W_FLOAT,
222    SI_PARAM_FRONT_FACE,
223    SI_PARAM_ANCILLARY,
224    SI_PARAM_SAMPLE_COVERAGE,
225    SI_PARAM_POS_FIXED_PT,
226 
227    SI_NUM_PARAMS = SI_PARAM_POS_FIXED_PT + 9, /* +8 for COLOR[0..1] */
228 };
229 
230 /* These fields are only set in current_vs_state (except INDEXED) in si_context, and they are
231  * accessible in the shader via vs_state_bits in VS, TES, and GS.
232  */
233 #define VS_STATE_CLAMP_VERTEX_COLOR__SHIFT   0
234 #define VS_STATE_CLAMP_VERTEX_COLOR__MASK    0x1 /* Shared by VS and GS */
235 #define VS_STATE_INDEXED__SHIFT              1
236 #define VS_STATE_INDEXED__MASK               0x1 /* Shared by VS and GS */
237 
238 /* These fields are only set in current_vs_state in si_context, and they are accessible
239  * in the shader via vs_state_bits in LS/HS.
240  */
241 /* bit gap */
242 /* TCS output patch0 offset for per-patch outputs / 4
243  * - 64 outputs are implied by SI_UNIQUE_SLOT_* values.
244  * - max = 32(CPs) * 64(outputs) * 16(vec4) * 64(num_patches) * 2(inputs + outputs) / 4
245  *       = 1M, clamped to 32K(LDS limit) / 4 = 8K
246  * - only used by si_llvm_tcs_build_end, it can be removed after NIR lowering replaces it
247  */
248 #define VS_STATE_TCS_OUT_PATCH0_OFFSET__SHIFT   10
249 #define VS_STATE_TCS_OUT_PATCH0_OFFSET__MASK    0x3fff
250 #define VS_STATE_LS_OUT_VERTEX_SIZE__SHIFT      24
251 #define VS_STATE_LS_OUT_VERTEX_SIZE__MASK       0xff /* max 32 * 4 + 1 (to reduce LDS bank conflicts) */
252 
253 /* These fields are only set in current_gs_state in si_context, and they are accessible
254  * in the shader via vs_state_bits in legacy GS, the GS copy shader, and any NGG shader.
255  */
256 /* bit gap */
257 /* The number of ES outputs is derived from the last output index of SI_UNIQUE_SLOT_* + 1, which
258  * can be 55 at most. The ESGS vertex stride in dwords is: NUM_ES_OUTPUTS * 4 + 1
259  * Only used by GFX9+ to compute LDS addresses of GS inputs.
260  */
261 #define GS_STATE_NUM_ES_OUTPUTS__SHIFT          13
262 #define GS_STATE_NUM_ES_OUTPUTS__MASK           0x3f
263 /* Small prim filter precision = num_samples / quant_mode, which can only be equal to 1/2^n
264  * where n is between 4 and 12. Knowing that, we only need to store 4 bits of the FP32 exponent.
265  * Set it like this: value = (fui(num_samples / quant_mode) >> 23) & 0xf;
266  * Expand to FP32 like this: ((0x70 | value) << 23);
267  * With 0x70 = 112, we get 2^(112 + value - 127) = 2^(value - 15), which is always a negative
268  * exponent and it's equal to 1/2^(15 - value).
269  */
270 #define GS_STATE_SMALL_PRIM_PRECISION_NO_AA__SHIFT 19
271 #define GS_STATE_SMALL_PRIM_PRECISION_NO_AA__MASK  0xf
272 #define GS_STATE_SMALL_PRIM_PRECISION__SHIFT    23
273 #define GS_STATE_SMALL_PRIM_PRECISION__MASK     0xf
274 #define GS_STATE_STREAMOUT_QUERY_ENABLED__SHIFT 27
275 #define GS_STATE_STREAMOUT_QUERY_ENABLED__MASK  0x1
276 #define GS_STATE_PROVOKING_VTX_FIRST__SHIFT     28
277 #define GS_STATE_PROVOKING_VTX_FIRST__MASK      0x1
278 #define GS_STATE_OUTPRIM__SHIFT                 29
279 #define GS_STATE_OUTPRIM__MASK                  0x3
280 #define GS_STATE_PIPELINE_STATS_EMU__SHIFT      31
281 #define GS_STATE_PIPELINE_STATS_EMU__MASK       0x1
282 
283 #define ENCODE_FIELD(field, value) (((unsigned)(value) & field##__MASK) << field##__SHIFT)
284 #define CLEAR_FIELD(field) (~((unsigned)field##__MASK << field##__SHIFT))
285 
286 /* This is called by functions that change states. */
287 #define SET_FIELD(var, field, value) do { \
288    assert((value) == ((unsigned)(value) & field##__MASK)); \
289    (var) &= CLEAR_FIELD(field); \
290    (var) |= ENCODE_FIELD(field, value); \
291 } while (0)
292 
293 /* This is called during shader compilation and returns LLVMValueRef. */
294 #define GET_FIELD(ctx, field) si_unpack_param((ctx), (ctx)->args->vs_state_bits, field##__SHIFT, \
295                                              util_bitcount(field##__MASK))
296 
297 enum
298 {
299    /* These represent the number of SGPRs the shader uses. */
300    SI_VS_BLIT_SGPRS_POS = 3,
301    SI_VS_BLIT_SGPRS_POS_COLOR = 7,
302    SI_VS_BLIT_SGPRS_POS_TEXCOORD = 9,
303 
304    MAX_SI_VS_BLIT_SGPRS = 10, /* +1 for the attribute ring address */
305 };
306 
307 #define SI_NGG_CULL_TRIANGLES                (1 << 0)   /* this implies W, view.xy, and small prim culling */
308 #define SI_NGG_CULL_BACK_FACE                (1 << 1)   /* back faces */
309 #define SI_NGG_CULL_FRONT_FACE               (1 << 2)   /* front faces */
310 #define SI_NGG_CULL_LINES                    (1 << 3)   /* the primitive type is lines */
311 #define SI_NGG_CULL_SMALL_LINES_DIAMOND_EXIT (1 << 4)   /* cull small lines according to the diamond exit rule */
312 #define SI_NGG_CULL_CLIP_PLANE_ENABLE(enable) (((enable) & 0xff) << 5)
313 #define SI_NGG_CULL_GET_CLIP_PLANE_ENABLE(x)  (((x) >> 5) & 0xff)
314 
315 struct si_shader_profile {
316    uint32_t sha1[SHA1_DIGEST_LENGTH32];
317    uint32_t options;
318 };
319 
320 extern struct si_shader_profile si_shader_profiles[];
321 unsigned si_get_num_shader_profiles(void);
322 
323 #define SI_PROFILE_WAVE32                    (1 << 0)
324 #define SI_PROFILE_GFX10_WAVE64              (1 << 1)
325 /* bit gap */
326 #define SI_PROFILE_VS_NO_BINNING             (1 << 3)
327 #define SI_PROFILE_GFX9_GFX10_PS_NO_BINNING  (1 << 4)
328 #define SI_PROFILE_CLAMP_DIV_BY_ZERO         (1 << 5)
329 
330 enum si_shader_dump_type {
331    SI_DUMP_SHADER_KEY,
332    SI_DUMP_INIT_NIR,       /* initial input NIR when shaders are created (before lowering) */
333    SI_DUMP_NIR,            /* final NIR after lowering when shader variants are created */
334    SI_DUMP_INIT_LLVM_IR,   /* initial LLVM IR before optimizations */
335    SI_DUMP_LLVM_IR,        /* final LLVM IR */
336    SI_DUMP_INIT_ACO_IR,    /* initial ACO IR before optimizations */
337    SI_DUMP_ACO_IR,         /* final ACO IR */
338    SI_DUMP_ASM,            /* final asm shaders */
339    SI_DUMP_STATS,          /* print statistics as shader-db */
340    SI_DUMP_ALWAYS,
341 };
342 
343 enum {
344    SI_UNIQUE_SLOT_POS = 0,
345 
346    /* Since some shader stages use the highest used IO index
347     * to determine the size to allocate for inputs/outputs
348     * (in LDS, tess and GS rings). VARn should be placed right
349     * after POSITION to make that size as small as possible.
350     */
351    SI_UNIQUE_SLOT_VAR0 = 1, /* 0..31 */
352 
353    /* Put 16-bit GLES varyings after 32-bit varyings. They can use the same indices as
354     * legacy desktop GL varyings because they are mutually exclusive.
355     */
356    SI_UNIQUE_SLOT_VAR0_16BIT = 33, /* 0..15 */
357 
358    /* Legacy GL-only varyings can alias GLES-only 16-bit varyings. */
359    SI_UNIQUE_SLOT_FOGC = 33,
360    SI_UNIQUE_SLOT_COL0,
361    SI_UNIQUE_SLOT_COL1,
362    SI_UNIQUE_SLOT_BFC0,
363    SI_UNIQUE_SLOT_BFC1,
364    SI_UNIQUE_SLOT_TEX0,
365    SI_UNIQUE_SLOT_TEX1,
366    SI_UNIQUE_SLOT_TEX2,
367    SI_UNIQUE_SLOT_TEX3,
368    SI_UNIQUE_SLOT_TEX4,
369    SI_UNIQUE_SLOT_TEX5,
370    SI_UNIQUE_SLOT_TEX6,
371    SI_UNIQUE_SLOT_TEX7,
372    SI_UNIQUE_SLOT_CLIP_VERTEX,
373 
374    /* Varyings present in both GLES and desktop GL must start at 49 after 16-bit varyings. */
375    SI_UNIQUE_SLOT_CLIP_DIST0 = 49,
376    SI_UNIQUE_SLOT_CLIP_DIST1,
377    SI_UNIQUE_SLOT_PSIZ,
378    /* These can't be written by LS, HS, and ES. */
379    SI_UNIQUE_SLOT_LAYER,
380    SI_UNIQUE_SLOT_VIEWPORT,
381    SI_UNIQUE_SLOT_PRIMITIVE_ID,
382 };
383 
384 /**
385  * For VS shader keys, describe any fixups required for vertex fetch.
386  *
387  * \ref log_size, \ref format, and the number of channels are interpreted as
388  * by \ref ac_build_opencoded_load_format.
389  *
390  * Note: all bits 0 (size = 1 byte, num channels = 1, format = float) is an
391  * impossible format and indicates that no fixup is needed (just use
392  * buffer_load_format_xyzw).
393  */
394 union si_vs_fix_fetch {
395    struct {
396       uint8_t log_size : 2;        /* 1, 2, 4, 8 or bytes per channel */
397       uint8_t num_channels_m1 : 2; /* number of channels minus 1 */
398       uint8_t format : 3;          /* AC_FETCH_FORMAT_xxx */
399       uint8_t reverse : 1;         /* reverse XYZ channels */
400    } u;
401    uint8_t bits;
402 };
403 
404 struct si_shader;
405 
406 /* State of the context creating the shader object. */
407 struct si_compiler_ctx_state {
408    /* Should only be used by si_init_shader_selector_async and
409     * si_build_shader_variant if thread_index == -1 (non-threaded). */
410    struct ac_llvm_compiler *compiler;
411 
412    /* Used if thread_index == -1 or if debug.async is true. */
413    struct util_debug_callback debug;
414 
415    /* Used for creating the log string for gallium/ddebug. */
416    bool is_debug_context;
417 };
418 
419 enum si_color_output_type {
420    SI_TYPE_ANY32,
421    SI_TYPE_FLOAT16,
422    SI_TYPE_INT16,
423    SI_TYPE_UINT16,
424 };
425 
426 union si_input_info {
427    struct {
428       uint8_t semantic;
429       uint8_t interpolate;
430       uint8_t fp16_lo_hi_valid;
431       uint8_t usage_mask;
432    };
433    uint32_t _unused; /* this just forces 4-byte alignment */
434 };
435 
436 struct si_shader_info {
437    shader_info base;
438 
439    uint32_t options; /* bitmask of SI_PROFILE_* */
440 
441    uint8_t num_inputs;
442    uint8_t num_outputs;
443    union si_input_info input[PIPE_MAX_SHADER_INPUTS];
444    uint8_t output_semantic[PIPE_MAX_SHADER_OUTPUTS];
445    uint8_t output_usagemask[PIPE_MAX_SHADER_OUTPUTS];
446    uint8_t output_readmask[PIPE_MAX_SHADER_OUTPUTS];
447    uint8_t output_streams[PIPE_MAX_SHADER_OUTPUTS];
448    uint8_t output_type[PIPE_MAX_SHADER_OUTPUTS]; /* enum nir_alu_type */
449 
450    uint8_t num_vs_inputs;
451    uint8_t num_vbos_in_user_sgprs;
452    uint8_t num_stream_output_components[4];
453    uint16_t enabled_streamout_buffer_mask;
454 
455    uint64_t inputs_read; /* "get_unique_index" bits */
456    uint64_t tcs_vgpr_only_inputs; /* TCS inputs that are only in VGPRs, not LDS. */
457 
458    uint64_t outputs_written_before_tes_gs; /* "get_unique_index" bits */
459    uint64_t outputs_written_before_ps; /* "get_unique_index" bits */
460    uint32_t patch_outputs_written;     /* "get_unique_index_patch" bits */
461 
462    uint8_t clipdist_mask;
463    uint8_t culldist_mask;
464 
465    uint16_t lshs_vertex_stride;
466    uint16_t esgs_vertex_stride;
467    uint16_t gsvs_vertex_size;
468    uint8_t gs_input_verts_per_prim;
469    unsigned max_gsvs_emit_size;
470 
471    /* Set 0xf or 0x0 (4 bits) per each written output.
472     * ANDed with spi_shader_col_format.
473     */
474    unsigned colors_written_4bit;
475 
476    int constbuf0_num_slots;
477    uint num_memory_stores;
478    uint8_t color_attr_index[2];
479    uint8_t color_interpolate[2];
480    uint8_t color_interpolate_loc[2];
481    uint8_t colors_read; /**< which color components are read by the FS */
482    uint8_t colors_written;
483    uint16_t output_color_types; /**< Each bit pair is enum si_color_output_type */
484    bool color0_writes_all_cbufs; /**< gl_FragColor */
485    bool reads_samplemask;   /**< does fragment shader read sample mask? */
486    bool reads_tess_factors; /**< If TES reads TESSINNER or TESSOUTER */
487    bool writes_z;           /**< does fragment shader write Z value? */
488    bool writes_stencil;     /**< does fragment shader write stencil value? */
489    bool writes_samplemask;  /**< does fragment shader write sample mask? */
490    bool writes_edgeflag;    /**< vertex shader outputs edgeflag */
491    bool uses_interp_color;
492    bool uses_persp_center_color;
493    bool uses_persp_centroid_color;
494    bool uses_persp_sample_color;
495    bool uses_persp_center;
496    bool uses_persp_centroid;
497    bool uses_persp_sample;
498    bool uses_linear_center;
499    bool uses_linear_centroid;
500    bool uses_linear_sample;
501    bool uses_interp_at_sample;
502    bool uses_instanceid;
503    bool uses_base_vertex;
504    bool uses_base_instance;
505    bool uses_drawid;
506    bool uses_primid;
507    bool uses_frontface;
508    bool uses_invocationid;
509    bool uses_thread_id[3];
510    bool uses_block_id[3];
511    bool uses_variable_block_size;
512    bool uses_grid_size;
513    bool uses_tg_size;
514    bool writes_position;
515    bool writes_psize;
516    bool writes_clipvertex;
517    bool writes_primid;
518    bool writes_viewport_index;
519    bool writes_layer;
520    bool uses_bindless_samplers;
521    bool uses_bindless_images;
522    bool uses_indirect_descriptor;
523    bool has_divergent_loop;
524    bool uses_sampleid;
525    bool uses_layer_id;
526    bool has_non_uniform_tex_access;
527 
528    bool uses_vmem_sampler_or_bvh;
529    bool uses_vmem_load_other; /* all other VMEM loads and atomics with return */
530 
531    /** Whether all codepaths write tess factors in all invocations. */
532    bool tessfactors_are_def_in_all_invocs;
533 
534    /* A flag to check if vrs2x2 can be enabled to reduce number of
535     * fragment shader invocations if flat shading.
536     */
537    bool allow_flat_shading;
538 
539    /* Optimization: if the texture bound to this texunit has been cleared to 1,
540     * then the draw can be skipped (see si_draw_vbo_skip_noop). Initially the
541     * value is 0xff (undetermined) and can be later changed to 0 (= false) or
542     * texunit + 1.
543     */
544    uint8_t writes_1_if_tex_is_1;
545 
546    /* frag coord and sample pos per component read mask. */
547    uint8_t reads_frag_coord_mask;
548    uint8_t reads_sample_pos_mask;
549 };
550 
551 /* A shader selector is a gallium CSO and contains shader variants and
552  * binaries for one NIR program. This can be shared by multiple contexts.
553  */
554 struct si_shader_selector {
555    struct util_live_shader base;
556    struct si_screen *screen;
557    struct util_queue_fence ready;
558    struct si_compiler_ctx_state compiler_ctx_state;
559    gl_shader_stage stage;
560 
561    simple_mtx_t mutex;
562    union si_shader_key *keys;
563    unsigned variants_count;
564    unsigned variants_max_count;
565    struct si_shader **variants;
566 
567    /* The compiled NIR shader without a prolog and/or epilog (not
568     * uploaded to a buffer object).
569     */
570    struct si_shader *main_shader_part;
571    struct si_shader *main_shader_part_ls;     /* as_ls is set in the key */
572    struct si_shader *main_shader_part_es;     /* as_es is set in the key */
573    struct si_shader *main_shader_part_ngg;    /* as_ngg is set in the key */
574    struct si_shader *main_shader_part_ngg_es; /* for Wave32 TES before legacy GS */
575 
576    struct nir_shader *nir;
577    void *nir_binary;
578    unsigned nir_size;
579 
580    struct si_shader_info info;
581 
582    enum pipe_shader_type pipe_shader_type;
583    uint8_t const_and_shader_buf_descriptors_index;
584    uint8_t sampler_and_images_descriptors_index;
585    uint8_t cs_shaderbufs_sgpr_index;
586    uint8_t cs_num_shaderbufs_in_user_sgprs;
587    uint8_t cs_images_sgpr_index;
588    uint8_t cs_images_num_sgprs;
589    uint8_t cs_num_images_in_user_sgprs;
590    unsigned ngg_cull_vert_threshold; /* UINT32_MAX = disabled */
591    enum mesa_prim rast_prim;
592 
593    /* GS parameters. */
594    bool tess_turns_off_ngg;
595 
596    /* bitmasks of used descriptor slots */
597    uint64_t active_const_and_shader_buffers;
598    uint64_t active_samplers_and_images;
599 };
600 
601 /* Valid shader configurations:
602  *
603  * API shaders           VS | TCS | TES | GS |pass| PS
604  * are compiled as:         |     |     |    |thru|
605  *                          |     |     |    |    |
606  * Only VS & PS:         VS |     |     |    |    | PS
607  * GFX6     - with GS:   ES |     |     | GS | VS | PS
608  *          - with tess: LS | HS  | VS  |    |    | PS
609  *          - with both: LS | HS  | ES  | GS | VS | PS
610  * GFX9     - with GS:   -> |     |     | GS | VS | PS
611  *          - with tess: -> | HS  | VS  |    |    | PS
612  *          - with both: -> | HS  | ->  | GS | VS | PS
613  *                          |     |     |    |    |
614  * NGG      - VS & PS:   GS |     |     |    |    | PS
615  * (GFX10+) - with GS:   -> |     |     | GS |    | PS
616  *          - with tess: -> | HS  | GS  |    |    | PS
617  *          - with both: -> | HS  | ->  | GS |    | PS
618  *
619  * -> = merged with the next stage
620  */
621 
622 /* Use the byte alignment for all following structure members for optimal
623  * shader key memory footprint.
624  */
625 #pragma pack(push, 1)
626 
627 /* Common TCS bits between the shader key and the epilog key. */
628 struct si_tcs_epilog_bits {
629    unsigned prim_mode : 3;
630    unsigned invoc0_tess_factors_are_def : 1;
631    unsigned tes_reads_tess_factors : 1;
632 };
633 
634 /* Common PS bits between the shader key and the prolog key. */
635 struct si_ps_prolog_bits {
636    unsigned color_two_side : 1;
637    unsigned flatshade_colors : 1;
638    unsigned poly_stipple : 1;
639    unsigned force_persp_sample_interp : 1;
640    unsigned force_linear_sample_interp : 1;
641    unsigned force_persp_center_interp : 1;
642    unsigned force_linear_center_interp : 1;
643    unsigned bc_optimize_for_persp : 1;
644    unsigned bc_optimize_for_linear : 1;
645    unsigned samplemask_log_ps_iter : 3;
646 };
647 
648 /* Common PS bits between the shader key and the epilog key. */
649 struct si_ps_epilog_bits {
650    unsigned spi_shader_col_format;
651    unsigned color_is_int8 : 8;
652    unsigned color_is_int10 : 8;
653    unsigned last_cbuf : 3;
654    unsigned alpha_func : 3;
655    unsigned alpha_to_one : 1;
656    unsigned alpha_to_coverage_via_mrtz : 1;  /* gfx11+ */
657    unsigned clamp_color : 1;
658    unsigned dual_src_blend_swizzle : 1;      /* gfx11+ */
659    unsigned rbplus_depth_only_opt:1;
660    unsigned kill_samplemask:1;
661 };
662 
663 union si_shader_part_key {
664    struct {
665       struct si_tcs_epilog_bits states;
666       unsigned wave32 : 1;
667       unsigned noop_s_barrier : 1;
668    } tcs_epilog;
669    struct {
670       struct si_ps_prolog_bits states;
671       unsigned wave32 : 1;
672       unsigned num_input_sgprs : 6;
673       /* Color interpolation and two-side color selection. */
674       unsigned colors_read : 8;       /* color input components read */
675       unsigned num_interp_inputs : 5; /* BCOLOR is at this location */
676       unsigned num_fragcoord_components : 3;
677       unsigned wqm : 1;
678       char color_attr_index[2];
679       signed char color_interp_vgpr_index[2]; /* -1 == constant */
680    } ps_prolog;
681    struct {
682       struct si_ps_epilog_bits states;
683       unsigned wave32 : 1;
684       unsigned uses_discard : 1;
685       unsigned colors_written : 8;
686       unsigned color_types : 16;
687       unsigned writes_z : 1;
688       unsigned writes_stencil : 1;
689       unsigned writes_samplemask : 1;
690    } ps_epilog;
691 };
692 
693 /* The shader key for geometry stages (VS, TCS, TES, GS) */
694 struct si_shader_key_ge {
695    /* Prolog and epilog flags. */
696    union {
697       struct {
698          struct si_shader_selector *ls;      /* for merged LS-HS */
699          struct si_tcs_epilog_bits epilog;
700       } tcs; /* tessellation control shader */
701       struct {
702          struct si_shader_selector *es;      /* for merged ES-GS */
703       } gs;
704    } part;
705 
706    /* These three are initially set according to the NEXT_SHADER property,
707     * or guessed if the property doesn't seem correct.
708     */
709    unsigned as_es : 1;  /* whether it's a shader before GS */
710    unsigned as_ls : 1;  /* whether it's VS before TCS */
711    unsigned as_ngg : 1; /* whether it's the last GE stage and NGG is enabled,
712                            also set for the stage right before GS */
713 
714    /* Flags for monolithic compilation only. */
715    struct {
716       /* - If neither "is_one" nor "is_fetched" has a bit set, the instance
717        *   divisor is 0.
718        * - If "is_one" has a bit set, the instance divisor is 1.
719        * - If "is_fetched" has a bit set, the instance divisor will be loaded
720        *   from the constant buffer.
721        */
722       uint16_t instance_divisor_is_one;     /* bitmask of inputs */
723       uint16_t instance_divisor_is_fetched; /* bitmask of inputs */
724 
725       /* Whether fetch should be opencoded according to vs_fix_fetch.
726        * Otherwise, if vs_fix_fetch is non-zero, buffer_load_format_xyzw
727        * with minimal fixups is used. */
728       uint16_t vs_fetch_opencode;
729       union si_vs_fix_fetch vs_fix_fetch[SI_MAX_ATTRIBS];
730 
731       union {
732          /* When PS needs PrimID and GS is disabled. */
733          unsigned vs_export_prim_id : 1;    /* VS and TES only */
734          unsigned gs_tri_strip_adj_fix : 1; /* GS only */
735       } u;
736    } mono;
737 
738    /* Optimization flags for asynchronous compilation only. */
739    struct {
740       /* For HW VS (it can be VS, TES, GS) */
741       uint64_t kill_outputs; /* "get_unique_index" bits */
742       unsigned kill_clip_distances : 8;
743       unsigned kill_pointsize : 1;
744       unsigned kill_layer : 1;
745       unsigned remove_streamout : 1;
746 
747       /* For NGG VS and TES. */
748       unsigned ngg_culling : 13; /* SI_NGG_CULL_* */
749 
750       /* For shaders where monolithic variants have better code.
751        *
752        * This is a flag that has no effect on code generation,
753        * but forces monolithic shaders to be used as soon as
754        * possible, because it's in the "opt" group.
755        */
756       unsigned prefer_mono : 1;
757 
758       /* VS and TCS have the same number of patch vertices. */
759       unsigned same_patch_vertices:1;
760 
761       unsigned inline_uniforms:1;
762 
763       /* This must be kept last to limit the number of variants
764        * depending only on the uniform values.
765        */
766       uint32_t inlined_uniform_values[MAX_INLINABLE_UNIFORMS];
767    } opt;
768 };
769 
770 struct si_shader_key_ps {
771    struct {
772       /* Prolog and epilog flags. */
773       struct si_ps_prolog_bits prolog;
774       struct si_ps_epilog_bits epilog;
775    } part;
776 
777    /* Flags for monolithic compilation only. */
778    struct {
779       unsigned poly_line_smoothing : 1;
780       unsigned point_smoothing : 1;
781       unsigned interpolate_at_sample_force_center : 1;
782       unsigned fbfetch_msaa : 1;
783       unsigned fbfetch_is_1D : 1;
784       unsigned fbfetch_layered : 1;
785    } mono;
786 
787    /* Optimization flags for asynchronous compilation only. */
788    struct {
789       /* For shaders where monolithic variants have better code.
790        *
791        * This is a flag that has no effect on code generation,
792        * but forces monolithic shaders to be used as soon as
793        * possible, because it's in the "opt" group.
794        */
795       unsigned prefer_mono : 1;
796       unsigned inline_uniforms:1;
797 
798       /* This eliminates the FRONT_FACE input VGPR as well as shader code using it. */
799       int force_front_face_input : 2; /* 0 = gl_FrontFacing, 1 = true, -1 = false */
800 
801       /* This must be kept last to limit the number of variants
802        * depending only on the uniform values.
803        */
804       uint32_t inlined_uniform_values[MAX_INLINABLE_UNIFORMS];
805    } opt;
806 };
807 
808 union si_shader_key {
809    struct si_shader_key_ge ge; /* geometry engine shaders */
810    struct si_shader_key_ps ps;
811 };
812 
813 /* Restore the pack alignment to default. */
814 #pragma pack(pop)
815 
816 /* GCN-specific shader info. */
817 struct si_shader_binary_info {
818    uint8_t vs_output_param_offset[NUM_TOTAL_VARYING_SLOTS];
819    uint32_t vs_output_ps_input_cntl[NUM_TOTAL_VARYING_SLOTS];
820    union si_input_info ps_inputs[SI_NUM_INTERP];
821    uint8_t num_ps_inputs;
822    uint8_t ps_colors_read;
823    uint8_t num_input_sgprs;
824    uint8_t num_input_vgprs;
825    bool uses_vmem_load_other; /* all other VMEM loads and atomics with return */
826    bool uses_vmem_sampler_or_bvh;
827    uint8_t num_fragcoord_components;
828    bool uses_instanceid;
829    uint8_t nr_pos_exports;
830    uint8_t nr_param_exports;
831    unsigned private_mem_vgprs;
832    unsigned max_simd_waves;
833 };
834 
835 enum si_shader_binary_type {
836    SI_SHADER_BINARY_ELF,
837    SI_SHADER_BINARY_RAW,
838 };
839 
840 struct si_shader_binary {
841    enum si_shader_binary_type type;
842 
843    /* Depends on binary type, either ELF or raw buffer. */
844    const char *code_buffer;
845    size_t code_size;
846    uint32_t exec_size;
847 
848    char *uploaded_code;
849    size_t uploaded_code_size;
850 
851    char *llvm_ir_string;
852 
853    const char *disasm_string;
854    size_t disasm_size;
855 
856    const unsigned *symbols;
857    unsigned num_symbols;
858 };
859 
860 struct gfx9_gs_info {
861    unsigned es_verts_per_subgroup;
862    unsigned gs_prims_per_subgroup;
863    unsigned gs_inst_prims_in_subgroup;
864    unsigned max_prims_per_subgroup;
865    unsigned esgs_ring_size; /* in bytes */
866 };
867 
868 struct si_shader {
869    struct si_pm4_state pm4; /* base class */
870    struct si_compiler_ctx_state compiler_ctx_state;
871 
872    struct si_shader_selector *selector;
873    struct si_shader_selector *previous_stage_sel; /* for refcounting */
874 
875    struct si_shader_part *prolog;
876    struct si_shader *previous_stage; /* for GFX9 */
877    struct si_shader_part *epilog;
878    struct si_shader *gs_copy_shader;
879 
880    struct si_resource *bo;
881    /* gpu_address should be bo->gpu_address except if SQTT is
882     * in use.
883     */
884    uint64_t gpu_address;
885    struct si_resource *scratch_bo;
886    union si_shader_key key;
887    struct util_queue_fence ready;
888    bool compilation_failed;
889    bool is_monolithic;
890    bool is_optimized;
891    bool is_binary_shared;
892    bool is_gs_copy_shader;
893    uint8_t wave_size;
894 
895    /* The following data is all that's needed for binary shaders. */
896    struct si_shader_binary binary;
897    struct ac_shader_config config;
898    struct si_shader_binary_info info;
899 
900    /* SI_SGPR_VS_STATE_BITS */
901    bool uses_vs_state_provoking_vertex;
902    bool uses_gs_state_outprim;
903 
904    bool uses_base_instance;
905 
906    /* Shader key + LLVM IR + disassembly + statistics.
907     * Generated for debug contexts only.
908     */
909    char *shader_log;
910    size_t shader_log_size;
911 
912    struct gfx9_gs_info gs_info;
913 
914    /* Precomputed register values. */
915    union {
916       struct {
917          unsigned vgt_gsvs_ring_offset_1;
918          unsigned vgt_gsvs_ring_offset_2;
919          unsigned vgt_gsvs_ring_offset_3;
920          unsigned vgt_gsvs_ring_itemsize;
921          unsigned vgt_gs_max_vert_out;
922          unsigned vgt_gs_vert_itemsize;
923          unsigned vgt_gs_vert_itemsize_1;
924          unsigned vgt_gs_vert_itemsize_2;
925          unsigned vgt_gs_vert_itemsize_3;
926          unsigned vgt_gs_instance_cnt;
927          unsigned vgt_gs_onchip_cntl;
928          unsigned vgt_gs_max_prims_per_subgroup;
929          unsigned vgt_esgs_ring_itemsize;
930          unsigned spi_shader_pgm_rsrc3_gs;
931          unsigned spi_shader_pgm_rsrc4_gs;
932       } gs;
933 
934       struct {
935          /* Computed by gfx10_ngg_calculate_subgroup_info. */
936          uint16_t ngg_emit_size; /* in dwords */
937          uint16_t hw_max_esverts;
938          uint16_t max_gsprims;
939          uint16_t max_out_verts;
940          bool max_vert_out_per_gs_instance;
941          /* Register values. */
942          unsigned ge_max_output_per_subgroup;
943          unsigned ge_ngg_subgrp_cntl;
944          unsigned vgt_primitiveid_en;
945          unsigned vgt_gs_onchip_cntl;
946          unsigned vgt_gs_instance_cnt;
947          unsigned esgs_vertex_stride;
948          unsigned spi_vs_out_config;
949          unsigned spi_shader_pos_format;
950          unsigned pa_cl_vte_cntl;
951          unsigned vgt_gs_max_vert_out; /* for API GS */
952          unsigned ge_pc_alloc;         /* uconfig register */
953          unsigned spi_shader_pgm_rsrc3_gs;
954          unsigned spi_shader_pgm_rsrc4_gs;
955          unsigned vgt_shader_stages_en;
956       } ngg;
957 
958       struct {
959          unsigned vgt_gs_mode;
960          unsigned vgt_primitiveid_en;
961          unsigned vgt_reuse_off;
962          unsigned spi_vs_out_config;
963          unsigned spi_shader_pos_format;
964          unsigned pa_cl_vte_cntl;
965          unsigned ge_pc_alloc; /* uconfig register */
966       } vs;
967 
968       struct {
969          unsigned spi_ps_input_ena;
970          unsigned spi_ps_input_addr;
971          unsigned spi_baryc_cntl;
972          unsigned spi_ps_in_control;
973          unsigned spi_shader_z_format;
974          unsigned spi_shader_col_format;
975          unsigned cb_shader_mask;
976          unsigned db_shader_control;
977          unsigned num_interp;
978          bool writes_samplemask;
979       } ps;
980    };
981 
982    /* Precomputed register values. */
983    unsigned vgt_tf_param;                /* VGT_TF_PARAM */
984    unsigned vgt_vertex_reuse_block_cntl; /* VGT_VERTEX_REUSE_BLOCK_CNTL */
985    unsigned pa_cl_vs_out_cntl;
986    unsigned ge_cntl;
987 };
988 
989 struct si_shader_part {
990    struct si_shader_part *next;
991    union si_shader_part_key key;
992    struct si_shader_binary binary;
993    struct ac_shader_config config;
994 };
995 
996 /* si_shader.c */
997 struct ac_rtld_binary;
998 
999 void si_update_shader_binary_info(struct si_shader *shader, struct nir_shader *nir);
1000 bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
1001                        struct si_shader *shader, struct util_debug_callback *debug);
1002 bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
1003                               struct si_shader *shader, struct util_debug_callback *debug);
1004 void si_shader_destroy(struct si_shader *shader);
1005 unsigned si_shader_io_get_unique_index(unsigned semantic);
1006 bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader,
1007                              uint64_t scratch_va);
1008 bool si_can_dump_shader(struct si_screen *sscreen, gl_shader_stage stage,
1009                         enum si_shader_dump_type dump_type);
1010 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
1011                     struct util_debug_callback *debug, FILE *f, bool check_debug_option);
1012 void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader,
1013                                         struct util_debug_callback *debug);
1014 void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size);
1015 const char *si_get_shader_name(const struct si_shader *shader);
1016 void si_shader_binary_clean(struct si_shader_binary *binary);
1017 struct nir_shader *si_deserialize_shader(struct si_shader_selector *sel);
1018 unsigned si_get_ps_num_interp(struct si_shader *ps);
1019 bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader,
1020                            struct ac_rtld_binary *rtld);
1021 bool si_get_external_symbol(enum amd_gfx_level gfx_level, void *data, const char *name,
1022                             uint64_t *value);
1023 
1024 /* si_shader_info.c */
1025 void si_nir_scan_shader(struct si_screen *sscreen,  const struct nir_shader *nir,
1026                         struct si_shader_info *info);
1027 
1028 /* si_shader_nir.c */
1029 extern const struct nir_lower_subgroups_options si_nir_subgroups_options;
1030 
1031 bool si_alu_to_scalar_packed_math_filter(const struct nir_instr *instr, const void *data);
1032 void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first);
1033 void si_nir_late_opts(struct nir_shader *nir);
1034 char *si_finalize_nir(struct pipe_screen *screen, void *nirptr);
1035 
1036 /* si_state_shaders.cpp */
1037 unsigned si_determine_wave_size(struct si_screen *sscreen, struct si_shader *shader);
1038 void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *gs,
1039                       struct gfx9_gs_info *out);
1040 bool gfx10_is_ngg_passthrough(struct si_shader *shader);
1041 
1042 bool si_should_clear_lds(struct si_screen *sscreen, const struct nir_shader *shader);
1043 
1044 /* Inline helpers. */
1045 
1046 /* Return the pointer to the main shader part's pointer. */
si_get_main_shader_part(struct si_shader_selector * sel,const union si_shader_key * key)1047 static inline struct si_shader **si_get_main_shader_part(struct si_shader_selector *sel,
1048                                                          const union si_shader_key *key)
1049 {
1050    if (sel->stage <= MESA_SHADER_GEOMETRY) {
1051       if (key->ge.as_ls)
1052          return &sel->main_shader_part_ls;
1053       if (key->ge.as_es && key->ge.as_ngg)
1054          return &sel->main_shader_part_ngg_es;
1055       if (key->ge.as_es)
1056          return &sel->main_shader_part_es;
1057       if (key->ge.as_ngg)
1058          return &sel->main_shader_part_ngg;
1059    }
1060    return &sel->main_shader_part;
1061 }
1062 
si_shader_uses_bindless_samplers(struct si_shader_selector * selector)1063 static inline bool si_shader_uses_bindless_samplers(struct si_shader_selector *selector)
1064 {
1065    return selector ? selector->info.uses_bindless_samplers : false;
1066 }
1067 
si_shader_uses_bindless_images(struct si_shader_selector * selector)1068 static inline bool si_shader_uses_bindless_images(struct si_shader_selector *selector)
1069 {
1070    return selector ? selector->info.uses_bindless_images : false;
1071 }
1072 
gfx10_edgeflags_have_effect(struct si_shader * shader)1073 static inline bool gfx10_edgeflags_have_effect(struct si_shader *shader)
1074 {
1075    if (shader->selector->stage == MESA_SHADER_VERTEX &&
1076        !shader->selector->info.base.vs.blit_sgprs_amd &&
1077        !(shader->key.ge.opt.ngg_culling & SI_NGG_CULL_LINES))
1078       return true;
1079 
1080    return false;
1081 }
1082 
gfx10_ngg_writes_user_edgeflags(struct si_shader * shader)1083 static inline bool gfx10_ngg_writes_user_edgeflags(struct si_shader *shader)
1084 {
1085    return gfx10_edgeflags_have_effect(shader) &&
1086           shader->selector->info.writes_edgeflag;
1087 }
1088 
si_shader_uses_streamout(const struct si_shader * shader)1089 static inline bool si_shader_uses_streamout(const struct si_shader *shader)
1090 {
1091    return shader->selector->stage <= MESA_SHADER_GEOMETRY &&
1092           shader->selector->info.enabled_streamout_buffer_mask &&
1093           !shader->key.ge.opt.remove_streamout;
1094 }
1095 
si_shader_uses_discard(struct si_shader * shader)1096 static inline bool si_shader_uses_discard(struct si_shader *shader)
1097 {
1098    /* Changes to this should also update ps_modifies_zs. */
1099    return shader->selector->info.base.fs.uses_discard ||
1100           shader->key.ps.part.prolog.poly_stipple ||
1101           shader->key.ps.mono.point_smoothing ||
1102           shader->key.ps.part.epilog.alpha_func != PIPE_FUNC_ALWAYS;
1103 }
1104 
1105 #ifdef __cplusplus
1106 }
1107 #endif
1108 
1109 #endif
1110