• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2012 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 /* The compiler middle-end architecture: Explaining (non-)monolithic shaders
8  * -------------------------------------------------------------------------
9  *
10  * Typically, there is one-to-one correspondence between API and HW shaders,
11  * that is, for every API shader, there is exactly one shader binary in
12  * the driver.
13  *
14  * The problem with that is that we also have to emulate some API states
15  * (e.g. alpha-test, and many others) in shaders too. The two obvious ways
16  * to deal with it are:
17  * - each shader has multiple variants for each combination of emulated states,
18  *   and the variants are compiled on demand, possibly relying on a shader
19  *   cache for good performance
20  * - patch shaders at the binary level
21  *
22  * This driver uses something completely different. The emulated states are
23  * usually implemented at the beginning or end of shaders. Therefore, we can
24  * split the shader into 3 parts:
25  * - prolog part (shader code dependent on states)
26  * - main part (the API shader)
27  * - epilog part (shader code dependent on states)
28  *
29  * Each part is compiled as a separate shader and the final binaries are
30  * concatenated. This type of shader is called non-monolithic, because it
31  * consists of multiple independent binaries. Creating a new shader variant
32  * is therefore only a concatenation of shader parts (binaries) and doesn't
33  * involve any compilation. The main shader parts are the only parts that are
34  * compiled when applications create shader objects. The prolog and epilog
35  * parts are compiled on the first use and saved, so that their binaries can
36  * be reused by many other shaders.
37  *
38  * One of the roles of the prolog part is to compute vertex buffer addresses
39  * for vertex shaders. A few of the roles of the epilog part are color buffer
40  * format conversions in pixel shaders that we have to do manually, and write
41  * tessellation factors in tessellation control shaders. The prolog and epilog
42  * have many other important responsibilities in various shader stages.
43  * They don't just "emulate legacy stuff".
44  *
45  * Monolithic shaders are shaders where the parts are combined before LLVM
46  * compilation, and the whole thing is compiled and optimized as one unit with
47  * one binary on the output. The result is the same as the non-monolithic
48  * shader, but the final code can be better, because LLVM can optimize across
49  * all shader parts. Monolithic shaders aren't usually used except for these
50  * special cases:
51  *
52  * 1) Some rarely-used states require modification of the main shader part
53  *    itself, and in such cases, only the monolithic shader variant is
54  *    compiled, and that's always done on the first use.
55  *
56  * 2) When we do cross-stage optimizations for separate shader objects and
57  *    e.g. eliminate unused shader varyings, the resulting optimized shader
58  *    variants are always compiled as monolithic shaders, and always
59  *    asynchronously (i.e. not stalling ongoing rendering). We call them
60  *    "optimized monolithic" shaders. The important property here is that
61  *    the non-monolithic unoptimized shader variant is always available for use
62  *    when the asynchronous compilation of the optimized shader is not done
63  *    yet.
64  *
65  * Starting with GFX9 chips, some shader stages are merged, and the number of
66  * shader parts per shader increased. The complete new list of shader parts is:
67  * - 1st shader: prolog part
68  * - 1st shader: main part
69  * - 2nd shader: main part
70  * - 2nd shader: epilog part
71  */
72 
73 /* How linking shader inputs and outputs between vertex, tessellation, and
74  * geometry shaders works.
75  *
76  * Inputs and outputs between shaders are stored in a buffer. This buffer
77  * lives in LDS (typical case for tessellation), but it can also live
78  * in memory (ESGS). Each input or output has a fixed location within a vertex.
79  * The highest used input or output determines the stride between vertices.
80  *
81  * Since GS and tessellation are only possible in the OpenGL core profile,
82  * only these semantics are valid for per-vertex data:
83  *
84  *   Name             Location
85  *
86  *   POSITION         0
87  *   VAR0..31         1..32
88  *   CLIP_DIST0..1    49..50
89  *   PSIZ             51
90  *
91  * For example, a shader only writing GENERIC0 has the output stride of 5.
92  *
93  * Only these semantics are valid for per-patch data:
94  *
95  *   Name             Location
96  *
97  *   TESSOUTER        0
98  *   TESSINNER        1
99  *   PATCH0..29       2..31
100  *
101  * That's how independent shaders agree on input and output locations.
102  * The si_shader_io_get_unique_index function assigns the locations.
103  *
104  * For tessellation, other required information for calculating the input and
105  * output addresses like the vertex stride, the patch stride, and the offsets
106  * where per-vertex and per-patch data start, is passed to the shader via
107  * user data SGPRs. The offsets and strides are calculated at draw time and
108  * aren't available at compile time.
109  */
110 
111 #ifndef SI_SHADER_H
112 #define SI_SHADER_H
113 
114 #include "shader_info.h"
115 #include "ac_binary.h"
116 #include "ac_gpu_info.h"
117 #include "util/mesa-blake3.h"
118 #include "util/u_live_shader_cache.h"
119 #include "util/u_queue.h"
120 #include "si_pm4.h"
121 
122 #ifdef __cplusplus
123 extern "C" {
124 #endif
125 
126 struct nir_shader;
127 struct nir_instr;
128 
129 #define SI_NUM_INTERP     32
130 #define SI_MAX_ATTRIBS    16
131 #define SI_MAX_VS_OUTPUTS 40
132 #define SI_USER_CLIP_PLANE_MASK  0x3F
133 
134 #define INTERP_MODE_COLOR  INTERP_MODE_COUNT
135 
136 #define SI_PS_INPUT_CNTL_0000          (S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(0))
137 #define SI_PS_INPUT_CNTL_0001          (S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(3))
138 #define SI_PS_INPUT_CNTL_UNUSED        SI_PS_INPUT_CNTL_0000
139 /* D3D9 behaviour for COLOR0 requires 0001. GL is undefined. */
140 #define SI_PS_INPUT_CNTL_UNUSED_COLOR0 SI_PS_INPUT_CNTL_0001
141 
142 /* SGPR user data indices */
143 enum
144 {
145    SI_SGPR_INTERNAL_BINDINGS,
146    SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES,
147    SI_SGPR_CONST_AND_SHADER_BUFFERS, /* or just a constant buffer 0 pointer */
148    SI_SGPR_SAMPLERS_AND_IMAGES,
149    SI_NUM_RESOURCE_SGPRS,
150 
151    /* API VS, TES without GS, GS copy shader */
152    SI_SGPR_VS_STATE_BITS = SI_NUM_RESOURCE_SGPRS,
153    SI_NUM_VS_STATE_RESOURCE_SGPRS,
154 
155    /* all VS variants */
156    SI_SGPR_BASE_VERTEX = SI_NUM_VS_STATE_RESOURCE_SGPRS,
157    SI_SGPR_DRAWID,
158    SI_SGPR_START_INSTANCE,
159    SI_VS_NUM_USER_SGPR,
160 
161    SI_SGPR_VS_BLIT_DATA = SI_SGPR_CONST_AND_SHADER_BUFFERS,
162 
163    /* TES */
164    SI_SGPR_TES_OFFCHIP_LAYOUT = SI_NUM_VS_STATE_RESOURCE_SGPRS,
165    SI_SGPR_TES_OFFCHIP_ADDR,
166    SI_TES_NUM_USER_SGPR,
167 
168    /* GFX6-8: TCS only */
169    GFX6_SGPR_TCS_OFFCHIP_LAYOUT = SI_NUM_RESOURCE_SGPRS,
170    GFX6_SGPR_TCS_OFFCHIP_ADDR,
171    GFX6_SGPR_TCS_IN_LAYOUT,
172    GFX6_TCS_NUM_USER_SGPR,
173 
174    /* GFX9: Merged LS-HS (VS-TCS) only. */
175    GFX9_SGPR_TCS_OFFCHIP_LAYOUT = SI_VS_NUM_USER_SGPR,
176    GFX9_SGPR_TCS_OFFCHIP_ADDR,
177    GFX9_TCS_NUM_USER_SGPR,
178 
179    /* GS limits */
180    GFX6_GS_NUM_USER_SGPR = SI_NUM_RESOURCE_SGPRS,
181    SI_GSCOPY_NUM_USER_SGPR = SI_NUM_VS_STATE_RESOURCE_SGPRS,
182 
183    GFX9_SGPR_SMALL_PRIM_CULL_INFO = MAX2(SI_VS_NUM_USER_SGPR, SI_TES_NUM_USER_SGPR),
184    GFX9_SGPR_ATTRIBUTE_RING_ADDR,
185    GFX9_GS_NUM_USER_SGPR,
186 
187    /* PS only */
188    SI_SGPR_ALPHA_REF = SI_NUM_RESOURCE_SGPRS,
189    SI_PS_NUM_USER_SGPR,
190 
191    /* The value has to be 12, because the hw requires that descriptors
192     * are aligned to 4 SGPRs.
193     */
194    SI_SGPR_VS_VB_DESCRIPTOR_FIRST = 12,
195 };
196 
197 /* LLVM function parameter indices */
198 enum
199 {
200    SI_NUM_RESOURCE_PARAMS = 4,
201 
202    /* PS only parameters */
203    SI_PARAM_ALPHA_REF = SI_NUM_RESOURCE_PARAMS,
204    SI_PARAM_PRIM_MASK,
205    SI_PARAM_PERSP_SAMPLE,
206    SI_PARAM_PERSP_CENTER,
207    SI_PARAM_PERSP_CENTROID,
208    SI_PARAM_PERSP_PULL_MODEL,
209    SI_PARAM_LINEAR_SAMPLE,
210    SI_PARAM_LINEAR_CENTER,
211    SI_PARAM_LINEAR_CENTROID,
212    SI_PARAM_LINE_STIPPLE_TEX,
213    SI_PARAM_POS_X_FLOAT,
214    SI_PARAM_POS_Y_FLOAT,
215    SI_PARAM_POS_Z_FLOAT,
216    SI_PARAM_POS_W_FLOAT,
217    SI_PARAM_FRONT_FACE,
218    SI_PARAM_ANCILLARY,
219    SI_PARAM_SAMPLE_COVERAGE,
220    SI_PARAM_POS_FIXED_PT,
221 
222    SI_NUM_PARAMS = SI_PARAM_POS_FIXED_PT + 9, /* +8 for COLOR[0..1] */
223 };
224 
225 /* These fields are only set in current_vs_state (except INDEXED) in si_context, and they are
226  * accessible in the shader via vs_state_bits in VS, TES, and GS.
227  */
228 #define VS_STATE_CLAMP_VERTEX_COLOR__SHIFT   0
229 #define VS_STATE_CLAMP_VERTEX_COLOR__MASK    0x1 /* Shared by VS and GS */
230 #define VS_STATE_INDEXED__SHIFT              1
231 #define VS_STATE_INDEXED__MASK               0x1 /* Shared by VS and GS */
232 
233 /* These fields are only set in current_gs_state in si_context, and they are accessible
234  * in the shader via vs_state_bits in legacy GS, the GS copy shader, and any NGG shader.
235  */
236 /* bit gap */
237 /* The number of ES outputs is derived from the last output index of SI_UNIQUE_SLOT_* + 1, which
238  * can be 55 at most. The ESGS vertex stride in dwords is: NUM_ES_OUTPUTS * 4 + 1
239  * Only used by GFX9+ to compute LDS addresses of GS inputs.
240  */
241 #define GS_STATE_NUM_ES_OUTPUTS__SHIFT          14
242 #define GS_STATE_NUM_ES_OUTPUTS__MASK           0x3f
243 #define GS_STATE_CULL_FACE_FRONT__SHIFT         20
244 #define GS_STATE_CULL_FACE_FRONT__MASK          0x1
245 #define GS_STATE_CULL_FACE_BACK__SHIFT          21
246 #define GS_STATE_CULL_FACE_BACK__MASK           0x1
247 /* Small prim filter precision = num_samples / quant_mode where num_samples is in {1, 2, 4, 8} and
248  * quant_mode is in {256, 1024, 4096}, which is equal to 1/2^n where n is between 5 and 12.
249  *
250  * Equation 1: Represent the value as 1/2^n.
251  * Assumption: log_samples <= 3 and log_quant_mode >= 8
252  *    num_samples / quant_mode =
253  *    2^log_samples / 2^log_quant_mode =
254  *    1 / 2^(log_quant_mode - log_samples) [because log_samples < log_quant_mode]
255  *
256  * Knowing that, we only need 4 bits to represent the FP32 exponent and thus the FP32 number.
257  *
258  * Equation 2: Encoding the exponent.
259  *    1/2^(15 - value) in FP32 = ((value | 0x70) << 23) in binary if value < 15
260  * Proof: With 0x70 = 112, we get FP32 exponent 2^(112 + value - 127) according to the FP32
261  *        definition, which can be simplified to 2^(value - 15), which is a negative exponent
262  *        for value < 15. Given that 2^-n = 1/2^n, the FP32 number is equal to 1/2^(15 - value).
263  *
264  * Equation 3: Convert quant_mode_enum to log_quant_mode.
265  * quant_mode_enum:
266  *    0 means 256  = 2^8  --> log2(256)  = 8
267  *    1 means 1024 = 2^10 --> log2(1024) = 10
268  *    2 means 4096 = 2^12 --> log2(4096) = 12
269  *
270  * Conversion to log_quant_mode:
271  *    log_quant_mode = quant_mode_enum * 2 + 8. Proof:
272  *       0 * 2 + 8 = 8
273  *       1 * 2 + 8 = 10
274  *       2 * 2 + 8 = 12
275  *
276  * Equation 4: Get the exponent value for Equation 2 from Equation 1.
277  *    15 - value = log_quant_mode - log_samples
278  *    value = 15 - (log_quant_mode + log_samples)
279  *
280  * Combine equations 2, 3, and 4 to get the expression computing the FP32 number from log_samples
281  * and quant_mode_enum using integer ops:
282  *    (value | 0x70) << 23 =
283  *    ((15 - (log_quant_mode + log_samples)) | 0x70) << 23 =
284  *    ((15 - (quant_mode_enum * 2 + 8 + log_samples)) | 0x70) << 23 =
285  *    ((15 - quant_mode_enum * 2 - 8 - log_samples) | 0x70) << 23 =
286  *    ((7 - quant_mode_enum * 2 - log_samples) | 0x70) << 23 =
287  *
288  * Since "log_samples <= 3" and "quant_mode_enum * 2 <= 4", we need a SGPR field that stores:
289  *    triangle_precision = 7 - quant_mode_enum * 2 - log_samples
290  *
291  * Line precision ignores log_samples, so the shader should do:
292  *    line_precision = triangle_precision + log_samples
293  */
294 #define GS_STATE_SMALL_PRIM_PRECISION__SHIFT    22  /* triangle_precision */
295 #define GS_STATE_SMALL_PRIM_PRECISION__MASK     0x7
296 #define GS_STATE_SMALL_PRIM_PRECISION_LOG_SAMPLES__SHIFT 25
297 #define GS_STATE_SMALL_PRIM_PRECISION_LOG_SAMPLES__MASK  0x3
298 #define GS_STATE_STREAMOUT_QUERY_ENABLED__SHIFT 27
299 #define GS_STATE_STREAMOUT_QUERY_ENABLED__MASK  0x1
300 #define GS_STATE_PROVOKING_VTX_FIRST__SHIFT     28
301 #define GS_STATE_PROVOKING_VTX_FIRST__MASK      0x1
302 #define GS_STATE_OUTPRIM__SHIFT                 29
303 #define GS_STATE_OUTPRIM__MASK                  0x3
304 #define GS_STATE_PIPELINE_STATS_EMU__SHIFT      31
305 #define GS_STATE_PIPELINE_STATS_EMU__MASK       0x1
306 
307 #define ENCODE_FIELD(field, value) (((unsigned)(value) & field##__MASK) << field##__SHIFT)
308 #define CLEAR_FIELD(field) (~((unsigned)field##__MASK << field##__SHIFT))
309 
310 /* This is called by functions that change states. */
311 #define SET_FIELD(var, field, value) do { \
312    assert((value) == ((unsigned)(value) & field##__MASK)); \
313    (var) &= CLEAR_FIELD(field); \
314    (var) |= ENCODE_FIELD(field, value); \
315 } while (0)
316 
317 /* This is called during shader compilation and returns LLVMValueRef. */
318 #define GET_FIELD(ctx, field) si_unpack_param((ctx), (ctx)->args->vs_state_bits, field##__SHIFT, \
319                                              util_bitcount(field##__MASK))
320 
321 enum
322 {
323    /* These represent the number of SGPRs the shader uses. */
324    SI_VS_BLIT_SGPRS_POS = 3,
325    SI_VS_BLIT_SGPRS_POS_COLOR = 7,
326    SI_VS_BLIT_SGPRS_POS_TEXCOORD = 9,
327 
328    MAX_SI_VS_BLIT_SGPRS = 10, /* +1 for the attribute ring address */
329 };
330 
331 /* The following two are only set for vertex shaders that cull.
332  * TES and GS get the primitive type from shader_info.
333  */
334 #define SI_NGG_CULL_VS_TRIANGLES             (1 << 0)   /* this implies W, view.xy, and small prim culling */
335 #define SI_NGG_CULL_VS_LINES                 (1 << 1)   /* this implies W and view.xy culling */
336 #define SI_NGG_CULL_SMALL_LINES_DIAMOND_EXIT (1 << 2)   /* cull small lines according to the diamond exit rule */
337 #define SI_NGG_CULL_CLIP_PLANE_ENABLE(enable) (((enable) & 0xff) << 3)
338 #define SI_NGG_CULL_GET_CLIP_PLANE_ENABLE(x)  (((x) >> 3) & 0xff)
339 
340 struct si_shader_profile {
341    uint32_t blake3[BLAKE3_OUT_LEN32];
342    uint32_t options;
343 };
344 
345 extern struct si_shader_profile si_shader_profiles[];
346 unsigned si_get_num_shader_profiles(void);
347 
348 #define SI_PROFILE_WAVE32                    (1 << 0)
349 #define SI_PROFILE_GFX10_WAVE64              (1 << 1)
350 /* bit gap */
351 #define SI_PROFILE_VS_NO_BINNING             (1 << 3)
352 #define SI_PROFILE_GFX9_GFX10_PS_NO_BINNING  (1 << 4)
353 #define SI_PROFILE_CLAMP_DIV_BY_ZERO         (1 << 5)
354 #define SI_PROFILE_NO_OPT_UNIFORM_VARYINGS   (1 << 6)
355 
356 enum si_shader_dump_type {
357    SI_DUMP_SHADER_KEY,
358    SI_DUMP_INIT_NIR,       /* initial input NIR when shaders are created (before lowering) */
359    SI_DUMP_NIR,            /* final NIR after lowering when shader variants are created */
360    SI_DUMP_INIT_LLVM_IR,   /* initial LLVM IR before optimizations */
361    SI_DUMP_LLVM_IR,        /* final LLVM IR */
362    SI_DUMP_INIT_ACO_IR,    /* initial ACO IR before optimizations */
363    SI_DUMP_ACO_IR,         /* final ACO IR */
364    SI_DUMP_ASM,            /* final asm shaders */
365    SI_DUMP_STATS,          /* print statistics as shader-db */
366    SI_DUMP_ALWAYS,
367 };
368 
369 enum {
370    SI_UNIQUE_SLOT_POS = 0,
371 
372    /* Since some shader stages use the highest used IO index
373     * to determine the size to allocate for inputs/outputs
374     * (in LDS, tess and GS rings). VARn should be placed right
375     * after POSITION to make that size as small as possible.
376     */
377    SI_UNIQUE_SLOT_VAR0 = 1, /* 0..31 */
378 
379    /* Put 16-bit GLES varyings after 32-bit varyings. They can use the same indices as
380     * legacy desktop GL varyings because they are mutually exclusive.
381     */
382    SI_UNIQUE_SLOT_VAR0_16BIT = 33, /* 0..15 */
383 
384    /* Legacy GL-only varyings can alias GLES-only 16-bit varyings. */
385    SI_UNIQUE_SLOT_FOGC = 33,
386    SI_UNIQUE_SLOT_COL0,
387    SI_UNIQUE_SLOT_COL1,
388    SI_UNIQUE_SLOT_BFC0,
389    SI_UNIQUE_SLOT_BFC1,
390    SI_UNIQUE_SLOT_TEX0,
391    SI_UNIQUE_SLOT_TEX1,
392    SI_UNIQUE_SLOT_TEX2,
393    SI_UNIQUE_SLOT_TEX3,
394    SI_UNIQUE_SLOT_TEX4,
395    SI_UNIQUE_SLOT_TEX5,
396    SI_UNIQUE_SLOT_TEX6,
397    SI_UNIQUE_SLOT_TEX7,
398    SI_UNIQUE_SLOT_CLIP_VERTEX,
399 
400    /* Varyings present in both GLES and desktop GL must start at 49 after 16-bit varyings. */
401    SI_UNIQUE_SLOT_CLIP_DIST0 = 49,
402    SI_UNIQUE_SLOT_CLIP_DIST1,
403    SI_UNIQUE_SLOT_PSIZ,
404    /* These can't be written by LS, HS, and ES. */
405    SI_UNIQUE_SLOT_LAYER,
406    SI_UNIQUE_SLOT_VIEWPORT,
407    SI_UNIQUE_SLOT_PRIMITIVE_ID,
408 };
409 
410 /**
411  * For VS shader keys, describe any fixups required for vertex fetch.
412  *
413  * \ref log_size, \ref format, and the number of channels are interpreted as
414  * by \ref ac_build_opencoded_load_format.
415  *
416  * Note: all bits 0 (size = 1 byte, num channels = 1, format = float) is an
417  * impossible format and indicates that no fixup is needed (just use
418  * buffer_load_format_xyzw).
419  */
420 union si_vs_fix_fetch {
421    struct {
422       uint8_t log_size : 2;        /* 1, 2, 4, 8 or bytes per channel */
423       uint8_t num_channels_m1 : 2; /* number of channels minus 1 */
424       uint8_t format : 3;          /* AC_FETCH_FORMAT_xxx */
425       uint8_t reverse : 1;         /* reverse XYZ channels */
426    } u;
427    uint8_t bits;
428 };
429 
430 struct si_shader;
431 
432 /* State of the context creating the shader object. */
433 struct si_compiler_ctx_state {
434    /* Should only be used by si_init_shader_selector_async and
435     * si_build_shader_variant if thread_index == -1 (non-threaded). */
436    struct ac_llvm_compiler *compiler;
437 
438    /* Used if thread_index == -1 or if debug.async is true. */
439    struct util_debug_callback debug;
440 
441    /* Used for creating the log string for gallium/ddebug. */
442    bool is_debug_context;
443 };
444 
445 enum si_color_output_type {
446    SI_TYPE_ANY32,
447    SI_TYPE_FLOAT16,
448    SI_TYPE_INT16,
449    SI_TYPE_UINT16,
450 };
451 
452 union si_input_info {
453    struct {
454       uint8_t semantic;
455       uint8_t interpolate;
456       uint8_t fp16_lo_hi_valid;
457       uint8_t usage_mask;
458    };
459    uint32_t _unused; /* this just forces 4-byte alignment */
460 };
461 
462 struct si_shader_info {
463    shader_info base;
464 
465    uint32_t options; /* bitmask of SI_PROFILE_* */
466 
467    uint8_t num_inputs;
468    uint8_t num_outputs;
469    union si_input_info input[PIPE_MAX_SHADER_INPUTS];
470    uint8_t output_semantic[PIPE_MAX_SHADER_OUTPUTS];
471    uint8_t output_usagemask[PIPE_MAX_SHADER_OUTPUTS];
472    uint8_t output_streams[PIPE_MAX_SHADER_OUTPUTS];
473    uint8_t output_type[PIPE_MAX_SHADER_OUTPUTS]; /* enum nir_alu_type */
474    uint8_t output_xfb_writemask[PIPE_MAX_SHADER_OUTPUTS];
475 
476    uint8_t num_streamout_components;
477    uint8_t num_vs_inputs;
478    uint8_t num_vbos_in_user_sgprs;
479    uint8_t num_stream_output_components[4]; /* for GS streams, not streamout */
480    uint16_t enabled_streamout_buffer_mask;
481 
482    uint64_t inputs_read; /* "get_unique_index" bits */
483    uint64_t tcs_inputs_via_temp;
484    uint64_t tcs_inputs_via_lds;
485 
486    /* For VS before {TCS, TES, GS} and TES before GS. */
487    uint64_t ls_es_outputs_written;     /* "get_unique_index" bits */
488    uint64_t outputs_written_before_ps; /* "get_unique_index" bits */
489    uint64_t tcs_outputs_written_for_tes;   /* "get_unique_index" bits */
490    uint32_t patch_outputs_written_for_tes; /* "get_unique_index_patch" bits */
491    uint32_t tess_levels_written_for_tes;   /* "get_unique_index_patch" bits */
492 
493    uint8_t clipdist_mask;
494    uint8_t culldist_mask;
495 
496    uint16_t esgs_vertex_stride;
497    uint16_t gsvs_vertex_size;
498    uint8_t gs_input_verts_per_prim;
499    unsigned max_gsvs_emit_size;
500 
501    /* Set 0xf or 0x0 (4 bits) per each written output.
502     * ANDed with spi_shader_col_format.
503     */
504    unsigned colors_written_4bit;
505 
506    int constbuf0_num_slots;
507    uint num_memory_stores;
508    uint8_t color_attr_index[2];
509    uint8_t color_interpolate[2];
510    uint8_t color_interpolate_loc[2];
511    uint8_t colors_read; /**< which color components are read by the FS */
512    uint8_t colors_written;
513    uint16_t output_color_types; /**< Each bit pair is enum si_color_output_type */
514    bool color0_writes_all_cbufs; /**< gl_FragColor */
515    bool reads_samplemask;   /**< does fragment shader read sample mask? */
516    bool reads_tess_factors; /**< If TES reads TESSINNER or TESSOUTER */
517    bool writes_z;           /**< does fragment shader write Z value? */
518    /* We need both because both can be present in different conditional blocks. */
519    bool output_z_equals_input_z; /**< gl_FragDepth == gl_FragCoord.z for any write */
520    bool output_z_is_not_input_z; /**< gl_FragDepth != gl_FragCoord.z for any write */
521    bool writes_stencil;     /**< does fragment shader write stencil value? */
522    bool writes_samplemask;  /**< does fragment shader write sample mask? */
523    bool writes_edgeflag;    /**< vertex shader outputs edgeflag */
524    bool uses_interp_color;
525    bool uses_persp_center_color;
526    bool uses_persp_centroid_color;
527    bool uses_persp_sample_color;
528    bool uses_persp_center;
529    bool uses_persp_centroid;
530    bool uses_persp_sample;
531    bool uses_linear_center;
532    bool uses_linear_centroid;
533    bool uses_linear_sample;
534    bool uses_interp_at_sample;
535    bool uses_instanceid;
536    bool uses_base_vertex;
537    bool uses_base_instance;
538    bool uses_drawid;
539    bool uses_primid;
540    bool uses_frontface;
541    bool uses_invocationid;
542    bool uses_thread_id[3];
543    bool uses_block_id[3];
544    bool uses_variable_block_size;
545    bool uses_grid_size;
546    bool uses_tg_size;
547    bool uses_atomic_ordered_add;
548    bool writes_position;
549    bool writes_psize;
550    bool writes_clipvertex;
551    bool writes_primid;
552    bool writes_viewport_index;
553    bool writes_layer;
554    bool uses_bindless_samplers;
555    bool uses_bindless_images;
556    bool uses_indirect_descriptor;
557    bool has_divergent_loop;
558    bool uses_sampleid;
559    bool uses_layer_id;
560    bool has_non_uniform_tex_access;
561    bool has_shadow_comparison;
562 
563    bool uses_vmem_sampler_or_bvh;
564    bool uses_vmem_load_other; /* all other VMEM loads and atomics with return */
565 
566    /** Whether all codepaths write tess factors in all invocations. */
567    bool tessfactors_are_def_in_all_invocs;
568 
569    /* A flag to check if vrs2x2 can be enabled to reduce number of
570     * fragment shader invocations if flat shading.
571     */
572    bool allow_flat_shading;
573 
574    /* Optimization: if the texture bound to this texunit has been cleared to 1,
575     * then the draw can be skipped (see si_draw_vbo_skip_noop). Initially the
576     * value is 0xff (undetermined) and can be later changed to 0 (= false) or
577     * texunit + 1.
578     */
579    uint8_t writes_1_if_tex_is_1;
580 
581    /* frag coord and sample pos per component read mask. */
582    uint8_t reads_frag_coord_mask;
583    uint8_t reads_sample_pos_mask;
584 };
585 
586 /* A shader selector is a gallium CSO and contains shader variants and
587  * binaries for one NIR program. This can be shared by multiple contexts.
588  */
589 struct si_shader_selector {
590    struct util_live_shader base;
591    struct si_screen *screen;
592    struct util_queue_fence ready;
593    struct si_compiler_ctx_state compiler_ctx_state;
594    gl_shader_stage stage;
595 
596    simple_mtx_t mutex;
597    union si_shader_key *keys;
598    unsigned variants_count;
599    unsigned variants_max_count;
600    struct si_shader **variants;
601 
602    /* The compiled NIR shader without a prolog and/or epilog (not
603     * uploaded to a buffer object).
604     *
605     * [0] for wave32, [1] for wave64.
606     */
607    struct si_shader *main_shader_part[2];
608    struct si_shader *main_shader_part_ls[2];     /* as_ls is set in the key */
609    struct si_shader *main_shader_part_es;        /* as_es && !as_ngg in the key */
610    struct si_shader *main_shader_part_ngg[2];    /* !as_es && as_ngg in the key */
611    struct si_shader *main_shader_part_ngg_es[2]; /* as_es && as_ngg in the key */
612 
613    struct nir_shader *nir;
614    void *nir_binary;
615    unsigned nir_size;
616 
617    struct si_shader_info info;
618 
619    uint8_t const_and_shader_buf_descriptors_index;
620    uint8_t sampler_and_images_descriptors_index;
621    uint8_t cs_shaderbufs_sgpr_index;
622    uint8_t cs_num_shaderbufs_in_user_sgprs;
623    uint8_t cs_images_sgpr_index;
624    uint8_t cs_images_num_sgprs;
625    uint8_t cs_num_images_in_user_sgprs;
626    unsigned ngg_cull_vert_threshold; /* UINT32_MAX = disabled */
627    enum mesa_prim rast_prim;
628 
629    /* GS parameters. */
630    bool tess_turns_off_ngg;
631 
632    /* bitmasks of used descriptor slots */
633    uint64_t active_const_and_shader_buffers;
634    uint64_t active_samplers_and_images;
635 };
636 
637 /* Valid shader configurations:
638  *
639  * API shaders           VS | TCS | TES | GS |pass| PS
640  * are compiled as:         |     |     |    |thru|
641  *                          |     |     |    |    |
642  * Only VS & PS:         VS |     |     |    |    | PS
643  * GFX6     - with GS:   ES |     |     | GS | VS | PS
644  *          - with tess: LS | HS  | VS  |    |    | PS
645  *          - with both: LS | HS  | ES  | GS | VS | PS
646  * GFX9     - with GS:   -> |     |     | GS | VS | PS
647  *          - with tess: -> | HS  | VS  |    |    | PS
648  *          - with both: -> | HS  | ->  | GS | VS | PS
649  *                          |     |     |    |    |
650  * NGG      - VS & PS:   GS |     |     |    |    | PS
651  * (GFX10+) - with GS:   -> |     |     | GS |    | PS
652  *          - with tess: -> | HS  | GS  |    |    | PS
653  *          - with both: -> | HS  | ->  | GS |    | PS
654  *
655  * -> = merged with the next stage
656  */
657 
658 /* Use the byte alignment for all following structure members for optimal
659  * shader key memory footprint.
660  */
661 #pragma pack(push, 1)
662 
663 /* Common PS bits between the shader key and the prolog key. */
664 struct si_ps_prolog_bits {
665    unsigned color_two_side : 1;
666    unsigned flatshade_colors : 1;
667    unsigned poly_stipple : 1;
668    unsigned force_persp_sample_interp : 1;
669    unsigned force_linear_sample_interp : 1;
670    unsigned force_persp_center_interp : 1;
671    unsigned force_linear_center_interp : 1;
672    unsigned bc_optimize_for_persp : 1;
673    unsigned bc_optimize_for_linear : 1;
674    unsigned samplemask_log_ps_iter : 3;
675 };
676 
677 /* Common PS bits between the shader key and the epilog key. */
678 struct si_ps_epilog_bits {
679    unsigned spi_shader_col_format;
680    unsigned color_is_int8 : 8;
681    unsigned color_is_int10 : 8;
682    unsigned last_cbuf : 3;
683    unsigned alpha_func : 3;
684    unsigned alpha_to_one : 1;
685    unsigned alpha_to_coverage_via_mrtz : 1;  /* gfx11+ or alpha_to_one */
686    unsigned clamp_color : 1;
687    unsigned dual_src_blend_swizzle : 1;      /* gfx11+ */
688    unsigned rbplus_depth_only_opt:1;
689    unsigned kill_z:1;
690    unsigned kill_stencil:1;
691    unsigned kill_samplemask:1;
692 };
693 
694 union si_shader_part_key {
695    struct {
696       struct si_ps_prolog_bits states;
697       unsigned use_aco : 1;
698       unsigned wave32 : 1;
699       unsigned num_input_sgprs : 6;
700       /* Color interpolation and two-side color selection. */
701       unsigned colors_read : 8;       /* color input components read */
702       unsigned num_interp_inputs : 5; /* BCOLOR is at this location */
703       unsigned num_fragcoord_components : 3;
704       unsigned wqm : 1;
705       char color_attr_index[2];
706       signed char color_interp_vgpr_index[2]; /* -1 == constant */
707    } ps_prolog;
708    struct {
709       struct si_ps_epilog_bits states;
710       unsigned use_aco : 1;
711       unsigned wave32 : 1;
712       unsigned uses_discard : 1;
713       unsigned colors_written : 8;
714       unsigned color_types : 16;
715       unsigned writes_z : 1;
716       unsigned writes_stencil : 1;
717       unsigned writes_samplemask : 1;
718    } ps_epilog;
719 };
720 
721 /* The shader key for geometry stages (VS, TCS, TES, GS) */
722 struct si_shader_key_ge {
723    /* Prolog and epilog flags. */
724    union {
725       struct {
726          struct si_shader_selector *ls;      /* for merged LS-HS */
727       } tcs; /* tessellation control shader */
728       struct {
729          struct si_shader_selector *es;      /* for merged ES-GS */
730       } gs;
731    } part;
732 
733    /* These three are initially set according to the NEXT_SHADER property,
734     * or guessed if the property doesn't seem correct.
735     */
736    unsigned as_es : 1;  /* whether it's a shader before GS */
737    unsigned as_ls : 1;  /* whether it's VS before TCS */
738    unsigned as_ngg : 1; /* whether it's the last GE stage and NGG is enabled,
739                            also set for the stage right before GS */
740 
741    /* Flags for monolithic compilation only. */
742    struct {
743       /* - If neither "is_one" nor "is_fetched" has a bit set, the instance
744        *   divisor is 0.
745        * - If "is_one" has a bit set, the instance divisor is 1.
746        * - If "is_fetched" has a bit set, the instance divisor will be loaded
747        *   from the constant buffer.
748        */
749       uint16_t instance_divisor_is_one;     /* bitmask of inputs */
750       uint16_t instance_divisor_is_fetched; /* bitmask of inputs */
751 
752       /* Whether fetch should be opencoded according to vs_fix_fetch.
753        * Otherwise, if vs_fix_fetch is non-zero, buffer_load_format_xyzw
754        * with minimal fixups is used. */
755       uint16_t vs_fetch_opencode;
756       union si_vs_fix_fetch vs_fix_fetch[SI_MAX_ATTRIBS];
757 
758       union {
759          /* When PS needs PrimID and GS is disabled. */
760          unsigned vs_export_prim_id : 1;    /* VS and TES only */
761          unsigned gs_tri_strip_adj_fix : 1; /* GS only */
762       } u;
763 
764       /* Gfx12: When no streamout buffers are bound, streamout must be disabled. */
765       unsigned remove_streamout : 1;
766    } mono;
767 
768    /* Optimization flags for asynchronous compilation only. */
769    struct {
770       /* For HW VS (it can be VS, TES, GS) */
771       uint64_t kill_outputs; /* "get_unique_index" bits */
772       unsigned kill_clip_distances : 8;
773       unsigned kill_pointsize : 1;
774       unsigned kill_layer : 1;
775       unsigned remove_streamout : 1;
776 
777       /* For NGG VS and TES. */
778       unsigned ngg_culling : 11; /* SI_NGG_CULL_* */
779 
780       /* If NGG VS streamout knows the number of vertices per primitive at compile time,
781        * it can put stores for all vertices in the same VMEM clause, instead of storing
782        * vertices for the 2nd and 3rd vertex conditionally because the primitive type is
783        * unknown.
784        */
785       unsigned ngg_vs_streamout_num_verts_per_prim : 2;
786 
787       /* For shaders where monolithic variants have better code.
788        *
789        * This is a flag that has no effect on code generation,
790        * but forces monolithic shaders to be used as soon as
791        * possible, because it's in the "opt" group.
792        */
793       unsigned prefer_mono : 1;
794 
795       /* VS and TCS have the same number of patch vertices. */
796       unsigned same_patch_vertices:1;
797 
798       /* For TCS. */
799       unsigned tes_prim_mode : 2;
800       unsigned tes_reads_tess_factors : 1;
801 
802       unsigned inline_uniforms:1;
803 
804       /* This must be kept last to limit the number of variants
805        * depending only on the uniform values.
806        */
807       uint32_t inlined_uniform_values[MAX_INLINABLE_UNIFORMS];
808    } opt;
809 };
810 
811 struct si_shader_key_ps {
812    struct {
813       /* Prolog and epilog flags. */
814       struct si_ps_prolog_bits prolog;
815       struct si_ps_epilog_bits epilog;
816    } part;
817 
818    /* Flags for monolithic compilation only. */
819    struct {
820       unsigned poly_line_smoothing : 1;
821       unsigned point_smoothing : 1;
822       unsigned interpolate_at_sample_force_center : 1;
823       unsigned fbfetch_msaa : 1;
824       unsigned fbfetch_is_1D : 1;
825       unsigned fbfetch_layered : 1;
826    } mono;
827 
828    /* Optimization flags for asynchronous compilation only. */
829    struct {
830       /* For shaders where monolithic variants have better code.
831        *
832        * This is a flag that has no effect on code generation,
833        * but forces monolithic shaders to be used as soon as
834        * possible, because it's in the "opt" group.
835        */
836       unsigned prefer_mono : 1;
837       unsigned inline_uniforms:1;
838 
839       /* This eliminates the FRONT_FACE input VGPR as well as shader code using it. */
840       int force_front_face_input : 2; /* 0 = gl_FrontFacing, 1 = true, -1 = false */
841 
842       /* This must be kept last to limit the number of variants
843        * depending only on the uniform values.
844        */
845       uint32_t inlined_uniform_values[MAX_INLINABLE_UNIFORMS];
846    } opt;
847 };
848 
849 union si_shader_key {
850    struct si_shader_key_ge ge; /* geometry engine shaders */
851    struct si_shader_key_ps ps;
852 };
853 
854 /* Restore the pack alignment to default. */
855 #pragma pack(pop)
856 
857 /* GCN-specific shader info. */
858 struct si_shader_binary_info {
859    uint8_t vs_output_param_offset[NUM_TOTAL_VARYING_SLOTS];
860    uint32_t vs_output_ps_input_cntl[NUM_TOTAL_VARYING_SLOTS];
861    union si_input_info ps_inputs[SI_NUM_INTERP];
862    uint8_t num_ps_inputs;
863    uint8_t ps_colors_read;
864    uint8_t num_input_sgprs;
865    uint8_t num_input_vgprs;
866    bool uses_vmem_load_other; /* all other VMEM loads and atomics with return */
867    bool uses_vmem_sampler_or_bvh;
868    uint8_t num_fragcoord_components;
869    bool uses_instanceid;
870    uint8_t nr_pos_exports;
871    uint8_t nr_param_exports;
872    unsigned private_mem_vgprs;
873    unsigned max_simd_waves;
874 };
875 
876 enum si_shader_binary_type {
877    SI_SHADER_BINARY_ELF,
878    SI_SHADER_BINARY_RAW,
879 };
880 
881 struct si_shader_binary {
882    enum si_shader_binary_type type;
883 
884    /* Depends on binary type, either ELF or raw buffer. */
885    const char *code_buffer;
886    size_t code_size;
887    uint32_t exec_size;
888 
889    char *uploaded_code;
890    size_t uploaded_code_size;
891 
892    char *llvm_ir_string;
893 
894    const char *disasm_string;
895    size_t disasm_size;
896 
897    const unsigned *symbols;
898    unsigned num_symbols;
899 };
900 
901 struct gfx9_gs_info {
902    unsigned es_verts_per_subgroup;
903    unsigned gs_prims_per_subgroup;
904    unsigned gs_inst_prims_in_subgroup;
905    unsigned max_prims_per_subgroup;
906    unsigned esgs_ring_size; /* in bytes */
907 };
908 
909 struct si_shader {
910    struct si_pm4_state pm4; /* base class */
911    struct si_compiler_ctx_state compiler_ctx_state;
912 
913    struct si_shader_selector *selector;
914    struct si_shader_selector *previous_stage_sel; /* for refcounting */
915    struct si_shader *next_shader; /* Only used during compilation of LS and ES when merged. */
916 
917    struct si_shader_part *prolog;
918    struct si_shader *previous_stage; /* for GFX9 */
919    struct si_shader_part *epilog;
920    struct si_shader *gs_copy_shader;
921 
922    struct si_resource *bo;
923    /* gpu_address should be bo->gpu_address except if SQTT is
924     * in use.
925     */
926    uint64_t gpu_address;
927    /* Only used on GFX6-10 where the scratch address must be inserted into the shader binary.
928     * This is the scratch address that the current shader binary contains.
929     */
930    uint64_t scratch_va;
931    union si_shader_key key;
932    struct util_queue_fence ready;
933    bool compilation_failed;
934    bool is_monolithic;
935    bool is_optimized;
936    bool is_binary_shared;
937    bool is_gs_copy_shader;
938    uint8_t wave_size;
939    unsigned complete_shader_binary_size;
940 
941    /* The following data is all that's needed for binary shaders. */
942    struct si_shader_binary binary;
943    struct ac_shader_config config;
944    struct si_shader_binary_info info;
945 
946    /* SI_SGPR_VS_STATE_BITS */
947    bool uses_vs_state_provoking_vertex;
948    bool uses_gs_state_outprim;
949 
950    bool uses_base_instance;
951 
952    /* Shader key + LLVM IR + disassembly + statistics.
953     * Generated for debug contexts only.
954     */
955    char *shader_log;
956    size_t shader_log_size;
957 
958    struct gfx9_gs_info gs_info;
959 
960    /* Precomputed register values. */
961    union {
962       struct {
963          unsigned vgt_gsvs_ring_offset_1;
964          unsigned vgt_gsvs_ring_offset_2;
965          unsigned vgt_gsvs_ring_offset_3;
966          unsigned vgt_gsvs_ring_itemsize;
967          unsigned vgt_gs_max_vert_out;
968          unsigned vgt_gs_vert_itemsize;
969          unsigned vgt_gs_vert_itemsize_1;
970          unsigned vgt_gs_vert_itemsize_2;
971          unsigned vgt_gs_vert_itemsize_3;
972          unsigned vgt_gs_instance_cnt;
973          unsigned vgt_gs_onchip_cntl;
974          unsigned vgt_gs_max_prims_per_subgroup;
975          unsigned vgt_esgs_ring_itemsize;
976          unsigned spi_shader_pgm_rsrc3_gs;
977          unsigned spi_shader_pgm_rsrc4_gs;
978       } gs;
979 
980       struct {
981          /* Computed by gfx10_ngg_calculate_subgroup_info. */
982          uint16_t ngg_emit_size; /* in dwords */
983          uint16_t hw_max_esverts;
984          uint16_t max_gsprims;
985          uint16_t max_out_verts;
986          bool max_vert_out_per_gs_instance;
987          /* Register values. */
988          unsigned ge_max_output_per_subgroup;
989          unsigned ge_ngg_subgrp_cntl;
990          unsigned vgt_primitiveid_en;
991          unsigned vgt_gs_onchip_cntl;
992          unsigned vgt_gs_instance_cnt;
993          unsigned esgs_vertex_stride;
994          unsigned spi_vs_out_config;
995          unsigned spi_shader_pos_format;
996          unsigned pa_cl_vte_cntl;
997          unsigned vgt_gs_max_vert_out; /* for API GS */
998          unsigned ge_pc_alloc;         /* uconfig register */
999          unsigned spi_shader_pgm_rsrc3_gs;
1000          unsigned spi_shader_pgm_rsrc4_gs;
1001          unsigned vgt_shader_stages_en;
1002       } ngg;
1003 
1004       struct {
1005          unsigned vgt_gs_mode;
1006          unsigned vgt_primitiveid_en;
1007          unsigned vgt_reuse_off;
1008          unsigned spi_vs_out_config;
1009          unsigned spi_shader_pos_format;
1010          unsigned pa_cl_vte_cntl;
1011          unsigned ge_pc_alloc; /* uconfig register */
1012       } vs;
1013 
1014       struct {
1015          unsigned spi_ps_input_ena;
1016          unsigned spi_ps_input_addr;
1017          unsigned spi_baryc_cntl;
1018          unsigned spi_ps_in_control;
1019          unsigned spi_shader_z_format;
1020          unsigned spi_shader_col_format;
1021          unsigned cb_shader_mask;
1022          unsigned db_shader_control;
1023          unsigned num_interp;
1024          unsigned spi_gs_out_config_ps;
1025          unsigned pa_sc_hisz_control;
1026          bool writes_z;
1027          bool writes_stencil;
1028          bool writes_samplemask;
1029       } ps;
1030    };
1031 
1032    /* Precomputed register values. */
1033    unsigned vgt_tf_param;                /* VGT_TF_PARAM */
1034    unsigned vgt_vertex_reuse_block_cntl; /* VGT_VERTEX_REUSE_BLOCK_CNTL */
1035    unsigned pa_cl_vs_out_cntl;
1036    unsigned ge_cntl;
1037 };
1038 
1039 struct si_shader_part {
1040    struct si_shader_part *next;
1041    union si_shader_part_key key;
1042    struct si_shader_binary binary;
1043    struct ac_shader_config config;
1044 };
1045 
1046 /* si_shader.c */
1047 struct ac_rtld_binary;
1048 
1049 void si_update_shader_binary_info(struct si_shader *shader, struct nir_shader *nir);
1050 bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
1051                        struct si_shader *shader, struct util_debug_callback *debug);
1052 bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
1053                               struct si_shader *shader, struct util_debug_callback *debug);
1054 void si_shader_destroy(struct si_shader *shader);
1055 unsigned si_shader_io_get_unique_index(unsigned semantic);
1056 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader,
1057                             uint64_t scratch_va);
1058 int si_shader_binary_upload_at(struct si_screen *sscreen, struct si_shader *shader,
1059                                uint64_t scratch_va, int64_t bo_offset);
1060 bool si_can_dump_shader(struct si_screen *sscreen, gl_shader_stage stage,
1061                         enum si_shader_dump_type dump_type);
1062 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
1063                     struct util_debug_callback *debug, FILE *f, bool check_debug_option);
1064 void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader,
1065                                         struct util_debug_callback *debug);
1066 void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size);
1067 const char *si_get_shader_name(const struct si_shader *shader);
1068 void si_shader_binary_clean(struct si_shader_binary *binary);
1069 struct nir_shader *si_deserialize_shader(struct si_shader_selector *sel);
1070 unsigned si_get_ps_num_interp(struct si_shader *ps);
1071 unsigned si_get_shader_prefetch_size(struct si_shader *shader);
1072 unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_shader *shader);
1073 
1074 /* si_shader_info.c */
1075 void si_nir_scan_shader(struct si_screen *sscreen, struct nir_shader *nir,
1076                         struct si_shader_info *info);
1077 
1078 /* si_shader_nir.c */
1079 void si_lower_mediump_io(struct nir_shader *nir);
1080 
1081 bool si_alu_to_scalar_packed_math_filter(const struct nir_instr *instr, const void *data);
1082 void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first);
1083 void si_nir_late_opts(struct nir_shader *nir);
1084 char *si_finalize_nir(struct pipe_screen *screen, struct nir_shader *nir);
1085 
1086 /* si_state_shaders.cpp */
1087 unsigned si_shader_num_alloc_param_exports(struct si_shader *shader);
1088 unsigned si_determine_wave_size(struct si_screen *sscreen, struct si_shader *shader);
1089 void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *gs,
1090                       struct gfx9_gs_info *out);
1091 bool gfx10_is_ngg_passthrough(struct si_shader *shader);
1092 unsigned si_shader_lshs_vertex_stride(struct si_shader *ls);
1093 bool si_should_clear_lds(struct si_screen *sscreen, const struct nir_shader *shader);
1094 unsigned si_get_output_prim_simplified(const struct si_shader_selector *sel,
1095                                        const union si_shader_key *key);
1096 
1097 /* Inline helpers. */
1098 
1099 /* Return the pointer to the main shader part's pointer. */
si_get_main_shader_part(struct si_shader_selector * sel,const union si_shader_key * key,unsigned wave_size)1100 static inline struct si_shader **si_get_main_shader_part(struct si_shader_selector *sel,
1101                                                          const union si_shader_key *key,
1102                                                          unsigned wave_size)
1103 {
1104    assert(wave_size == 32 || wave_size == 64);
1105    unsigned index = wave_size / 32 - 1;
1106 
1107    if (sel->stage <= MESA_SHADER_GEOMETRY) {
1108       if (key->ge.as_ls)
1109          return &sel->main_shader_part_ls[index];
1110       if (key->ge.as_es && key->ge.as_ngg)
1111          return &sel->main_shader_part_ngg_es[index];
1112       if (key->ge.as_es) {
1113          /* legacy GS only support wave 64 */
1114          assert(wave_size == 64);
1115          return &sel->main_shader_part_es;
1116       }
1117       if (key->ge.as_ngg)
1118          return &sel->main_shader_part_ngg[index];
1119    }
1120    return &sel->main_shader_part[index];
1121 }
1122 
gfx10_has_variable_edgeflags(struct si_shader * shader)1123 static inline bool gfx10_has_variable_edgeflags(struct si_shader *shader)
1124 {
1125    unsigned output_prim = si_get_output_prim_simplified(shader->selector, &shader->key);
1126 
1127    return shader->selector->stage == MESA_SHADER_VERTEX &&
1128           (output_prim == MESA_PRIM_TRIANGLES || output_prim == MESA_PRIM_UNKNOWN);
1129 }
1130 
si_shader_uses_streamout(const struct si_shader * shader)1131 static inline bool si_shader_uses_streamout(const struct si_shader *shader)
1132 {
1133    return shader->selector->stage <= MESA_SHADER_GEOMETRY &&
1134           shader->selector->info.enabled_streamout_buffer_mask &&
1135           !shader->key.ge.opt.remove_streamout &&
1136           !shader->key.ge.mono.remove_streamout;
1137 }
1138 
si_shader_uses_discard(struct si_shader * shader)1139 static inline bool si_shader_uses_discard(struct si_shader *shader)
1140 {
1141    /* Changes to this should also update ps_modifies_zs. */
1142    return shader->selector->info.base.fs.uses_discard ||
1143           shader->key.ps.part.prolog.poly_stipple ||
1144           shader->key.ps.mono.point_smoothing ||
1145           shader->key.ps.part.epilog.alpha_func != PIPE_FUNC_ALWAYS;
1146 }
1147 
si_shader_culling_enabled(struct si_shader * shader)1148 static inline bool si_shader_culling_enabled(struct si_shader *shader)
1149 {
1150    /* Legacy VS/TES/GS and ES don't cull in the shader. */
1151    if (!shader->key.ge.as_ngg || shader->key.ge.as_es) {
1152       assert(!shader->key.ge.opt.ngg_culling);
1153       return false;
1154    }
1155 
1156    if (shader->key.ge.opt.ngg_culling)
1157       return true;
1158 
1159    unsigned output_prim = si_get_output_prim_simplified(shader->selector, &shader->key);
1160 
1161    /* This enables NGG culling for non-monolithic TES and GS. */
1162    return shader->selector->ngg_cull_vert_threshold == 0 &&
1163           (output_prim == MESA_PRIM_TRIANGLES || output_prim == MESA_PRIM_LINES);
1164 }
1165 
1166 #ifdef __cplusplus
1167 }
1168 #endif
1169 
1170 #endif
1171