• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2012 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 /* The compiler middle-end architecture: Explaining (non-)monolithic shaders
8  * -------------------------------------------------------------------------
9  *
10  * Typically, there is one-to-one correspondence between API and HW shaders,
11  * that is, for every API shader, there is exactly one shader binary in
12  * the driver.
13  *
14  * The problem with that is that we also have to emulate some API states
15  * (e.g. alpha-test, and many others) in shaders too. The two obvious ways
16  * to deal with it are:
17  * - each shader has multiple variants for each combination of emulated states,
18  *   and the variants are compiled on demand, possibly relying on a shader
19  *   cache for good performance
20  * - patch shaders at the binary level
21  *
22  * This driver uses something completely different. The emulated states are
23  * usually implemented at the beginning or end of shaders. Therefore, we can
24  * split the shader into 3 parts:
25  * - prolog part (shader code dependent on states)
26  * - main part (the API shader)
27  * - epilog part (shader code dependent on states)
28  *
29  * Each part is compiled as a separate shader and the final binaries are
30  * concatenated. This type of shader is called non-monolithic, because it
31  * consists of multiple independent binaries. Creating a new shader variant
32  * is therefore only a concatenation of shader parts (binaries) and doesn't
33  * involve any compilation. The main shader parts are the only parts that are
34  * compiled when applications create shader objects. The prolog and epilog
35  * parts are compiled on the first use and saved, so that their binaries can
36  * be reused by many other shaders.
37  *
38  * One of the roles of the prolog part is to compute vertex buffer addresses
39  * for vertex shaders. A few of the roles of the epilog part are color buffer
40  * format conversions in pixel shaders that we have to do manually, and write
41  * tessellation factors in tessellation control shaders. The prolog and epilog
42  * have many other important responsibilities in various shader stages.
43  * They don't just "emulate legacy stuff".
44  *
45  * Monolithic shaders are shaders where the parts are combined before LLVM
46  * compilation, and the whole thing is compiled and optimized as one unit with
47  * one binary on the output. The result is the same as the non-monolithic
48  * shader, but the final code can be better, because LLVM can optimize across
49  * all shader parts. Monolithic shaders aren't usually used except for these
50  * special cases:
51  *
52  * 1) Some rarely-used states require modification of the main shader part
53  *    itself, and in such cases, only the monolithic shader variant is
54  *    compiled, and that's always done on the first use.
55  *
56  * 2) When we do cross-stage optimizations for separate shader objects and
57  *    e.g. eliminate unused shader varyings, the resulting optimized shader
58  *    variants are always compiled as monolithic shaders, and always
59  *    asynchronously (i.e. not stalling ongoing rendering). We call them
60  *    "optimized monolithic" shaders. The important property here is that
61  *    the non-monolithic unoptimized shader variant is always available for use
62  *    when the asynchronous compilation of the optimized shader is not done
63  *    yet.
64  *
65  * Starting with GFX9 chips, some shader stages are merged, and the number of
66  * shader parts per shader increased. The complete new list of shader parts is:
67  * - 1st shader: prolog part
68  * - 1st shader: main part
69  * - 2nd shader: main part
70  * - 2nd shader: epilog part
71  */
72 
73 /* How linking shader inputs and outputs between vertex, tessellation, and
74  * geometry shaders works.
75  *
76  * Inputs and outputs between shaders are stored in a buffer. This buffer
77  * lives in LDS (typical case for tessellation), but it can also live
78  * in memory (ESGS). Each input or output has a fixed location within a vertex.
79  * The highest used input or output determines the stride between vertices.
80  *
81  * Since GS and tessellation are only possible in the OpenGL core profile,
82  * only these semantics are valid for per-vertex data:
83  *
84  *   Name             Location
85  *
86  *   POSITION         0
87  *   VAR0..31         1..32
88  *   CLIP_DIST0..1    49..50
89  *   PSIZ             51
90  *
91  * For example, a shader only writing GENERIC0 has the output stride of 5.
92  *
93  * Only these semantics are valid for per-patch data:
94  *
95  *   Name             Location
96  *
97  *   TESSOUTER        0
98  *   TESSINNER        1
99  *   PATCH0..29       2..31
100  *
101  * That's how independent shaders agree on input and output locations.
102  * The si_shader_io_get_unique_index function assigns the locations.
103  *
104  * For tessellation, other required information for calculating the input and
105  * output addresses like the vertex stride, the patch stride, and the offsets
106  * where per-vertex and per-patch data start, is passed to the shader via
107  * user data SGPRs. The offsets and strides are calculated at draw time and
108  * aren't available at compile time.
109  */
110 
111 #ifndef SI_SHADER_H
112 #define SI_SHADER_H
113 
114 #include "shader_info.h"
115 #include "ac_binary.h"
116 #include "ac_gpu_info.h"
117 #include "util/mesa-blake3.h"
118 #include "util/u_live_shader_cache.h"
119 #include "util/u_queue.h"
120 #include "si_pm4.h"
121 
122 #ifdef __cplusplus
123 extern "C" {
124 #endif
125 
126 struct nir_shader;
127 struct nir_instr;
128 
129 #define SI_NUM_INTERP     32
130 #define SI_MAX_ATTRIBS    16
131 #define SI_MAX_VS_OUTPUTS 40
132 #define SI_USER_CLIP_PLANE_MASK  0x3F
133 
134 #define INTERP_MODE_COLOR  INTERP_MODE_COUNT
135 
136 #define SI_PS_INPUT_CNTL_0000          (S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(0))
137 #define SI_PS_INPUT_CNTL_0001          (S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(3))
138 #define SI_PS_INPUT_CNTL_UNUSED        SI_PS_INPUT_CNTL_0000
139 /* D3D9 behaviour for COLOR0 requires 0001. GL is undefined. */
140 #define SI_PS_INPUT_CNTL_UNUSED_COLOR0 SI_PS_INPUT_CNTL_0001
141 
142 /* SGPR user data indices */
143 enum
144 {
145    SI_SGPR_INTERNAL_BINDINGS,
146    SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES,
147    SI_SGPR_CONST_AND_SHADER_BUFFERS, /* or just a constant buffer 0 pointer */
148    SI_SGPR_SAMPLERS_AND_IMAGES,
149    SI_NUM_RESOURCE_SGPRS,
150 
151    /* API VS, TES without GS, GS copy shader */
152    SI_SGPR_VS_STATE_BITS = SI_NUM_RESOURCE_SGPRS,
153    SI_NUM_VS_STATE_RESOURCE_SGPRS,
154 
155    /* all VS variants */
156    SI_SGPR_BASE_VERTEX = SI_NUM_VS_STATE_RESOURCE_SGPRS,
157    SI_SGPR_DRAWID,
158    SI_SGPR_START_INSTANCE,
159    SI_VS_NUM_USER_SGPR,
160 
161    SI_SGPR_VS_BLIT_DATA = SI_SGPR_CONST_AND_SHADER_BUFFERS,
162 
163    /* TES */
164    SI_SGPR_TES_OFFCHIP_LAYOUT = SI_NUM_VS_STATE_RESOURCE_SGPRS,
165    SI_SGPR_TES_OFFCHIP_ADDR,
166    SI_TES_NUM_USER_SGPR,
167 
168    /* GFX6-8: TCS only */
169    GFX6_SGPR_TCS_OFFCHIP_LAYOUT = SI_NUM_RESOURCE_SGPRS,
170    GFX6_SGPR_TCS_OFFCHIP_ADDR,
171    GFX6_SGPR_TCS_IN_LAYOUT,
172    GFX6_TCS_NUM_USER_SGPR,
173 
174    /* GFX9: Merged LS-HS (VS-TCS) only. */
175    GFX9_SGPR_TCS_OFFCHIP_LAYOUT = SI_VS_NUM_USER_SGPR,
176    GFX9_SGPR_TCS_OFFCHIP_ADDR,
177    GFX9_TCS_NUM_USER_SGPR,
178 
179    /* GS limits */
180    GFX6_GS_NUM_USER_SGPR = SI_NUM_RESOURCE_SGPRS,
181    SI_GSCOPY_NUM_USER_SGPR = SI_NUM_VS_STATE_RESOURCE_SGPRS,
182 
183    GFX9_SGPR_SMALL_PRIM_CULL_INFO = MAX2(SI_VS_NUM_USER_SGPR, SI_TES_NUM_USER_SGPR),
184    GFX9_SGPR_ATTRIBUTE_RING_ADDR,
185    GFX9_GS_NUM_USER_SGPR,
186 
187    /* PS only */
188    SI_SGPR_SAMPLE_LOCS0 = SI_NUM_RESOURCE_SGPRS,
189    SI_SGPR_SAMPLE_LOCS1,
190    SI_SGPR_ALPHA_REF,
191    SI_PS_NUM_USER_SGPR,
192 
193    /* The value has to be 12, because the hw requires that descriptors
194     * are aligned to 4 SGPRs.
195     */
196    SI_SGPR_VS_VB_DESCRIPTOR_FIRST = 12,
197 };
198 
199 /* LLVM function parameter indices */
200 enum
201 {
202    SI_NUM_RESOURCE_PARAMS = 4,
203 
204    /* PS only parameters */
205    SI_PARAM_SAMPLE_LOCS0 = SI_NUM_RESOURCE_PARAMS,
206    SI_PARAM_SAMPLE_LOCS1,
207    SI_PARAM_ALPHA_REF,
208    SI_PARAM_PRIM_MASK,
209    SI_PARAM_PERSP_SAMPLE,
210    SI_PARAM_PERSP_CENTER,
211    SI_PARAM_PERSP_CENTROID,
212    SI_PARAM_PERSP_PULL_MODEL,
213    SI_PARAM_LINEAR_SAMPLE,
214    SI_PARAM_LINEAR_CENTER,
215    SI_PARAM_LINEAR_CENTROID,
216    SI_PARAM_LINE_STIPPLE_TEX,
217    SI_PARAM_POS_X_FLOAT,
218    SI_PARAM_POS_Y_FLOAT,
219    SI_PARAM_POS_Z_FLOAT,
220    SI_PARAM_POS_W_FLOAT,
221    SI_PARAM_FRONT_FACE,
222    SI_PARAM_ANCILLARY,
223    SI_PARAM_SAMPLE_COVERAGE,
224    SI_PARAM_POS_FIXED_PT,
225 
226    SI_NUM_PARAMS = SI_PARAM_POS_FIXED_PT + 9, /* +8 for COLOR[0..1] */
227 };
228 
229 /* These fields are only set in current_vs_state (except INDEXED) in si_context, and they are
230  * accessible in the shader via vs_state_bits in VS, TES, and GS.
231  */
232 #define VS_STATE_CLAMP_VERTEX_COLOR__SHIFT   0
233 #define VS_STATE_CLAMP_VERTEX_COLOR__MASK    0x1 /* Shared by VS and GS */
234 #define VS_STATE_INDEXED__SHIFT              1
235 #define VS_STATE_INDEXED__MASK               0x1 /* Shared by VS and GS */
236 
237 /* These fields are only set in current_gs_state in si_context, and they are accessible
238  * in the shader via vs_state_bits in legacy GS, the GS copy shader, and any NGG shader.
239  */
240 /* bit gap */
241 /* The number of ES outputs is derived from the last output index of SI_UNIQUE_SLOT_* + 1, which
242  * can be 55 at most. The ESGS vertex stride in dwords is: NUM_ES_OUTPUTS * 4 + 1
243  * Only used by GFX9+ to compute LDS addresses of GS inputs.
244  */
245 #define GS_STATE_NUM_ES_OUTPUTS__SHIFT          14
246 #define GS_STATE_NUM_ES_OUTPUTS__MASK           0x3f
247 #define GS_STATE_CULL_FACE_FRONT__SHIFT         20
248 #define GS_STATE_CULL_FACE_FRONT__MASK          0x1
249 #define GS_STATE_CULL_FACE_BACK__SHIFT          21
250 #define GS_STATE_CULL_FACE_BACK__MASK           0x1
251 /* Small prim filter precision = num_samples / quant_mode where num_samples is in {1, 2, 4, 8} and
252  * quant_mode is in {256, 1024, 4096}, which is equal to 1/2^n where n is between 5 and 12.
253  *
254  * Equation 1: Represent the value as 1/2^n.
255  * Assumption: log_samples <= 3 and log_quant_mode >= 8
256  *    num_samples / quant_mode =
257  *    2^log_samples / 2^log_quant_mode =
258  *    1 / 2^(log_quant_mode - log_samples) [because log_samples < log_quant_mode]
259  *
260  * Knowing that, we only need 4 bits to represent the FP32 exponent and thus the FP32 number.
261  *
262  * Equation 2: Encoding the exponent.
263  *    1/2^(15 - value) in FP32 = ((value | 0x70) << 23) in binary if value < 15
264  * Proof: With 0x70 = 112, we get FP32 exponent 2^(112 + value - 127) according to the FP32
265  *        definition, which can be simplified to 2^(value - 15), which is a negative exponent
266  *        for value < 15. Given that 2^-n = 1/2^n, the FP32 number is equal to 1/2^(15 - value).
267  *
268  * Equation 3: Convert quant_mode_enum to log_quant_mode.
269  * quant_mode_enum:
270  *    0 means 256  = 2^8  --> log2(256)  = 8
271  *    1 means 1024 = 2^10 --> log2(1024) = 10
272  *    2 means 4096 = 2^12 --> log2(4096) = 12
273  *
274  * Conversion to log_quant_mode:
275  *    log_quant_mode = quant_mode_enum * 2 + 8. Proof:
276  *       0 * 2 + 8 = 8
277  *       1 * 2 + 8 = 10
278  *       2 * 2 + 8 = 12
279  *
280  * Equation 4: Get the exponent value for Equation 2 from Equation 1.
281  *    15 - value = log_quant_mode - log_samples
282  *    value = 15 - (log_quant_mode + log_samples)
283  *
284  * Combine equations 2, 3, and 4 to get the expression computing the FP32 number from log_samples
285  * and quant_mode_enum using integer ops:
286  *    (value | 0x70) << 23 =
287  *    ((15 - (log_quant_mode + log_samples)) | 0x70) << 23 =
288  *    ((15 - (quant_mode_enum * 2 + 8 + log_samples)) | 0x70) << 23 =
289  *    ((15 - quant_mode_enum * 2 - 8 - log_samples) | 0x70) << 23 =
290  *    ((7 - quant_mode_enum * 2 - log_samples) | 0x70) << 23 =
291  *
292  * Since "log_samples <= 3" and "quant_mode_enum * 2 <= 4", we need a SGPR field that stores:
293  *    triangle_precision = 7 - quant_mode_enum * 2 - log_samples
294  *
295  * Line precision ignores log_samples, so the shader should do:
296  *    line_precision = triangle_precision + log_samples
297  */
298 #define GS_STATE_SMALL_PRIM_PRECISION__SHIFT    22  /* triangle_precision */
299 #define GS_STATE_SMALL_PRIM_PRECISION__MASK     0x7
300 #define GS_STATE_SMALL_PRIM_PRECISION_LOG_SAMPLES__SHIFT 25
301 #define GS_STATE_SMALL_PRIM_PRECISION_LOG_SAMPLES__MASK  0x3
302 #define GS_STATE_STREAMOUT_QUERY_ENABLED__SHIFT 27
303 #define GS_STATE_STREAMOUT_QUERY_ENABLED__MASK  0x1
304 #define GS_STATE_PROVOKING_VTX_FIRST__SHIFT     28
305 #define GS_STATE_PROVOKING_VTX_FIRST__MASK      0x1
306 #define GS_STATE_OUTPRIM__SHIFT                 29
307 #define GS_STATE_OUTPRIM__MASK                  0x3
308 #define GS_STATE_PIPELINE_STATS_EMU__SHIFT      31
309 #define GS_STATE_PIPELINE_STATS_EMU__MASK       0x1
310 
311 #define ENCODE_FIELD(field, value) (((unsigned)(value) & field##__MASK) << field##__SHIFT)
312 #define CLEAR_FIELD(field) (~((unsigned)field##__MASK << field##__SHIFT))
313 
314 /* This is called by functions that change states. */
315 #define SET_FIELD(var, field, value) do { \
316    assert((value) == ((unsigned)(value) & field##__MASK)); \
317    (var) &= CLEAR_FIELD(field); \
318    (var) |= ENCODE_FIELD(field, value); \
319 } while (0)
320 
321 /* This is called during shader compilation and returns LLVMValueRef. */
322 #define GET_FIELD(ctx, field) si_unpack_param((ctx), (ctx)->args->vs_state_bits, field##__SHIFT, \
323                                              util_bitcount(field##__MASK))
324 
325 enum
326 {
327    /* These represent the number of SGPRs the shader uses. */
328    SI_VS_BLIT_SGPRS_POS = 3,
329    SI_VS_BLIT_SGPRS_POS_COLOR = 7,
330    SI_VS_BLIT_SGPRS_POS_TEXCOORD = 9,
331 
332    MAX_SI_VS_BLIT_SGPRS = 10, /* +1 for the attribute ring address */
333 };
334 
335 /* The following two are only set for vertex shaders that cull.
336  * TES and GS get the primitive type from shader_info.
337  */
338 #define SI_NGG_CULL_VS_TRIANGLES             (1 << 0)   /* this implies W, view.xy, and small prim culling */
339 #define SI_NGG_CULL_VS_LINES                 (1 << 1)   /* this implies W and view.xy culling */
340 #define SI_NGG_CULL_SMALL_LINES_DIAMOND_EXIT (1 << 2)   /* cull small lines according to the diamond exit rule */
341 #define SI_NGG_CULL_CLIP_PLANE_ENABLE(enable) (((enable) & 0xff) << 3)
342 #define SI_NGG_CULL_GET_CLIP_PLANE_ENABLE(x)  (((x) >> 3) & 0xff)
343 
344 struct si_shader_profile {
345    uint32_t blake3[BLAKE3_OUT_LEN32];
346    uint32_t options;
347 };
348 
349 extern struct si_shader_profile si_shader_profiles[];
350 unsigned si_get_num_shader_profiles(void);
351 
352 #define SI_PROFILE_WAVE32                    (1 << 0)
353 #define SI_PROFILE_GFX10_WAVE64              (1 << 1)
354 /* bit gap */
355 #define SI_PROFILE_VS_NO_BINNING             (1 << 3)
356 #define SI_PROFILE_GFX9_GFX10_PS_NO_BINNING  (1 << 4)
357 #define SI_PROFILE_CLAMP_DIV_BY_ZERO         (1 << 5)
358 #define SI_PROFILE_NO_OPT_UNIFORM_VARYINGS   (1 << 6)
359 
360 enum si_shader_dump_type {
361    SI_DUMP_SHADER_KEY,
362    SI_DUMP_INIT_NIR,       /* initial input NIR when shaders are created (before lowering) */
363    SI_DUMP_NIR,            /* final NIR after lowering when shader variants are created */
364    SI_DUMP_INIT_LLVM_IR,   /* initial LLVM IR before optimizations */
365    SI_DUMP_LLVM_IR,        /* final LLVM IR */
366    SI_DUMP_INIT_ACO_IR,    /* initial ACO IR before optimizations */
367    SI_DUMP_ACO_IR,         /* final ACO IR */
368    SI_DUMP_ASM,            /* final asm shaders */
369    SI_DUMP_STATS,          /* print statistics as shader-db */
370    SI_DUMP_ALWAYS,
371 };
372 
373 enum {
374    SI_UNIQUE_SLOT_POS = 0,
375 
376    /* Since some shader stages use the highest used IO index
377     * to determine the size to allocate for inputs/outputs
378     * (in LDS, tess and GS rings). VARn should be placed right
379     * after POSITION to make that size as small as possible.
380     */
381    SI_UNIQUE_SLOT_VAR0 = 1, /* 0..31 */
382 
383    /* Put 16-bit GLES varyings after 32-bit varyings. They can use the same indices as
384     * legacy desktop GL varyings because they are mutually exclusive.
385     */
386    SI_UNIQUE_SLOT_VAR0_16BIT = 33, /* 0..15 */
387 
388    /* Legacy GL-only varyings can alias GLES-only 16-bit varyings. */
389    SI_UNIQUE_SLOT_FOGC = 33,
390    SI_UNIQUE_SLOT_COL0,
391    SI_UNIQUE_SLOT_COL1,
392    SI_UNIQUE_SLOT_BFC0,
393    SI_UNIQUE_SLOT_BFC1,
394    SI_UNIQUE_SLOT_TEX0,
395    SI_UNIQUE_SLOT_TEX1,
396    SI_UNIQUE_SLOT_TEX2,
397    SI_UNIQUE_SLOT_TEX3,
398    SI_UNIQUE_SLOT_TEX4,
399    SI_UNIQUE_SLOT_TEX5,
400    SI_UNIQUE_SLOT_TEX6,
401    SI_UNIQUE_SLOT_TEX7,
402    SI_UNIQUE_SLOT_CLIP_VERTEX,
403 
404    /* Varyings present in both GLES and desktop GL must start at 49 after 16-bit varyings. */
405    SI_UNIQUE_SLOT_CLIP_DIST0 = 49,
406    SI_UNIQUE_SLOT_CLIP_DIST1,
407    SI_UNIQUE_SLOT_PSIZ,
408    /* These can't be written by LS, HS, and ES. */
409    SI_UNIQUE_SLOT_LAYER,
410    SI_UNIQUE_SLOT_VIEWPORT,
411    SI_UNIQUE_SLOT_PRIMITIVE_ID,
412 };
413 
414 /**
415  * For VS shader keys, describe any fixups required for vertex fetch.
416  *
417  * \ref log_size, \ref format, and the number of channels are interpreted as
418  * by \ref ac_build_opencoded_load_format.
419  *
420  * Note: all bits 0 (size = 1 byte, num channels = 1, format = float) is an
421  * impossible format and indicates that no fixup is needed (just use
422  * buffer_load_format_xyzw).
423  */
424 union si_vs_fix_fetch {
425    struct {
426       uint8_t log_size : 2;        /* 1, 2, 4, 8 or bytes per channel */
427       uint8_t num_channels_m1 : 2; /* number of channels minus 1 */
428       uint8_t format : 3;          /* AC_FETCH_FORMAT_xxx */
429       uint8_t reverse : 1;         /* reverse XYZ channels */
430    } u;
431    uint8_t bits;
432 };
433 
434 struct si_shader;
435 
436 /* State of the context creating the shader object. */
437 struct si_compiler_ctx_state {
438    /* Should only be used by si_init_shader_selector_async and
439     * si_build_shader_variant if thread_index == -1 (non-threaded). */
440    struct ac_llvm_compiler *compiler;
441 
442    /* Used if thread_index == -1 or if debug.async is true. */
443    struct util_debug_callback debug;
444 
445    /* Used for creating the log string for gallium/ddebug. */
446    bool is_debug_context;
447 };
448 
449 enum si_color_output_type {
450    SI_TYPE_ANY32,
451    SI_TYPE_FLOAT16,
452    SI_TYPE_INT16,
453    SI_TYPE_UINT16,
454 };
455 
456 union si_ps_input_info {
457    struct {
458       uint8_t semantic;
459       uint8_t interpolate;
460       uint8_t fp16_lo_hi_valid;
461    };
462    uint32_t _unused; /* this just forces 4-byte alignment */
463 };
464 
465 struct si_vs_tcs_input_info {
466    uint8_t semantic;
467    uint8_t usage_mask;
468 };
469 
470 struct si_shader_info {
471    shader_info base;
472 
473    uint32_t options; /* bitmask of SI_PROFILE_* */
474 
475    uint8_t num_inputs;
476    uint8_t num_outputs;
477    struct si_vs_tcs_input_info input[PIPE_MAX_SHADER_INPUTS];
478    uint8_t output_semantic[PIPE_MAX_SHADER_OUTPUTS];
479    uint8_t output_usagemask[PIPE_MAX_SHADER_OUTPUTS];
480    uint8_t output_streams[PIPE_MAX_SHADER_OUTPUTS];
481    uint8_t output_type[PIPE_MAX_SHADER_OUTPUTS]; /* enum nir_alu_type */
482    uint8_t output_xfb_writemask[PIPE_MAX_SHADER_OUTPUTS];
483 
484    uint8_t num_streamout_components;
485    uint8_t num_vs_inputs;
486    uint8_t num_vbos_in_user_sgprs;
487    uint8_t num_stream_output_components[4]; /* for GS streams, not streamout */
488    uint16_t enabled_streamout_buffer_mask;
489 
490    uint64_t inputs_read; /* "get_unique_index" bits */
491    uint64_t tcs_inputs_via_temp;
492    uint64_t tcs_inputs_via_lds;
493 
494    /* For VS before {TCS, TES, GS} and TES before GS. */
495    uint64_t ls_es_outputs_written;     /* "get_unique_index" bits */
496    uint64_t outputs_written_before_ps; /* "get_unique_index" bits */
497    uint64_t tcs_outputs_written_for_tes;   /* "get_unique_index" bits */
498    uint32_t patch_outputs_written_for_tes; /* "get_unique_index_patch" bits */
499    uint32_t tess_levels_written_for_tes;   /* "get_unique_index_patch" bits */
500 
501    uint8_t clipdist_mask;
502    uint8_t culldist_mask;
503 
504    uint16_t esgs_vertex_stride;
505    uint16_t gsvs_vertex_size;
506    uint8_t gs_input_verts_per_prim;
507    unsigned max_gsvs_emit_size;
508 
509    /* Set 0xf or 0x0 (4 bits) per each written output.
510     * ANDed with spi_shader_col_format.
511     */
512    unsigned colors_written_4bit;
513 
514    int constbuf0_num_slots;
515    uint8_t color_attr_index[2];
516    uint8_t color_interpolate[2];
517    uint8_t color_interpolate_loc[2];
518    uint8_t colors_read; /**< which color components are read by the FS */
519    uint8_t colors_written;
520    uint16_t output_color_types; /**< Each bit pair is enum si_color_output_type */
521    bool color0_writes_all_cbufs; /**< gl_FragColor */
522    bool reads_samplemask;   /**< does fragment shader read sample mask? */
523    bool reads_tess_factors; /**< If TES reads TESSINNER or TESSOUTER */
524    bool writes_z;           /**< does fragment shader write Z value? */
525    /* We need both because both can be present in different conditional blocks. */
526    bool output_z_equals_input_z; /**< gl_FragDepth == gl_FragCoord.z for any write */
527    bool output_z_is_not_input_z; /**< gl_FragDepth != gl_FragCoord.z for any write */
528    bool writes_stencil;     /**< does fragment shader write stencil value? */
529    bool writes_samplemask;  /**< does fragment shader write sample mask? */
530    bool writes_edgeflag;    /**< vertex shader outputs edgeflag */
531    bool uses_interp_color;
532    bool uses_persp_center_color;
533    bool uses_persp_centroid_color;
534    bool uses_persp_sample_color;
535    bool uses_persp_center;
536    bool uses_persp_centroid;
537    bool uses_persp_sample;
538    bool uses_linear_center;
539    bool uses_linear_centroid;
540    bool uses_linear_sample;
541    bool uses_interp_at_offset;
542    bool uses_interp_at_sample;
543    bool uses_instanceid;
544    bool uses_base_vertex;
545    bool uses_base_instance;
546    bool uses_drawid;
547    bool uses_primid;
548    bool uses_frontface;
549    bool uses_invocationid;
550    bool uses_thread_id[3];
551    bool uses_block_id[3];
552    bool uses_variable_block_size;
553    bool uses_grid_size;
554    bool uses_tg_size;
555    bool uses_atomic_ordered_add;
556    bool writes_position;
557    bool writes_psize;
558    bool writes_clipvertex;
559    bool writes_primid;
560    bool writes_viewport_index;
561    bool writes_layer;
562    bool uses_bindless_samplers;
563    bool uses_bindless_images;
564    bool uses_indirect_descriptor;
565    bool has_divergent_loop;
566    bool has_non_uniform_tex_access;
567    bool has_shadow_comparison;
568 
569    bool uses_vmem_sampler_or_bvh;
570    bool uses_vmem_load_other; /* all other VMEM loads and atomics with return */
571 
572    /** Whether all codepaths write tess factors in all invocations. */
573    bool tessfactors_are_def_in_all_invocs;
574 
575    /* A flag to check if vrs2x2 can be enabled to reduce number of
576     * fragment shader invocations if flat shading.
577     */
578    bool allow_flat_shading;
579 
580    /* Optimization: if the texture bound to this texunit has been cleared to 1,
581     * then the draw can be skipped (see si_draw_vbo_skip_noop). Initially the
582     * value is 0xff (undetermined) and can be later changed to 0 (= false) or
583     * texunit + 1.
584     */
585    uint8_t writes_1_if_tex_is_1;
586 
587    /* frag coord and sample pos per component read mask. */
588    uint8_t reads_frag_coord_mask;
589 };
590 
591 /* A shader selector is a gallium CSO and contains shader variants and
592  * binaries for one NIR program. This can be shared by multiple contexts.
593  */
594 struct si_shader_selector {
595    struct util_live_shader base;
596    struct si_screen *screen;
597    struct util_queue_fence ready;
598    struct si_compiler_ctx_state compiler_ctx_state;
599    gl_shader_stage stage;
600 
601    simple_mtx_t mutex;
602    union si_shader_key *keys;
603    unsigned variants_count;
604    unsigned variants_max_count;
605    struct si_shader **variants;
606 
607    /* The compiled NIR shader without a prolog and/or epilog (not
608     * uploaded to a buffer object).
609     *
610     * [0] for wave32, [1] for wave64.
611     */
612    struct si_shader *main_shader_part[2];
613    struct si_shader *main_shader_part_ls[2];     /* as_ls is set in the key */
614    struct si_shader *main_shader_part_es;        /* as_es && !as_ngg in the key */
615    struct si_shader *main_shader_part_ngg[2];    /* !as_es && as_ngg in the key */
616    struct si_shader *main_shader_part_ngg_es[2]; /* as_es && as_ngg in the key */
617 
618    struct nir_shader *nir;
619    void *nir_binary;
620    unsigned nir_size;
621 
622    struct si_shader_info info;
623 
624    uint8_t const_and_shader_buf_descriptors_index;
625    uint8_t sampler_and_images_descriptors_index;
626    uint8_t cs_shaderbufs_sgpr_index;
627    uint8_t cs_num_shaderbufs_in_user_sgprs;
628    uint8_t cs_images_sgpr_index;
629    uint8_t cs_images_num_sgprs;
630    uint8_t cs_num_images_in_user_sgprs;
631    unsigned ngg_cull_vert_threshold; /* UINT32_MAX = disabled */
632    enum mesa_prim rast_prim;
633 
634    /* GS parameters. */
635    bool tess_turns_off_ngg;
636 
637    /* bitmasks of used descriptor slots */
638    uint64_t active_const_and_shader_buffers;
639    uint64_t active_samplers_and_images;
640 };
641 
642 /* Valid shader configurations:
643  *
644  * API shaders           VS | TCS | TES | GS |pass| PS
645  * are compiled as:         |     |     |    |thru|
646  *                          |     |     |    |    |
647  * Only VS & PS:         VS |     |     |    |    | PS
648  * GFX6     - with GS:   ES |     |     | GS | VS | PS
649  *          - with tess: LS | HS  | VS  |    |    | PS
650  *          - with both: LS | HS  | ES  | GS | VS | PS
651  * GFX9     - with GS:   -> |     |     | GS | VS | PS
652  *          - with tess: -> | HS  | VS  |    |    | PS
653  *          - with both: -> | HS  | ->  | GS | VS | PS
654  *                          |     |     |    |    |
655  * NGG      - VS & PS:   GS |     |     |    |    | PS
656  * (GFX10+) - with GS:   -> |     |     | GS |    | PS
657  *          - with tess: -> | HS  | GS  |    |    | PS
658  *          - with both: -> | HS  | ->  | GS |    | PS
659  *
660  * -> = merged with the next stage
661  */
662 
663 /* Use the byte alignment for all following structure members for optimal
664  * shader key memory footprint.
665  */
666 #pragma pack(push, 1)
667 
668 /* Common PS bits between the shader key and the prolog key. */
669 struct si_ps_prolog_bits {
670    unsigned color_two_side : 1;
671    unsigned flatshade_colors : 1;
672    unsigned poly_stipple : 1;
673    unsigned force_persp_sample_interp : 1;
674    unsigned force_linear_sample_interp : 1;
675    unsigned force_persp_center_interp : 1;
676    unsigned force_linear_center_interp : 1;
677    unsigned bc_optimize_for_persp : 1;
678    unsigned bc_optimize_for_linear : 1;
679    unsigned samplemask_log_ps_iter : 2;
680    unsigned get_frag_coord_from_pixel_coord : 1;
681    unsigned force_samplemask_to_helper_invocation : 1;
682 };
683 
684 /* Common PS bits between the shader key and the epilog key. */
685 struct si_ps_epilog_bits {
686    unsigned spi_shader_col_format;
687    unsigned color_is_int8 : 8;
688    unsigned color_is_int10 : 8;
689    unsigned alpha_func : 3;
690    unsigned alpha_to_one : 1;
691    unsigned alpha_to_coverage_via_mrtz : 1;  /* gfx11+ or alpha_to_one */
692    unsigned clamp_color : 1;
693    unsigned dual_src_blend_swizzle : 1;      /* gfx11+ */
694    unsigned rbplus_depth_only_opt:1;
695    unsigned kill_z:1;
696    unsigned kill_stencil:1;
697    unsigned kill_samplemask:1;
698 };
699 
700 union si_shader_part_key {
701    struct {
702       struct si_ps_prolog_bits states;
703       unsigned use_aco : 1;
704       unsigned wave32 : 1;
705       unsigned num_input_sgprs : 6;
706       /* Color interpolation and two-side color selection. */
707       unsigned colors_read : 8;       /* color input components read */
708       unsigned num_interp_inputs : 5; /* BCOLOR is at this location */
709       unsigned fragcoord_usage_mask : 4;
710       unsigned pixel_center_integer : 1;
711       unsigned wqm : 1;
712       char color_attr_index[2];
713       signed char color_interp_vgpr_index[2]; /* -1 == constant */
714    } ps_prolog;
715    struct {
716       struct si_ps_epilog_bits states;
717       unsigned use_aco : 1;
718       unsigned wave32 : 1;
719       unsigned uses_discard : 1;
720       unsigned colors_written : 8;
721       unsigned color_types : 16;
722       unsigned writes_all_cbufs : 1;
723       unsigned writes_z : 1;
724       unsigned writes_stencil : 1;
725       unsigned writes_samplemask : 1;
726    } ps_epilog;
727 };
728 
729 /* The shader key for geometry stages (VS, TCS, TES, GS) */
730 struct si_shader_key_ge {
731    /* Prolog and epilog flags. */
732    union {
733       struct {
734          struct si_shader_selector *ls;      /* for merged LS-HS */
735       } tcs; /* tessellation control shader */
736       struct {
737          struct si_shader_selector *es;      /* for merged ES-GS */
738       } gs;
739    } part;
740 
741    /* These three are initially set according to the NEXT_SHADER property,
742     * or guessed if the property doesn't seem correct.
743     */
744    unsigned as_es : 1;  /* whether it's a shader before GS */
745    unsigned as_ls : 1;  /* whether it's VS before TCS */
746    unsigned as_ngg : 1; /* whether it's the last GE stage and NGG is enabled,
747                            also set for the stage right before GS */
748 
749    /* Flags for monolithic compilation only. */
750    struct {
751       /* - If neither "is_one" nor "is_fetched" has a bit set, the instance
752        *   divisor is 0.
753        * - If "is_one" has a bit set, the instance divisor is 1.
754        * - If "is_fetched" has a bit set, the instance divisor will be loaded
755        *   from the constant buffer.
756        */
757       uint16_t instance_divisor_is_one;     /* bitmask of inputs */
758       uint16_t instance_divisor_is_fetched; /* bitmask of inputs */
759 
760       /* Whether fetch should be opencoded according to vs_fix_fetch.
761        * Otherwise, if vs_fix_fetch is non-zero, buffer_load_format_xyzw
762        * with minimal fixups is used. */
763       uint16_t vs_fetch_opencode;
764       union si_vs_fix_fetch vs_fix_fetch[SI_MAX_ATTRIBS];
765 
766       union {
767          /* When PS needs PrimID and GS is disabled. */
768          unsigned vs_export_prim_id : 1;    /* VS and TES only */
769          unsigned gs_tri_strip_adj_fix : 1; /* GS only */
770       } u;
771 
772       /* Gfx12: When no streamout buffers are bound, streamout must be disabled. */
773       unsigned remove_streamout : 1;
774    } mono;
775 
776    /* Optimization flags for asynchronous compilation only. */
777    struct {
778       /* For HW VS (it can be VS, TES, GS) */
779       uint64_t kill_outputs; /* "get_unique_index" bits */
780       unsigned kill_clip_distances : 8;
781       unsigned kill_pointsize : 1;
782       unsigned kill_layer : 1;
783       unsigned remove_streamout : 1;
784 
785       /* For NGG VS and TES. */
786       unsigned ngg_culling : 11; /* SI_NGG_CULL_* */
787 
788       /* If NGG VS streamout knows the number of vertices per primitive at compile time,
789        * it can put stores for all vertices in the same VMEM clause, instead of storing
790        * vertices for the 2nd and 3rd vertex conditionally because the primitive type is
791        * unknown.
792        */
793       unsigned ngg_vs_streamout_num_verts_per_prim : 2;
794 
795       /* For shaders where monolithic variants have better code.
796        *
797        * This is a flag that has no effect on code generation,
798        * but forces monolithic shaders to be used as soon as
799        * possible, because it's in the "opt" group.
800        */
801       unsigned prefer_mono : 1;
802 
803       /* VS and TCS have the same number of patch vertices. */
804       unsigned same_patch_vertices:1;
805 
806       /* For TCS. */
807       unsigned tes_prim_mode : 2;
808       unsigned tes_reads_tess_factors : 1;
809 
810       unsigned inline_uniforms:1;
811 
812       /* This must be kept last to limit the number of variants
813        * depending only on the uniform values.
814        */
815       uint32_t inlined_uniform_values[MAX_INLINABLE_UNIFORMS];
816    } opt;
817 };
818 
819 struct si_shader_key_ps {
820    struct {
821       /* Prolog and epilog flags. */
822       struct si_ps_prolog_bits prolog;
823       struct si_ps_epilog_bits epilog;
824    } part;
825 
826    /* Flags for monolithic compilation only. */
827    struct {
828       unsigned force_mono : 1;
829       unsigned poly_line_smoothing : 1;
830       unsigned point_smoothing : 1;
831       unsigned interpolate_at_sample_force_center : 1;
832       unsigned fbfetch_msaa : 1;
833       unsigned fbfetch_is_1D : 1;
834       unsigned fbfetch_layered : 1;
835    } mono;
836 
837    /* Optimization flags for asynchronous compilation only. */
838    struct {
839       /* For shaders where monolithic variants have better code.
840        *
841        * This is a flag that has no effect on code generation,
842        * but forces monolithic shaders to be used as soon as
843        * possible, because it's in the "opt" group.
844        */
845       unsigned prefer_mono : 1;
846       unsigned inline_uniforms:1;
847 
848       /* This eliminates the FRONT_FACE input VGPR as well as shader code using it. */
849       int force_front_face_input : 2; /* 0 = gl_FrontFacing, 1 = true, -1 = false */
850 
851       /* This must be kept last to limit the number of variants
852        * depending only on the uniform values.
853        */
854       uint32_t inlined_uniform_values[MAX_INLINABLE_UNIFORMS];
855    } opt;
856 };
857 
858 union si_shader_key {
859    struct si_shader_key_ge ge; /* geometry engine shaders */
860    struct si_shader_key_ps ps;
861 };
862 
863 /* Restore the pack alignment to default. */
864 #pragma pack(pop)
865 
866 /* GCN-specific shader info. */
867 struct si_shader_binary_info {
868    uint8_t vs_output_param_offset[NUM_TOTAL_VARYING_SLOTS];
869    uint32_t vs_output_ps_input_cntl[NUM_TOTAL_VARYING_SLOTS];
870    union si_ps_input_info ps_inputs[SI_NUM_INTERP];
871    uint8_t num_ps_inputs;
872    uint8_t ps_colors_read;
873    uint8_t num_input_sgprs;
874    uint8_t num_input_vgprs;
875    bool uses_vmem_load_other; /* all other VMEM loads and atomics with return */
876    bool uses_vmem_sampler_or_bvh;
877    bool uses_instanceid;
878    uint8_t nr_pos_exports;
879    uint8_t nr_param_exports;
880    unsigned private_mem_vgprs;
881    unsigned max_simd_waves;
882 };
883 
884 enum si_shader_binary_type {
885    SI_SHADER_BINARY_ELF,
886    SI_SHADER_BINARY_RAW,
887 };
888 
889 struct si_shader_binary {
890    enum si_shader_binary_type type;
891 
892    /* Depends on binary type, either ELF or raw buffer. */
893    const char *code_buffer;
894    size_t code_size;
895    uint32_t exec_size;
896 
897    char *uploaded_code;
898    size_t uploaded_code_size;
899 
900    char *llvm_ir_string;
901 
902    const char *disasm_string;
903    size_t disasm_size;
904 
905    const unsigned *symbols;
906    unsigned num_symbols;
907 };
908 
909 struct gfx9_gs_info {
910    unsigned es_verts_per_subgroup;
911    unsigned gs_prims_per_subgroup;
912    unsigned gs_inst_prims_in_subgroup;
913    unsigned max_prims_per_subgroup;
914    unsigned esgs_ring_size; /* in bytes */
915 };
916 
917 struct si_shader {
918    struct si_pm4_state pm4; /* base class */
919    struct si_compiler_ctx_state compiler_ctx_state;
920 
921    struct si_shader_selector *selector;
922    struct si_shader_selector *previous_stage_sel; /* for refcounting */
923    struct si_shader *next_shader; /* Only used during compilation of LS and ES when merged. */
924 
925    struct si_shader_part *prolog;
926    struct si_shader *previous_stage; /* for GFX9 */
927    struct si_shader_part *epilog;
928    struct si_shader *gs_copy_shader;
929 
930    struct si_resource *bo;
931    /* gpu_address should be bo->gpu_address except if SQTT is
932     * in use.
933     */
934    uint64_t gpu_address;
935    /* Only used on GFX6-10 where the scratch address must be inserted into the shader binary.
936     * This is the scratch address that the current shader binary contains.
937     */
938    uint64_t scratch_va;
939    union si_shader_key key;
940    struct util_queue_fence ready;
941    bool compilation_failed;
942    bool is_monolithic;
943    bool is_optimized;
944    bool is_binary_shared;
945    bool is_gs_copy_shader;
946    uint8_t wave_size;
947    unsigned complete_shader_binary_size;
948 
949    /* The following data is all that's needed for binary shaders. */
950    struct si_shader_binary binary;
951    struct ac_shader_config config;
952    struct si_shader_binary_info info;
953 
954    /* SI_SGPR_VS_STATE_BITS */
955    bool uses_vs_state_provoking_vertex;
956    bool uses_gs_state_outprim;
957 
958    bool uses_base_instance;
959 
960    /* Shader key + LLVM IR + disassembly + statistics.
961     * Generated for debug contexts only.
962     */
963    char *shader_log;
964    size_t shader_log_size;
965 
966    struct gfx9_gs_info gs_info;
967 
968    /* Precomputed register values. */
969    union {
970       struct {
971          unsigned vgt_gsvs_ring_offset_1;
972          unsigned vgt_gsvs_ring_offset_2;
973          unsigned vgt_gsvs_ring_offset_3;
974          unsigned vgt_gsvs_ring_itemsize;
975          unsigned vgt_gs_max_vert_out;
976          unsigned vgt_gs_vert_itemsize;
977          unsigned vgt_gs_vert_itemsize_1;
978          unsigned vgt_gs_vert_itemsize_2;
979          unsigned vgt_gs_vert_itemsize_3;
980          unsigned vgt_gs_instance_cnt;
981          unsigned vgt_gs_onchip_cntl;
982          unsigned vgt_gs_max_prims_per_subgroup;
983          unsigned vgt_esgs_ring_itemsize;
984          unsigned spi_shader_pgm_rsrc3_gs;
985          unsigned spi_shader_pgm_rsrc4_gs;
986       } gs;
987 
988       struct {
989          /* Computed by gfx10_ngg_calculate_subgroup_info. */
990          uint16_t ngg_emit_size; /* in dwords */
991          uint16_t hw_max_esverts;
992          uint16_t max_gsprims;
993          uint16_t max_out_verts;
994          bool max_vert_out_per_gs_instance;
995          /* Register values. */
996          unsigned ge_max_output_per_subgroup;
997          unsigned ge_ngg_subgrp_cntl;
998          unsigned vgt_primitiveid_en;
999          unsigned vgt_gs_onchip_cntl;
1000          unsigned vgt_gs_instance_cnt;
1001          unsigned esgs_vertex_stride;
1002          unsigned spi_vs_out_config;
1003          unsigned spi_shader_pos_format;
1004          unsigned pa_cl_vte_cntl;
1005          unsigned vgt_gs_max_vert_out; /* for API GS */
1006          unsigned ge_pc_alloc;         /* uconfig register */
1007          unsigned spi_shader_pgm_rsrc3_gs;
1008          unsigned spi_shader_pgm_rsrc4_gs;
1009          unsigned vgt_shader_stages_en;
1010       } ngg;
1011 
1012       struct {
1013          unsigned vgt_gs_mode;
1014          unsigned vgt_primitiveid_en;
1015          unsigned vgt_reuse_off;
1016          unsigned spi_vs_out_config;
1017          unsigned spi_shader_pos_format;
1018          unsigned pa_cl_vte_cntl;
1019          unsigned ge_pc_alloc; /* uconfig register */
1020       } vs;
1021 
1022       struct {
1023          unsigned spi_ps_input_ena;
1024          unsigned spi_ps_input_addr;
1025          unsigned spi_ps_in_control;
1026          unsigned spi_shader_z_format;
1027          unsigned spi_shader_col_format;
1028          unsigned cb_shader_mask;
1029          unsigned db_shader_control;
1030          unsigned num_interp;
1031          unsigned spi_gs_out_config_ps;
1032          unsigned pa_sc_hisz_control;
1033          bool writes_z;
1034          bool writes_stencil;
1035          bool writes_samplemask;
1036       } ps;
1037    };
1038 
1039    /* Precomputed register values. */
1040    unsigned vgt_tf_param;                /* VGT_TF_PARAM */
1041    unsigned vgt_vertex_reuse_block_cntl; /* VGT_VERTEX_REUSE_BLOCK_CNTL */
1042    unsigned pa_cl_vs_out_cntl;
1043    unsigned ge_cntl;
1044 };
1045 
1046 struct si_shader_part {
1047    struct si_shader_part *next;
1048    union si_shader_part_key key;
1049    struct si_shader_binary binary;
1050    unsigned num_vgprs;
1051    unsigned num_sgprs;
1052 };
1053 
1054 /* si_shader.c */
1055 struct ac_rtld_binary;
1056 
1057 bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
1058                        struct si_shader *shader, struct util_debug_callback *debug);
1059 bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
1060                               struct si_shader *shader, struct util_debug_callback *debug);
1061 void si_shader_destroy(struct si_shader *shader);
1062 unsigned si_shader_io_get_unique_index(unsigned semantic);
1063 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader,
1064                             uint64_t scratch_va);
1065 int si_shader_binary_upload_at(struct si_screen *sscreen, struct si_shader *shader,
1066                                uint64_t scratch_va, int64_t bo_offset);
1067 bool si_can_dump_shader(struct si_screen *sscreen, gl_shader_stage stage,
1068                         enum si_shader_dump_type dump_type);
1069 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
1070                     struct util_debug_callback *debug, FILE *f, bool check_debug_option);
1071 void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader,
1072                                         struct util_debug_callback *debug);
1073 void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size);
1074 const char *si_get_shader_name(const struct si_shader *shader);
1075 void si_shader_binary_clean(struct si_shader_binary *binary);
1076 struct nir_shader *si_deserialize_shader(struct si_shader_selector *sel);
1077 unsigned si_get_ps_num_interp(struct si_shader *ps);
1078 unsigned si_get_shader_prefetch_size(struct si_shader *shader);
1079 unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_shader *shader);
1080 
1081 /* si_shader_info.c */
1082 void si_nir_scan_shader(struct si_screen *sscreen, struct nir_shader *nir,
1083                         struct si_shader_info *info, bool colors_lowered);
1084 
1085 /* si_shader_nir.c */
1086 void si_lower_mediump_io(struct nir_shader *nir);
1087 
1088 bool si_alu_to_scalar_packed_math_filter(const struct nir_instr *instr, const void *data);
1089 void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool has_array_temps);
1090 void si_nir_late_opts(struct nir_shader *nir);
1091 char *si_finalize_nir(struct pipe_screen *screen, struct nir_shader *nir);
1092 
1093 /* si_state_shaders.cpp */
1094 unsigned si_shader_num_alloc_param_exports(struct si_shader *shader);
1095 unsigned si_determine_wave_size(struct si_screen *sscreen, struct si_shader *shader);
1096 void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *gs,
1097                       struct gfx9_gs_info *out);
1098 bool gfx10_is_ngg_passthrough(struct si_shader *shader);
1099 unsigned si_shader_lshs_vertex_stride(struct si_shader *ls);
1100 bool si_should_clear_lds(struct si_screen *sscreen, const struct nir_shader *shader);
1101 unsigned si_get_output_prim_simplified(const struct si_shader_selector *sel,
1102                                        const union si_shader_key *key);
1103 
1104 /* Inline helpers. */
1105 
1106 /* Return the pointer to the main shader part's pointer. */
si_get_main_shader_part(struct si_shader_selector * sel,const union si_shader_key * key,unsigned wave_size)1107 static inline struct si_shader **si_get_main_shader_part(struct si_shader_selector *sel,
1108                                                          const union si_shader_key *key,
1109                                                          unsigned wave_size)
1110 {
1111    assert(wave_size == 32 || wave_size == 64);
1112    unsigned index = wave_size / 32 - 1;
1113 
1114    if (sel->stage <= MESA_SHADER_GEOMETRY) {
1115       if (key->ge.as_ls)
1116          return &sel->main_shader_part_ls[index];
1117       if (key->ge.as_es && key->ge.as_ngg)
1118          return &sel->main_shader_part_ngg_es[index];
1119       if (key->ge.as_es) {
1120          /* legacy GS only support wave 64 */
1121          assert(wave_size == 64);
1122          return &sel->main_shader_part_es;
1123       }
1124       if (key->ge.as_ngg)
1125          return &sel->main_shader_part_ngg[index];
1126    }
1127    return &sel->main_shader_part[index];
1128 }
1129 
gfx10_has_variable_edgeflags(struct si_shader * shader)1130 static inline bool gfx10_has_variable_edgeflags(struct si_shader *shader)
1131 {
1132    unsigned output_prim = si_get_output_prim_simplified(shader->selector, &shader->key);
1133 
1134    return shader->selector->stage == MESA_SHADER_VERTEX &&
1135           (output_prim == MESA_PRIM_TRIANGLES || output_prim == MESA_PRIM_UNKNOWN);
1136 }
1137 
si_shader_uses_streamout(const struct si_shader * shader)1138 static inline bool si_shader_uses_streamout(const struct si_shader *shader)
1139 {
1140    return shader->selector->stage <= MESA_SHADER_GEOMETRY &&
1141           shader->selector->info.enabled_streamout_buffer_mask &&
1142           !shader->key.ge.opt.remove_streamout &&
1143           !shader->key.ge.mono.remove_streamout;
1144 }
1145 
si_shader_uses_discard(struct si_shader * shader)1146 static inline bool si_shader_uses_discard(struct si_shader *shader)
1147 {
1148    /* Changes to this should also update ps_modifies_zs. */
1149    return shader->selector->info.base.fs.uses_discard ||
1150           shader->key.ps.part.prolog.poly_stipple ||
1151           shader->key.ps.mono.point_smoothing ||
1152           shader->key.ps.part.epilog.alpha_func != PIPE_FUNC_ALWAYS;
1153 }
1154 
si_shader_culling_enabled(struct si_shader * shader)1155 static inline bool si_shader_culling_enabled(struct si_shader *shader)
1156 {
1157    /* Legacy VS/TES/GS and ES don't cull in the shader. */
1158    if (!shader->key.ge.as_ngg || shader->key.ge.as_es) {
1159       assert(!shader->key.ge.opt.ngg_culling);
1160       return false;
1161    }
1162 
1163    if (shader->key.ge.opt.ngg_culling)
1164       return true;
1165 
1166    unsigned output_prim = si_get_output_prim_simplified(shader->selector, &shader->key);
1167 
1168    /* This enables NGG culling for non-monolithic TES and GS. */
1169    return shader->selector->ngg_cull_vert_threshold == 0 &&
1170           (output_prim == MESA_PRIM_TRIANGLES || output_prim == MESA_PRIM_LINES);
1171 }
1172 
1173 #ifdef __cplusplus
1174 }
1175 #endif
1176 
1177 #endif
1178