1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 /* The compiler middle-end architecture: Explaining (non-)monolithic shaders
8 * -------------------------------------------------------------------------
9 *
10 * Typically, there is one-to-one correspondence between API and HW shaders,
11 * that is, for every API shader, there is exactly one shader binary in
12 * the driver.
13 *
14 * The problem with that is that we also have to emulate some API states
15 * (e.g. alpha-test, and many others) in shaders too. The two obvious ways
16 * to deal with it are:
17 * - each shader has multiple variants for each combination of emulated states,
18 * and the variants are compiled on demand, possibly relying on a shader
19 * cache for good performance
20 * - patch shaders at the binary level
21 *
22 * This driver uses something completely different. The emulated states are
23 * usually implemented at the beginning or end of shaders. Therefore, we can
24 * split the shader into 3 parts:
25 * - prolog part (shader code dependent on states)
26 * - main part (the API shader)
27 * - epilog part (shader code dependent on states)
28 *
29 * Each part is compiled as a separate shader and the final binaries are
30 * concatenated. This type of shader is called non-monolithic, because it
31 * consists of multiple independent binaries. Creating a new shader variant
32 * is therefore only a concatenation of shader parts (binaries) and doesn't
33 * involve any compilation. The main shader parts are the only parts that are
34 * compiled when applications create shader objects. The prolog and epilog
35 * parts are compiled on the first use and saved, so that their binaries can
36 * be reused by many other shaders.
37 *
38 * One of the roles of the prolog part is to compute vertex buffer addresses
39 * for vertex shaders. A few of the roles of the epilog part are color buffer
40 * format conversions in pixel shaders that we have to do manually, and write
41 * tessellation factors in tessellation control shaders. The prolog and epilog
42 * have many other important responsibilities in various shader stages.
43 * They don't just "emulate legacy stuff".
44 *
45 * Monolithic shaders are shaders where the parts are combined before LLVM
46 * compilation, and the whole thing is compiled and optimized as one unit with
47 * one binary on the output. The result is the same as the non-monolithic
48 * shader, but the final code can be better, because LLVM can optimize across
49 * all shader parts. Monolithic shaders aren't usually used except for these
50 * special cases:
51 *
52 * 1) Some rarely-used states require modification of the main shader part
53 * itself, and in such cases, only the monolithic shader variant is
54 * compiled, and that's always done on the first use.
55 *
56 * 2) When we do cross-stage optimizations for separate shader objects and
57 * e.g. eliminate unused shader varyings, the resulting optimized shader
58 * variants are always compiled as monolithic shaders, and always
59 * asynchronously (i.e. not stalling ongoing rendering). We call them
60 * "optimized monolithic" shaders. The important property here is that
61 * the non-monolithic unoptimized shader variant is always available for use
62 * when the asynchronous compilation of the optimized shader is not done
63 * yet.
64 *
65 * Starting with GFX9 chips, some shader stages are merged, and the number of
66 * shader parts per shader increased. The complete new list of shader parts is:
67 * - 1st shader: prolog part
68 * - 1st shader: main part
69 * - 2nd shader: main part
70 * - 2nd shader: epilog part
71 */
72
73 /* How linking shader inputs and outputs between vertex, tessellation, and
74 * geometry shaders works.
75 *
76 * Inputs and outputs between shaders are stored in a buffer. This buffer
77 * lives in LDS (typical case for tessellation), but it can also live
78 * in memory (ESGS). Each input or output has a fixed location within a vertex.
79 * The highest used input or output determines the stride between vertices.
80 *
81 * Since GS and tessellation are only possible in the OpenGL core profile,
82 * only these semantics are valid for per-vertex data:
83 *
84 * Name Location
85 *
86 * POSITION 0
87 * VAR0..31 1..32
88 * CLIP_DIST0..1 49..50
89 * PSIZ 51
90 *
91 * For example, a shader only writing GENERIC0 has the output stride of 5.
92 *
93 * Only these semantics are valid for per-patch data:
94 *
95 * Name Location
96 *
97 * TESSOUTER 0
98 * TESSINNER 1
99 * PATCH0..29 2..31
100 *
101 * That's how independent shaders agree on input and output locations.
102 * The si_shader_io_get_unique_index function assigns the locations.
103 *
104 * For tessellation, other required information for calculating the input and
105 * output addresses like the vertex stride, the patch stride, and the offsets
106 * where per-vertex and per-patch data start, is passed to the shader via
107 * user data SGPRs. The offsets and strides are calculated at draw time and
108 * aren't available at compile time.
109 */
110
111 #ifndef SI_SHADER_H
112 #define SI_SHADER_H
113
114 #include "shader_info.h"
115 #include "ac_binary.h"
116 #include "ac_gpu_info.h"
117 #include "util/mesa-blake3.h"
118 #include "util/u_live_shader_cache.h"
119 #include "util/u_queue.h"
120 #include "si_pm4.h"
121
122 #ifdef __cplusplus
123 extern "C" {
124 #endif
125
126 struct nir_shader;
127 struct nir_instr;
128
129 #define SI_NUM_INTERP 32
130 #define SI_MAX_ATTRIBS 16
131 #define SI_MAX_VS_OUTPUTS 40
132 #define SI_USER_CLIP_PLANE_MASK 0x3F
133
134 #define INTERP_MODE_COLOR INTERP_MODE_COUNT
135
136 #define SI_PS_INPUT_CNTL_0000 (S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(0))
137 #define SI_PS_INPUT_CNTL_0001 (S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(3))
138 #define SI_PS_INPUT_CNTL_UNUSED SI_PS_INPUT_CNTL_0000
139 /* D3D9 behaviour for COLOR0 requires 0001. GL is undefined. */
140 #define SI_PS_INPUT_CNTL_UNUSED_COLOR0 SI_PS_INPUT_CNTL_0001
141
142 /* SGPR user data indices */
143 enum
144 {
145 SI_SGPR_INTERNAL_BINDINGS,
146 SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES,
147 SI_SGPR_CONST_AND_SHADER_BUFFERS, /* or just a constant buffer 0 pointer */
148 SI_SGPR_SAMPLERS_AND_IMAGES,
149 SI_NUM_RESOURCE_SGPRS,
150
151 /* API VS, TES without GS, GS copy shader */
152 SI_SGPR_VS_STATE_BITS = SI_NUM_RESOURCE_SGPRS,
153 SI_NUM_VS_STATE_RESOURCE_SGPRS,
154
155 /* all VS variants */
156 SI_SGPR_BASE_VERTEX = SI_NUM_VS_STATE_RESOURCE_SGPRS,
157 SI_SGPR_DRAWID,
158 SI_SGPR_START_INSTANCE,
159 SI_VS_NUM_USER_SGPR,
160
161 SI_SGPR_VS_BLIT_DATA = SI_SGPR_CONST_AND_SHADER_BUFFERS,
162
163 /* TES */
164 SI_SGPR_TES_OFFCHIP_LAYOUT = SI_NUM_VS_STATE_RESOURCE_SGPRS,
165 SI_SGPR_TES_OFFCHIP_ADDR,
166 SI_TES_NUM_USER_SGPR,
167
168 /* GFX6-8: TCS only */
169 GFX6_SGPR_TCS_OFFCHIP_LAYOUT = SI_NUM_RESOURCE_SGPRS,
170 GFX6_SGPR_TCS_OFFCHIP_ADDR,
171 GFX6_SGPR_TCS_IN_LAYOUT,
172 GFX6_TCS_NUM_USER_SGPR,
173
174 /* GFX9: Merged LS-HS (VS-TCS) only. */
175 GFX9_SGPR_TCS_OFFCHIP_LAYOUT = SI_VS_NUM_USER_SGPR,
176 GFX9_SGPR_TCS_OFFCHIP_ADDR,
177 GFX9_TCS_NUM_USER_SGPR,
178
179 /* GS limits */
180 GFX6_GS_NUM_USER_SGPR = SI_NUM_RESOURCE_SGPRS,
181 SI_GSCOPY_NUM_USER_SGPR = SI_NUM_VS_STATE_RESOURCE_SGPRS,
182
183 GFX9_SGPR_SMALL_PRIM_CULL_INFO = MAX2(SI_VS_NUM_USER_SGPR, SI_TES_NUM_USER_SGPR),
184 GFX9_SGPR_ATTRIBUTE_RING_ADDR,
185 GFX9_GS_NUM_USER_SGPR,
186
187 /* PS only */
188 SI_SGPR_SAMPLE_LOCS0 = SI_NUM_RESOURCE_SGPRS,
189 SI_SGPR_SAMPLE_LOCS1,
190 SI_SGPR_ALPHA_REF,
191 SI_PS_NUM_USER_SGPR,
192
193 /* The value has to be 12, because the hw requires that descriptors
194 * are aligned to 4 SGPRs.
195 */
196 SI_SGPR_VS_VB_DESCRIPTOR_FIRST = 12,
197 };
198
199 /* LLVM function parameter indices */
200 enum
201 {
202 SI_NUM_RESOURCE_PARAMS = 4,
203
204 /* PS only parameters */
205 SI_PARAM_SAMPLE_LOCS0 = SI_NUM_RESOURCE_PARAMS,
206 SI_PARAM_SAMPLE_LOCS1,
207 SI_PARAM_ALPHA_REF,
208 SI_PARAM_PRIM_MASK,
209 SI_PARAM_PERSP_SAMPLE,
210 SI_PARAM_PERSP_CENTER,
211 SI_PARAM_PERSP_CENTROID,
212 SI_PARAM_PERSP_PULL_MODEL,
213 SI_PARAM_LINEAR_SAMPLE,
214 SI_PARAM_LINEAR_CENTER,
215 SI_PARAM_LINEAR_CENTROID,
216 SI_PARAM_LINE_STIPPLE_TEX,
217 SI_PARAM_POS_X_FLOAT,
218 SI_PARAM_POS_Y_FLOAT,
219 SI_PARAM_POS_Z_FLOAT,
220 SI_PARAM_POS_W_FLOAT,
221 SI_PARAM_FRONT_FACE,
222 SI_PARAM_ANCILLARY,
223 SI_PARAM_SAMPLE_COVERAGE,
224 SI_PARAM_POS_FIXED_PT,
225
226 SI_NUM_PARAMS = SI_PARAM_POS_FIXED_PT + 9, /* +8 for COLOR[0..1] */
227 };
228
229 /* These fields are only set in current_vs_state (except INDEXED) in si_context, and they are
230 * accessible in the shader via vs_state_bits in VS, TES, and GS.
231 */
232 #define VS_STATE_CLAMP_VERTEX_COLOR__SHIFT 0
233 #define VS_STATE_CLAMP_VERTEX_COLOR__MASK 0x1 /* Shared by VS and GS */
234 #define VS_STATE_INDEXED__SHIFT 1
235 #define VS_STATE_INDEXED__MASK 0x1 /* Shared by VS and GS */
236
237 /* These fields are only set in current_gs_state in si_context, and they are accessible
238 * in the shader via vs_state_bits in legacy GS, the GS copy shader, and any NGG shader.
239 */
240 /* bit gap */
241 /* The number of ES outputs is derived from the last output index of SI_UNIQUE_SLOT_* + 1, which
242 * can be 55 at most. The ESGS vertex stride in dwords is: NUM_ES_OUTPUTS * 4 + 1
243 * Only used by GFX9+ to compute LDS addresses of GS inputs.
244 */
245 #define GS_STATE_NUM_ES_OUTPUTS__SHIFT 14
246 #define GS_STATE_NUM_ES_OUTPUTS__MASK 0x3f
247 #define GS_STATE_CULL_FACE_FRONT__SHIFT 20
248 #define GS_STATE_CULL_FACE_FRONT__MASK 0x1
249 #define GS_STATE_CULL_FACE_BACK__SHIFT 21
250 #define GS_STATE_CULL_FACE_BACK__MASK 0x1
251 /* Small prim filter precision = num_samples / quant_mode where num_samples is in {1, 2, 4, 8} and
252 * quant_mode is in {256, 1024, 4096}, which is equal to 1/2^n where n is between 5 and 12.
253 *
254 * Equation 1: Represent the value as 1/2^n.
255 * Assumption: log_samples <= 3 and log_quant_mode >= 8
256 * num_samples / quant_mode =
257 * 2^log_samples / 2^log_quant_mode =
258 * 1 / 2^(log_quant_mode - log_samples) [because log_samples < log_quant_mode]
259 *
260 * Knowing that, we only need 4 bits to represent the FP32 exponent and thus the FP32 number.
261 *
262 * Equation 2: Encoding the exponent.
263 * 1/2^(15 - value) in FP32 = ((value | 0x70) << 23) in binary if value < 15
264 * Proof: With 0x70 = 112, we get FP32 exponent 2^(112 + value - 127) according to the FP32
265 * definition, which can be simplified to 2^(value - 15), which is a negative exponent
266 * for value < 15. Given that 2^-n = 1/2^n, the FP32 number is equal to 1/2^(15 - value).
267 *
268 * Equation 3: Convert quant_mode_enum to log_quant_mode.
269 * quant_mode_enum:
270 * 0 means 256 = 2^8 --> log2(256) = 8
271 * 1 means 1024 = 2^10 --> log2(1024) = 10
272 * 2 means 4096 = 2^12 --> log2(4096) = 12
273 *
274 * Conversion to log_quant_mode:
275 * log_quant_mode = quant_mode_enum * 2 + 8. Proof:
276 * 0 * 2 + 8 = 8
277 * 1 * 2 + 8 = 10
278 * 2 * 2 + 8 = 12
279 *
280 * Equation 4: Get the exponent value for Equation 2 from Equation 1.
281 * 15 - value = log_quant_mode - log_samples
282 * value = 15 - (log_quant_mode + log_samples)
283 *
284 * Combine equations 2, 3, and 4 to get the expression computing the FP32 number from log_samples
285 * and quant_mode_enum using integer ops:
286 * (value | 0x70) << 23 =
287 * ((15 - (log_quant_mode + log_samples)) | 0x70) << 23 =
288 * ((15 - (quant_mode_enum * 2 + 8 + log_samples)) | 0x70) << 23 =
289 * ((15 - quant_mode_enum * 2 - 8 - log_samples) | 0x70) << 23 =
290 * ((7 - quant_mode_enum * 2 - log_samples) | 0x70) << 23 =
291 *
292 * Since "log_samples <= 3" and "quant_mode_enum * 2 <= 4", we need a SGPR field that stores:
293 * triangle_precision = 7 - quant_mode_enum * 2 - log_samples
294 *
295 * Line precision ignores log_samples, so the shader should do:
296 * line_precision = triangle_precision + log_samples
297 */
298 #define GS_STATE_SMALL_PRIM_PRECISION__SHIFT 22 /* triangle_precision */
299 #define GS_STATE_SMALL_PRIM_PRECISION__MASK 0x7
300 #define GS_STATE_SMALL_PRIM_PRECISION_LOG_SAMPLES__SHIFT 25
301 #define GS_STATE_SMALL_PRIM_PRECISION_LOG_SAMPLES__MASK 0x3
302 #define GS_STATE_STREAMOUT_QUERY_ENABLED__SHIFT 27
303 #define GS_STATE_STREAMOUT_QUERY_ENABLED__MASK 0x1
304 #define GS_STATE_PROVOKING_VTX_FIRST__SHIFT 28
305 #define GS_STATE_PROVOKING_VTX_FIRST__MASK 0x1
306 #define GS_STATE_OUTPRIM__SHIFT 29
307 #define GS_STATE_OUTPRIM__MASK 0x3
308 #define GS_STATE_PIPELINE_STATS_EMU__SHIFT 31
309 #define GS_STATE_PIPELINE_STATS_EMU__MASK 0x1
310
311 #define ENCODE_FIELD(field, value) (((unsigned)(value) & field##__MASK) << field##__SHIFT)
312 #define CLEAR_FIELD(field) (~((unsigned)field##__MASK << field##__SHIFT))
313
314 /* This is called by functions that change states. */
315 #define SET_FIELD(var, field, value) do { \
316 assert((value) == ((unsigned)(value) & field##__MASK)); \
317 (var) &= CLEAR_FIELD(field); \
318 (var) |= ENCODE_FIELD(field, value); \
319 } while (0)
320
321 /* This is called during shader compilation and returns LLVMValueRef. */
322 #define GET_FIELD(ctx, field) si_unpack_param((ctx), (ctx)->args->vs_state_bits, field##__SHIFT, \
323 util_bitcount(field##__MASK))
324
325 enum
326 {
327 /* These represent the number of SGPRs the shader uses. */
328 SI_VS_BLIT_SGPRS_POS = 3,
329 SI_VS_BLIT_SGPRS_POS_COLOR = 7,
330 SI_VS_BLIT_SGPRS_POS_TEXCOORD = 9,
331
332 MAX_SI_VS_BLIT_SGPRS = 10, /* +1 for the attribute ring address */
333 };
334
335 /* The following two are only set for vertex shaders that cull.
336 * TES and GS get the primitive type from shader_info.
337 */
338 #define SI_NGG_CULL_VS_TRIANGLES (1 << 0) /* this implies W, view.xy, and small prim culling */
339 #define SI_NGG_CULL_VS_LINES (1 << 1) /* this implies W and view.xy culling */
340 #define SI_NGG_CULL_SMALL_LINES_DIAMOND_EXIT (1 << 2) /* cull small lines according to the diamond exit rule */
341 #define SI_NGG_CULL_CLIP_PLANE_ENABLE(enable) (((enable) & 0xff) << 3)
342 #define SI_NGG_CULL_GET_CLIP_PLANE_ENABLE(x) (((x) >> 3) & 0xff)
343
344 struct si_shader_profile {
345 uint32_t blake3[BLAKE3_OUT_LEN32];
346 uint32_t options;
347 };
348
349 extern struct si_shader_profile si_shader_profiles[];
350 unsigned si_get_num_shader_profiles(void);
351
352 #define SI_PROFILE_WAVE32 (1 << 0)
353 #define SI_PROFILE_GFX10_WAVE64 (1 << 1)
354 /* bit gap */
355 #define SI_PROFILE_VS_NO_BINNING (1 << 3)
356 #define SI_PROFILE_GFX9_GFX10_PS_NO_BINNING (1 << 4)
357 #define SI_PROFILE_CLAMP_DIV_BY_ZERO (1 << 5)
358 #define SI_PROFILE_NO_OPT_UNIFORM_VARYINGS (1 << 6)
359
360 enum si_shader_dump_type {
361 SI_DUMP_SHADER_KEY,
362 SI_DUMP_INIT_NIR, /* initial input NIR when shaders are created (before lowering) */
363 SI_DUMP_NIR, /* final NIR after lowering when shader variants are created */
364 SI_DUMP_INIT_LLVM_IR, /* initial LLVM IR before optimizations */
365 SI_DUMP_LLVM_IR, /* final LLVM IR */
366 SI_DUMP_INIT_ACO_IR, /* initial ACO IR before optimizations */
367 SI_DUMP_ACO_IR, /* final ACO IR */
368 SI_DUMP_ASM, /* final asm shaders */
369 SI_DUMP_STATS, /* print statistics as shader-db */
370 SI_DUMP_ALWAYS,
371 };
372
373 enum {
374 SI_UNIQUE_SLOT_POS = 0,
375
376 /* Since some shader stages use the highest used IO index
377 * to determine the size to allocate for inputs/outputs
378 * (in LDS, tess and GS rings). VARn should be placed right
379 * after POSITION to make that size as small as possible.
380 */
381 SI_UNIQUE_SLOT_VAR0 = 1, /* 0..31 */
382
383 /* Put 16-bit GLES varyings after 32-bit varyings. They can use the same indices as
384 * legacy desktop GL varyings because they are mutually exclusive.
385 */
386 SI_UNIQUE_SLOT_VAR0_16BIT = 33, /* 0..15 */
387
388 /* Legacy GL-only varyings can alias GLES-only 16-bit varyings. */
389 SI_UNIQUE_SLOT_FOGC = 33,
390 SI_UNIQUE_SLOT_COL0,
391 SI_UNIQUE_SLOT_COL1,
392 SI_UNIQUE_SLOT_BFC0,
393 SI_UNIQUE_SLOT_BFC1,
394 SI_UNIQUE_SLOT_TEX0,
395 SI_UNIQUE_SLOT_TEX1,
396 SI_UNIQUE_SLOT_TEX2,
397 SI_UNIQUE_SLOT_TEX3,
398 SI_UNIQUE_SLOT_TEX4,
399 SI_UNIQUE_SLOT_TEX5,
400 SI_UNIQUE_SLOT_TEX6,
401 SI_UNIQUE_SLOT_TEX7,
402 SI_UNIQUE_SLOT_CLIP_VERTEX,
403
404 /* Varyings present in both GLES and desktop GL must start at 49 after 16-bit varyings. */
405 SI_UNIQUE_SLOT_CLIP_DIST0 = 49,
406 SI_UNIQUE_SLOT_CLIP_DIST1,
407 SI_UNIQUE_SLOT_PSIZ,
408 /* These can't be written by LS, HS, and ES. */
409 SI_UNIQUE_SLOT_LAYER,
410 SI_UNIQUE_SLOT_VIEWPORT,
411 SI_UNIQUE_SLOT_PRIMITIVE_ID,
412 };
413
414 /**
415 * For VS shader keys, describe any fixups required for vertex fetch.
416 *
417 * \ref log_size, \ref format, and the number of channels are interpreted as
418 * by \ref ac_build_opencoded_load_format.
419 *
420 * Note: all bits 0 (size = 1 byte, num channels = 1, format = float) is an
421 * impossible format and indicates that no fixup is needed (just use
422 * buffer_load_format_xyzw).
423 */
424 union si_vs_fix_fetch {
425 struct {
426 uint8_t log_size : 2; /* 1, 2, 4, 8 or bytes per channel */
427 uint8_t num_channels_m1 : 2; /* number of channels minus 1 */
428 uint8_t format : 3; /* AC_FETCH_FORMAT_xxx */
429 uint8_t reverse : 1; /* reverse XYZ channels */
430 } u;
431 uint8_t bits;
432 };
433
434 struct si_shader;
435
436 /* State of the context creating the shader object. */
437 struct si_compiler_ctx_state {
438 /* Should only be used by si_init_shader_selector_async and
439 * si_build_shader_variant if thread_index == -1 (non-threaded). */
440 struct ac_llvm_compiler *compiler;
441
442 /* Used if thread_index == -1 or if debug.async is true. */
443 struct util_debug_callback debug;
444
445 /* Used for creating the log string for gallium/ddebug. */
446 bool is_debug_context;
447 };
448
449 enum si_color_output_type {
450 SI_TYPE_ANY32,
451 SI_TYPE_FLOAT16,
452 SI_TYPE_INT16,
453 SI_TYPE_UINT16,
454 };
455
456 union si_ps_input_info {
457 struct {
458 uint8_t semantic;
459 uint8_t interpolate;
460 uint8_t fp16_lo_hi_valid;
461 };
462 uint32_t _unused; /* this just forces 4-byte alignment */
463 };
464
465 struct si_vs_tcs_input_info {
466 uint8_t semantic;
467 uint8_t usage_mask;
468 };
469
470 struct si_shader_info {
471 shader_info base;
472
473 uint32_t options; /* bitmask of SI_PROFILE_* */
474
475 uint8_t num_inputs;
476 uint8_t num_outputs;
477 struct si_vs_tcs_input_info input[PIPE_MAX_SHADER_INPUTS];
478 uint8_t output_semantic[PIPE_MAX_SHADER_OUTPUTS];
479 uint8_t output_usagemask[PIPE_MAX_SHADER_OUTPUTS];
480 uint8_t output_streams[PIPE_MAX_SHADER_OUTPUTS];
481 uint8_t output_type[PIPE_MAX_SHADER_OUTPUTS]; /* enum nir_alu_type */
482 uint8_t output_xfb_writemask[PIPE_MAX_SHADER_OUTPUTS];
483
484 uint8_t num_streamout_components;
485 uint8_t num_vs_inputs;
486 uint8_t num_vbos_in_user_sgprs;
487 uint8_t num_stream_output_components[4]; /* for GS streams, not streamout */
488 uint16_t enabled_streamout_buffer_mask;
489
490 uint64_t inputs_read; /* "get_unique_index" bits */
491 uint64_t tcs_inputs_via_temp;
492 uint64_t tcs_inputs_via_lds;
493
494 /* For VS before {TCS, TES, GS} and TES before GS. */
495 uint64_t ls_es_outputs_written; /* "get_unique_index" bits */
496 uint64_t outputs_written_before_ps; /* "get_unique_index" bits */
497 uint64_t tcs_outputs_written_for_tes; /* "get_unique_index" bits */
498 uint32_t patch_outputs_written_for_tes; /* "get_unique_index_patch" bits */
499 uint32_t tess_levels_written_for_tes; /* "get_unique_index_patch" bits */
500
501 uint8_t clipdist_mask;
502 uint8_t culldist_mask;
503
504 uint16_t esgs_vertex_stride;
505 uint16_t gsvs_vertex_size;
506 uint8_t gs_input_verts_per_prim;
507 unsigned max_gsvs_emit_size;
508
509 /* Set 0xf or 0x0 (4 bits) per each written output.
510 * ANDed with spi_shader_col_format.
511 */
512 unsigned colors_written_4bit;
513
514 int constbuf0_num_slots;
515 uint8_t color_attr_index[2];
516 uint8_t color_interpolate[2];
517 uint8_t color_interpolate_loc[2];
518 uint8_t colors_read; /**< which color components are read by the FS */
519 uint8_t colors_written;
520 uint16_t output_color_types; /**< Each bit pair is enum si_color_output_type */
521 bool color0_writes_all_cbufs; /**< gl_FragColor */
522 bool reads_samplemask; /**< does fragment shader read sample mask? */
523 bool reads_tess_factors; /**< If TES reads TESSINNER or TESSOUTER */
524 bool writes_z; /**< does fragment shader write Z value? */
525 /* We need both because both can be present in different conditional blocks. */
526 bool output_z_equals_input_z; /**< gl_FragDepth == gl_FragCoord.z for any write */
527 bool output_z_is_not_input_z; /**< gl_FragDepth != gl_FragCoord.z for any write */
528 bool writes_stencil; /**< does fragment shader write stencil value? */
529 bool writes_samplemask; /**< does fragment shader write sample mask? */
530 bool writes_edgeflag; /**< vertex shader outputs edgeflag */
531 bool uses_interp_color;
532 bool uses_persp_center_color;
533 bool uses_persp_centroid_color;
534 bool uses_persp_sample_color;
535 bool uses_persp_center;
536 bool uses_persp_centroid;
537 bool uses_persp_sample;
538 bool uses_linear_center;
539 bool uses_linear_centroid;
540 bool uses_linear_sample;
541 bool uses_interp_at_offset;
542 bool uses_interp_at_sample;
543 bool uses_instanceid;
544 bool uses_base_vertex;
545 bool uses_base_instance;
546 bool uses_drawid;
547 bool uses_primid;
548 bool uses_frontface;
549 bool uses_invocationid;
550 bool uses_thread_id[3];
551 bool uses_block_id[3];
552 bool uses_variable_block_size;
553 bool uses_grid_size;
554 bool uses_tg_size;
555 bool uses_atomic_ordered_add;
556 bool writes_position;
557 bool writes_psize;
558 bool writes_clipvertex;
559 bool writes_primid;
560 bool writes_viewport_index;
561 bool writes_layer;
562 bool uses_bindless_samplers;
563 bool uses_bindless_images;
564 bool uses_indirect_descriptor;
565 bool has_divergent_loop;
566 bool has_non_uniform_tex_access;
567 bool has_shadow_comparison;
568
569 bool uses_vmem_sampler_or_bvh;
570 bool uses_vmem_load_other; /* all other VMEM loads and atomics with return */
571
572 /** Whether all codepaths write tess factors in all invocations. */
573 bool tessfactors_are_def_in_all_invocs;
574
575 /* A flag to check if vrs2x2 can be enabled to reduce number of
576 * fragment shader invocations if flat shading.
577 */
578 bool allow_flat_shading;
579
580 /* Optimization: if the texture bound to this texunit has been cleared to 1,
581 * then the draw can be skipped (see si_draw_vbo_skip_noop). Initially the
582 * value is 0xff (undetermined) and can be later changed to 0 (= false) or
583 * texunit + 1.
584 */
585 uint8_t writes_1_if_tex_is_1;
586
587 /* frag coord and sample pos per component read mask. */
588 uint8_t reads_frag_coord_mask;
589 };
590
591 /* A shader selector is a gallium CSO and contains shader variants and
592 * binaries for one NIR program. This can be shared by multiple contexts.
593 */
594 struct si_shader_selector {
595 struct util_live_shader base;
596 struct si_screen *screen;
597 struct util_queue_fence ready;
598 struct si_compiler_ctx_state compiler_ctx_state;
599 gl_shader_stage stage;
600
601 simple_mtx_t mutex;
602 union si_shader_key *keys;
603 unsigned variants_count;
604 unsigned variants_max_count;
605 struct si_shader **variants;
606
607 /* The compiled NIR shader without a prolog and/or epilog (not
608 * uploaded to a buffer object).
609 *
610 * [0] for wave32, [1] for wave64.
611 */
612 struct si_shader *main_shader_part[2];
613 struct si_shader *main_shader_part_ls[2]; /* as_ls is set in the key */
614 struct si_shader *main_shader_part_es; /* as_es && !as_ngg in the key */
615 struct si_shader *main_shader_part_ngg[2]; /* !as_es && as_ngg in the key */
616 struct si_shader *main_shader_part_ngg_es[2]; /* as_es && as_ngg in the key */
617
618 struct nir_shader *nir;
619 void *nir_binary;
620 unsigned nir_size;
621
622 struct si_shader_info info;
623
624 uint8_t const_and_shader_buf_descriptors_index;
625 uint8_t sampler_and_images_descriptors_index;
626 uint8_t cs_shaderbufs_sgpr_index;
627 uint8_t cs_num_shaderbufs_in_user_sgprs;
628 uint8_t cs_images_sgpr_index;
629 uint8_t cs_images_num_sgprs;
630 uint8_t cs_num_images_in_user_sgprs;
631 unsigned ngg_cull_vert_threshold; /* UINT32_MAX = disabled */
632 enum mesa_prim rast_prim;
633
634 /* GS parameters. */
635 bool tess_turns_off_ngg;
636
637 /* bitmasks of used descriptor slots */
638 uint64_t active_const_and_shader_buffers;
639 uint64_t active_samplers_and_images;
640 };
641
642 /* Valid shader configurations:
643 *
644 * API shaders VS | TCS | TES | GS |pass| PS
645 * are compiled as: | | | |thru|
646 * | | | | |
647 * Only VS & PS: VS | | | | | PS
648 * GFX6 - with GS: ES | | | GS | VS | PS
649 * - with tess: LS | HS | VS | | | PS
650 * - with both: LS | HS | ES | GS | VS | PS
651 * GFX9 - with GS: -> | | | GS | VS | PS
652 * - with tess: -> | HS | VS | | | PS
653 * - with both: -> | HS | -> | GS | VS | PS
654 * | | | | |
655 * NGG - VS & PS: GS | | | | | PS
656 * (GFX10+) - with GS: -> | | | GS | | PS
657 * - with tess: -> | HS | GS | | | PS
658 * - with both: -> | HS | -> | GS | | PS
659 *
660 * -> = merged with the next stage
661 */
662
663 /* Use the byte alignment for all following structure members for optimal
664 * shader key memory footprint.
665 */
666 #pragma pack(push, 1)
667
668 /* Common PS bits between the shader key and the prolog key. */
669 struct si_ps_prolog_bits {
670 unsigned color_two_side : 1;
671 unsigned flatshade_colors : 1;
672 unsigned poly_stipple : 1;
673 unsigned force_persp_sample_interp : 1;
674 unsigned force_linear_sample_interp : 1;
675 unsigned force_persp_center_interp : 1;
676 unsigned force_linear_center_interp : 1;
677 unsigned bc_optimize_for_persp : 1;
678 unsigned bc_optimize_for_linear : 1;
679 unsigned samplemask_log_ps_iter : 2;
680 unsigned get_frag_coord_from_pixel_coord : 1;
681 unsigned force_samplemask_to_helper_invocation : 1;
682 };
683
684 /* Common PS bits between the shader key and the epilog key. */
685 struct si_ps_epilog_bits {
686 unsigned spi_shader_col_format;
687 unsigned color_is_int8 : 8;
688 unsigned color_is_int10 : 8;
689 unsigned alpha_func : 3;
690 unsigned alpha_to_one : 1;
691 unsigned alpha_to_coverage_via_mrtz : 1; /* gfx11+ or alpha_to_one */
692 unsigned clamp_color : 1;
693 unsigned dual_src_blend_swizzle : 1; /* gfx11+ */
694 unsigned rbplus_depth_only_opt:1;
695 unsigned kill_z:1;
696 unsigned kill_stencil:1;
697 unsigned kill_samplemask:1;
698 };
699
700 union si_shader_part_key {
701 struct {
702 struct si_ps_prolog_bits states;
703 unsigned use_aco : 1;
704 unsigned wave32 : 1;
705 unsigned num_input_sgprs : 6;
706 /* Color interpolation and two-side color selection. */
707 unsigned colors_read : 8; /* color input components read */
708 unsigned num_interp_inputs : 5; /* BCOLOR is at this location */
709 unsigned fragcoord_usage_mask : 4;
710 unsigned pixel_center_integer : 1;
711 unsigned wqm : 1;
712 char color_attr_index[2];
713 signed char color_interp_vgpr_index[2]; /* -1 == constant */
714 } ps_prolog;
715 struct {
716 struct si_ps_epilog_bits states;
717 unsigned use_aco : 1;
718 unsigned wave32 : 1;
719 unsigned uses_discard : 1;
720 unsigned colors_written : 8;
721 unsigned color_types : 16;
722 unsigned writes_all_cbufs : 1;
723 unsigned writes_z : 1;
724 unsigned writes_stencil : 1;
725 unsigned writes_samplemask : 1;
726 } ps_epilog;
727 };
728
729 /* The shader key for geometry stages (VS, TCS, TES, GS) */
730 struct si_shader_key_ge {
731 /* Prolog and epilog flags. */
732 union {
733 struct {
734 struct si_shader_selector *ls; /* for merged LS-HS */
735 } tcs; /* tessellation control shader */
736 struct {
737 struct si_shader_selector *es; /* for merged ES-GS */
738 } gs;
739 } part;
740
741 /* These three are initially set according to the NEXT_SHADER property,
742 * or guessed if the property doesn't seem correct.
743 */
744 unsigned as_es : 1; /* whether it's a shader before GS */
745 unsigned as_ls : 1; /* whether it's VS before TCS */
746 unsigned as_ngg : 1; /* whether it's the last GE stage and NGG is enabled,
747 also set for the stage right before GS */
748
749 /* Flags for monolithic compilation only. */
750 struct {
751 /* - If neither "is_one" nor "is_fetched" has a bit set, the instance
752 * divisor is 0.
753 * - If "is_one" has a bit set, the instance divisor is 1.
754 * - If "is_fetched" has a bit set, the instance divisor will be loaded
755 * from the constant buffer.
756 */
757 uint16_t instance_divisor_is_one; /* bitmask of inputs */
758 uint16_t instance_divisor_is_fetched; /* bitmask of inputs */
759
760 /* Whether fetch should be opencoded according to vs_fix_fetch.
761 * Otherwise, if vs_fix_fetch is non-zero, buffer_load_format_xyzw
762 * with minimal fixups is used. */
763 uint16_t vs_fetch_opencode;
764 union si_vs_fix_fetch vs_fix_fetch[SI_MAX_ATTRIBS];
765
766 union {
767 /* When PS needs PrimID and GS is disabled. */
768 unsigned vs_export_prim_id : 1; /* VS and TES only */
769 unsigned gs_tri_strip_adj_fix : 1; /* GS only */
770 } u;
771
772 /* Gfx12: When no streamout buffers are bound, streamout must be disabled. */
773 unsigned remove_streamout : 1;
774 } mono;
775
776 /* Optimization flags for asynchronous compilation only. */
777 struct {
778 /* For HW VS (it can be VS, TES, GS) */
779 uint64_t kill_outputs; /* "get_unique_index" bits */
780 unsigned kill_clip_distances : 8;
781 unsigned kill_pointsize : 1;
782 unsigned kill_layer : 1;
783 unsigned remove_streamout : 1;
784
785 /* For NGG VS and TES. */
786 unsigned ngg_culling : 11; /* SI_NGG_CULL_* */
787
788 /* If NGG VS streamout knows the number of vertices per primitive at compile time,
789 * it can put stores for all vertices in the same VMEM clause, instead of storing
790 * vertices for the 2nd and 3rd vertex conditionally because the primitive type is
791 * unknown.
792 */
793 unsigned ngg_vs_streamout_num_verts_per_prim : 2;
794
795 /* For shaders where monolithic variants have better code.
796 *
797 * This is a flag that has no effect on code generation,
798 * but forces monolithic shaders to be used as soon as
799 * possible, because it's in the "opt" group.
800 */
801 unsigned prefer_mono : 1;
802
803 /* VS and TCS have the same number of patch vertices. */
804 unsigned same_patch_vertices:1;
805
806 /* For TCS. */
807 unsigned tes_prim_mode : 2;
808 unsigned tes_reads_tess_factors : 1;
809
810 unsigned inline_uniforms:1;
811
812 /* This must be kept last to limit the number of variants
813 * depending only on the uniform values.
814 */
815 uint32_t inlined_uniform_values[MAX_INLINABLE_UNIFORMS];
816 } opt;
817 };
818
819 struct si_shader_key_ps {
820 struct {
821 /* Prolog and epilog flags. */
822 struct si_ps_prolog_bits prolog;
823 struct si_ps_epilog_bits epilog;
824 } part;
825
826 /* Flags for monolithic compilation only. */
827 struct {
828 unsigned force_mono : 1;
829 unsigned poly_line_smoothing : 1;
830 unsigned point_smoothing : 1;
831 unsigned interpolate_at_sample_force_center : 1;
832 unsigned fbfetch_msaa : 1;
833 unsigned fbfetch_is_1D : 1;
834 unsigned fbfetch_layered : 1;
835 } mono;
836
837 /* Optimization flags for asynchronous compilation only. */
838 struct {
839 /* For shaders where monolithic variants have better code.
840 *
841 * This is a flag that has no effect on code generation,
842 * but forces monolithic shaders to be used as soon as
843 * possible, because it's in the "opt" group.
844 */
845 unsigned prefer_mono : 1;
846 unsigned inline_uniforms:1;
847
848 /* This eliminates the FRONT_FACE input VGPR as well as shader code using it. */
849 int force_front_face_input : 2; /* 0 = gl_FrontFacing, 1 = true, -1 = false */
850
851 /* This must be kept last to limit the number of variants
852 * depending only on the uniform values.
853 */
854 uint32_t inlined_uniform_values[MAX_INLINABLE_UNIFORMS];
855 } opt;
856 };
857
858 union si_shader_key {
859 struct si_shader_key_ge ge; /* geometry engine shaders */
860 struct si_shader_key_ps ps;
861 };
862
863 /* Restore the pack alignment to default. */
864 #pragma pack(pop)
865
866 /* GCN-specific shader info. */
867 struct si_shader_binary_info {
868 uint8_t vs_output_param_offset[NUM_TOTAL_VARYING_SLOTS];
869 uint32_t vs_output_ps_input_cntl[NUM_TOTAL_VARYING_SLOTS];
870 union si_ps_input_info ps_inputs[SI_NUM_INTERP];
871 uint8_t num_ps_inputs;
872 uint8_t ps_colors_read;
873 uint8_t num_input_sgprs;
874 uint8_t num_input_vgprs;
875 bool uses_vmem_load_other; /* all other VMEM loads and atomics with return */
876 bool uses_vmem_sampler_or_bvh;
877 bool uses_instanceid;
878 uint8_t nr_pos_exports;
879 uint8_t nr_param_exports;
880 unsigned private_mem_vgprs;
881 unsigned max_simd_waves;
882 };
883
884 enum si_shader_binary_type {
885 SI_SHADER_BINARY_ELF,
886 SI_SHADER_BINARY_RAW,
887 };
888
889 struct si_shader_binary {
890 enum si_shader_binary_type type;
891
892 /* Depends on binary type, either ELF or raw buffer. */
893 const char *code_buffer;
894 size_t code_size;
895 uint32_t exec_size;
896
897 char *uploaded_code;
898 size_t uploaded_code_size;
899
900 char *llvm_ir_string;
901
902 const char *disasm_string;
903 size_t disasm_size;
904
905 const unsigned *symbols;
906 unsigned num_symbols;
907 };
908
909 struct gfx9_gs_info {
910 unsigned es_verts_per_subgroup;
911 unsigned gs_prims_per_subgroup;
912 unsigned gs_inst_prims_in_subgroup;
913 unsigned max_prims_per_subgroup;
914 unsigned esgs_ring_size; /* in bytes */
915 };
916
917 struct si_shader {
918 struct si_pm4_state pm4; /* base class */
919 struct si_compiler_ctx_state compiler_ctx_state;
920
921 struct si_shader_selector *selector;
922 struct si_shader_selector *previous_stage_sel; /* for refcounting */
923 struct si_shader *next_shader; /* Only used during compilation of LS and ES when merged. */
924
925 struct si_shader_part *prolog;
926 struct si_shader *previous_stage; /* for GFX9 */
927 struct si_shader_part *epilog;
928 struct si_shader *gs_copy_shader;
929
930 struct si_resource *bo;
931 /* gpu_address should be bo->gpu_address except if SQTT is
932 * in use.
933 */
934 uint64_t gpu_address;
935 /* Only used on GFX6-10 where the scratch address must be inserted into the shader binary.
936 * This is the scratch address that the current shader binary contains.
937 */
938 uint64_t scratch_va;
939 union si_shader_key key;
940 struct util_queue_fence ready;
941 bool compilation_failed;
942 bool is_monolithic;
943 bool is_optimized;
944 bool is_binary_shared;
945 bool is_gs_copy_shader;
946 uint8_t wave_size;
947 unsigned complete_shader_binary_size;
948
949 /* The following data is all that's needed for binary shaders. */
950 struct si_shader_binary binary;
951 struct ac_shader_config config;
952 struct si_shader_binary_info info;
953
954 /* SI_SGPR_VS_STATE_BITS */
955 bool uses_vs_state_provoking_vertex;
956 bool uses_gs_state_outprim;
957
958 bool uses_base_instance;
959
960 /* Shader key + LLVM IR + disassembly + statistics.
961 * Generated for debug contexts only.
962 */
963 char *shader_log;
964 size_t shader_log_size;
965
966 struct gfx9_gs_info gs_info;
967
968 /* Precomputed register values. */
969 union {
970 struct {
971 unsigned vgt_gsvs_ring_offset_1;
972 unsigned vgt_gsvs_ring_offset_2;
973 unsigned vgt_gsvs_ring_offset_3;
974 unsigned vgt_gsvs_ring_itemsize;
975 unsigned vgt_gs_max_vert_out;
976 unsigned vgt_gs_vert_itemsize;
977 unsigned vgt_gs_vert_itemsize_1;
978 unsigned vgt_gs_vert_itemsize_2;
979 unsigned vgt_gs_vert_itemsize_3;
980 unsigned vgt_gs_instance_cnt;
981 unsigned vgt_gs_onchip_cntl;
982 unsigned vgt_gs_max_prims_per_subgroup;
983 unsigned vgt_esgs_ring_itemsize;
984 unsigned spi_shader_pgm_rsrc3_gs;
985 unsigned spi_shader_pgm_rsrc4_gs;
986 } gs;
987
988 struct {
989 /* Computed by gfx10_ngg_calculate_subgroup_info. */
990 uint16_t ngg_emit_size; /* in dwords */
991 uint16_t hw_max_esverts;
992 uint16_t max_gsprims;
993 uint16_t max_out_verts;
994 bool max_vert_out_per_gs_instance;
995 /* Register values. */
996 unsigned ge_max_output_per_subgroup;
997 unsigned ge_ngg_subgrp_cntl;
998 unsigned vgt_primitiveid_en;
999 unsigned vgt_gs_onchip_cntl;
1000 unsigned vgt_gs_instance_cnt;
1001 unsigned esgs_vertex_stride;
1002 unsigned spi_vs_out_config;
1003 unsigned spi_shader_pos_format;
1004 unsigned pa_cl_vte_cntl;
1005 unsigned vgt_gs_max_vert_out; /* for API GS */
1006 unsigned ge_pc_alloc; /* uconfig register */
1007 unsigned spi_shader_pgm_rsrc3_gs;
1008 unsigned spi_shader_pgm_rsrc4_gs;
1009 unsigned vgt_shader_stages_en;
1010 } ngg;
1011
1012 struct {
1013 unsigned vgt_gs_mode;
1014 unsigned vgt_primitiveid_en;
1015 unsigned vgt_reuse_off;
1016 unsigned spi_vs_out_config;
1017 unsigned spi_shader_pos_format;
1018 unsigned pa_cl_vte_cntl;
1019 unsigned ge_pc_alloc; /* uconfig register */
1020 } vs;
1021
1022 struct {
1023 unsigned spi_ps_input_ena;
1024 unsigned spi_ps_input_addr;
1025 unsigned spi_ps_in_control;
1026 unsigned spi_shader_z_format;
1027 unsigned spi_shader_col_format;
1028 unsigned cb_shader_mask;
1029 unsigned db_shader_control;
1030 unsigned num_interp;
1031 unsigned spi_gs_out_config_ps;
1032 unsigned pa_sc_hisz_control;
1033 bool writes_z;
1034 bool writes_stencil;
1035 bool writes_samplemask;
1036 } ps;
1037 };
1038
1039 /* Precomputed register values. */
1040 unsigned vgt_tf_param; /* VGT_TF_PARAM */
1041 unsigned vgt_vertex_reuse_block_cntl; /* VGT_VERTEX_REUSE_BLOCK_CNTL */
1042 unsigned pa_cl_vs_out_cntl;
1043 unsigned ge_cntl;
1044 };
1045
1046 struct si_shader_part {
1047 struct si_shader_part *next;
1048 union si_shader_part_key key;
1049 struct si_shader_binary binary;
1050 unsigned num_vgprs;
1051 unsigned num_sgprs;
1052 };
1053
1054 /* si_shader.c */
1055 struct ac_rtld_binary;
1056
1057 bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
1058 struct si_shader *shader, struct util_debug_callback *debug);
1059 bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
1060 struct si_shader *shader, struct util_debug_callback *debug);
1061 void si_shader_destroy(struct si_shader *shader);
1062 unsigned si_shader_io_get_unique_index(unsigned semantic);
1063 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader,
1064 uint64_t scratch_va);
1065 int si_shader_binary_upload_at(struct si_screen *sscreen, struct si_shader *shader,
1066 uint64_t scratch_va, int64_t bo_offset);
1067 bool si_can_dump_shader(struct si_screen *sscreen, gl_shader_stage stage,
1068 enum si_shader_dump_type dump_type);
1069 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
1070 struct util_debug_callback *debug, FILE *f, bool check_debug_option);
1071 void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader,
1072 struct util_debug_callback *debug);
1073 void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size);
1074 const char *si_get_shader_name(const struct si_shader *shader);
1075 void si_shader_binary_clean(struct si_shader_binary *binary);
1076 struct nir_shader *si_deserialize_shader(struct si_shader_selector *sel);
1077 unsigned si_get_ps_num_interp(struct si_shader *ps);
1078 unsigned si_get_shader_prefetch_size(struct si_shader *shader);
1079 unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_shader *shader);
1080
1081 /* si_shader_info.c */
1082 void si_nir_scan_shader(struct si_screen *sscreen, struct nir_shader *nir,
1083 struct si_shader_info *info, bool colors_lowered);
1084
1085 /* si_shader_nir.c */
1086 void si_lower_mediump_io(struct nir_shader *nir);
1087
1088 bool si_alu_to_scalar_packed_math_filter(const struct nir_instr *instr, const void *data);
1089 void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool has_array_temps);
1090 void si_nir_late_opts(struct nir_shader *nir);
1091 char *si_finalize_nir(struct pipe_screen *screen, struct nir_shader *nir);
1092
1093 /* si_state_shaders.cpp */
1094 unsigned si_shader_num_alloc_param_exports(struct si_shader *shader);
1095 unsigned si_determine_wave_size(struct si_screen *sscreen, struct si_shader *shader);
1096 void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *gs,
1097 struct gfx9_gs_info *out);
1098 bool gfx10_is_ngg_passthrough(struct si_shader *shader);
1099 unsigned si_shader_lshs_vertex_stride(struct si_shader *ls);
1100 bool si_should_clear_lds(struct si_screen *sscreen, const struct nir_shader *shader);
1101 unsigned si_get_output_prim_simplified(const struct si_shader_selector *sel,
1102 const union si_shader_key *key);
1103
1104 /* Inline helpers. */
1105
1106 /* Return the pointer to the main shader part's pointer. */
si_get_main_shader_part(struct si_shader_selector * sel,const union si_shader_key * key,unsigned wave_size)1107 static inline struct si_shader **si_get_main_shader_part(struct si_shader_selector *sel,
1108 const union si_shader_key *key,
1109 unsigned wave_size)
1110 {
1111 assert(wave_size == 32 || wave_size == 64);
1112 unsigned index = wave_size / 32 - 1;
1113
1114 if (sel->stage <= MESA_SHADER_GEOMETRY) {
1115 if (key->ge.as_ls)
1116 return &sel->main_shader_part_ls[index];
1117 if (key->ge.as_es && key->ge.as_ngg)
1118 return &sel->main_shader_part_ngg_es[index];
1119 if (key->ge.as_es) {
1120 /* legacy GS only support wave 64 */
1121 assert(wave_size == 64);
1122 return &sel->main_shader_part_es;
1123 }
1124 if (key->ge.as_ngg)
1125 return &sel->main_shader_part_ngg[index];
1126 }
1127 return &sel->main_shader_part[index];
1128 }
1129
gfx10_has_variable_edgeflags(struct si_shader * shader)1130 static inline bool gfx10_has_variable_edgeflags(struct si_shader *shader)
1131 {
1132 unsigned output_prim = si_get_output_prim_simplified(shader->selector, &shader->key);
1133
1134 return shader->selector->stage == MESA_SHADER_VERTEX &&
1135 (output_prim == MESA_PRIM_TRIANGLES || output_prim == MESA_PRIM_UNKNOWN);
1136 }
1137
si_shader_uses_streamout(const struct si_shader * shader)1138 static inline bool si_shader_uses_streamout(const struct si_shader *shader)
1139 {
1140 return shader->selector->stage <= MESA_SHADER_GEOMETRY &&
1141 shader->selector->info.enabled_streamout_buffer_mask &&
1142 !shader->key.ge.opt.remove_streamout &&
1143 !shader->key.ge.mono.remove_streamout;
1144 }
1145
si_shader_uses_discard(struct si_shader * shader)1146 static inline bool si_shader_uses_discard(struct si_shader *shader)
1147 {
1148 /* Changes to this should also update ps_modifies_zs. */
1149 return shader->selector->info.base.fs.uses_discard ||
1150 shader->key.ps.part.prolog.poly_stipple ||
1151 shader->key.ps.mono.point_smoothing ||
1152 shader->key.ps.part.epilog.alpha_func != PIPE_FUNC_ALWAYS;
1153 }
1154
si_shader_culling_enabled(struct si_shader * shader)1155 static inline bool si_shader_culling_enabled(struct si_shader *shader)
1156 {
1157 /* Legacy VS/TES/GS and ES don't cull in the shader. */
1158 if (!shader->key.ge.as_ngg || shader->key.ge.as_es) {
1159 assert(!shader->key.ge.opt.ngg_culling);
1160 return false;
1161 }
1162
1163 if (shader->key.ge.opt.ngg_culling)
1164 return true;
1165
1166 unsigned output_prim = si_get_output_prim_simplified(shader->selector, &shader->key);
1167
1168 /* This enables NGG culling for non-monolithic TES and GS. */
1169 return shader->selector->ngg_cull_vert_threshold == 0 &&
1170 (output_prim == MESA_PRIM_TRIANGLES || output_prim == MESA_PRIM_LINES);
1171 }
1172
1173 #ifdef __cplusplus
1174 }
1175 #endif
1176
1177 #endif
1178