1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 /* The compiler middle-end architecture: Explaining (non-)monolithic shaders
8 * -------------------------------------------------------------------------
9 *
10 * Typically, there is one-to-one correspondence between API and HW shaders,
11 * that is, for every API shader, there is exactly one shader binary in
12 * the driver.
13 *
14 * The problem with that is that we also have to emulate some API states
15 * (e.g. alpha-test, and many others) in shaders too. The two obvious ways
16 * to deal with it are:
17 * - each shader has multiple variants for each combination of emulated states,
18 * and the variants are compiled on demand, possibly relying on a shader
19 * cache for good performance
20 * - patch shaders at the binary level
21 *
22 * This driver uses something completely different. The emulated states are
23 * usually implemented at the beginning or end of shaders. Therefore, we can
24 * split the shader into 3 parts:
25 * - prolog part (shader code dependent on states)
26 * - main part (the API shader)
27 * - epilog part (shader code dependent on states)
28 *
29 * Each part is compiled as a separate shader and the final binaries are
30 * concatenated. This type of shader is called non-monolithic, because it
31 * consists of multiple independent binaries. Creating a new shader variant
32 * is therefore only a concatenation of shader parts (binaries) and doesn't
33 * involve any compilation. The main shader parts are the only parts that are
34 * compiled when applications create shader objects. The prolog and epilog
35 * parts are compiled on the first use and saved, so that their binaries can
36 * be reused by many other shaders.
37 *
38 * One of the roles of the prolog part is to compute vertex buffer addresses
39 * for vertex shaders. A few of the roles of the epilog part are color buffer
40 * format conversions in pixel shaders that we have to do manually, and write
41 * tessellation factors in tessellation control shaders. The prolog and epilog
42 * have many other important responsibilities in various shader stages.
43 * They don't just "emulate legacy stuff".
44 *
45 * Monolithic shaders are shaders where the parts are combined before LLVM
46 * compilation, and the whole thing is compiled and optimized as one unit with
47 * one binary on the output. The result is the same as the non-monolithic
48 * shader, but the final code can be better, because LLVM can optimize across
49 * all shader parts. Monolithic shaders aren't usually used except for these
50 * special cases:
51 *
52 * 1) Some rarely-used states require modification of the main shader part
53 * itself, and in such cases, only the monolithic shader variant is
54 * compiled, and that's always done on the first use.
55 *
56 * 2) When we do cross-stage optimizations for separate shader objects and
57 * e.g. eliminate unused shader varyings, the resulting optimized shader
58 * variants are always compiled as monolithic shaders, and always
59 * asynchronously (i.e. not stalling ongoing rendering). We call them
60 * "optimized monolithic" shaders. The important property here is that
61 * the non-monolithic unoptimized shader variant is always available for use
62 * when the asynchronous compilation of the optimized shader is not done
63 * yet.
64 *
65 * Starting with GFX9 chips, some shader stages are merged, and the number of
66 * shader parts per shader increased. The complete new list of shader parts is:
67 * - 1st shader: prolog part
68 * - 1st shader: main part
69 * - 2nd shader: main part
70 * - 2nd shader: epilog part
71 */
72
73 /* How linking shader inputs and outputs between vertex, tessellation, and
74 * geometry shaders works.
75 *
76 * Inputs and outputs between shaders are stored in a buffer. This buffer
77 * lives in LDS (typical case for tessellation), but it can also live
78 * in memory (ESGS). Each input or output has a fixed location within a vertex.
79 * The highest used input or output determines the stride between vertices.
80 *
81 * Since GS and tessellation are only possible in the OpenGL core profile,
82 * only these semantics are valid for per-vertex data:
83 *
84 * Name Location
85 *
86 * POSITION 0
87 * VAR0..31 1..32
88 * CLIP_DIST0..1 49..50
89 * PSIZ 51
90 *
91 * For example, a shader only writing GENERIC0 has the output stride of 5.
92 *
93 * Only these semantics are valid for per-patch data:
94 *
95 * Name Location
96 *
97 * TESSOUTER 0
98 * TESSINNER 1
99 * PATCH0..29 2..31
100 *
101 * That's how independent shaders agree on input and output locations.
102 * The si_shader_io_get_unique_index function assigns the locations.
103 *
104 * For tessellation, other required information for calculating the input and
105 * output addresses like the vertex stride, the patch stride, and the offsets
106 * where per-vertex and per-patch data start, is passed to the shader via
107 * user data SGPRs. The offsets and strides are calculated at draw time and
108 * aren't available at compile time.
109 */
110
111 #ifndef SI_SHADER_H
112 #define SI_SHADER_H
113
114 #include "shader_info.h"
115 #include "ac_binary.h"
116 #include "ac_gpu_info.h"
117 #include "util/mesa-blake3.h"
118 #include "util/u_live_shader_cache.h"
119 #include "util/u_queue.h"
120 #include "si_pm4.h"
121
122 #ifdef __cplusplus
123 extern "C" {
124 #endif
125
126 struct nir_shader;
127 struct nir_instr;
128
129 #define SI_NUM_INTERP 32
130 #define SI_MAX_ATTRIBS 16
131 #define SI_MAX_VS_OUTPUTS 40
132 #define SI_USER_CLIP_PLANE_MASK 0x3F
133
134 #define INTERP_MODE_COLOR INTERP_MODE_COUNT
135
136 #define SI_PS_INPUT_CNTL_0000 (S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(0))
137 #define SI_PS_INPUT_CNTL_0001 (S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(3))
138 #define SI_PS_INPUT_CNTL_UNUSED SI_PS_INPUT_CNTL_0000
139 /* D3D9 behaviour for COLOR0 requires 0001. GL is undefined. */
140 #define SI_PS_INPUT_CNTL_UNUSED_COLOR0 SI_PS_INPUT_CNTL_0001
141
142 /* SGPR user data indices */
143 enum
144 {
145 SI_SGPR_INTERNAL_BINDINGS,
146 SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES,
147 SI_SGPR_CONST_AND_SHADER_BUFFERS, /* or just a constant buffer 0 pointer */
148 SI_SGPR_SAMPLERS_AND_IMAGES,
149 SI_NUM_RESOURCE_SGPRS,
150
151 /* API VS, TES without GS, GS copy shader */
152 SI_SGPR_VS_STATE_BITS = SI_NUM_RESOURCE_SGPRS,
153 SI_NUM_VS_STATE_RESOURCE_SGPRS,
154
155 /* all VS variants */
156 SI_SGPR_BASE_VERTEX = SI_NUM_VS_STATE_RESOURCE_SGPRS,
157 SI_SGPR_DRAWID,
158 SI_SGPR_START_INSTANCE,
159 SI_VS_NUM_USER_SGPR,
160
161 SI_SGPR_VS_BLIT_DATA = SI_SGPR_CONST_AND_SHADER_BUFFERS,
162
163 /* TES */
164 SI_SGPR_TES_OFFCHIP_LAYOUT = SI_NUM_VS_STATE_RESOURCE_SGPRS,
165 SI_SGPR_TES_OFFCHIP_ADDR,
166 SI_TES_NUM_USER_SGPR,
167
168 /* GFX6-8: TCS only */
169 GFX6_SGPR_TCS_OFFCHIP_LAYOUT = SI_NUM_RESOURCE_SGPRS,
170 GFX6_SGPR_TCS_OFFCHIP_ADDR,
171 GFX6_SGPR_TCS_IN_LAYOUT,
172 GFX6_TCS_NUM_USER_SGPR,
173
174 /* GFX9: Merged LS-HS (VS-TCS) only. */
175 GFX9_SGPR_TCS_OFFCHIP_LAYOUT = SI_VS_NUM_USER_SGPR,
176 GFX9_SGPR_TCS_OFFCHIP_ADDR,
177 GFX9_TCS_NUM_USER_SGPR,
178
179 /* GS limits */
180 GFX6_GS_NUM_USER_SGPR = SI_NUM_RESOURCE_SGPRS,
181 SI_GSCOPY_NUM_USER_SGPR = SI_NUM_VS_STATE_RESOURCE_SGPRS,
182
183 GFX9_SGPR_SMALL_PRIM_CULL_INFO = MAX2(SI_VS_NUM_USER_SGPR, SI_TES_NUM_USER_SGPR),
184 GFX9_SGPR_ATTRIBUTE_RING_ADDR,
185 GFX9_GS_NUM_USER_SGPR,
186
187 /* PS only */
188 SI_SGPR_ALPHA_REF = SI_NUM_RESOURCE_SGPRS,
189 SI_PS_NUM_USER_SGPR,
190
191 /* The value has to be 12, because the hw requires that descriptors
192 * are aligned to 4 SGPRs.
193 */
194 SI_SGPR_VS_VB_DESCRIPTOR_FIRST = 12,
195 };
196
197 /* LLVM function parameter indices */
198 enum
199 {
200 SI_NUM_RESOURCE_PARAMS = 4,
201
202 /* PS only parameters */
203 SI_PARAM_ALPHA_REF = SI_NUM_RESOURCE_PARAMS,
204 SI_PARAM_PRIM_MASK,
205 SI_PARAM_PERSP_SAMPLE,
206 SI_PARAM_PERSP_CENTER,
207 SI_PARAM_PERSP_CENTROID,
208 SI_PARAM_PERSP_PULL_MODEL,
209 SI_PARAM_LINEAR_SAMPLE,
210 SI_PARAM_LINEAR_CENTER,
211 SI_PARAM_LINEAR_CENTROID,
212 SI_PARAM_LINE_STIPPLE_TEX,
213 SI_PARAM_POS_X_FLOAT,
214 SI_PARAM_POS_Y_FLOAT,
215 SI_PARAM_POS_Z_FLOAT,
216 SI_PARAM_POS_W_FLOAT,
217 SI_PARAM_FRONT_FACE,
218 SI_PARAM_ANCILLARY,
219 SI_PARAM_SAMPLE_COVERAGE,
220 SI_PARAM_POS_FIXED_PT,
221
222 SI_NUM_PARAMS = SI_PARAM_POS_FIXED_PT + 9, /* +8 for COLOR[0..1] */
223 };
224
225 /* These fields are only set in current_vs_state (except INDEXED) in si_context, and they are
226 * accessible in the shader via vs_state_bits in VS, TES, and GS.
227 */
228 #define VS_STATE_CLAMP_VERTEX_COLOR__SHIFT 0
229 #define VS_STATE_CLAMP_VERTEX_COLOR__MASK 0x1 /* Shared by VS and GS */
230 #define VS_STATE_INDEXED__SHIFT 1
231 #define VS_STATE_INDEXED__MASK 0x1 /* Shared by VS and GS */
232
233 /* These fields are only set in current_gs_state in si_context, and they are accessible
234 * in the shader via vs_state_bits in legacy GS, the GS copy shader, and any NGG shader.
235 */
236 /* bit gap */
237 /* The number of ES outputs is derived from the last output index of SI_UNIQUE_SLOT_* + 1, which
238 * can be 55 at most. The ESGS vertex stride in dwords is: NUM_ES_OUTPUTS * 4 + 1
239 * Only used by GFX9+ to compute LDS addresses of GS inputs.
240 */
241 #define GS_STATE_NUM_ES_OUTPUTS__SHIFT 14
242 #define GS_STATE_NUM_ES_OUTPUTS__MASK 0x3f
243 #define GS_STATE_CULL_FACE_FRONT__SHIFT 20
244 #define GS_STATE_CULL_FACE_FRONT__MASK 0x1
245 #define GS_STATE_CULL_FACE_BACK__SHIFT 21
246 #define GS_STATE_CULL_FACE_BACK__MASK 0x1
247 /* Small prim filter precision = num_samples / quant_mode where num_samples is in {1, 2, 4, 8} and
248 * quant_mode is in {256, 1024, 4096}, which is equal to 1/2^n where n is between 5 and 12.
249 *
250 * Equation 1: Represent the value as 1/2^n.
251 * Assumption: log_samples <= 3 and log_quant_mode >= 8
252 * num_samples / quant_mode =
253 * 2^log_samples / 2^log_quant_mode =
254 * 1 / 2^(log_quant_mode - log_samples) [because log_samples < log_quant_mode]
255 *
256 * Knowing that, we only need 4 bits to represent the FP32 exponent and thus the FP32 number.
257 *
258 * Equation 2: Encoding the exponent.
259 * 1/2^(15 - value) in FP32 = ((value | 0x70) << 23) in binary if value < 15
260 * Proof: With 0x70 = 112, we get FP32 exponent 2^(112 + value - 127) according to the FP32
261 * definition, which can be simplified to 2^(value - 15), which is a negative exponent
262 * for value < 15. Given that 2^-n = 1/2^n, the FP32 number is equal to 1/2^(15 - value).
263 *
264 * Equation 3: Convert quant_mode_enum to log_quant_mode.
265 * quant_mode_enum:
266 * 0 means 256 = 2^8 --> log2(256) = 8
267 * 1 means 1024 = 2^10 --> log2(1024) = 10
268 * 2 means 4096 = 2^12 --> log2(4096) = 12
269 *
270 * Conversion to log_quant_mode:
271 * log_quant_mode = quant_mode_enum * 2 + 8. Proof:
272 * 0 * 2 + 8 = 8
273 * 1 * 2 + 8 = 10
274 * 2 * 2 + 8 = 12
275 *
276 * Equation 4: Get the exponent value for Equation 2 from Equation 1.
277 * 15 - value = log_quant_mode - log_samples
278 * value = 15 - (log_quant_mode + log_samples)
279 *
280 * Combine equations 2, 3, and 4 to get the expression computing the FP32 number from log_samples
281 * and quant_mode_enum using integer ops:
282 * (value | 0x70) << 23 =
283 * ((15 - (log_quant_mode + log_samples)) | 0x70) << 23 =
284 * ((15 - (quant_mode_enum * 2 + 8 + log_samples)) | 0x70) << 23 =
285 * ((15 - quant_mode_enum * 2 - 8 - log_samples) | 0x70) << 23 =
286 * ((7 - quant_mode_enum * 2 - log_samples) | 0x70) << 23 =
287 *
288 * Since "log_samples <= 3" and "quant_mode_enum * 2 <= 4", we need a SGPR field that stores:
289 * triangle_precision = 7 - quant_mode_enum * 2 - log_samples
290 *
291 * Line precision ignores log_samples, so the shader should do:
292 * line_precision = triangle_precision + log_samples
293 */
294 #define GS_STATE_SMALL_PRIM_PRECISION__SHIFT 22 /* triangle_precision */
295 #define GS_STATE_SMALL_PRIM_PRECISION__MASK 0x7
296 #define GS_STATE_SMALL_PRIM_PRECISION_LOG_SAMPLES__SHIFT 25
297 #define GS_STATE_SMALL_PRIM_PRECISION_LOG_SAMPLES__MASK 0x3
298 #define GS_STATE_STREAMOUT_QUERY_ENABLED__SHIFT 27
299 #define GS_STATE_STREAMOUT_QUERY_ENABLED__MASK 0x1
300 #define GS_STATE_PROVOKING_VTX_FIRST__SHIFT 28
301 #define GS_STATE_PROVOKING_VTX_FIRST__MASK 0x1
302 #define GS_STATE_OUTPRIM__SHIFT 29
303 #define GS_STATE_OUTPRIM__MASK 0x3
304 #define GS_STATE_PIPELINE_STATS_EMU__SHIFT 31
305 #define GS_STATE_PIPELINE_STATS_EMU__MASK 0x1
306
307 #define ENCODE_FIELD(field, value) (((unsigned)(value) & field##__MASK) << field##__SHIFT)
308 #define CLEAR_FIELD(field) (~((unsigned)field##__MASK << field##__SHIFT))
309
310 /* This is called by functions that change states. */
311 #define SET_FIELD(var, field, value) do { \
312 assert((value) == ((unsigned)(value) & field##__MASK)); \
313 (var) &= CLEAR_FIELD(field); \
314 (var) |= ENCODE_FIELD(field, value); \
315 } while (0)
316
317 /* This is called during shader compilation and returns LLVMValueRef. */
318 #define GET_FIELD(ctx, field) si_unpack_param((ctx), (ctx)->args->vs_state_bits, field##__SHIFT, \
319 util_bitcount(field##__MASK))
320
321 enum
322 {
323 /* These represent the number of SGPRs the shader uses. */
324 SI_VS_BLIT_SGPRS_POS = 3,
325 SI_VS_BLIT_SGPRS_POS_COLOR = 7,
326 SI_VS_BLIT_SGPRS_POS_TEXCOORD = 9,
327
328 MAX_SI_VS_BLIT_SGPRS = 10, /* +1 for the attribute ring address */
329 };
330
331 /* The following two are only set for vertex shaders that cull.
332 * TES and GS get the primitive type from shader_info.
333 */
334 #define SI_NGG_CULL_VS_TRIANGLES (1 << 0) /* this implies W, view.xy, and small prim culling */
335 #define SI_NGG_CULL_VS_LINES (1 << 1) /* this implies W and view.xy culling */
336 #define SI_NGG_CULL_SMALL_LINES_DIAMOND_EXIT (1 << 2) /* cull small lines according to the diamond exit rule */
337 #define SI_NGG_CULL_CLIP_PLANE_ENABLE(enable) (((enable) & 0xff) << 3)
338 #define SI_NGG_CULL_GET_CLIP_PLANE_ENABLE(x) (((x) >> 3) & 0xff)
339
340 struct si_shader_profile {
341 uint32_t blake3[BLAKE3_OUT_LEN32];
342 uint32_t options;
343 };
344
345 extern struct si_shader_profile si_shader_profiles[];
346 unsigned si_get_num_shader_profiles(void);
347
348 #define SI_PROFILE_WAVE32 (1 << 0)
349 #define SI_PROFILE_GFX10_WAVE64 (1 << 1)
350 /* bit gap */
351 #define SI_PROFILE_VS_NO_BINNING (1 << 3)
352 #define SI_PROFILE_GFX9_GFX10_PS_NO_BINNING (1 << 4)
353 #define SI_PROFILE_CLAMP_DIV_BY_ZERO (1 << 5)
354 #define SI_PROFILE_NO_OPT_UNIFORM_VARYINGS (1 << 6)
355
356 enum si_shader_dump_type {
357 SI_DUMP_SHADER_KEY,
358 SI_DUMP_INIT_NIR, /* initial input NIR when shaders are created (before lowering) */
359 SI_DUMP_NIR, /* final NIR after lowering when shader variants are created */
360 SI_DUMP_INIT_LLVM_IR, /* initial LLVM IR before optimizations */
361 SI_DUMP_LLVM_IR, /* final LLVM IR */
362 SI_DUMP_INIT_ACO_IR, /* initial ACO IR before optimizations */
363 SI_DUMP_ACO_IR, /* final ACO IR */
364 SI_DUMP_ASM, /* final asm shaders */
365 SI_DUMP_STATS, /* print statistics as shader-db */
366 SI_DUMP_ALWAYS,
367 };
368
369 enum {
370 SI_UNIQUE_SLOT_POS = 0,
371
372 /* Since some shader stages use the highest used IO index
373 * to determine the size to allocate for inputs/outputs
374 * (in LDS, tess and GS rings). VARn should be placed right
375 * after POSITION to make that size as small as possible.
376 */
377 SI_UNIQUE_SLOT_VAR0 = 1, /* 0..31 */
378
379 /* Put 16-bit GLES varyings after 32-bit varyings. They can use the same indices as
380 * legacy desktop GL varyings because they are mutually exclusive.
381 */
382 SI_UNIQUE_SLOT_VAR0_16BIT = 33, /* 0..15 */
383
384 /* Legacy GL-only varyings can alias GLES-only 16-bit varyings. */
385 SI_UNIQUE_SLOT_FOGC = 33,
386 SI_UNIQUE_SLOT_COL0,
387 SI_UNIQUE_SLOT_COL1,
388 SI_UNIQUE_SLOT_BFC0,
389 SI_UNIQUE_SLOT_BFC1,
390 SI_UNIQUE_SLOT_TEX0,
391 SI_UNIQUE_SLOT_TEX1,
392 SI_UNIQUE_SLOT_TEX2,
393 SI_UNIQUE_SLOT_TEX3,
394 SI_UNIQUE_SLOT_TEX4,
395 SI_UNIQUE_SLOT_TEX5,
396 SI_UNIQUE_SLOT_TEX6,
397 SI_UNIQUE_SLOT_TEX7,
398 SI_UNIQUE_SLOT_CLIP_VERTEX,
399
400 /* Varyings present in both GLES and desktop GL must start at 49 after 16-bit varyings. */
401 SI_UNIQUE_SLOT_CLIP_DIST0 = 49,
402 SI_UNIQUE_SLOT_CLIP_DIST1,
403 SI_UNIQUE_SLOT_PSIZ,
404 /* These can't be written by LS, HS, and ES. */
405 SI_UNIQUE_SLOT_LAYER,
406 SI_UNIQUE_SLOT_VIEWPORT,
407 SI_UNIQUE_SLOT_PRIMITIVE_ID,
408 };
409
410 /**
411 * For VS shader keys, describe any fixups required for vertex fetch.
412 *
413 * \ref log_size, \ref format, and the number of channels are interpreted as
414 * by \ref ac_build_opencoded_load_format.
415 *
416 * Note: all bits 0 (size = 1 byte, num channels = 1, format = float) is an
417 * impossible format and indicates that no fixup is needed (just use
418 * buffer_load_format_xyzw).
419 */
420 union si_vs_fix_fetch {
421 struct {
422 uint8_t log_size : 2; /* 1, 2, 4, 8 or bytes per channel */
423 uint8_t num_channels_m1 : 2; /* number of channels minus 1 */
424 uint8_t format : 3; /* AC_FETCH_FORMAT_xxx */
425 uint8_t reverse : 1; /* reverse XYZ channels */
426 } u;
427 uint8_t bits;
428 };
429
430 struct si_shader;
431
432 /* State of the context creating the shader object. */
433 struct si_compiler_ctx_state {
434 /* Should only be used by si_init_shader_selector_async and
435 * si_build_shader_variant if thread_index == -1 (non-threaded). */
436 struct ac_llvm_compiler *compiler;
437
438 /* Used if thread_index == -1 or if debug.async is true. */
439 struct util_debug_callback debug;
440
441 /* Used for creating the log string for gallium/ddebug. */
442 bool is_debug_context;
443 };
444
445 enum si_color_output_type {
446 SI_TYPE_ANY32,
447 SI_TYPE_FLOAT16,
448 SI_TYPE_INT16,
449 SI_TYPE_UINT16,
450 };
451
452 union si_input_info {
453 struct {
454 uint8_t semantic;
455 uint8_t interpolate;
456 uint8_t fp16_lo_hi_valid;
457 uint8_t usage_mask;
458 };
459 uint32_t _unused; /* this just forces 4-byte alignment */
460 };
461
462 struct si_shader_info {
463 shader_info base;
464
465 uint32_t options; /* bitmask of SI_PROFILE_* */
466
467 uint8_t num_inputs;
468 uint8_t num_outputs;
469 union si_input_info input[PIPE_MAX_SHADER_INPUTS];
470 uint8_t output_semantic[PIPE_MAX_SHADER_OUTPUTS];
471 uint8_t output_usagemask[PIPE_MAX_SHADER_OUTPUTS];
472 uint8_t output_streams[PIPE_MAX_SHADER_OUTPUTS];
473 uint8_t output_type[PIPE_MAX_SHADER_OUTPUTS]; /* enum nir_alu_type */
474 uint8_t output_xfb_writemask[PIPE_MAX_SHADER_OUTPUTS];
475
476 uint8_t num_streamout_components;
477 uint8_t num_vs_inputs;
478 uint8_t num_vbos_in_user_sgprs;
479 uint8_t num_stream_output_components[4]; /* for GS streams, not streamout */
480 uint16_t enabled_streamout_buffer_mask;
481
482 uint64_t inputs_read; /* "get_unique_index" bits */
483 uint64_t tcs_inputs_via_temp;
484 uint64_t tcs_inputs_via_lds;
485
486 /* For VS before {TCS, TES, GS} and TES before GS. */
487 uint64_t ls_es_outputs_written; /* "get_unique_index" bits */
488 uint64_t outputs_written_before_ps; /* "get_unique_index" bits */
489 uint64_t tcs_outputs_written_for_tes; /* "get_unique_index" bits */
490 uint32_t patch_outputs_written_for_tes; /* "get_unique_index_patch" bits */
491 uint32_t tess_levels_written_for_tes; /* "get_unique_index_patch" bits */
492
493 uint8_t clipdist_mask;
494 uint8_t culldist_mask;
495
496 uint16_t esgs_vertex_stride;
497 uint16_t gsvs_vertex_size;
498 uint8_t gs_input_verts_per_prim;
499 unsigned max_gsvs_emit_size;
500
501 /* Set 0xf or 0x0 (4 bits) per each written output.
502 * ANDed with spi_shader_col_format.
503 */
504 unsigned colors_written_4bit;
505
506 int constbuf0_num_slots;
507 uint num_memory_stores;
508 uint8_t color_attr_index[2];
509 uint8_t color_interpolate[2];
510 uint8_t color_interpolate_loc[2];
511 uint8_t colors_read; /**< which color components are read by the FS */
512 uint8_t colors_written;
513 uint16_t output_color_types; /**< Each bit pair is enum si_color_output_type */
514 bool color0_writes_all_cbufs; /**< gl_FragColor */
515 bool reads_samplemask; /**< does fragment shader read sample mask? */
516 bool reads_tess_factors; /**< If TES reads TESSINNER or TESSOUTER */
517 bool writes_z; /**< does fragment shader write Z value? */
518 /* We need both because both can be present in different conditional blocks. */
519 bool output_z_equals_input_z; /**< gl_FragDepth == gl_FragCoord.z for any write */
520 bool output_z_is_not_input_z; /**< gl_FragDepth != gl_FragCoord.z for any write */
521 bool writes_stencil; /**< does fragment shader write stencil value? */
522 bool writes_samplemask; /**< does fragment shader write sample mask? */
523 bool writes_edgeflag; /**< vertex shader outputs edgeflag */
524 bool uses_interp_color;
525 bool uses_persp_center_color;
526 bool uses_persp_centroid_color;
527 bool uses_persp_sample_color;
528 bool uses_persp_center;
529 bool uses_persp_centroid;
530 bool uses_persp_sample;
531 bool uses_linear_center;
532 bool uses_linear_centroid;
533 bool uses_linear_sample;
534 bool uses_interp_at_sample;
535 bool uses_instanceid;
536 bool uses_base_vertex;
537 bool uses_base_instance;
538 bool uses_drawid;
539 bool uses_primid;
540 bool uses_frontface;
541 bool uses_invocationid;
542 bool uses_thread_id[3];
543 bool uses_block_id[3];
544 bool uses_variable_block_size;
545 bool uses_grid_size;
546 bool uses_tg_size;
547 bool uses_atomic_ordered_add;
548 bool writes_position;
549 bool writes_psize;
550 bool writes_clipvertex;
551 bool writes_primid;
552 bool writes_viewport_index;
553 bool writes_layer;
554 bool uses_bindless_samplers;
555 bool uses_bindless_images;
556 bool uses_indirect_descriptor;
557 bool has_divergent_loop;
558 bool uses_sampleid;
559 bool uses_layer_id;
560 bool has_non_uniform_tex_access;
561 bool has_shadow_comparison;
562
563 bool uses_vmem_sampler_or_bvh;
564 bool uses_vmem_load_other; /* all other VMEM loads and atomics with return */
565
566 /** Whether all codepaths write tess factors in all invocations. */
567 bool tessfactors_are_def_in_all_invocs;
568
569 /* A flag to check if vrs2x2 can be enabled to reduce number of
570 * fragment shader invocations if flat shading.
571 */
572 bool allow_flat_shading;
573
574 /* Optimization: if the texture bound to this texunit has been cleared to 1,
575 * then the draw can be skipped (see si_draw_vbo_skip_noop). Initially the
576 * value is 0xff (undetermined) and can be later changed to 0 (= false) or
577 * texunit + 1.
578 */
579 uint8_t writes_1_if_tex_is_1;
580
581 /* frag coord and sample pos per component read mask. */
582 uint8_t reads_frag_coord_mask;
583 uint8_t reads_sample_pos_mask;
584 };
585
586 /* A shader selector is a gallium CSO and contains shader variants and
587 * binaries for one NIR program. This can be shared by multiple contexts.
588 */
589 struct si_shader_selector {
590 struct util_live_shader base;
591 struct si_screen *screen;
592 struct util_queue_fence ready;
593 struct si_compiler_ctx_state compiler_ctx_state;
594 gl_shader_stage stage;
595
596 simple_mtx_t mutex;
597 union si_shader_key *keys;
598 unsigned variants_count;
599 unsigned variants_max_count;
600 struct si_shader **variants;
601
602 /* The compiled NIR shader without a prolog and/or epilog (not
603 * uploaded to a buffer object).
604 *
605 * [0] for wave32, [1] for wave64.
606 */
607 struct si_shader *main_shader_part[2];
608 struct si_shader *main_shader_part_ls[2]; /* as_ls is set in the key */
609 struct si_shader *main_shader_part_es; /* as_es && !as_ngg in the key */
610 struct si_shader *main_shader_part_ngg[2]; /* !as_es && as_ngg in the key */
611 struct si_shader *main_shader_part_ngg_es[2]; /* as_es && as_ngg in the key */
612
613 struct nir_shader *nir;
614 void *nir_binary;
615 unsigned nir_size;
616
617 struct si_shader_info info;
618
619 uint8_t const_and_shader_buf_descriptors_index;
620 uint8_t sampler_and_images_descriptors_index;
621 uint8_t cs_shaderbufs_sgpr_index;
622 uint8_t cs_num_shaderbufs_in_user_sgprs;
623 uint8_t cs_images_sgpr_index;
624 uint8_t cs_images_num_sgprs;
625 uint8_t cs_num_images_in_user_sgprs;
626 unsigned ngg_cull_vert_threshold; /* UINT32_MAX = disabled */
627 enum mesa_prim rast_prim;
628
629 /* GS parameters. */
630 bool tess_turns_off_ngg;
631
632 /* bitmasks of used descriptor slots */
633 uint64_t active_const_and_shader_buffers;
634 uint64_t active_samplers_and_images;
635 };
636
637 /* Valid shader configurations:
638 *
639 * API shaders VS | TCS | TES | GS |pass| PS
640 * are compiled as: | | | |thru|
641 * | | | | |
642 * Only VS & PS: VS | | | | | PS
643 * GFX6 - with GS: ES | | | GS | VS | PS
644 * - with tess: LS | HS | VS | | | PS
645 * - with both: LS | HS | ES | GS | VS | PS
646 * GFX9 - with GS: -> | | | GS | VS | PS
647 * - with tess: -> | HS | VS | | | PS
648 * - with both: -> | HS | -> | GS | VS | PS
649 * | | | | |
650 * NGG - VS & PS: GS | | | | | PS
651 * (GFX10+) - with GS: -> | | | GS | | PS
652 * - with tess: -> | HS | GS | | | PS
653 * - with both: -> | HS | -> | GS | | PS
654 *
655 * -> = merged with the next stage
656 */
657
658 /* Use the byte alignment for all following structure members for optimal
659 * shader key memory footprint.
660 */
661 #pragma pack(push, 1)
662
663 /* Common PS bits between the shader key and the prolog key. */
664 struct si_ps_prolog_bits {
665 unsigned color_two_side : 1;
666 unsigned flatshade_colors : 1;
667 unsigned poly_stipple : 1;
668 unsigned force_persp_sample_interp : 1;
669 unsigned force_linear_sample_interp : 1;
670 unsigned force_persp_center_interp : 1;
671 unsigned force_linear_center_interp : 1;
672 unsigned bc_optimize_for_persp : 1;
673 unsigned bc_optimize_for_linear : 1;
674 unsigned samplemask_log_ps_iter : 3;
675 };
676
677 /* Common PS bits between the shader key and the epilog key. */
678 struct si_ps_epilog_bits {
679 unsigned spi_shader_col_format;
680 unsigned color_is_int8 : 8;
681 unsigned color_is_int10 : 8;
682 unsigned last_cbuf : 3;
683 unsigned alpha_func : 3;
684 unsigned alpha_to_one : 1;
685 unsigned alpha_to_coverage_via_mrtz : 1; /* gfx11+ or alpha_to_one */
686 unsigned clamp_color : 1;
687 unsigned dual_src_blend_swizzle : 1; /* gfx11+ */
688 unsigned rbplus_depth_only_opt:1;
689 unsigned kill_z:1;
690 unsigned kill_stencil:1;
691 unsigned kill_samplemask:1;
692 };
693
694 union si_shader_part_key {
695 struct {
696 struct si_ps_prolog_bits states;
697 unsigned use_aco : 1;
698 unsigned wave32 : 1;
699 unsigned num_input_sgprs : 6;
700 /* Color interpolation and two-side color selection. */
701 unsigned colors_read : 8; /* color input components read */
702 unsigned num_interp_inputs : 5; /* BCOLOR is at this location */
703 unsigned num_fragcoord_components : 3;
704 unsigned wqm : 1;
705 char color_attr_index[2];
706 signed char color_interp_vgpr_index[2]; /* -1 == constant */
707 } ps_prolog;
708 struct {
709 struct si_ps_epilog_bits states;
710 unsigned use_aco : 1;
711 unsigned wave32 : 1;
712 unsigned uses_discard : 1;
713 unsigned colors_written : 8;
714 unsigned color_types : 16;
715 unsigned writes_z : 1;
716 unsigned writes_stencil : 1;
717 unsigned writes_samplemask : 1;
718 } ps_epilog;
719 };
720
721 /* The shader key for geometry stages (VS, TCS, TES, GS) */
722 struct si_shader_key_ge {
723 /* Prolog and epilog flags. */
724 union {
725 struct {
726 struct si_shader_selector *ls; /* for merged LS-HS */
727 } tcs; /* tessellation control shader */
728 struct {
729 struct si_shader_selector *es; /* for merged ES-GS */
730 } gs;
731 } part;
732
733 /* These three are initially set according to the NEXT_SHADER property,
734 * or guessed if the property doesn't seem correct.
735 */
736 unsigned as_es : 1; /* whether it's a shader before GS */
737 unsigned as_ls : 1; /* whether it's VS before TCS */
738 unsigned as_ngg : 1; /* whether it's the last GE stage and NGG is enabled,
739 also set for the stage right before GS */
740
741 /* Flags for monolithic compilation only. */
742 struct {
743 /* - If neither "is_one" nor "is_fetched" has a bit set, the instance
744 * divisor is 0.
745 * - If "is_one" has a bit set, the instance divisor is 1.
746 * - If "is_fetched" has a bit set, the instance divisor will be loaded
747 * from the constant buffer.
748 */
749 uint16_t instance_divisor_is_one; /* bitmask of inputs */
750 uint16_t instance_divisor_is_fetched; /* bitmask of inputs */
751
752 /* Whether fetch should be opencoded according to vs_fix_fetch.
753 * Otherwise, if vs_fix_fetch is non-zero, buffer_load_format_xyzw
754 * with minimal fixups is used. */
755 uint16_t vs_fetch_opencode;
756 union si_vs_fix_fetch vs_fix_fetch[SI_MAX_ATTRIBS];
757
758 union {
759 /* When PS needs PrimID and GS is disabled. */
760 unsigned vs_export_prim_id : 1; /* VS and TES only */
761 unsigned gs_tri_strip_adj_fix : 1; /* GS only */
762 } u;
763
764 /* Gfx12: When no streamout buffers are bound, streamout must be disabled. */
765 unsigned remove_streamout : 1;
766 } mono;
767
768 /* Optimization flags for asynchronous compilation only. */
769 struct {
770 /* For HW VS (it can be VS, TES, GS) */
771 uint64_t kill_outputs; /* "get_unique_index" bits */
772 unsigned kill_clip_distances : 8;
773 unsigned kill_pointsize : 1;
774 unsigned kill_layer : 1;
775 unsigned remove_streamout : 1;
776
777 /* For NGG VS and TES. */
778 unsigned ngg_culling : 11; /* SI_NGG_CULL_* */
779
780 /* If NGG VS streamout knows the number of vertices per primitive at compile time,
781 * it can put stores for all vertices in the same VMEM clause, instead of storing
782 * vertices for the 2nd and 3rd vertex conditionally because the primitive type is
783 * unknown.
784 */
785 unsigned ngg_vs_streamout_num_verts_per_prim : 2;
786
787 /* For shaders where monolithic variants have better code.
788 *
789 * This is a flag that has no effect on code generation,
790 * but forces monolithic shaders to be used as soon as
791 * possible, because it's in the "opt" group.
792 */
793 unsigned prefer_mono : 1;
794
795 /* VS and TCS have the same number of patch vertices. */
796 unsigned same_patch_vertices:1;
797
798 /* For TCS. */
799 unsigned tes_prim_mode : 2;
800 unsigned tes_reads_tess_factors : 1;
801
802 unsigned inline_uniforms:1;
803
804 /* This must be kept last to limit the number of variants
805 * depending only on the uniform values.
806 */
807 uint32_t inlined_uniform_values[MAX_INLINABLE_UNIFORMS];
808 } opt;
809 };
810
811 struct si_shader_key_ps {
812 struct {
813 /* Prolog and epilog flags. */
814 struct si_ps_prolog_bits prolog;
815 struct si_ps_epilog_bits epilog;
816 } part;
817
818 /* Flags for monolithic compilation only. */
819 struct {
820 unsigned poly_line_smoothing : 1;
821 unsigned point_smoothing : 1;
822 unsigned interpolate_at_sample_force_center : 1;
823 unsigned fbfetch_msaa : 1;
824 unsigned fbfetch_is_1D : 1;
825 unsigned fbfetch_layered : 1;
826 } mono;
827
828 /* Optimization flags for asynchronous compilation only. */
829 struct {
830 /* For shaders where monolithic variants have better code.
831 *
832 * This is a flag that has no effect on code generation,
833 * but forces monolithic shaders to be used as soon as
834 * possible, because it's in the "opt" group.
835 */
836 unsigned prefer_mono : 1;
837 unsigned inline_uniforms:1;
838
839 /* This eliminates the FRONT_FACE input VGPR as well as shader code using it. */
840 int force_front_face_input : 2; /* 0 = gl_FrontFacing, 1 = true, -1 = false */
841
842 /* This must be kept last to limit the number of variants
843 * depending only on the uniform values.
844 */
845 uint32_t inlined_uniform_values[MAX_INLINABLE_UNIFORMS];
846 } opt;
847 };
848
849 union si_shader_key {
850 struct si_shader_key_ge ge; /* geometry engine shaders */
851 struct si_shader_key_ps ps;
852 };
853
854 /* Restore the pack alignment to default. */
855 #pragma pack(pop)
856
857 /* GCN-specific shader info. */
858 struct si_shader_binary_info {
859 uint8_t vs_output_param_offset[NUM_TOTAL_VARYING_SLOTS];
860 uint32_t vs_output_ps_input_cntl[NUM_TOTAL_VARYING_SLOTS];
861 union si_input_info ps_inputs[SI_NUM_INTERP];
862 uint8_t num_ps_inputs;
863 uint8_t ps_colors_read;
864 uint8_t num_input_sgprs;
865 uint8_t num_input_vgprs;
866 bool uses_vmem_load_other; /* all other VMEM loads and atomics with return */
867 bool uses_vmem_sampler_or_bvh;
868 uint8_t num_fragcoord_components;
869 bool uses_instanceid;
870 uint8_t nr_pos_exports;
871 uint8_t nr_param_exports;
872 unsigned private_mem_vgprs;
873 unsigned max_simd_waves;
874 };
875
876 enum si_shader_binary_type {
877 SI_SHADER_BINARY_ELF,
878 SI_SHADER_BINARY_RAW,
879 };
880
881 struct si_shader_binary {
882 enum si_shader_binary_type type;
883
884 /* Depends on binary type, either ELF or raw buffer. */
885 const char *code_buffer;
886 size_t code_size;
887 uint32_t exec_size;
888
889 char *uploaded_code;
890 size_t uploaded_code_size;
891
892 char *llvm_ir_string;
893
894 const char *disasm_string;
895 size_t disasm_size;
896
897 const unsigned *symbols;
898 unsigned num_symbols;
899 };
900
901 struct gfx9_gs_info {
902 unsigned es_verts_per_subgroup;
903 unsigned gs_prims_per_subgroup;
904 unsigned gs_inst_prims_in_subgroup;
905 unsigned max_prims_per_subgroup;
906 unsigned esgs_ring_size; /* in bytes */
907 };
908
909 struct si_shader {
910 struct si_pm4_state pm4; /* base class */
911 struct si_compiler_ctx_state compiler_ctx_state;
912
913 struct si_shader_selector *selector;
914 struct si_shader_selector *previous_stage_sel; /* for refcounting */
915 struct si_shader *next_shader; /* Only used during compilation of LS and ES when merged. */
916
917 struct si_shader_part *prolog;
918 struct si_shader *previous_stage; /* for GFX9 */
919 struct si_shader_part *epilog;
920 struct si_shader *gs_copy_shader;
921
922 struct si_resource *bo;
923 /* gpu_address should be bo->gpu_address except if SQTT is
924 * in use.
925 */
926 uint64_t gpu_address;
927 /* Only used on GFX6-10 where the scratch address must be inserted into the shader binary.
928 * This is the scratch address that the current shader binary contains.
929 */
930 uint64_t scratch_va;
931 union si_shader_key key;
932 struct util_queue_fence ready;
933 bool compilation_failed;
934 bool is_monolithic;
935 bool is_optimized;
936 bool is_binary_shared;
937 bool is_gs_copy_shader;
938 uint8_t wave_size;
939 unsigned complete_shader_binary_size;
940
941 /* The following data is all that's needed for binary shaders. */
942 struct si_shader_binary binary;
943 struct ac_shader_config config;
944 struct si_shader_binary_info info;
945
946 /* SI_SGPR_VS_STATE_BITS */
947 bool uses_vs_state_provoking_vertex;
948 bool uses_gs_state_outprim;
949
950 bool uses_base_instance;
951
952 /* Shader key + LLVM IR + disassembly + statistics.
953 * Generated for debug contexts only.
954 */
955 char *shader_log;
956 size_t shader_log_size;
957
958 struct gfx9_gs_info gs_info;
959
960 /* Precomputed register values. */
961 union {
962 struct {
963 unsigned vgt_gsvs_ring_offset_1;
964 unsigned vgt_gsvs_ring_offset_2;
965 unsigned vgt_gsvs_ring_offset_3;
966 unsigned vgt_gsvs_ring_itemsize;
967 unsigned vgt_gs_max_vert_out;
968 unsigned vgt_gs_vert_itemsize;
969 unsigned vgt_gs_vert_itemsize_1;
970 unsigned vgt_gs_vert_itemsize_2;
971 unsigned vgt_gs_vert_itemsize_3;
972 unsigned vgt_gs_instance_cnt;
973 unsigned vgt_gs_onchip_cntl;
974 unsigned vgt_gs_max_prims_per_subgroup;
975 unsigned vgt_esgs_ring_itemsize;
976 unsigned spi_shader_pgm_rsrc3_gs;
977 unsigned spi_shader_pgm_rsrc4_gs;
978 } gs;
979
980 struct {
981 /* Computed by gfx10_ngg_calculate_subgroup_info. */
982 uint16_t ngg_emit_size; /* in dwords */
983 uint16_t hw_max_esverts;
984 uint16_t max_gsprims;
985 uint16_t max_out_verts;
986 bool max_vert_out_per_gs_instance;
987 /* Register values. */
988 unsigned ge_max_output_per_subgroup;
989 unsigned ge_ngg_subgrp_cntl;
990 unsigned vgt_primitiveid_en;
991 unsigned vgt_gs_onchip_cntl;
992 unsigned vgt_gs_instance_cnt;
993 unsigned esgs_vertex_stride;
994 unsigned spi_vs_out_config;
995 unsigned spi_shader_pos_format;
996 unsigned pa_cl_vte_cntl;
997 unsigned vgt_gs_max_vert_out; /* for API GS */
998 unsigned ge_pc_alloc; /* uconfig register */
999 unsigned spi_shader_pgm_rsrc3_gs;
1000 unsigned spi_shader_pgm_rsrc4_gs;
1001 unsigned vgt_shader_stages_en;
1002 } ngg;
1003
1004 struct {
1005 unsigned vgt_gs_mode;
1006 unsigned vgt_primitiveid_en;
1007 unsigned vgt_reuse_off;
1008 unsigned spi_vs_out_config;
1009 unsigned spi_shader_pos_format;
1010 unsigned pa_cl_vte_cntl;
1011 unsigned ge_pc_alloc; /* uconfig register */
1012 } vs;
1013
1014 struct {
1015 unsigned spi_ps_input_ena;
1016 unsigned spi_ps_input_addr;
1017 unsigned spi_baryc_cntl;
1018 unsigned spi_ps_in_control;
1019 unsigned spi_shader_z_format;
1020 unsigned spi_shader_col_format;
1021 unsigned cb_shader_mask;
1022 unsigned db_shader_control;
1023 unsigned num_interp;
1024 unsigned spi_gs_out_config_ps;
1025 unsigned pa_sc_hisz_control;
1026 bool writes_z;
1027 bool writes_stencil;
1028 bool writes_samplemask;
1029 } ps;
1030 };
1031
1032 /* Precomputed register values. */
1033 unsigned vgt_tf_param; /* VGT_TF_PARAM */
1034 unsigned vgt_vertex_reuse_block_cntl; /* VGT_VERTEX_REUSE_BLOCK_CNTL */
1035 unsigned pa_cl_vs_out_cntl;
1036 unsigned ge_cntl;
1037 };
1038
1039 struct si_shader_part {
1040 struct si_shader_part *next;
1041 union si_shader_part_key key;
1042 struct si_shader_binary binary;
1043 struct ac_shader_config config;
1044 };
1045
1046 /* si_shader.c */
1047 struct ac_rtld_binary;
1048
1049 void si_update_shader_binary_info(struct si_shader *shader, struct nir_shader *nir);
1050 bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
1051 struct si_shader *shader, struct util_debug_callback *debug);
1052 bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
1053 struct si_shader *shader, struct util_debug_callback *debug);
1054 void si_shader_destroy(struct si_shader *shader);
1055 unsigned si_shader_io_get_unique_index(unsigned semantic);
1056 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader,
1057 uint64_t scratch_va);
1058 int si_shader_binary_upload_at(struct si_screen *sscreen, struct si_shader *shader,
1059 uint64_t scratch_va, int64_t bo_offset);
1060 bool si_can_dump_shader(struct si_screen *sscreen, gl_shader_stage stage,
1061 enum si_shader_dump_type dump_type);
1062 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
1063 struct util_debug_callback *debug, FILE *f, bool check_debug_option);
1064 void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader,
1065 struct util_debug_callback *debug);
1066 void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size);
1067 const char *si_get_shader_name(const struct si_shader *shader);
1068 void si_shader_binary_clean(struct si_shader_binary *binary);
1069 struct nir_shader *si_deserialize_shader(struct si_shader_selector *sel);
1070 unsigned si_get_ps_num_interp(struct si_shader *ps);
1071 unsigned si_get_shader_prefetch_size(struct si_shader *shader);
1072 unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_shader *shader);
1073
1074 /* si_shader_info.c */
1075 void si_nir_scan_shader(struct si_screen *sscreen, struct nir_shader *nir,
1076 struct si_shader_info *info);
1077
1078 /* si_shader_nir.c */
1079 void si_lower_mediump_io(struct nir_shader *nir);
1080
1081 bool si_alu_to_scalar_packed_math_filter(const struct nir_instr *instr, const void *data);
1082 void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first);
1083 void si_nir_late_opts(struct nir_shader *nir);
1084 char *si_finalize_nir(struct pipe_screen *screen, struct nir_shader *nir);
1085
1086 /* si_state_shaders.cpp */
1087 unsigned si_shader_num_alloc_param_exports(struct si_shader *shader);
1088 unsigned si_determine_wave_size(struct si_screen *sscreen, struct si_shader *shader);
1089 void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *gs,
1090 struct gfx9_gs_info *out);
1091 bool gfx10_is_ngg_passthrough(struct si_shader *shader);
1092 unsigned si_shader_lshs_vertex_stride(struct si_shader *ls);
1093 bool si_should_clear_lds(struct si_screen *sscreen, const struct nir_shader *shader);
1094 unsigned si_get_output_prim_simplified(const struct si_shader_selector *sel,
1095 const union si_shader_key *key);
1096
1097 /* Inline helpers. */
1098
1099 /* Return the pointer to the main shader part's pointer. */
si_get_main_shader_part(struct si_shader_selector * sel,const union si_shader_key * key,unsigned wave_size)1100 static inline struct si_shader **si_get_main_shader_part(struct si_shader_selector *sel,
1101 const union si_shader_key *key,
1102 unsigned wave_size)
1103 {
1104 assert(wave_size == 32 || wave_size == 64);
1105 unsigned index = wave_size / 32 - 1;
1106
1107 if (sel->stage <= MESA_SHADER_GEOMETRY) {
1108 if (key->ge.as_ls)
1109 return &sel->main_shader_part_ls[index];
1110 if (key->ge.as_es && key->ge.as_ngg)
1111 return &sel->main_shader_part_ngg_es[index];
1112 if (key->ge.as_es) {
1113 /* legacy GS only support wave 64 */
1114 assert(wave_size == 64);
1115 return &sel->main_shader_part_es;
1116 }
1117 if (key->ge.as_ngg)
1118 return &sel->main_shader_part_ngg[index];
1119 }
1120 return &sel->main_shader_part[index];
1121 }
1122
gfx10_has_variable_edgeflags(struct si_shader * shader)1123 static inline bool gfx10_has_variable_edgeflags(struct si_shader *shader)
1124 {
1125 unsigned output_prim = si_get_output_prim_simplified(shader->selector, &shader->key);
1126
1127 return shader->selector->stage == MESA_SHADER_VERTEX &&
1128 (output_prim == MESA_PRIM_TRIANGLES || output_prim == MESA_PRIM_UNKNOWN);
1129 }
1130
si_shader_uses_streamout(const struct si_shader * shader)1131 static inline bool si_shader_uses_streamout(const struct si_shader *shader)
1132 {
1133 return shader->selector->stage <= MESA_SHADER_GEOMETRY &&
1134 shader->selector->info.enabled_streamout_buffer_mask &&
1135 !shader->key.ge.opt.remove_streamout &&
1136 !shader->key.ge.mono.remove_streamout;
1137 }
1138
si_shader_uses_discard(struct si_shader * shader)1139 static inline bool si_shader_uses_discard(struct si_shader *shader)
1140 {
1141 /* Changes to this should also update ps_modifies_zs. */
1142 return shader->selector->info.base.fs.uses_discard ||
1143 shader->key.ps.part.prolog.poly_stipple ||
1144 shader->key.ps.mono.point_smoothing ||
1145 shader->key.ps.part.epilog.alpha_func != PIPE_FUNC_ALWAYS;
1146 }
1147
si_shader_culling_enabled(struct si_shader * shader)1148 static inline bool si_shader_culling_enabled(struct si_shader *shader)
1149 {
1150 /* Legacy VS/TES/GS and ES don't cull in the shader. */
1151 if (!shader->key.ge.as_ngg || shader->key.ge.as_es) {
1152 assert(!shader->key.ge.opt.ngg_culling);
1153 return false;
1154 }
1155
1156 if (shader->key.ge.opt.ngg_culling)
1157 return true;
1158
1159 unsigned output_prim = si_get_output_prim_simplified(shader->selector, &shader->key);
1160
1161 /* This enables NGG culling for non-monolithic TES and GS. */
1162 return shader->selector->ngg_cull_vert_threshold == 0 &&
1163 (output_prim == MESA_PRIM_TRIANGLES || output_prim == MESA_PRIM_LINES);
1164 }
1165
1166 #ifdef __cplusplus
1167 }
1168 #endif
1169
1170 #endif
1171