1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 /* The compiler middle-end architecture: Explaining (non-)monolithic shaders
8 * -------------------------------------------------------------------------
9 *
10 * Typically, there is one-to-one correspondence between API and HW shaders,
11 * that is, for every API shader, there is exactly one shader binary in
12 * the driver.
13 *
14 * The problem with that is that we also have to emulate some API states
15 * (e.g. alpha-test, and many others) in shaders too. The two obvious ways
16 * to deal with it are:
17 * - each shader has multiple variants for each combination of emulated states,
18 * and the variants are compiled on demand, possibly relying on a shader
19 * cache for good performance
20 * - patch shaders at the binary level
21 *
22 * This driver uses something completely different. The emulated states are
23 * usually implemented at the beginning or end of shaders. Therefore, we can
24 * split the shader into 3 parts:
25 * - prolog part (shader code dependent on states)
26 * - main part (the API shader)
27 * - epilog part (shader code dependent on states)
28 *
29 * Each part is compiled as a separate shader and the final binaries are
30 * concatenated. This type of shader is called non-monolithic, because it
31 * consists of multiple independent binaries. Creating a new shader variant
32 * is therefore only a concatenation of shader parts (binaries) and doesn't
33 * involve any compilation. The main shader parts are the only parts that are
34 * compiled when applications create shader objects. The prolog and epilog
35 * parts are compiled on the first use and saved, so that their binaries can
36 * be reused by many other shaders.
37 *
38 * One of the roles of the prolog part is to compute vertex buffer addresses
39 * for vertex shaders. A few of the roles of the epilog part are color buffer
40 * format conversions in pixel shaders that we have to do manually, and write
41 * tessellation factors in tessellation control shaders. The prolog and epilog
42 * have many other important responsibilities in various shader stages.
43 * They don't just "emulate legacy stuff".
44 *
45 * Monolithic shaders are shaders where the parts are combined before LLVM
46 * compilation, and the whole thing is compiled and optimized as one unit with
47 * one binary on the output. The result is the same as the non-monolithic
48 * shader, but the final code can be better, because LLVM can optimize across
49 * all shader parts. Monolithic shaders aren't usually used except for these
50 * special cases:
51 *
52 * 1) Some rarely-used states require modification of the main shader part
53 * itself, and in such cases, only the monolithic shader variant is
54 * compiled, and that's always done on the first use.
55 *
56 * 2) When we do cross-stage optimizations for separate shader objects and
57 * e.g. eliminate unused shader varyings, the resulting optimized shader
58 * variants are always compiled as monolithic shaders, and always
59 * asynchronously (i.e. not stalling ongoing rendering). We call them
60 * "optimized monolithic" shaders. The important property here is that
61 * the non-monolithic unoptimized shader variant is always available for use
62 * when the asynchronous compilation of the optimized shader is not done
63 * yet.
64 *
65 * Starting with GFX9 chips, some shader stages are merged, and the number of
66 * shader parts per shader increased. The complete new list of shader parts is:
67 * - 1st shader: prolog part
68 * - 1st shader: main part
69 * - 2nd shader: main part
70 * - 2nd shader: epilog part
71 */
72
73 /* How linking shader inputs and outputs between vertex, tessellation, and
74 * geometry shaders works.
75 *
76 * Inputs and outputs between shaders are stored in a buffer. This buffer
77 * lives in LDS (typical case for tessellation), but it can also live
78 * in memory (ESGS). Each input or output has a fixed location within a vertex.
79 * The highest used input or output determines the stride between vertices.
80 *
81 * Since GS and tessellation are only possible in the OpenGL core profile,
82 * only these semantics are valid for per-vertex data:
83 *
84 * Name Location
85 *
86 * POSITION 0
87 * VAR0..31 1..32
88 * CLIP_DIST0..1 49..50
89 * PSIZ 51
90 *
91 * For example, a shader only writing GENERIC0 has the output stride of 5.
92 *
93 * Only these semantics are valid for per-patch data:
94 *
95 * Name Location
96 *
97 * TESSOUTER 0
98 * TESSINNER 1
99 * PATCH0..29 2..31
100 *
101 * That's how independent shaders agree on input and output locations.
102 * The si_shader_io_get_unique_index function assigns the locations.
103 *
104 * For tessellation, other required information for calculating the input and
105 * output addresses like the vertex stride, the patch stride, and the offsets
106 * where per-vertex and per-patch data start, is passed to the shader via
107 * user data SGPRs. The offsets and strides are calculated at draw time and
108 * aren't available at compile time.
109 */
110
111 #ifndef SI_SHADER_H
112 #define SI_SHADER_H
113
114 #include "shader_info.h"
115 #include "ac_binary.h"
116 #include "ac_gpu_info.h"
117 #include "util/mesa-sha1.h"
118 #include "util/u_live_shader_cache.h"
119 #include "util/u_queue.h"
120 #include "si_pm4.h"
121
122 #ifdef __cplusplus
123 extern "C" {
124 #endif
125
126 struct nir_shader;
127 struct nir_instr;
128 struct nir_lower_subgroups_options;
129
130 #define SI_NUM_INTERP 32
131 #define SI_MAX_ATTRIBS 16
132 #define SI_MAX_VS_OUTPUTS 40
133 #define SI_USER_CLIP_PLANE_MASK 0x3F
134
135 #define INTERP_MODE_COLOR INTERP_MODE_COUNT
136
137 #define SI_PS_INPUT_CNTL_0000 (S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(0))
138 #define SI_PS_INPUT_CNTL_0001 (S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(3))
139 #define SI_PS_INPUT_CNTL_UNUSED SI_PS_INPUT_CNTL_0000
140 /* D3D9 behaviour for COLOR0 requires 0001. GL is undefined. */
141 #define SI_PS_INPUT_CNTL_UNUSED_COLOR0 SI_PS_INPUT_CNTL_0001
142
143 #define SI_VECTOR_ARG_IS_COLOR BITFIELD_BIT(0)
144 #define SI_VECTOR_ARG_COLOR_COMPONENT(x) (((x) & 0x7) << 1)
145 #define SI_GET_VECTOR_ARG_COLOR_COMPONENT(x) (((x) >> 1) & 0x7)
146
147 /* SGPR user data indices */
148 enum
149 {
150 SI_SGPR_INTERNAL_BINDINGS,
151 SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES,
152 SI_SGPR_CONST_AND_SHADER_BUFFERS, /* or just a constant buffer 0 pointer */
153 SI_SGPR_SAMPLERS_AND_IMAGES,
154 SI_NUM_RESOURCE_SGPRS,
155
156 /* API VS, TES without GS, GS copy shader */
157 SI_SGPR_VS_STATE_BITS = SI_NUM_RESOURCE_SGPRS,
158 SI_NUM_VS_STATE_RESOURCE_SGPRS,
159
160 /* all VS variants */
161 SI_SGPR_BASE_VERTEX = SI_NUM_VS_STATE_RESOURCE_SGPRS,
162 SI_SGPR_DRAWID,
163 SI_SGPR_START_INSTANCE,
164 SI_VS_NUM_USER_SGPR,
165
166 SI_SGPR_VS_BLIT_DATA = SI_SGPR_CONST_AND_SHADER_BUFFERS,
167
168 /* TES */
169 SI_SGPR_TES_OFFCHIP_LAYOUT = SI_NUM_VS_STATE_RESOURCE_SGPRS,
170 SI_SGPR_TES_OFFCHIP_ADDR,
171 SI_TES_NUM_USER_SGPR,
172
173 /* GFX6-8: TCS only */
174 GFX6_SGPR_TCS_OFFCHIP_LAYOUT = SI_NUM_RESOURCE_SGPRS,
175 GFX6_SGPR_TCS_OFFCHIP_ADDR,
176 GFX6_SGPR_TCS_IN_LAYOUT,
177 GFX6_TCS_NUM_USER_SGPR,
178
179 /* GFX9: Merged LS-HS (VS-TCS) only. */
180 GFX9_SGPR_TCS_OFFCHIP_LAYOUT = SI_VS_NUM_USER_SGPR,
181 GFX9_SGPR_TCS_OFFCHIP_ADDR,
182 GFX9_TCS_NUM_USER_SGPR,
183
184 /* GS limits */
185 GFX6_GS_NUM_USER_SGPR = SI_NUM_RESOURCE_SGPRS,
186 SI_GSCOPY_NUM_USER_SGPR = SI_NUM_VS_STATE_RESOURCE_SGPRS,
187
188 GFX9_SGPR_SMALL_PRIM_CULL_INFO = MAX2(SI_VS_NUM_USER_SGPR, SI_TES_NUM_USER_SGPR),
189 GFX9_SGPR_ATTRIBUTE_RING_ADDR,
190 GFX9_GS_NUM_USER_SGPR,
191
192 /* PS only */
193 SI_SGPR_ALPHA_REF = SI_NUM_RESOURCE_SGPRS,
194 SI_PS_NUM_USER_SGPR,
195
196 /* The value has to be 12, because the hw requires that descriptors
197 * are aligned to 4 SGPRs.
198 */
199 SI_SGPR_VS_VB_DESCRIPTOR_FIRST = 12,
200 };
201
202 /* LLVM function parameter indices */
203 enum
204 {
205 SI_NUM_RESOURCE_PARAMS = 4,
206
207 /* PS only parameters */
208 SI_PARAM_ALPHA_REF = SI_NUM_RESOURCE_PARAMS,
209 SI_PARAM_PRIM_MASK,
210 SI_PARAM_PERSP_SAMPLE,
211 SI_PARAM_PERSP_CENTER,
212 SI_PARAM_PERSP_CENTROID,
213 SI_PARAM_PERSP_PULL_MODEL,
214 SI_PARAM_LINEAR_SAMPLE,
215 SI_PARAM_LINEAR_CENTER,
216 SI_PARAM_LINEAR_CENTROID,
217 SI_PARAM_LINE_STIPPLE_TEX,
218 SI_PARAM_POS_X_FLOAT,
219 SI_PARAM_POS_Y_FLOAT,
220 SI_PARAM_POS_Z_FLOAT,
221 SI_PARAM_POS_W_FLOAT,
222 SI_PARAM_FRONT_FACE,
223 SI_PARAM_ANCILLARY,
224 SI_PARAM_SAMPLE_COVERAGE,
225 SI_PARAM_POS_FIXED_PT,
226
227 SI_NUM_PARAMS = SI_PARAM_POS_FIXED_PT + 9, /* +8 for COLOR[0..1] */
228 };
229
230 /* These fields are only set in current_vs_state (except INDEXED) in si_context, and they are
231 * accessible in the shader via vs_state_bits in VS, TES, and GS.
232 */
233 #define VS_STATE_CLAMP_VERTEX_COLOR__SHIFT 0
234 #define VS_STATE_CLAMP_VERTEX_COLOR__MASK 0x1 /* Shared by VS and GS */
235 #define VS_STATE_INDEXED__SHIFT 1
236 #define VS_STATE_INDEXED__MASK 0x1 /* Shared by VS and GS */
237
238 /* These fields are only set in current_vs_state in si_context, and they are accessible
239 * in the shader via vs_state_bits in LS/HS.
240 */
241 /* bit gap */
242 /* TCS output patch0 offset for per-patch outputs / 4
243 * - 64 outputs are implied by SI_UNIQUE_SLOT_* values.
244 * - max = 32(CPs) * 64(outputs) * 16(vec4) * 64(num_patches) * 2(inputs + outputs) / 4
245 * = 1M, clamped to 32K(LDS limit) / 4 = 8K
246 * - only used by si_llvm_tcs_build_end, it can be removed after NIR lowering replaces it
247 */
248 #define VS_STATE_TCS_OUT_PATCH0_OFFSET__SHIFT 10
249 #define VS_STATE_TCS_OUT_PATCH0_OFFSET__MASK 0x3fff
250 #define VS_STATE_LS_OUT_VERTEX_SIZE__SHIFT 24
251 #define VS_STATE_LS_OUT_VERTEX_SIZE__MASK 0xff /* max 32 * 4 + 1 (to reduce LDS bank conflicts) */
252
253 /* These fields are only set in current_gs_state in si_context, and they are accessible
254 * in the shader via vs_state_bits in legacy GS, the GS copy shader, and any NGG shader.
255 */
256 /* bit gap */
257 /* The number of ES outputs is derived from the last output index of SI_UNIQUE_SLOT_* + 1, which
258 * can be 55 at most. The ESGS vertex stride in dwords is: NUM_ES_OUTPUTS * 4 + 1
259 * Only used by GFX9+ to compute LDS addresses of GS inputs.
260 */
261 #define GS_STATE_NUM_ES_OUTPUTS__SHIFT 13
262 #define GS_STATE_NUM_ES_OUTPUTS__MASK 0x3f
263 /* Small prim filter precision = num_samples / quant_mode, which can only be equal to 1/2^n
264 * where n is between 4 and 12. Knowing that, we only need to store 4 bits of the FP32 exponent.
265 * Set it like this: value = (fui(num_samples / quant_mode) >> 23) & 0xf;
266 * Expand to FP32 like this: ((0x70 | value) << 23);
267 * With 0x70 = 112, we get 2^(112 + value - 127) = 2^(value - 15), which is always a negative
268 * exponent and it's equal to 1/2^(15 - value).
269 */
270 #define GS_STATE_SMALL_PRIM_PRECISION_NO_AA__SHIFT 19
271 #define GS_STATE_SMALL_PRIM_PRECISION_NO_AA__MASK 0xf
272 #define GS_STATE_SMALL_PRIM_PRECISION__SHIFT 23
273 #define GS_STATE_SMALL_PRIM_PRECISION__MASK 0xf
274 #define GS_STATE_STREAMOUT_QUERY_ENABLED__SHIFT 27
275 #define GS_STATE_STREAMOUT_QUERY_ENABLED__MASK 0x1
276 #define GS_STATE_PROVOKING_VTX_FIRST__SHIFT 28
277 #define GS_STATE_PROVOKING_VTX_FIRST__MASK 0x1
278 #define GS_STATE_OUTPRIM__SHIFT 29
279 #define GS_STATE_OUTPRIM__MASK 0x3
280 #define GS_STATE_PIPELINE_STATS_EMU__SHIFT 31
281 #define GS_STATE_PIPELINE_STATS_EMU__MASK 0x1
282
283 #define ENCODE_FIELD(field, value) (((unsigned)(value) & field##__MASK) << field##__SHIFT)
284 #define CLEAR_FIELD(field) (~((unsigned)field##__MASK << field##__SHIFT))
285
286 /* This is called by functions that change states. */
287 #define SET_FIELD(var, field, value) do { \
288 assert((value) == ((unsigned)(value) & field##__MASK)); \
289 (var) &= CLEAR_FIELD(field); \
290 (var) |= ENCODE_FIELD(field, value); \
291 } while (0)
292
293 /* This is called during shader compilation and returns LLVMValueRef. */
294 #define GET_FIELD(ctx, field) si_unpack_param((ctx), (ctx)->args->vs_state_bits, field##__SHIFT, \
295 util_bitcount(field##__MASK))
296
297 enum
298 {
299 /* These represent the number of SGPRs the shader uses. */
300 SI_VS_BLIT_SGPRS_POS = 3,
301 SI_VS_BLIT_SGPRS_POS_COLOR = 7,
302 SI_VS_BLIT_SGPRS_POS_TEXCOORD = 9,
303
304 MAX_SI_VS_BLIT_SGPRS = 10, /* +1 for the attribute ring address */
305 };
306
307 #define SI_NGG_CULL_TRIANGLES (1 << 0) /* this implies W, view.xy, and small prim culling */
308 #define SI_NGG_CULL_BACK_FACE (1 << 1) /* back faces */
309 #define SI_NGG_CULL_FRONT_FACE (1 << 2) /* front faces */
310 #define SI_NGG_CULL_LINES (1 << 3) /* the primitive type is lines */
311 #define SI_NGG_CULL_SMALL_LINES_DIAMOND_EXIT (1 << 4) /* cull small lines according to the diamond exit rule */
312 #define SI_NGG_CULL_CLIP_PLANE_ENABLE(enable) (((enable) & 0xff) << 5)
313 #define SI_NGG_CULL_GET_CLIP_PLANE_ENABLE(x) (((x) >> 5) & 0xff)
314
315 struct si_shader_profile {
316 uint32_t sha1[SHA1_DIGEST_LENGTH32];
317 uint32_t options;
318 };
319
320 extern struct si_shader_profile si_shader_profiles[];
321 unsigned si_get_num_shader_profiles(void);
322
323 #define SI_PROFILE_WAVE32 (1 << 0)
324 #define SI_PROFILE_GFX10_WAVE64 (1 << 1)
325 /* bit gap */
326 #define SI_PROFILE_VS_NO_BINNING (1 << 3)
327 #define SI_PROFILE_GFX9_GFX10_PS_NO_BINNING (1 << 4)
328 #define SI_PROFILE_CLAMP_DIV_BY_ZERO (1 << 5)
329
330 enum si_shader_dump_type {
331 SI_DUMP_SHADER_KEY,
332 SI_DUMP_INIT_NIR, /* initial input NIR when shaders are created (before lowering) */
333 SI_DUMP_NIR, /* final NIR after lowering when shader variants are created */
334 SI_DUMP_INIT_LLVM_IR, /* initial LLVM IR before optimizations */
335 SI_DUMP_LLVM_IR, /* final LLVM IR */
336 SI_DUMP_INIT_ACO_IR, /* initial ACO IR before optimizations */
337 SI_DUMP_ACO_IR, /* final ACO IR */
338 SI_DUMP_ASM, /* final asm shaders */
339 SI_DUMP_STATS, /* print statistics as shader-db */
340 SI_DUMP_ALWAYS,
341 };
342
343 enum {
344 SI_UNIQUE_SLOT_POS = 0,
345
346 /* Since some shader stages use the highest used IO index
347 * to determine the size to allocate for inputs/outputs
348 * (in LDS, tess and GS rings). VARn should be placed right
349 * after POSITION to make that size as small as possible.
350 */
351 SI_UNIQUE_SLOT_VAR0 = 1, /* 0..31 */
352
353 /* Put 16-bit GLES varyings after 32-bit varyings. They can use the same indices as
354 * legacy desktop GL varyings because they are mutually exclusive.
355 */
356 SI_UNIQUE_SLOT_VAR0_16BIT = 33, /* 0..15 */
357
358 /* Legacy GL-only varyings can alias GLES-only 16-bit varyings. */
359 SI_UNIQUE_SLOT_FOGC = 33,
360 SI_UNIQUE_SLOT_COL0,
361 SI_UNIQUE_SLOT_COL1,
362 SI_UNIQUE_SLOT_BFC0,
363 SI_UNIQUE_SLOT_BFC1,
364 SI_UNIQUE_SLOT_TEX0,
365 SI_UNIQUE_SLOT_TEX1,
366 SI_UNIQUE_SLOT_TEX2,
367 SI_UNIQUE_SLOT_TEX3,
368 SI_UNIQUE_SLOT_TEX4,
369 SI_UNIQUE_SLOT_TEX5,
370 SI_UNIQUE_SLOT_TEX6,
371 SI_UNIQUE_SLOT_TEX7,
372 SI_UNIQUE_SLOT_CLIP_VERTEX,
373
374 /* Varyings present in both GLES and desktop GL must start at 49 after 16-bit varyings. */
375 SI_UNIQUE_SLOT_CLIP_DIST0 = 49,
376 SI_UNIQUE_SLOT_CLIP_DIST1,
377 SI_UNIQUE_SLOT_PSIZ,
378 /* These can't be written by LS, HS, and ES. */
379 SI_UNIQUE_SLOT_LAYER,
380 SI_UNIQUE_SLOT_VIEWPORT,
381 SI_UNIQUE_SLOT_PRIMITIVE_ID,
382 };
383
384 /**
385 * For VS shader keys, describe any fixups required for vertex fetch.
386 *
387 * \ref log_size, \ref format, and the number of channels are interpreted as
388 * by \ref ac_build_opencoded_load_format.
389 *
390 * Note: all bits 0 (size = 1 byte, num channels = 1, format = float) is an
391 * impossible format and indicates that no fixup is needed (just use
392 * buffer_load_format_xyzw).
393 */
394 union si_vs_fix_fetch {
395 struct {
396 uint8_t log_size : 2; /* 1, 2, 4, 8 or bytes per channel */
397 uint8_t num_channels_m1 : 2; /* number of channels minus 1 */
398 uint8_t format : 3; /* AC_FETCH_FORMAT_xxx */
399 uint8_t reverse : 1; /* reverse XYZ channels */
400 } u;
401 uint8_t bits;
402 };
403
404 struct si_shader;
405
406 /* State of the context creating the shader object. */
407 struct si_compiler_ctx_state {
408 /* Should only be used by si_init_shader_selector_async and
409 * si_build_shader_variant if thread_index == -1 (non-threaded). */
410 struct ac_llvm_compiler *compiler;
411
412 /* Used if thread_index == -1 or if debug.async is true. */
413 struct util_debug_callback debug;
414
415 /* Used for creating the log string for gallium/ddebug. */
416 bool is_debug_context;
417 };
418
419 enum si_color_output_type {
420 SI_TYPE_ANY32,
421 SI_TYPE_FLOAT16,
422 SI_TYPE_INT16,
423 SI_TYPE_UINT16,
424 };
425
426 union si_input_info {
427 struct {
428 uint8_t semantic;
429 uint8_t interpolate;
430 uint8_t fp16_lo_hi_valid;
431 uint8_t usage_mask;
432 };
433 uint32_t _unused; /* this just forces 4-byte alignment */
434 };
435
436 struct si_shader_info {
437 shader_info base;
438
439 uint32_t options; /* bitmask of SI_PROFILE_* */
440
441 uint8_t num_inputs;
442 uint8_t num_outputs;
443 union si_input_info input[PIPE_MAX_SHADER_INPUTS];
444 uint8_t output_semantic[PIPE_MAX_SHADER_OUTPUTS];
445 uint8_t output_usagemask[PIPE_MAX_SHADER_OUTPUTS];
446 uint8_t output_readmask[PIPE_MAX_SHADER_OUTPUTS];
447 uint8_t output_streams[PIPE_MAX_SHADER_OUTPUTS];
448 uint8_t output_type[PIPE_MAX_SHADER_OUTPUTS]; /* enum nir_alu_type */
449
450 uint8_t num_vs_inputs;
451 uint8_t num_vbos_in_user_sgprs;
452 uint8_t num_stream_output_components[4];
453 uint16_t enabled_streamout_buffer_mask;
454
455 uint64_t inputs_read; /* "get_unique_index" bits */
456 uint64_t tcs_vgpr_only_inputs; /* TCS inputs that are only in VGPRs, not LDS. */
457
458 uint64_t outputs_written_before_tes_gs; /* "get_unique_index" bits */
459 uint64_t outputs_written_before_ps; /* "get_unique_index" bits */
460 uint32_t patch_outputs_written; /* "get_unique_index_patch" bits */
461
462 uint8_t clipdist_mask;
463 uint8_t culldist_mask;
464
465 uint16_t lshs_vertex_stride;
466 uint16_t esgs_vertex_stride;
467 uint16_t gsvs_vertex_size;
468 uint8_t gs_input_verts_per_prim;
469 unsigned max_gsvs_emit_size;
470
471 /* Set 0xf or 0x0 (4 bits) per each written output.
472 * ANDed with spi_shader_col_format.
473 */
474 unsigned colors_written_4bit;
475
476 int constbuf0_num_slots;
477 uint num_memory_stores;
478 uint8_t color_attr_index[2];
479 uint8_t color_interpolate[2];
480 uint8_t color_interpolate_loc[2];
481 uint8_t colors_read; /**< which color components are read by the FS */
482 uint8_t colors_written;
483 uint16_t output_color_types; /**< Each bit pair is enum si_color_output_type */
484 bool color0_writes_all_cbufs; /**< gl_FragColor */
485 bool reads_samplemask; /**< does fragment shader read sample mask? */
486 bool reads_tess_factors; /**< If TES reads TESSINNER or TESSOUTER */
487 bool writes_z; /**< does fragment shader write Z value? */
488 bool writes_stencil; /**< does fragment shader write stencil value? */
489 bool writes_samplemask; /**< does fragment shader write sample mask? */
490 bool writes_edgeflag; /**< vertex shader outputs edgeflag */
491 bool uses_interp_color;
492 bool uses_persp_center_color;
493 bool uses_persp_centroid_color;
494 bool uses_persp_sample_color;
495 bool uses_persp_center;
496 bool uses_persp_centroid;
497 bool uses_persp_sample;
498 bool uses_linear_center;
499 bool uses_linear_centroid;
500 bool uses_linear_sample;
501 bool uses_interp_at_sample;
502 bool uses_instanceid;
503 bool uses_base_vertex;
504 bool uses_base_instance;
505 bool uses_drawid;
506 bool uses_primid;
507 bool uses_frontface;
508 bool uses_invocationid;
509 bool uses_thread_id[3];
510 bool uses_block_id[3];
511 bool uses_variable_block_size;
512 bool uses_grid_size;
513 bool uses_tg_size;
514 bool writes_position;
515 bool writes_psize;
516 bool writes_clipvertex;
517 bool writes_primid;
518 bool writes_viewport_index;
519 bool writes_layer;
520 bool uses_bindless_samplers;
521 bool uses_bindless_images;
522 bool uses_indirect_descriptor;
523 bool has_divergent_loop;
524 bool uses_sampleid;
525 bool uses_layer_id;
526 bool has_non_uniform_tex_access;
527
528 bool uses_vmem_sampler_or_bvh;
529 bool uses_vmem_load_other; /* all other VMEM loads and atomics with return */
530
531 /** Whether all codepaths write tess factors in all invocations. */
532 bool tessfactors_are_def_in_all_invocs;
533
534 /* A flag to check if vrs2x2 can be enabled to reduce number of
535 * fragment shader invocations if flat shading.
536 */
537 bool allow_flat_shading;
538
539 /* Optimization: if the texture bound to this texunit has been cleared to 1,
540 * then the draw can be skipped (see si_draw_vbo_skip_noop). Initially the
541 * value is 0xff (undetermined) and can be later changed to 0 (= false) or
542 * texunit + 1.
543 */
544 uint8_t writes_1_if_tex_is_1;
545
546 /* frag coord and sample pos per component read mask. */
547 uint8_t reads_frag_coord_mask;
548 uint8_t reads_sample_pos_mask;
549 };
550
551 /* A shader selector is a gallium CSO and contains shader variants and
552 * binaries for one NIR program. This can be shared by multiple contexts.
553 */
554 struct si_shader_selector {
555 struct util_live_shader base;
556 struct si_screen *screen;
557 struct util_queue_fence ready;
558 struct si_compiler_ctx_state compiler_ctx_state;
559 gl_shader_stage stage;
560
561 simple_mtx_t mutex;
562 union si_shader_key *keys;
563 unsigned variants_count;
564 unsigned variants_max_count;
565 struct si_shader **variants;
566
567 /* The compiled NIR shader without a prolog and/or epilog (not
568 * uploaded to a buffer object).
569 */
570 struct si_shader *main_shader_part;
571 struct si_shader *main_shader_part_ls; /* as_ls is set in the key */
572 struct si_shader *main_shader_part_es; /* as_es is set in the key */
573 struct si_shader *main_shader_part_ngg; /* as_ngg is set in the key */
574 struct si_shader *main_shader_part_ngg_es; /* for Wave32 TES before legacy GS */
575
576 struct nir_shader *nir;
577 void *nir_binary;
578 unsigned nir_size;
579
580 struct si_shader_info info;
581
582 enum pipe_shader_type pipe_shader_type;
583 uint8_t const_and_shader_buf_descriptors_index;
584 uint8_t sampler_and_images_descriptors_index;
585 uint8_t cs_shaderbufs_sgpr_index;
586 uint8_t cs_num_shaderbufs_in_user_sgprs;
587 uint8_t cs_images_sgpr_index;
588 uint8_t cs_images_num_sgprs;
589 uint8_t cs_num_images_in_user_sgprs;
590 unsigned ngg_cull_vert_threshold; /* UINT32_MAX = disabled */
591 enum mesa_prim rast_prim;
592
593 /* GS parameters. */
594 bool tess_turns_off_ngg;
595
596 /* bitmasks of used descriptor slots */
597 uint64_t active_const_and_shader_buffers;
598 uint64_t active_samplers_and_images;
599 };
600
601 /* Valid shader configurations:
602 *
603 * API shaders VS | TCS | TES | GS |pass| PS
604 * are compiled as: | | | |thru|
605 * | | | | |
606 * Only VS & PS: VS | | | | | PS
607 * GFX6 - with GS: ES | | | GS | VS | PS
608 * - with tess: LS | HS | VS | | | PS
609 * - with both: LS | HS | ES | GS | VS | PS
610 * GFX9 - with GS: -> | | | GS | VS | PS
611 * - with tess: -> | HS | VS | | | PS
612 * - with both: -> | HS | -> | GS | VS | PS
613 * | | | | |
614 * NGG - VS & PS: GS | | | | | PS
615 * (GFX10+) - with GS: -> | | | GS | | PS
616 * - with tess: -> | HS | GS | | | PS
617 * - with both: -> | HS | -> | GS | | PS
618 *
619 * -> = merged with the next stage
620 */
621
622 /* Use the byte alignment for all following structure members for optimal
623 * shader key memory footprint.
624 */
625 #pragma pack(push, 1)
626
627 /* Common TCS bits between the shader key and the epilog key. */
628 struct si_tcs_epilog_bits {
629 unsigned prim_mode : 3;
630 unsigned invoc0_tess_factors_are_def : 1;
631 unsigned tes_reads_tess_factors : 1;
632 };
633
634 /* Common PS bits between the shader key and the prolog key. */
635 struct si_ps_prolog_bits {
636 unsigned color_two_side : 1;
637 unsigned flatshade_colors : 1;
638 unsigned poly_stipple : 1;
639 unsigned force_persp_sample_interp : 1;
640 unsigned force_linear_sample_interp : 1;
641 unsigned force_persp_center_interp : 1;
642 unsigned force_linear_center_interp : 1;
643 unsigned bc_optimize_for_persp : 1;
644 unsigned bc_optimize_for_linear : 1;
645 unsigned samplemask_log_ps_iter : 3;
646 };
647
648 /* Common PS bits between the shader key and the epilog key. */
649 struct si_ps_epilog_bits {
650 unsigned spi_shader_col_format;
651 unsigned color_is_int8 : 8;
652 unsigned color_is_int10 : 8;
653 unsigned last_cbuf : 3;
654 unsigned alpha_func : 3;
655 unsigned alpha_to_one : 1;
656 unsigned alpha_to_coverage_via_mrtz : 1; /* gfx11+ */
657 unsigned clamp_color : 1;
658 unsigned dual_src_blend_swizzle : 1; /* gfx11+ */
659 unsigned rbplus_depth_only_opt:1;
660 unsigned kill_samplemask:1;
661 };
662
663 union si_shader_part_key {
664 struct {
665 struct si_tcs_epilog_bits states;
666 unsigned wave32 : 1;
667 unsigned noop_s_barrier : 1;
668 } tcs_epilog;
669 struct {
670 struct si_ps_prolog_bits states;
671 unsigned wave32 : 1;
672 unsigned num_input_sgprs : 6;
673 /* Color interpolation and two-side color selection. */
674 unsigned colors_read : 8; /* color input components read */
675 unsigned num_interp_inputs : 5; /* BCOLOR is at this location */
676 unsigned num_fragcoord_components : 3;
677 unsigned wqm : 1;
678 char color_attr_index[2];
679 signed char color_interp_vgpr_index[2]; /* -1 == constant */
680 } ps_prolog;
681 struct {
682 struct si_ps_epilog_bits states;
683 unsigned wave32 : 1;
684 unsigned uses_discard : 1;
685 unsigned colors_written : 8;
686 unsigned color_types : 16;
687 unsigned writes_z : 1;
688 unsigned writes_stencil : 1;
689 unsigned writes_samplemask : 1;
690 } ps_epilog;
691 };
692
693 /* The shader key for geometry stages (VS, TCS, TES, GS) */
694 struct si_shader_key_ge {
695 /* Prolog and epilog flags. */
696 union {
697 struct {
698 struct si_shader_selector *ls; /* for merged LS-HS */
699 struct si_tcs_epilog_bits epilog;
700 } tcs; /* tessellation control shader */
701 struct {
702 struct si_shader_selector *es; /* for merged ES-GS */
703 } gs;
704 } part;
705
706 /* These three are initially set according to the NEXT_SHADER property,
707 * or guessed if the property doesn't seem correct.
708 */
709 unsigned as_es : 1; /* whether it's a shader before GS */
710 unsigned as_ls : 1; /* whether it's VS before TCS */
711 unsigned as_ngg : 1; /* whether it's the last GE stage and NGG is enabled,
712 also set for the stage right before GS */
713
714 /* Flags for monolithic compilation only. */
715 struct {
716 /* - If neither "is_one" nor "is_fetched" has a bit set, the instance
717 * divisor is 0.
718 * - If "is_one" has a bit set, the instance divisor is 1.
719 * - If "is_fetched" has a bit set, the instance divisor will be loaded
720 * from the constant buffer.
721 */
722 uint16_t instance_divisor_is_one; /* bitmask of inputs */
723 uint16_t instance_divisor_is_fetched; /* bitmask of inputs */
724
725 /* Whether fetch should be opencoded according to vs_fix_fetch.
726 * Otherwise, if vs_fix_fetch is non-zero, buffer_load_format_xyzw
727 * with minimal fixups is used. */
728 uint16_t vs_fetch_opencode;
729 union si_vs_fix_fetch vs_fix_fetch[SI_MAX_ATTRIBS];
730
731 union {
732 /* When PS needs PrimID and GS is disabled. */
733 unsigned vs_export_prim_id : 1; /* VS and TES only */
734 unsigned gs_tri_strip_adj_fix : 1; /* GS only */
735 } u;
736 } mono;
737
738 /* Optimization flags for asynchronous compilation only. */
739 struct {
740 /* For HW VS (it can be VS, TES, GS) */
741 uint64_t kill_outputs; /* "get_unique_index" bits */
742 unsigned kill_clip_distances : 8;
743 unsigned kill_pointsize : 1;
744 unsigned kill_layer : 1;
745 unsigned remove_streamout : 1;
746
747 /* For NGG VS and TES. */
748 unsigned ngg_culling : 13; /* SI_NGG_CULL_* */
749
750 /* For shaders where monolithic variants have better code.
751 *
752 * This is a flag that has no effect on code generation,
753 * but forces monolithic shaders to be used as soon as
754 * possible, because it's in the "opt" group.
755 */
756 unsigned prefer_mono : 1;
757
758 /* VS and TCS have the same number of patch vertices. */
759 unsigned same_patch_vertices:1;
760
761 unsigned inline_uniforms:1;
762
763 /* This must be kept last to limit the number of variants
764 * depending only on the uniform values.
765 */
766 uint32_t inlined_uniform_values[MAX_INLINABLE_UNIFORMS];
767 } opt;
768 };
769
770 struct si_shader_key_ps {
771 struct {
772 /* Prolog and epilog flags. */
773 struct si_ps_prolog_bits prolog;
774 struct si_ps_epilog_bits epilog;
775 } part;
776
777 /* Flags for monolithic compilation only. */
778 struct {
779 unsigned poly_line_smoothing : 1;
780 unsigned point_smoothing : 1;
781 unsigned interpolate_at_sample_force_center : 1;
782 unsigned fbfetch_msaa : 1;
783 unsigned fbfetch_is_1D : 1;
784 unsigned fbfetch_layered : 1;
785 } mono;
786
787 /* Optimization flags for asynchronous compilation only. */
788 struct {
789 /* For shaders where monolithic variants have better code.
790 *
791 * This is a flag that has no effect on code generation,
792 * but forces monolithic shaders to be used as soon as
793 * possible, because it's in the "opt" group.
794 */
795 unsigned prefer_mono : 1;
796 unsigned inline_uniforms:1;
797
798 /* This eliminates the FRONT_FACE input VGPR as well as shader code using it. */
799 int force_front_face_input : 2; /* 0 = gl_FrontFacing, 1 = true, -1 = false */
800
801 /* This must be kept last to limit the number of variants
802 * depending only on the uniform values.
803 */
804 uint32_t inlined_uniform_values[MAX_INLINABLE_UNIFORMS];
805 } opt;
806 };
807
808 union si_shader_key {
809 struct si_shader_key_ge ge; /* geometry engine shaders */
810 struct si_shader_key_ps ps;
811 };
812
813 /* Restore the pack alignment to default. */
814 #pragma pack(pop)
815
816 /* GCN-specific shader info. */
817 struct si_shader_binary_info {
818 uint8_t vs_output_param_offset[NUM_TOTAL_VARYING_SLOTS];
819 uint32_t vs_output_ps_input_cntl[NUM_TOTAL_VARYING_SLOTS];
820 union si_input_info ps_inputs[SI_NUM_INTERP];
821 uint8_t num_ps_inputs;
822 uint8_t ps_colors_read;
823 uint8_t num_input_sgprs;
824 uint8_t num_input_vgprs;
825 bool uses_vmem_load_other; /* all other VMEM loads and atomics with return */
826 bool uses_vmem_sampler_or_bvh;
827 uint8_t num_fragcoord_components;
828 bool uses_instanceid;
829 uint8_t nr_pos_exports;
830 uint8_t nr_param_exports;
831 unsigned private_mem_vgprs;
832 unsigned max_simd_waves;
833 };
834
835 enum si_shader_binary_type {
836 SI_SHADER_BINARY_ELF,
837 SI_SHADER_BINARY_RAW,
838 };
839
840 struct si_shader_binary {
841 enum si_shader_binary_type type;
842
843 /* Depends on binary type, either ELF or raw buffer. */
844 const char *code_buffer;
845 size_t code_size;
846 uint32_t exec_size;
847
848 char *uploaded_code;
849 size_t uploaded_code_size;
850
851 char *llvm_ir_string;
852
853 const char *disasm_string;
854 size_t disasm_size;
855
856 const unsigned *symbols;
857 unsigned num_symbols;
858 };
859
860 struct gfx9_gs_info {
861 unsigned es_verts_per_subgroup;
862 unsigned gs_prims_per_subgroup;
863 unsigned gs_inst_prims_in_subgroup;
864 unsigned max_prims_per_subgroup;
865 unsigned esgs_ring_size; /* in bytes */
866 };
867
868 struct si_shader {
869 struct si_pm4_state pm4; /* base class */
870 struct si_compiler_ctx_state compiler_ctx_state;
871
872 struct si_shader_selector *selector;
873 struct si_shader_selector *previous_stage_sel; /* for refcounting */
874
875 struct si_shader_part *prolog;
876 struct si_shader *previous_stage; /* for GFX9 */
877 struct si_shader_part *epilog;
878 struct si_shader *gs_copy_shader;
879
880 struct si_resource *bo;
881 /* gpu_address should be bo->gpu_address except if SQTT is
882 * in use.
883 */
884 uint64_t gpu_address;
885 struct si_resource *scratch_bo;
886 union si_shader_key key;
887 struct util_queue_fence ready;
888 bool compilation_failed;
889 bool is_monolithic;
890 bool is_optimized;
891 bool is_binary_shared;
892 bool is_gs_copy_shader;
893 uint8_t wave_size;
894
895 /* The following data is all that's needed for binary shaders. */
896 struct si_shader_binary binary;
897 struct ac_shader_config config;
898 struct si_shader_binary_info info;
899
900 /* SI_SGPR_VS_STATE_BITS */
901 bool uses_vs_state_provoking_vertex;
902 bool uses_gs_state_outprim;
903
904 bool uses_base_instance;
905
906 /* Shader key + LLVM IR + disassembly + statistics.
907 * Generated for debug contexts only.
908 */
909 char *shader_log;
910 size_t shader_log_size;
911
912 struct gfx9_gs_info gs_info;
913
914 /* Precomputed register values. */
915 union {
916 struct {
917 unsigned vgt_gsvs_ring_offset_1;
918 unsigned vgt_gsvs_ring_offset_2;
919 unsigned vgt_gsvs_ring_offset_3;
920 unsigned vgt_gsvs_ring_itemsize;
921 unsigned vgt_gs_max_vert_out;
922 unsigned vgt_gs_vert_itemsize;
923 unsigned vgt_gs_vert_itemsize_1;
924 unsigned vgt_gs_vert_itemsize_2;
925 unsigned vgt_gs_vert_itemsize_3;
926 unsigned vgt_gs_instance_cnt;
927 unsigned vgt_gs_onchip_cntl;
928 unsigned vgt_gs_max_prims_per_subgroup;
929 unsigned vgt_esgs_ring_itemsize;
930 unsigned spi_shader_pgm_rsrc3_gs;
931 unsigned spi_shader_pgm_rsrc4_gs;
932 } gs;
933
934 struct {
935 /* Computed by gfx10_ngg_calculate_subgroup_info. */
936 uint16_t ngg_emit_size; /* in dwords */
937 uint16_t hw_max_esverts;
938 uint16_t max_gsprims;
939 uint16_t max_out_verts;
940 bool max_vert_out_per_gs_instance;
941 /* Register values. */
942 unsigned ge_max_output_per_subgroup;
943 unsigned ge_ngg_subgrp_cntl;
944 unsigned vgt_primitiveid_en;
945 unsigned vgt_gs_onchip_cntl;
946 unsigned vgt_gs_instance_cnt;
947 unsigned esgs_vertex_stride;
948 unsigned spi_vs_out_config;
949 unsigned spi_shader_pos_format;
950 unsigned pa_cl_vte_cntl;
951 unsigned vgt_gs_max_vert_out; /* for API GS */
952 unsigned ge_pc_alloc; /* uconfig register */
953 unsigned spi_shader_pgm_rsrc3_gs;
954 unsigned spi_shader_pgm_rsrc4_gs;
955 unsigned vgt_shader_stages_en;
956 } ngg;
957
958 struct {
959 unsigned vgt_gs_mode;
960 unsigned vgt_primitiveid_en;
961 unsigned vgt_reuse_off;
962 unsigned spi_vs_out_config;
963 unsigned spi_shader_pos_format;
964 unsigned pa_cl_vte_cntl;
965 unsigned ge_pc_alloc; /* uconfig register */
966 } vs;
967
968 struct {
969 unsigned spi_ps_input_ena;
970 unsigned spi_ps_input_addr;
971 unsigned spi_baryc_cntl;
972 unsigned spi_ps_in_control;
973 unsigned spi_shader_z_format;
974 unsigned spi_shader_col_format;
975 unsigned cb_shader_mask;
976 unsigned db_shader_control;
977 unsigned num_interp;
978 bool writes_samplemask;
979 } ps;
980 };
981
982 /* Precomputed register values. */
983 unsigned vgt_tf_param; /* VGT_TF_PARAM */
984 unsigned vgt_vertex_reuse_block_cntl; /* VGT_VERTEX_REUSE_BLOCK_CNTL */
985 unsigned pa_cl_vs_out_cntl;
986 unsigned ge_cntl;
987 };
988
989 struct si_shader_part {
990 struct si_shader_part *next;
991 union si_shader_part_key key;
992 struct si_shader_binary binary;
993 struct ac_shader_config config;
994 };
995
996 /* si_shader.c */
997 struct ac_rtld_binary;
998
999 void si_update_shader_binary_info(struct si_shader *shader, struct nir_shader *nir);
1000 bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
1001 struct si_shader *shader, struct util_debug_callback *debug);
1002 bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
1003 struct si_shader *shader, struct util_debug_callback *debug);
1004 void si_shader_destroy(struct si_shader *shader);
1005 unsigned si_shader_io_get_unique_index(unsigned semantic);
1006 bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader,
1007 uint64_t scratch_va);
1008 bool si_can_dump_shader(struct si_screen *sscreen, gl_shader_stage stage,
1009 enum si_shader_dump_type dump_type);
1010 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
1011 struct util_debug_callback *debug, FILE *f, bool check_debug_option);
1012 void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader,
1013 struct util_debug_callback *debug);
1014 void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size);
1015 const char *si_get_shader_name(const struct si_shader *shader);
1016 void si_shader_binary_clean(struct si_shader_binary *binary);
1017 struct nir_shader *si_deserialize_shader(struct si_shader_selector *sel);
1018 unsigned si_get_ps_num_interp(struct si_shader *ps);
1019 bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader,
1020 struct ac_rtld_binary *rtld);
1021 bool si_get_external_symbol(enum amd_gfx_level gfx_level, void *data, const char *name,
1022 uint64_t *value);
1023
1024 /* si_shader_info.c */
1025 void si_nir_scan_shader(struct si_screen *sscreen, const struct nir_shader *nir,
1026 struct si_shader_info *info);
1027
1028 /* si_shader_nir.c */
1029 extern const struct nir_lower_subgroups_options si_nir_subgroups_options;
1030
1031 bool si_alu_to_scalar_packed_math_filter(const struct nir_instr *instr, const void *data);
1032 void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first);
1033 void si_nir_late_opts(struct nir_shader *nir);
1034 char *si_finalize_nir(struct pipe_screen *screen, void *nirptr);
1035
1036 /* si_state_shaders.cpp */
1037 unsigned si_determine_wave_size(struct si_screen *sscreen, struct si_shader *shader);
1038 void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *gs,
1039 struct gfx9_gs_info *out);
1040 bool gfx10_is_ngg_passthrough(struct si_shader *shader);
1041
1042 bool si_should_clear_lds(struct si_screen *sscreen, const struct nir_shader *shader);
1043
1044 /* Inline helpers. */
1045
1046 /* Return the pointer to the main shader part's pointer. */
si_get_main_shader_part(struct si_shader_selector * sel,const union si_shader_key * key)1047 static inline struct si_shader **si_get_main_shader_part(struct si_shader_selector *sel,
1048 const union si_shader_key *key)
1049 {
1050 if (sel->stage <= MESA_SHADER_GEOMETRY) {
1051 if (key->ge.as_ls)
1052 return &sel->main_shader_part_ls;
1053 if (key->ge.as_es && key->ge.as_ngg)
1054 return &sel->main_shader_part_ngg_es;
1055 if (key->ge.as_es)
1056 return &sel->main_shader_part_es;
1057 if (key->ge.as_ngg)
1058 return &sel->main_shader_part_ngg;
1059 }
1060 return &sel->main_shader_part;
1061 }
1062
si_shader_uses_bindless_samplers(struct si_shader_selector * selector)1063 static inline bool si_shader_uses_bindless_samplers(struct si_shader_selector *selector)
1064 {
1065 return selector ? selector->info.uses_bindless_samplers : false;
1066 }
1067
si_shader_uses_bindless_images(struct si_shader_selector * selector)1068 static inline bool si_shader_uses_bindless_images(struct si_shader_selector *selector)
1069 {
1070 return selector ? selector->info.uses_bindless_images : false;
1071 }
1072
gfx10_edgeflags_have_effect(struct si_shader * shader)1073 static inline bool gfx10_edgeflags_have_effect(struct si_shader *shader)
1074 {
1075 if (shader->selector->stage == MESA_SHADER_VERTEX &&
1076 !shader->selector->info.base.vs.blit_sgprs_amd &&
1077 !(shader->key.ge.opt.ngg_culling & SI_NGG_CULL_LINES))
1078 return true;
1079
1080 return false;
1081 }
1082
gfx10_ngg_writes_user_edgeflags(struct si_shader * shader)1083 static inline bool gfx10_ngg_writes_user_edgeflags(struct si_shader *shader)
1084 {
1085 return gfx10_edgeflags_have_effect(shader) &&
1086 shader->selector->info.writes_edgeflag;
1087 }
1088
si_shader_uses_streamout(const struct si_shader * shader)1089 static inline bool si_shader_uses_streamout(const struct si_shader *shader)
1090 {
1091 return shader->selector->stage <= MESA_SHADER_GEOMETRY &&
1092 shader->selector->info.enabled_streamout_buffer_mask &&
1093 !shader->key.ge.opt.remove_streamout;
1094 }
1095
si_shader_uses_discard(struct si_shader * shader)1096 static inline bool si_shader_uses_discard(struct si_shader *shader)
1097 {
1098 /* Changes to this should also update ps_modifies_zs. */
1099 return shader->selector->info.base.fs.uses_discard ||
1100 shader->key.ps.part.prolog.poly_stipple ||
1101 shader->key.ps.mono.point_smoothing ||
1102 shader->key.ps.part.epilog.alpha_func != PIPE_FUNC_ALWAYS;
1103 }
1104
1105 #ifdef __cplusplus
1106 }
1107 #endif
1108
1109 #endif
1110