1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 /* The compiler middle-end architecture: Explaining (non-)monolithic shaders
26 * -------------------------------------------------------------------------
27 *
28 * Typically, there is one-to-one correspondence between API and HW shaders,
29 * that is, for every API shader, there is exactly one shader binary in
30 * the driver.
31 *
32 * The problem with that is that we also have to emulate some API states
33 * (e.g. alpha-test, and many others) in shaders too. The two obvious ways
34 * to deal with it are:
35 * - each shader has multiple variants for each combination of emulated states,
36 * and the variants are compiled on demand, possibly relying on a shader
37 * cache for good performance
38 * - patch shaders at the binary level
39 *
40 * This driver uses something completely different. The emulated states are
41 * usually implemented at the beginning or end of shaders. Therefore, we can
42 * split the shader into 3 parts:
43 * - prolog part (shader code dependent on states)
44 * - main part (the API shader)
45 * - epilog part (shader code dependent on states)
46 *
47 * Each part is compiled as a separate shader and the final binaries are
48 * concatenated. This type of shader is called non-monolithic, because it
49 * consists of multiple independent binaries. Creating a new shader variant
50 * is therefore only a concatenation of shader parts (binaries) and doesn't
51 * involve any compilation. The main shader parts are the only parts that are
52 * compiled when applications create shader objects. The prolog and epilog
53 * parts are compiled on the first use and saved, so that their binaries can
54 * be reused by many other shaders.
55 *
56 * One of the roles of the prolog part is to compute vertex buffer addresses
57 * for vertex shaders. A few of the roles of the epilog part are color buffer
58 * format conversions in pixel shaders that we have to do manually, and write
59 * tessellation factors in tessellation control shaders. The prolog and epilog
60 * have many other important responsibilities in various shader stages.
61 * They don't just "emulate legacy stuff".
62 *
63 * Monolithic shaders are shaders where the parts are combined before LLVM
64 * compilation, and the whole thing is compiled and optimized as one unit with
65 * one binary on the output. The result is the same as the non-monolithic
66 * shader, but the final code can be better, because LLVM can optimize across
67 * all shader parts. Monolithic shaders aren't usually used except for these
68 * special cases:
69 *
70 * 1) Some rarely-used states require modification of the main shader part
71 * itself, and in such cases, only the monolithic shader variant is
72 * compiled, and that's always done on the first use.
73 *
74 * 2) When we do cross-stage optimizations for separate shader objects and
75 * e.g. eliminate unused shader varyings, the resulting optimized shader
76 * variants are always compiled as monolithic shaders, and always
77 * asynchronously (i.e. not stalling ongoing rendering). We call them
78 * "optimized monolithic" shaders. The important property here is that
79 * the non-monolithic unoptimized shader variant is always available for use
80 * when the asynchronous compilation of the optimized shader is not done
81 * yet.
82 *
83 * Starting with GFX9 chips, some shader stages are merged, and the number of
84 * shader parts per shader increased. The complete new list of shader parts is:
85 * - 1st shader: prolog part
86 * - 1st shader: main part
87 * - 2nd shader: prolog part
88 * - 2nd shader: main part
89 * - 2nd shader: epilog part
90 */
91
92 /* How linking shader inputs and outputs between vertex, tessellation, and
93 * geometry shaders works.
94 *
95 * Inputs and outputs between shaders are stored in a buffer. This buffer
96 * lives in LDS (typical case for tessellation), but it can also live
97 * in memory (ESGS). Each input or output has a fixed location within a vertex.
98 * The highest used input or output determines the stride between vertices.
99 *
100 * Since GS and tessellation are only possible in the OpenGL core profile,
101 * only these semantics are valid for per-vertex data:
102 *
103 * Name Location
104 *
105 * POSITION 0
106 * PSIZE 1
107 * CLIPDIST0..1 2..3
108 * CULLDIST0..1 (not implemented)
109 * GENERIC0..31 4..35
110 *
111 * For example, a shader only writing GENERIC0 has the output stride of 5.
112 *
113 * Only these semantics are valid for per-patch data:
114 *
115 * Name Location
116 *
117 * TESSOUTER 0
118 * TESSINNER 1
119 * PATCH0..29 2..31
120 *
121 * That's how independent shaders agree on input and output locations.
122 * The si_shader_io_get_unique_index function assigns the locations.
123 *
124 * For tessellation, other required information for calculating the input and
125 * output addresses like the vertex stride, the patch stride, and the offsets
126 * where per-vertex and per-patch data start, is passed to the shader via
127 * user data SGPRs. The offsets and strides are calculated at draw time and
128 * aren't available at compile time.
129 */
130
131 #ifndef SI_SHADER_H
132 #define SI_SHADER_H
133
134 #include "ac_binary.h"
135 #include "ac_llvm_build.h"
136 #include "ac_llvm_util.h"
137 #include "util/simple_mtx.h"
138 #include "util/u_inlines.h"
139 #include "util/u_live_shader_cache.h"
140 #include "util/u_queue.h"
141 #include "si_pm4.h"
142
143 #include <stdio.h>
144
145 #ifdef __cplusplus
146 extern "C" {
147 #endif
148
149 // Use LDS symbols when supported by LLVM. Can be disabled for testing the old
150 // path on newer LLVM for now. Should be removed in the long term.
151 #define USE_LDS_SYMBOLS (true)
152
153 struct nir_shader;
154 struct si_shader;
155 struct si_context;
156
157 #define SI_MAX_ATTRIBS 16
158 #define SI_MAX_VS_OUTPUTS 40
159 #define SI_USER_CLIP_PLANE_MASK 0x3F
160
161 #define SI_NGG_PRIM_EDGE_FLAG_BITS ((1 << 9) | (1 << 19) | (1 << 29))
162
163 #define SI_PS_INPUT_CNTL_0000 (S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(0))
164 #define SI_PS_INPUT_CNTL_0001 (S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(3))
165 #define SI_PS_INPUT_CNTL_UNUSED SI_PS_INPUT_CNTL_0000
166 /* D3D9 behaviour for COLOR0 requires 0001. GL is undefined. */
167 #define SI_PS_INPUT_CNTL_UNUSED_COLOR0 SI_PS_INPUT_CNTL_0001
168
169 /* SGPR user data indices */
170 enum
171 {
172 SI_SGPR_INTERNAL_BINDINGS,
173 SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES,
174 SI_SGPR_CONST_AND_SHADER_BUFFERS, /* or just a constant buffer 0 pointer */
175 SI_SGPR_SAMPLERS_AND_IMAGES,
176 SI_NUM_RESOURCE_SGPRS,
177
178 /* API VS, TES without GS, GS copy shader */
179 SI_SGPR_VS_STATE_BITS = SI_NUM_RESOURCE_SGPRS,
180 SI_NUM_VS_STATE_RESOURCE_SGPRS,
181
182 /* all VS variants */
183 SI_SGPR_BASE_VERTEX = SI_NUM_VS_STATE_RESOURCE_SGPRS,
184 SI_SGPR_DRAWID,
185 SI_SGPR_START_INSTANCE,
186 SI_VS_NUM_USER_SGPR,
187
188 SI_SGPR_VS_BLIT_DATA = SI_SGPR_CONST_AND_SHADER_BUFFERS,
189
190 /* TES */
191 SI_SGPR_TES_OFFCHIP_LAYOUT = SI_NUM_VS_STATE_RESOURCE_SGPRS,
192 SI_SGPR_TES_OFFCHIP_ADDR,
193 SI_TES_NUM_USER_SGPR,
194
195 /* GFX6-8: TCS only */
196 GFX6_SGPR_TCS_OFFCHIP_LAYOUT = SI_NUM_RESOURCE_SGPRS,
197 GFX6_SGPR_TCS_OUT_OFFSETS,
198 GFX6_SGPR_TCS_OUT_LAYOUT,
199 GFX6_SGPR_TCS_IN_LAYOUT,
200 GFX6_TCS_NUM_USER_SGPR,
201
202 /* GFX9: Merged LS-HS (VS-TCS) only. */
203 GFX9_SGPR_TCS_OFFCHIP_LAYOUT = SI_VS_NUM_USER_SGPR,
204 GFX9_SGPR_TCS_OUT_OFFSETS,
205 GFX9_SGPR_TCS_OUT_LAYOUT,
206 GFX9_TCS_NUM_USER_SGPR,
207
208 /* GS limits */
209 GFX6_GS_NUM_USER_SGPR = SI_NUM_RESOURCE_SGPRS,
210 SI_GSCOPY_NUM_USER_SGPR = SI_NUM_VS_STATE_RESOURCE_SGPRS,
211
212 GFX9_SGPR_SMALL_PRIM_CULL_INFO = MAX2(SI_VS_NUM_USER_SGPR, SI_TES_NUM_USER_SGPR),
213 GFX9_SGPR_ATTRIBUTE_RING_ADDR,
214 GFX9_GS_NUM_USER_SGPR,
215
216 /* PS only */
217 SI_SGPR_ALPHA_REF = SI_NUM_RESOURCE_SGPRS,
218 SI_PS_NUM_USER_SGPR,
219
220 /* The value has to be 12, because the hw requires that descriptors
221 * are aligned to 4 SGPRs.
222 */
223 SI_SGPR_VS_VB_DESCRIPTOR_FIRST = 12,
224 };
225
226 /* LLVM function parameter indices */
227 enum
228 {
229 SI_NUM_RESOURCE_PARAMS = 4,
230
231 /* PS only parameters */
232 SI_PARAM_ALPHA_REF = SI_NUM_RESOURCE_PARAMS,
233 SI_PARAM_PRIM_MASK,
234 SI_PARAM_PERSP_SAMPLE,
235 SI_PARAM_PERSP_CENTER,
236 SI_PARAM_PERSP_CENTROID,
237 SI_PARAM_PERSP_PULL_MODEL,
238 SI_PARAM_LINEAR_SAMPLE,
239 SI_PARAM_LINEAR_CENTER,
240 SI_PARAM_LINEAR_CENTROID,
241 SI_PARAM_LINE_STIPPLE_TEX,
242 SI_PARAM_POS_X_FLOAT,
243 SI_PARAM_POS_Y_FLOAT,
244 SI_PARAM_POS_Z_FLOAT,
245 SI_PARAM_POS_W_FLOAT,
246 SI_PARAM_FRONT_FACE,
247 SI_PARAM_ANCILLARY,
248 SI_PARAM_SAMPLE_COVERAGE,
249 SI_PARAM_POS_FIXED_PT,
250
251 SI_NUM_PARAMS = SI_PARAM_POS_FIXED_PT + 9, /* +8 for COLOR[0..1] */
252 };
253
254 /* These fields are only set in current_vs_state (except INDEXED) in si_context, and they are
255 * accessible in the shader via vs_state_bits in VS, TES, and GS.
256 */
257 #define VS_STATE_CLAMP_VERTEX_COLOR__SHIFT 0
258 #define VS_STATE_CLAMP_VERTEX_COLOR__MASK 0x1 /* Shared by VS and GS */
259 #define VS_STATE_INDEXED__SHIFT 1
260 #define VS_STATE_INDEXED__MASK 0x1 /* Shared by VS and GS */
261
262 /* These fields are only set in current_vs_state in si_context, and they are accessible
263 * in the shader via vs_state_bits in LS/HS.
264 */
265 /* bit gap */
266 #define VS_STATE_LS_OUT_PATCH_SIZE__SHIFT 11
267 #define VS_STATE_LS_OUT_PATCH_SIZE__MASK 0x1fff
268 #define VS_STATE_LS_OUT_VERTEX_SIZE__SHIFT 24
269 #define VS_STATE_LS_OUT_VERTEX_SIZE__MASK 0xff
270
271 /* These fields are only set in current_gs_state in si_context, and they are accessible
272 * in the shader via vs_state_bits in legacy GS, the GS copy shader, and any NGG shader.
273 */
274 /* bit gap */
275 #define GS_STATE_SMALL_PRIM_PRECISION_NO_AA__SHIFT 18
276 #define GS_STATE_SMALL_PRIM_PRECISION_NO_AA__MASK 0xf
277 #define GS_STATE_SMALL_PRIM_PRECISION__SHIFT 22
278 #define GS_STATE_SMALL_PRIM_PRECISION__MASK 0xf
279 #define GS_STATE_STREAMOUT_QUERY_ENABLED__SHIFT 26
280 #define GS_STATE_STREAMOUT_QUERY_ENABLED__MASK 0x1
281 #define GS_STATE_PROVOKING_VTX_INDEX__SHIFT 27
282 #define GS_STATE_PROVOKING_VTX_INDEX__MASK 0x3
283 #define GS_STATE_OUTPRIM__SHIFT 29
284 #define GS_STATE_OUTPRIM__MASK 0x3
285 #define GS_STATE_PIPELINE_STATS_EMU__SHIFT 31
286 #define GS_STATE_PIPELINE_STATS_EMU__MASK 0x1
287
288 #define ENCODE_FIELD(field, value) (((unsigned)(value) & field##__MASK) << field##__SHIFT)
289 #define CLEAR_FIELD(field) (~((unsigned)field##__MASK << field##__SHIFT))
290
291 /* This is called by functions that change states. */
292 #define SET_FIELD(var, field, value) do { \
293 assert((value) == ((unsigned)(value) & field##__MASK)); \
294 (var) &= CLEAR_FIELD(field); \
295 (var) |= ENCODE_FIELD(field, value); \
296 } while (0)
297
298 /* This is called during shader compilation and returns LLVMValueRef. */
299 #define GET_FIELD(ctx, field) si_unpack_param((ctx), (ctx)->vs_state_bits, field##__SHIFT, \
300 util_bitcount(field##__MASK))
301
302 enum
303 {
304 /* These represent the number of SGPRs the shader uses. */
305 SI_VS_BLIT_SGPRS_POS = 3,
306 SI_VS_BLIT_SGPRS_POS_COLOR = 7,
307 SI_VS_BLIT_SGPRS_POS_TEXCOORD = 9,
308 };
309
310 #define SI_NGG_CULL_TRIANGLES (1 << 0) /* this implies W, view.xy, and small prim culling */
311 #define SI_NGG_CULL_BACK_FACE (1 << 1) /* back faces */
312 #define SI_NGG_CULL_FRONT_FACE (1 << 2) /* front faces */
313 #define SI_NGG_CULL_LINES (1 << 3) /* the primitive type is lines */
314 #define SI_NGG_CULL_SMALL_LINES_DIAMOND_EXIT (1 << 4) /* cull small lines according to the diamond exit rule */
315 #define SI_NGG_CULL_CLIP_PLANE_ENABLE(enable) (((enable) & 0xff) << 5)
316 #define SI_NGG_CULL_GET_CLIP_PLANE_ENABLE(x) (((x) >> 5) & 0xff)
317
318 #define SI_PROFILE_WAVE32 (1 << 0)
319 #define SI_PROFILE_WAVE64 (1 << 1)
320 #define SI_PROFILE_IGNORE_LLVM13_DISCARD_BUG (1 << 2)
321 #define SI_PROFILE_VS_NO_BINNING (1 << 3)
322 #define SI_PROFILE_PS_NO_BINNING (1 << 4)
323 #define SI_PROFILE_CLAMP_DIV_BY_ZERO (1 << 5)
324
325 /**
326 * For VS shader keys, describe any fixups required for vertex fetch.
327 *
328 * \ref log_size, \ref format, and the number of channels are interpreted as
329 * by \ref ac_build_opencoded_load_format.
330 *
331 * Note: all bits 0 (size = 1 byte, num channels = 1, format = float) is an
332 * impossible format and indicates that no fixup is needed (just use
333 * buffer_load_format_xyzw).
334 */
335 union si_vs_fix_fetch {
336 struct {
337 uint8_t log_size : 2; /* 1, 2, 4, 8 or bytes per channel */
338 uint8_t num_channels_m1 : 2; /* number of channels minus 1 */
339 uint8_t format : 3; /* AC_FETCH_FORMAT_xxx */
340 uint8_t reverse : 1; /* reverse XYZ channels */
341 } u;
342 uint8_t bits;
343 };
344
345 struct si_shader;
346
347 /* State of the context creating the shader object. */
348 struct si_compiler_ctx_state {
349 /* Should only be used by si_init_shader_selector_async and
350 * si_build_shader_variant if thread_index == -1 (non-threaded). */
351 struct ac_llvm_compiler *compiler;
352
353 /* Used if thread_index == -1 or if debug.async is true. */
354 struct util_debug_callback debug;
355
356 /* Used for creating the log string for gallium/ddebug. */
357 bool is_debug_context;
358 };
359
360 enum si_color_output_type {
361 SI_TYPE_ANY32,
362 SI_TYPE_FLOAT16,
363 SI_TYPE_INT16,
364 SI_TYPE_UINT16,
365 };
366
367 union si_input_info {
368 struct {
369 ubyte semantic;
370 ubyte interpolate;
371 ubyte fp16_lo_hi_valid;
372 ubyte usage_mask;
373 };
374 uint32_t _unused; /* this just forces 4-byte alignment */
375 };
376
377 struct si_shader_info {
378 shader_info base;
379
380 uint32_t options; /* bitmask of SI_PROFILE_* */
381
382 ubyte num_inputs;
383 ubyte num_outputs;
384 union si_input_info input[PIPE_MAX_SHADER_INPUTS];
385 ubyte output_semantic[PIPE_MAX_SHADER_OUTPUTS];
386 ubyte output_usagemask[PIPE_MAX_SHADER_OUTPUTS];
387 ubyte output_readmask[PIPE_MAX_SHADER_OUTPUTS];
388 ubyte output_streams[PIPE_MAX_SHADER_OUTPUTS];
389 ubyte output_type[PIPE_MAX_SHADER_OUTPUTS]; /* enum nir_alu_type */
390
391 ubyte num_vs_inputs;
392 ubyte num_vbos_in_user_sgprs;
393 ubyte num_stream_output_components[4];
394 uint16_t enabled_streamout_buffer_mask;
395
396 uint64_t inputs_read; /* "get_unique_index" bits */
397 uint64_t tcs_vgpr_only_inputs; /* TCS inputs that are only in VGPRs, not LDS. */
398
399 uint64_t outputs_written_before_ps; /* "get_unique_index" bits */
400 uint64_t outputs_written; /* "get_unique_index" bits */
401 uint32_t patch_outputs_written; /* "get_unique_index_patch" bits */
402
403 ubyte clipdist_mask;
404 ubyte culldist_mask;
405
406 uint16_t lshs_vertex_stride;
407 uint16_t esgs_itemsize; /* vertex stride */
408 uint16_t gsvs_vertex_size;
409 ubyte gs_input_verts_per_prim;
410 unsigned max_gsvs_emit_size;
411
412 /* Set 0xf or 0x0 (4 bits) per each written output.
413 * ANDed with spi_shader_col_format.
414 */
415 unsigned colors_written_4bit;
416
417 int constbuf0_num_slots;
418 uint num_memory_stores;
419 ubyte color_attr_index[2];
420 ubyte color_interpolate[2];
421 ubyte color_interpolate_loc[2];
422 ubyte colors_read; /**< which color components are read by the FS */
423 ubyte colors_written;
424 uint16_t output_color_types; /**< Each bit pair is enum si_color_output_type */
425 bool vs_needs_prolog;
426 bool color0_writes_all_cbufs; /**< gl_FragColor */
427 bool reads_samplemask; /**< does fragment shader read sample mask? */
428 bool reads_tess_factors; /**< If TES reads TESSINNER or TESSOUTER */
429 bool writes_z; /**< does fragment shader write Z value? */
430 bool writes_stencil; /**< does fragment shader write stencil value? */
431 bool writes_samplemask; /**< does fragment shader write sample mask? */
432 bool writes_edgeflag; /**< vertex shader outputs edgeflag */
433 bool uses_interp_color;
434 bool uses_persp_center_color;
435 bool uses_persp_centroid_color;
436 bool uses_persp_sample_color;
437 bool uses_persp_center;
438 bool uses_persp_centroid;
439 bool uses_persp_sample;
440 bool uses_linear_center;
441 bool uses_linear_centroid;
442 bool uses_linear_sample;
443 bool uses_interp_at_sample;
444 bool uses_instanceid;
445 bool uses_base_vertex;
446 bool uses_base_instance;
447 bool uses_drawid;
448 bool uses_primid;
449 bool uses_frontface;
450 bool uses_invocationid;
451 bool uses_thread_id[3];
452 bool uses_block_id[3];
453 bool uses_variable_block_size;
454 bool uses_grid_size;
455 bool uses_subgroup_info;
456 bool writes_position;
457 bool writes_psize;
458 bool writes_clipvertex;
459 bool writes_primid;
460 bool writes_viewport_index;
461 bool writes_layer;
462 bool uses_bindless_samplers;
463 bool uses_bindless_images;
464 bool uses_indirect_descriptor;
465 bool has_divergent_loop;
466
467 bool uses_vmem_sampler_or_bvh;
468 bool uses_vmem_load_other; /* all other VMEM loads and atomics with return */
469
470 /** Whether all codepaths write tess factors in all invocations. */
471 bool tessfactors_are_def_in_all_invocs;
472
473 /* A flag to check if vrs2x2 can be enabled to reduce number of
474 * fragment shader invocations if flat shading.
475 */
476 bool allow_flat_shading;
477
478 /* Optimization: if the texture bound to this texunit has been cleared to 1,
479 * then the draw can be skipped (see si_draw_vbo_skip_noop). Initially the
480 * value is 0xff (undetermined) and can be later changed to 0 (= false) or
481 * texunit + 1.
482 */
483 uint8_t writes_1_if_tex_is_1;
484 };
485
486 /* A shader selector is a gallium CSO and contains shader variants and
487 * binaries for one NIR program. This can be shared by multiple contexts.
488 */
489 struct si_shader_selector {
490 struct util_live_shader base;
491 struct si_screen *screen;
492 struct util_queue_fence ready;
493 struct si_compiler_ctx_state compiler_ctx_state;
494 gl_shader_stage stage;
495
496 simple_mtx_t mutex;
497 union si_shader_key *keys;
498 unsigned variants_count;
499 unsigned variants_max_count;
500 struct si_shader **variants;
501
502 /* The compiled NIR shader without a prolog and/or epilog (not
503 * uploaded to a buffer object).
504 */
505 struct si_shader *main_shader_part;
506 struct si_shader *main_shader_part_ls; /* as_ls is set in the key */
507 struct si_shader *main_shader_part_es; /* as_es is set in the key */
508 struct si_shader *main_shader_part_ngg; /* as_ngg is set in the key */
509 struct si_shader *main_shader_part_ngg_es; /* for Wave32 TES before legacy GS */
510
511 struct nir_shader *nir;
512 void *nir_binary;
513 unsigned nir_size;
514
515 struct si_shader_info info;
516
517 enum pipe_shader_type pipe_shader_type;
518 ubyte const_and_shader_buf_descriptors_index;
519 ubyte sampler_and_images_descriptors_index;
520 ubyte cs_shaderbufs_sgpr_index;
521 ubyte cs_num_shaderbufs_in_user_sgprs;
522 ubyte cs_images_sgpr_index;
523 ubyte cs_images_num_sgprs;
524 ubyte cs_num_images_in_user_sgprs;
525 unsigned ngg_cull_vert_threshold; /* UINT32_MAX = disabled */
526 enum pipe_prim_type rast_prim;
527
528 /* GS parameters. */
529 bool tess_turns_off_ngg;
530
531 /* bitmasks of used descriptor slots */
532 uint64_t active_const_and_shader_buffers;
533 uint64_t active_samplers_and_images;
534 };
535
536 /* Valid shader configurations:
537 *
538 * API shaders VS | TCS | TES | GS |pass| PS
539 * are compiled as: | | | |thru|
540 * | | | | |
541 * Only VS & PS: VS | | | | | PS
542 * GFX6 - with GS: ES | | | GS | VS | PS
543 * - with tess: LS | HS | VS | | | PS
544 * - with both: LS | HS | ES | GS | VS | PS
545 * GFX9 - with GS: -> | | | GS | VS | PS
546 * - with tess: -> | HS | VS | | | PS
547 * - with both: -> | HS | -> | GS | VS | PS
548 * | | | | |
549 * NGG - VS & PS: GS | | | | | PS
550 * (GFX10+) - with GS: -> | | | GS | | PS
551 * - with tess: -> | HS | GS | | | PS
552 * - with both: -> | HS | -> | GS | | PS
553 *
554 * -> = merged with the next stage
555 */
556
557 /* Use the byte alignment for all following structure members for optimal
558 * shader key memory footprint.
559 */
560 #pragma pack(push, 1)
561
562 /* Common VS bits between the shader key and the prolog key. */
563 struct si_vs_prolog_bits {
564 /* - If neither "is_one" nor "is_fetched" has a bit set, the instance
565 * divisor is 0.
566 * - If "is_one" has a bit set, the instance divisor is 1.
567 * - If "is_fetched" has a bit set, the instance divisor will be loaded
568 * from the constant buffer.
569 */
570 uint16_t instance_divisor_is_one; /* bitmask of inputs */
571 uint16_t instance_divisor_is_fetched; /* bitmask of inputs */
572 unsigned ls_vgpr_fix : 1;
573 };
574
575 /* Common TCS bits between the shader key and the epilog key. */
576 struct si_tcs_epilog_bits {
577 unsigned prim_mode : 3;
578 unsigned invoc0_tess_factors_are_def : 1;
579 unsigned tes_reads_tess_factors : 1;
580 };
581
582 /* Common PS bits between the shader key and the prolog key. */
583 struct si_ps_prolog_bits {
584 unsigned color_two_side : 1;
585 unsigned flatshade_colors : 1;
586 unsigned poly_stipple : 1;
587 unsigned force_persp_sample_interp : 1;
588 unsigned force_linear_sample_interp : 1;
589 unsigned force_persp_center_interp : 1;
590 unsigned force_linear_center_interp : 1;
591 unsigned bc_optimize_for_persp : 1;
592 unsigned bc_optimize_for_linear : 1;
593 unsigned samplemask_log_ps_iter : 3;
594 };
595
596 /* Common PS bits between the shader key and the epilog key. */
597 struct si_ps_epilog_bits {
598 unsigned spi_shader_col_format;
599 unsigned color_is_int8 : 8;
600 unsigned color_is_int10 : 8;
601 unsigned last_cbuf : 3;
602 unsigned alpha_func : 3;
603 unsigned alpha_to_one : 1;
604 unsigned alpha_to_coverage_via_mrtz : 1; /* gfx11+ */
605 unsigned clamp_color : 1;
606 unsigned dual_src_blend_swizzle : 1; /* gfx11+ */
607 };
608
609 union si_shader_part_key {
610 struct {
611 struct si_vs_prolog_bits states;
612 unsigned wave32 : 1;
613 unsigned num_input_sgprs : 6;
614 /* For merged stages such as LS-HS, HS input VGPRs are first. */
615 unsigned num_merged_next_stage_vgprs : 3;
616 unsigned num_inputs : 5;
617 unsigned as_ls : 1;
618 unsigned as_es : 1;
619 unsigned as_ngg : 1;
620 unsigned load_vgprs_after_culling : 1;
621 /* Prologs for monolithic shaders shouldn't set EXEC. */
622 unsigned is_monolithic : 1;
623 } vs_prolog;
624 struct {
625 struct si_tcs_epilog_bits states;
626 unsigned wave32 : 1;
627 unsigned noop_s_barrier : 1;
628 } tcs_epilog;
629 struct {
630 struct si_ps_prolog_bits states;
631 unsigned wave32 : 1;
632 unsigned num_input_sgprs : 6;
633 unsigned num_input_vgprs : 5;
634 /* Color interpolation and two-side color selection. */
635 unsigned colors_read : 8; /* color input components read */
636 unsigned num_interp_inputs : 5; /* BCOLOR is at this location */
637 unsigned face_vgpr_index : 5;
638 unsigned ancillary_vgpr_index : 5;
639 unsigned sample_coverage_vgpr_index : 5;
640 unsigned wqm : 1;
641 char color_attr_index[2];
642 signed char color_interp_vgpr_index[2]; /* -1 == constant */
643 } ps_prolog;
644 struct {
645 struct si_ps_epilog_bits states;
646 unsigned wave32 : 1;
647 unsigned uses_discard : 1;
648 unsigned colors_written : 8;
649 unsigned color_types : 16;
650 unsigned writes_z : 1;
651 unsigned writes_stencil : 1;
652 unsigned writes_samplemask : 1;
653 } ps_epilog;
654 };
655
656 /* The shader key for geometry stages (VS, TCS, TES, GS) */
657 struct si_shader_key_ge {
658 /* Prolog and epilog flags. */
659 union {
660 struct {
661 struct si_vs_prolog_bits prolog;
662 } vs;
663 struct {
664 struct si_vs_prolog_bits ls_prolog; /* for merged LS-HS */
665 struct si_shader_selector *ls; /* for merged LS-HS */
666 struct si_tcs_epilog_bits epilog;
667 } tcs; /* tessellation control shader */
668 struct {
669 struct si_vs_prolog_bits vs_prolog; /* for merged ES-GS */
670 struct si_shader_selector *es; /* for merged ES-GS */
671 } gs;
672 } part;
673
674 /* These three are initially set according to the NEXT_SHADER property,
675 * or guessed if the property doesn't seem correct.
676 */
677 unsigned as_es : 1; /* whether it's a shader before GS */
678 unsigned as_ls : 1; /* whether it's VS before TCS */
679 unsigned as_ngg : 1; /* whether it's the last GE stage and NGG is enabled,
680 also set for the stage right before GS */
681
682 /* Flags for monolithic compilation only. */
683 struct {
684 /* Whether fetch should be opencoded according to vs_fix_fetch.
685 * Otherwise, if vs_fix_fetch is non-zero, buffer_load_format_xyzw
686 * with minimal fixups is used. */
687 uint16_t vs_fetch_opencode;
688 union si_vs_fix_fetch vs_fix_fetch[SI_MAX_ATTRIBS];
689
690 union {
691 /* When PS needs PrimID and GS is disabled. */
692 unsigned vs_export_prim_id : 1; /* VS and TES only */
693 unsigned gs_tri_strip_adj_fix : 1; /* GS only */
694 } u;
695 } mono;
696
697 /* Optimization flags for asynchronous compilation only. */
698 struct {
699 /* For HW VS (it can be VS, TES, GS) */
700 uint64_t kill_outputs; /* "get_unique_index" bits */
701 unsigned kill_clip_distances : 8;
702 unsigned kill_pointsize : 1;
703 unsigned remove_streamout : 1;
704
705 /* For NGG VS and TES. */
706 unsigned ngg_culling : 13; /* SI_NGG_CULL_* */
707
708 /* For shaders where monolithic variants have better code.
709 *
710 * This is a flag that has no effect on code generation,
711 * but forces monolithic shaders to be used as soon as
712 * possible, because it's in the "opt" group.
713 */
714 unsigned prefer_mono : 1;
715
716 /* VS and TCS have the same number of patch vertices. */
717 unsigned same_patch_vertices:1;
718
719 unsigned inline_uniforms:1;
720
721 /* This must be kept last to limit the number of variants
722 * depending only on the uniform values.
723 */
724 uint32_t inlined_uniform_values[MAX_INLINABLE_UNIFORMS];
725 } opt;
726 };
727
728 struct si_shader_key_ps {
729 struct {
730 /* Prolog and epilog flags. */
731 struct si_ps_prolog_bits prolog;
732 struct si_ps_epilog_bits epilog;
733 } part;
734
735 /* Flags for monolithic compilation only. */
736 struct {
737 unsigned poly_line_smoothing : 1;
738 unsigned point_smoothing : 1;
739 unsigned interpolate_at_sample_force_center : 1;
740 unsigned fbfetch_msaa : 1;
741 unsigned fbfetch_is_1D : 1;
742 unsigned fbfetch_layered : 1;
743 } mono;
744
745 /* Optimization flags for asynchronous compilation only. */
746 struct {
747 /* For shaders where monolithic variants have better code.
748 *
749 * This is a flag that has no effect on code generation,
750 * but forces monolithic shaders to be used as soon as
751 * possible, because it's in the "opt" group.
752 */
753 unsigned prefer_mono : 1;
754 unsigned inline_uniforms:1;
755
756 /* This must be kept last to limit the number of variants
757 * depending only on the uniform values.
758 */
759 uint32_t inlined_uniform_values[MAX_INLINABLE_UNIFORMS];
760 } opt;
761 };
762
763 union si_shader_key {
764 struct si_shader_key_ge ge; /* geometry engine shaders */
765 struct si_shader_key_ps ps;
766 };
767
768 /* Restore the pack alignment to default. */
769 #pragma pack(pop)
770
771 /* GCN-specific shader info. */
772 struct si_shader_binary_info {
773 ubyte vs_output_param_offset[NUM_TOTAL_VARYING_SLOTS];
774 uint64_t vs_output_param_mask; /* which params to export, indexed by "base" */
775 uint32_t vs_output_ps_input_cntl[NUM_TOTAL_VARYING_SLOTS];
776 ubyte num_input_sgprs;
777 ubyte num_input_vgprs;
778 bool uses_vmem_load_other; /* all other VMEM loads and atomics with return */
779 bool uses_vmem_sampler_or_bvh;
780 signed char face_vgpr_index;
781 signed char ancillary_vgpr_index;
782 signed char sample_coverage_vgpr_index;
783 bool uses_instanceid;
784 ubyte nr_pos_exports;
785 ubyte nr_param_exports;
786 unsigned private_mem_vgprs;
787 unsigned max_simd_waves;
788 };
789
790 struct si_shader_binary {
791 const char *elf_buffer;
792 size_t elf_size;
793
794 char *uploaded_code;
795 size_t uploaded_code_size;
796
797 char *llvm_ir_string;
798 };
799
800 struct gfx9_gs_info {
801 unsigned es_verts_per_subgroup;
802 unsigned gs_prims_per_subgroup;
803 unsigned gs_inst_prims_in_subgroup;
804 unsigned max_prims_per_subgroup;
805 unsigned esgs_ring_size; /* in bytes */
806 };
807
808 #define SI_NUM_VGT_STAGES_KEY_BITS 8
809 #define SI_NUM_VGT_STAGES_STATES (1 << SI_NUM_VGT_STAGES_KEY_BITS)
810
811 /* The VGT_SHADER_STAGES key used to index the table of precomputed values.
812 * Some fields are set by state-change calls, most are set by draw_vbo.
813 */
814 union si_vgt_stages_key {
815 struct {
816 #if UTIL_ARCH_LITTLE_ENDIAN
817 uint8_t tess : 1;
818 uint8_t gs : 1;
819 uint8_t ngg_passthrough : 1;
820 uint8_t ngg : 1; /* gfx10+ */
821 uint8_t streamout : 1; /* only used with NGG */
822 uint8_t hs_wave32 : 1;
823 uint8_t gs_wave32 : 1;
824 uint8_t vs_wave32 : 1;
825 #else /* UTIL_ARCH_BIG_ENDIAN */
826 uint8_t vs_wave32 : 1;
827 uint8_t gs_wave32 : 1;
828 uint8_t hs_wave32 : 1;
829 uint8_t streamout : 1;
830 uint8_t ngg : 1;
831 uint8_t ngg_passthrough : 1;
832 uint8_t gs : 1;
833 uint8_t tess : 1;
834 #endif
835 } u;
836 uint8_t index;
837 };
838
839 struct si_shader {
840 struct si_pm4_state pm4; /* base class */
841 struct si_compiler_ctx_state compiler_ctx_state;
842
843 struct si_shader_selector *selector;
844 struct si_shader_selector *previous_stage_sel; /* for refcounting */
845
846 struct si_shader_part *prolog;
847 struct si_shader *previous_stage; /* for GFX9 */
848 struct si_shader_part *epilog;
849 struct si_shader *gs_copy_shader;
850
851 struct si_resource *bo;
852 struct si_resource *scratch_bo;
853 union si_shader_key key;
854 struct util_queue_fence ready;
855 bool compilation_failed;
856 bool is_monolithic;
857 bool is_optimized;
858 bool is_binary_shared;
859 bool is_gs_copy_shader;
860 uint8_t wave_size;
861
862 /* The following data is all that's needed for binary shaders. */
863 struct si_shader_binary binary;
864 struct ac_shader_config config;
865 struct si_shader_binary_info info;
866
867 /* SI_SGPR_VS_STATE_BITS */
868 bool uses_vs_state_provoking_vertex;
869 bool uses_gs_state_outprim;
870
871 bool uses_base_instance;
872
873 struct {
874 uint16_t ngg_emit_size; /* in dwords */
875 uint16_t hw_max_esverts;
876 uint16_t max_gsprims;
877 uint16_t max_out_verts;
878 uint16_t prim_amp_factor;
879 bool max_vert_out_per_gs_instance;
880 } ngg;
881
882 /* Shader key + LLVM IR + disassembly + statistics.
883 * Generated for debug contexts only.
884 */
885 char *shader_log;
886 size_t shader_log_size;
887
888 struct gfx9_gs_info gs_info;
889
890 /* For save precompute context registers values. */
891 union {
892 struct {
893 unsigned vgt_gsvs_ring_offset_1;
894 unsigned vgt_gsvs_ring_offset_2;
895 unsigned vgt_gsvs_ring_offset_3;
896 unsigned vgt_gsvs_ring_itemsize;
897 unsigned vgt_gs_max_vert_out;
898 unsigned vgt_gs_vert_itemsize;
899 unsigned vgt_gs_vert_itemsize_1;
900 unsigned vgt_gs_vert_itemsize_2;
901 unsigned vgt_gs_vert_itemsize_3;
902 unsigned vgt_gs_instance_cnt;
903 unsigned vgt_gs_onchip_cntl;
904 unsigned vgt_gs_max_prims_per_subgroup;
905 unsigned vgt_esgs_ring_itemsize;
906 unsigned spi_shader_pgm_rsrc3_gs;
907 unsigned spi_shader_pgm_rsrc4_gs;
908 } gs;
909
910 struct {
911 unsigned ge_max_output_per_subgroup;
912 unsigned ge_ngg_subgrp_cntl;
913 unsigned vgt_primitiveid_en;
914 unsigned vgt_gs_onchip_cntl;
915 unsigned vgt_gs_instance_cnt;
916 unsigned vgt_esgs_ring_itemsize;
917 unsigned spi_vs_out_config;
918 unsigned spi_shader_idx_format;
919 unsigned spi_shader_pos_format;
920 unsigned pa_cl_vte_cntl;
921 unsigned pa_cl_ngg_cntl;
922 unsigned vgt_gs_max_vert_out; /* for API GS */
923 unsigned ge_pc_alloc; /* uconfig register */
924 unsigned spi_shader_pgm_rsrc3_gs;
925 unsigned spi_shader_pgm_rsrc4_gs;
926 union si_vgt_stages_key vgt_stages;
927 } ngg;
928
929 struct {
930 unsigned vgt_gs_mode;
931 unsigned vgt_primitiveid_en;
932 unsigned vgt_reuse_off;
933 unsigned spi_vs_out_config;
934 unsigned spi_shader_pos_format;
935 unsigned pa_cl_vte_cntl;
936 unsigned ge_pc_alloc; /* uconfig register */
937 } vs;
938
939 struct {
940 unsigned spi_ps_input_ena;
941 unsigned spi_ps_input_addr;
942 unsigned spi_baryc_cntl;
943 unsigned spi_ps_in_control;
944 unsigned spi_shader_z_format;
945 unsigned spi_shader_col_format;
946 unsigned cb_shader_mask;
947 unsigned db_shader_control;
948 unsigned num_interp;
949 } ps;
950 } ctx_reg;
951
952 /*For save precompute registers value */
953 unsigned vgt_tf_param; /* VGT_TF_PARAM */
954 unsigned vgt_vertex_reuse_block_cntl; /* VGT_VERTEX_REUSE_BLOCK_CNTL */
955 unsigned pa_cl_vs_out_cntl;
956 unsigned ge_cntl;
957 };
958
959 struct si_shader_part {
960 struct si_shader_part *next;
961 union si_shader_part_key key;
962 struct si_shader_binary binary;
963 struct ac_shader_config config;
964 };
965
966 /* si_shader.c */
967 void si_update_shader_binary_info(struct si_shader *shader, nir_shader *nir);
968 bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
969 struct si_shader *shader, struct util_debug_callback *debug);
970 bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
971 struct si_shader *shader, struct util_debug_callback *debug);
972 void si_shader_destroy(struct si_shader *shader);
973 unsigned si_shader_io_get_unique_index_patch(unsigned semantic);
974 unsigned si_shader_io_get_unique_index(unsigned semantic, bool is_varying);
975 bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader,
976 uint64_t scratch_va);
977 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
978 struct util_debug_callback *debug, FILE *f, bool check_debug_option);
979 void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader,
980 struct util_debug_callback *debug);
981 void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size);
982 const char *si_get_shader_name(const struct si_shader *shader);
983 void si_shader_binary_clean(struct si_shader_binary *binary);
984 struct nir_shader *si_deserialize_shader(struct si_shader_selector *sel);
985 unsigned si_get_ps_num_interp(struct si_shader *ps);
986
987 /* si_shader_info.c */
988 void si_nir_scan_shader(struct si_screen *sscreen, const struct nir_shader *nir,
989 struct si_shader_info *info);
990
991 /* si_shader_llvm_gs.c */
992 struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
993 struct ac_llvm_compiler *compiler,
994 struct si_shader_selector *gs_selector,
995 const struct pipe_stream_output_info *so,
996 struct util_debug_callback *debug);
997
998 /* si_shader_nir.c */
999 void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first);
1000 void si_nir_late_opts(nir_shader *nir);
1001 char *si_finalize_nir(struct pipe_screen *screen, void *nirptr);
1002
1003 /* si_state_shaders.cpp */
1004 unsigned si_determine_wave_size(struct si_screen *sscreen, struct si_shader *shader);
1005 void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *gs,
1006 struct gfx9_gs_info *out);
1007 bool gfx10_is_ngg_passthrough(struct si_shader *shader);
1008
1009 /* Inline helpers. */
1010
1011 /* Return the pointer to the main shader part's pointer. */
si_get_main_shader_part(struct si_shader_selector * sel,const union si_shader_key * key)1012 static inline struct si_shader **si_get_main_shader_part(struct si_shader_selector *sel,
1013 const union si_shader_key *key)
1014 {
1015 if (sel->stage <= MESA_SHADER_GEOMETRY) {
1016 if (key->ge.as_ls)
1017 return &sel->main_shader_part_ls;
1018 if (key->ge.as_es && key->ge.as_ngg)
1019 return &sel->main_shader_part_ngg_es;
1020 if (key->ge.as_es)
1021 return &sel->main_shader_part_es;
1022 if (key->ge.as_ngg)
1023 return &sel->main_shader_part_ngg;
1024 }
1025 return &sel->main_shader_part;
1026 }
1027
si_shader_uses_bindless_samplers(struct si_shader_selector * selector)1028 static inline bool si_shader_uses_bindless_samplers(struct si_shader_selector *selector)
1029 {
1030 return selector ? selector->info.uses_bindless_samplers : false;
1031 }
1032
si_shader_uses_bindless_images(struct si_shader_selector * selector)1033 static inline bool si_shader_uses_bindless_images(struct si_shader_selector *selector)
1034 {
1035 return selector ? selector->info.uses_bindless_images : false;
1036 }
1037
gfx10_edgeflags_have_effect(struct si_shader * shader)1038 static inline bool gfx10_edgeflags_have_effect(struct si_shader *shader)
1039 {
1040 if (shader->selector->stage == MESA_SHADER_VERTEX &&
1041 !shader->selector->info.base.vs.blit_sgprs_amd &&
1042 !(shader->key.ge.opt.ngg_culling & SI_NGG_CULL_LINES))
1043 return true;
1044
1045 return false;
1046 }
1047
gfx10_ngg_writes_user_edgeflags(struct si_shader * shader)1048 static inline bool gfx10_ngg_writes_user_edgeflags(struct si_shader *shader)
1049 {
1050 return gfx10_edgeflags_have_effect(shader) &&
1051 shader->selector->info.writes_edgeflag;
1052 }
1053
si_shader_uses_streamout(struct si_shader * shader)1054 static inline bool si_shader_uses_streamout(struct si_shader *shader)
1055 {
1056 return shader->selector->stage <= MESA_SHADER_GEOMETRY &&
1057 shader->selector->info.enabled_streamout_buffer_mask &&
1058 !shader->key.ge.opt.remove_streamout;
1059 }
1060
si_shader_uses_discard(struct si_shader * shader)1061 static inline bool si_shader_uses_discard(struct si_shader *shader)
1062 {
1063 /* Changes to this should also update ps_modifies_zs. */
1064 return shader->selector->info.base.fs.uses_discard ||
1065 shader->key.ps.part.prolog.poly_stipple ||
1066 shader->key.ps.mono.point_smoothing ||
1067 shader->key.ps.part.epilog.alpha_func != PIPE_FUNC_ALWAYS;
1068 }
1069
1070 #ifdef __cplusplus
1071 }
1072 #endif
1073
1074 #endif
1075