1 /* 2 * Copyright 2012 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * on the rights to use, copy, modify, merge, publish, distribute, sub 8 * license, and/or sell copies of the Software, and to permit persons to whom 9 * the Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 21 * USE OR OTHER DEALINGS IN THE SOFTWARE. 22 * 23 * Authors: 24 * Tom Stellard <thomas.stellard@amd.com> 25 * Michel Dänzer <michel.daenzer@amd.com> 26 * Christian König <christian.koenig@amd.com> 27 */ 28 29 /* How linking shader inputs and outputs between vertex, tessellation, and 30 * geometry shaders works. 31 * 32 * Inputs and outputs between shaders are stored in a buffer. This buffer 33 * lives in LDS (typical case for tessellation), but it can also live 34 * in memory (ESGS). Each input or output has a fixed location within a vertex. 35 * The highest used input or output determines the stride between vertices. 36 * 37 * Since GS and tessellation are only possible in the OpenGL core profile, 38 * only these semantics are valid for per-vertex data: 39 * 40 * Name Location 41 * 42 * POSITION 0 43 * PSIZE 1 44 * CLIPDIST0..1 2..3 45 * CULLDIST0..1 (not implemented) 46 * GENERIC0..31 4..35 47 * 48 * For example, a shader only writing GENERIC0 has the output stride of 5. 49 * 50 * Only these semantics are valid for per-patch data: 51 * 52 * Name Location 53 * 54 * TESSOUTER 0 55 * TESSINNER 1 56 * PATCH0..29 2..31 57 * 58 * That's how independent shaders agree on input and output locations. 59 * The si_shader_io_get_unique_index function assigns the locations. 60 * 61 * For tessellation, other required information for calculating the input and 62 * output addresses like the vertex stride, the patch stride, and the offsets 63 * where per-vertex and per-patch data start, is passed to the shader via 64 * user data SGPRs. The offsets and strides are calculated at draw time and 65 * aren't available at compile time. 66 */ 67 68 #ifndef SI_SHADER_H 69 #define SI_SHADER_H 70 71 #include <llvm-c/Core.h> /* LLVMModuleRef */ 72 #include <llvm-c/TargetMachine.h> 73 #include "tgsi/tgsi_scan.h" 74 #include "util/u_queue.h" 75 #include "si_state.h" 76 77 struct radeon_shader_binary; 78 struct radeon_shader_reloc; 79 80 #define SI_MAX_VS_OUTPUTS 40 81 82 /* SGPR user data indices */ 83 enum { 84 SI_SGPR_RW_BUFFERS, /* rings (& stream-out, VS only) */ 85 SI_SGPR_RW_BUFFERS_HI, 86 SI_SGPR_CONST_BUFFERS, 87 SI_SGPR_CONST_BUFFERS_HI, 88 SI_SGPR_SAMPLERS, /* images & sampler states interleaved */ 89 SI_SGPR_SAMPLERS_HI, 90 SI_SGPR_IMAGES, 91 SI_SGPR_IMAGES_HI, 92 SI_SGPR_SHADER_BUFFERS, 93 SI_SGPR_SHADER_BUFFERS_HI, 94 SI_NUM_RESOURCE_SGPRS, 95 96 /* all VS variants */ 97 SI_SGPR_VERTEX_BUFFERS = SI_NUM_RESOURCE_SGPRS, 98 SI_SGPR_VERTEX_BUFFERS_HI, 99 SI_SGPR_BASE_VERTEX, 100 SI_SGPR_START_INSTANCE, 101 SI_SGPR_DRAWID, 102 SI_ES_NUM_USER_SGPR, 103 104 /* hw VS only */ 105 SI_SGPR_VS_STATE_BITS = SI_ES_NUM_USER_SGPR, 106 SI_VS_NUM_USER_SGPR, 107 108 /* hw LS only */ 109 SI_SGPR_LS_OUT_LAYOUT = SI_ES_NUM_USER_SGPR, 110 SI_LS_NUM_USER_SGPR, 111 112 /* both TCS and TES */ 113 SI_SGPR_TCS_OFFCHIP_LAYOUT = SI_NUM_RESOURCE_SGPRS, 114 SI_TES_NUM_USER_SGPR, 115 116 /* TCS only */ 117 SI_SGPR_TCS_OUT_OFFSETS = SI_TES_NUM_USER_SGPR, 118 SI_SGPR_TCS_OUT_LAYOUT, 119 SI_SGPR_TCS_IN_LAYOUT, 120 SI_TCS_NUM_USER_SGPR, 121 122 /* GS limits */ 123 SI_GS_NUM_USER_SGPR = SI_NUM_RESOURCE_SGPRS, 124 SI_GSCOPY_NUM_USER_SGPR = SI_SGPR_RW_BUFFERS_HI + 1, 125 126 /* PS only */ 127 SI_SGPR_ALPHA_REF = SI_NUM_RESOURCE_SGPRS, 128 SI_PS_NUM_USER_SGPR, 129 130 /* CS only */ 131 SI_SGPR_GRID_SIZE = SI_NUM_RESOURCE_SGPRS, 132 SI_SGPR_BLOCK_SIZE = SI_SGPR_GRID_SIZE + 3, 133 SI_CS_NUM_USER_SGPR = SI_SGPR_BLOCK_SIZE + 3 134 }; 135 136 /* LLVM function parameter indices */ 137 enum { 138 SI_PARAM_RW_BUFFERS, 139 SI_PARAM_CONST_BUFFERS, 140 SI_PARAM_SAMPLERS, 141 SI_PARAM_IMAGES, 142 SI_PARAM_SHADER_BUFFERS, 143 SI_NUM_RESOURCE_PARAMS, 144 145 /* VS only parameters */ 146 SI_PARAM_VERTEX_BUFFERS = SI_NUM_RESOURCE_PARAMS, 147 SI_PARAM_BASE_VERTEX, 148 SI_PARAM_START_INSTANCE, 149 SI_PARAM_DRAWID, 150 /* [0] = clamp vertex color, VS as VS only */ 151 SI_PARAM_VS_STATE_BITS, 152 /* same value as TCS_IN_LAYOUT, VS as LS only */ 153 SI_PARAM_LS_OUT_LAYOUT = SI_PARAM_DRAWID + 1, 154 /* the other VS parameters are assigned dynamically */ 155 156 /* Layout of TCS outputs in the offchip buffer 157 * [0:8] = the number of patches per threadgroup. 158 * [9:15] = the number of output vertices per patch. 159 * [16:31] = the offset of per patch attributes in the buffer in bytes. 160 */ 161 SI_PARAM_TCS_OFFCHIP_LAYOUT = SI_NUM_RESOURCE_PARAMS, /* for TCS & TES */ 162 163 /* TCS only parameters. */ 164 165 /* Offsets where TCS outputs and TCS patch outputs live in LDS: 166 * [0:15] = TCS output patch0 offset / 16, max = NUM_PATCHES * 32 * 32 167 * [16:31] = TCS output patch0 offset for per-patch / 16, max = NUM_PATCHES*32*32* + 32*32 168 */ 169 SI_PARAM_TCS_OUT_OFFSETS, 170 171 /* Layout of TCS outputs / TES inputs: 172 * [0:12] = stride between output patches in dwords, num_outputs * num_vertices * 4, max = 32*32*4 173 * [13:20] = stride between output vertices in dwords = num_inputs * 4, max = 32*4 174 * [26:31] = gl_PatchVerticesIn, max = 32 175 */ 176 SI_PARAM_TCS_OUT_LAYOUT, 177 178 /* Layout of LS outputs / TCS inputs 179 * [0:12] = stride between patches in dwords = num_inputs * num_vertices * 4, max = 32*32*4 180 * [13:20] = stride between vertices in dwords = num_inputs * 4, max = 32*4 181 */ 182 SI_PARAM_TCS_IN_LAYOUT, 183 184 SI_PARAM_TCS_OC_LDS, 185 SI_PARAM_TESS_FACTOR_OFFSET, 186 SI_PARAM_PATCH_ID, 187 SI_PARAM_REL_IDS, 188 189 /* GS only parameters */ 190 SI_PARAM_GS2VS_OFFSET = SI_NUM_RESOURCE_PARAMS, 191 SI_PARAM_GS_WAVE_ID, 192 SI_PARAM_VTX0_OFFSET, 193 SI_PARAM_VTX1_OFFSET, 194 SI_PARAM_PRIMITIVE_ID, 195 SI_PARAM_VTX2_OFFSET, 196 SI_PARAM_VTX3_OFFSET, 197 SI_PARAM_VTX4_OFFSET, 198 SI_PARAM_VTX5_OFFSET, 199 SI_PARAM_GS_INSTANCE_ID, 200 201 /* PS only parameters */ 202 SI_PARAM_ALPHA_REF = SI_NUM_RESOURCE_PARAMS, 203 SI_PARAM_PRIM_MASK, 204 SI_PARAM_PERSP_SAMPLE, 205 SI_PARAM_PERSP_CENTER, 206 SI_PARAM_PERSP_CENTROID, 207 SI_PARAM_PERSP_PULL_MODEL, 208 SI_PARAM_LINEAR_SAMPLE, 209 SI_PARAM_LINEAR_CENTER, 210 SI_PARAM_LINEAR_CENTROID, 211 SI_PARAM_LINE_STIPPLE_TEX, 212 SI_PARAM_POS_X_FLOAT, 213 SI_PARAM_POS_Y_FLOAT, 214 SI_PARAM_POS_Z_FLOAT, 215 SI_PARAM_POS_W_FLOAT, 216 SI_PARAM_FRONT_FACE, 217 SI_PARAM_ANCILLARY, 218 SI_PARAM_SAMPLE_COVERAGE, 219 SI_PARAM_POS_FIXED_PT, 220 221 /* CS only parameters */ 222 SI_PARAM_GRID_SIZE = SI_NUM_RESOURCE_PARAMS, 223 SI_PARAM_BLOCK_SIZE, 224 SI_PARAM_BLOCK_ID, 225 SI_PARAM_THREAD_ID, 226 227 SI_NUM_PARAMS = SI_PARAM_POS_FIXED_PT + 9, /* +8 for COLOR[0..1] */ 228 }; 229 230 /* SI-specific system values. */ 231 enum { 232 TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI = TGSI_SEMANTIC_COUNT, 233 TGSI_SEMANTIC_DEFAULT_TESSINNER_SI, 234 }; 235 236 /* For VS shader key fix_fetch. */ 237 enum { 238 SI_FIX_FETCH_NONE = 0, 239 SI_FIX_FETCH_A2_SNORM, 240 SI_FIX_FETCH_A2_SSCALED, 241 SI_FIX_FETCH_A2_SINT, 242 SI_FIX_FETCH_RGBA_32_UNORM, 243 SI_FIX_FETCH_RGBX_32_UNORM, 244 SI_FIX_FETCH_RGBA_32_SNORM, 245 SI_FIX_FETCH_RGBX_32_SNORM, 246 SI_FIX_FETCH_RGBA_32_USCALED, 247 SI_FIX_FETCH_RGBA_32_SSCALED, 248 SI_FIX_FETCH_RGBA_32_FIXED, 249 SI_FIX_FETCH_RGBX_32_FIXED, 250 }; 251 252 struct si_shader; 253 254 /* State of the context creating the shader object. */ 255 struct si_compiler_ctx_state { 256 /* Should only be used by si_init_shader_selector_async and 257 * si_build_shader_variant if thread_index == -1 (non-threaded). */ 258 LLVMTargetMachineRef tm; 259 260 /* Used if thread_index == -1 or if debug.async is true. */ 261 struct pipe_debug_callback debug; 262 263 /* Used for creating the log string for gallium/ddebug. */ 264 bool is_debug_context; 265 }; 266 267 /* A shader selector is a gallium CSO and contains shader variants and 268 * binaries for one TGSI program. This can be shared by multiple contexts. 269 */ 270 struct si_shader_selector { 271 struct si_screen *screen; 272 struct util_queue_fence ready; 273 struct si_compiler_ctx_state compiler_ctx_state; 274 275 pipe_mutex mutex; 276 struct si_shader *first_variant; /* immutable after the first variant */ 277 struct si_shader *last_variant; /* mutable */ 278 279 /* The compiled TGSI shader expecting a prolog and/or epilog (not 280 * uploaded to a buffer). 281 */ 282 struct si_shader *main_shader_part; 283 284 struct si_shader *gs_copy_shader; 285 286 struct tgsi_token *tokens; 287 struct pipe_stream_output_info so; 288 struct tgsi_shader_info info; 289 290 /* PIPE_SHADER_[VERTEX|FRAGMENT|...] */ 291 unsigned type; 292 293 /* GS parameters. */ 294 unsigned esgs_itemsize; 295 unsigned gs_input_verts_per_prim; 296 unsigned gs_output_prim; 297 unsigned gs_max_out_vertices; 298 unsigned gs_num_invocations; 299 unsigned max_gs_stream; /* count - 1 */ 300 unsigned gsvs_vertex_size; 301 unsigned max_gsvs_emit_size; 302 303 /* PS parameters. */ 304 unsigned color_attr_index[2]; 305 unsigned db_shader_control; 306 /* Set 0xf or 0x0 (4 bits) per each written output. 307 * ANDed with spi_shader_col_format. 308 */ 309 unsigned colors_written_4bit; 310 311 /* CS parameters */ 312 unsigned local_size; 313 314 uint64_t outputs_written; /* "get_unique_index" bits */ 315 uint32_t patch_outputs_written; /* "get_unique_index" bits */ 316 uint32_t outputs_written2; /* "get_unique_index2" bits */ 317 318 uint64_t inputs_read; /* "get_unique_index" bits */ 319 uint32_t inputs_read2; /* "get_unique_index2" bits */ 320 }; 321 322 /* Valid shader configurations: 323 * 324 * API shaders VS | TCS | TES | GS |pass| PS 325 * are compiled as: | | | |thru| 326 * | | | | | 327 * Only VS & PS: VS | -- | -- | -- | -- | PS 328 * With GS: ES | -- | -- | GS | VS | PS 329 * With Tessel.: LS | HS | VS | -- | -- | PS 330 * With both: LS | HS | ES | GS | VS | PS 331 */ 332 333 /* Common VS bits between the shader key and the prolog key. */ 334 struct si_vs_prolog_bits { 335 unsigned instance_divisors[SI_NUM_VERTEX_BUFFERS]; 336 }; 337 338 /* Common VS bits between the shader key and the epilog key. */ 339 struct si_vs_epilog_bits { 340 unsigned export_prim_id:1; /* when PS needs it and GS is disabled */ 341 }; 342 343 /* Common TCS bits between the shader key and the epilog key. */ 344 struct si_tcs_epilog_bits { 345 unsigned prim_mode:3; 346 }; 347 348 struct si_gs_prolog_bits { 349 unsigned tri_strip_adj_fix:1; 350 }; 351 352 /* Common PS bits between the shader key and the prolog key. */ 353 struct si_ps_prolog_bits { 354 unsigned color_two_side:1; 355 unsigned flatshade_colors:1; 356 unsigned poly_stipple:1; 357 unsigned force_persp_sample_interp:1; 358 unsigned force_linear_sample_interp:1; 359 unsigned force_persp_center_interp:1; 360 unsigned force_linear_center_interp:1; 361 unsigned bc_optimize_for_persp:1; 362 unsigned bc_optimize_for_linear:1; 363 }; 364 365 /* Common PS bits between the shader key and the epilog key. */ 366 struct si_ps_epilog_bits { 367 unsigned spi_shader_col_format; 368 unsigned color_is_int8:8; 369 unsigned color_is_int10:8; 370 unsigned last_cbuf:3; 371 unsigned alpha_func:3; 372 unsigned alpha_to_one:1; 373 unsigned poly_line_smoothing:1; 374 unsigned clamp_color:1; 375 }; 376 377 union si_shader_part_key { 378 struct { 379 struct si_vs_prolog_bits states; 380 unsigned num_input_sgprs:5; 381 unsigned last_input:4; 382 } vs_prolog; 383 struct { 384 struct si_vs_epilog_bits states; 385 unsigned prim_id_param_offset:5; 386 } vs_epilog; 387 struct { 388 struct si_tcs_epilog_bits states; 389 } tcs_epilog; 390 struct { 391 struct si_gs_prolog_bits states; 392 } gs_prolog; 393 struct { 394 struct si_ps_prolog_bits states; 395 unsigned num_input_sgprs:5; 396 unsigned num_input_vgprs:5; 397 /* Color interpolation and two-side color selection. */ 398 unsigned colors_read:8; /* color input components read */ 399 unsigned num_interp_inputs:5; /* BCOLOR is at this location */ 400 unsigned face_vgpr_index:5; 401 unsigned wqm:1; 402 char color_attr_index[2]; 403 char color_interp_vgpr_index[2]; /* -1 == constant */ 404 } ps_prolog; 405 struct { 406 struct si_ps_epilog_bits states; 407 unsigned colors_written:8; 408 unsigned writes_z:1; 409 unsigned writes_stencil:1; 410 unsigned writes_samplemask:1; 411 } ps_epilog; 412 }; 413 414 struct si_shader_key { 415 /* Prolog and epilog flags. */ 416 union { 417 struct { 418 struct si_ps_prolog_bits prolog; 419 struct si_ps_epilog_bits epilog; 420 } ps; 421 struct { 422 struct si_vs_prolog_bits prolog; 423 struct si_vs_epilog_bits epilog; 424 } vs; 425 struct { 426 struct si_tcs_epilog_bits epilog; 427 } tcs; /* tessellation control shader */ 428 struct { 429 struct si_vs_epilog_bits epilog; /* same as VS */ 430 } tes; /* tessellation evaluation shader */ 431 struct { 432 struct si_gs_prolog_bits prolog; 433 } gs; 434 } part; 435 436 /* These two are initially set according to the NEXT_SHADER property, 437 * or guessed if the property doesn't seem correct. 438 */ 439 unsigned as_es:1; /* export shader */ 440 unsigned as_ls:1; /* local shader */ 441 442 /* Flags for monolithic compilation only. */ 443 union { 444 struct { 445 /* One nibble for every input: SI_FIX_FETCH_* enums. */ 446 uint64_t fix_fetch; 447 } vs; 448 struct { 449 uint64_t inputs_to_copy; /* for fixed-func TCS */ 450 } tcs; 451 } mono; 452 453 /* Optimization flags for asynchronous compilation only. */ 454 union { 455 struct { 456 uint64_t kill_outputs; /* "get_unique_index" bits */ 457 uint32_t kill_outputs2; /* "get_unique_index2" bits */ 458 unsigned clip_disable:1; 459 } hw_vs; /* HW VS (it can be VS, TES, GS) */ 460 } opt; 461 }; 462 463 struct si_shader_config { 464 unsigned num_sgprs; 465 unsigned num_vgprs; 466 unsigned spilled_sgprs; 467 unsigned spilled_vgprs; 468 unsigned private_mem_vgprs; 469 unsigned lds_size; 470 unsigned spi_ps_input_ena; 471 unsigned spi_ps_input_addr; 472 unsigned float_mode; 473 unsigned scratch_bytes_per_wave; 474 unsigned rsrc1; 475 unsigned rsrc2; 476 }; 477 478 enum { 479 /* SPI_PS_INPUT_CNTL_i.OFFSET[0:4] */ 480 EXP_PARAM_OFFSET_0 = 0, 481 EXP_PARAM_OFFSET_31 = 31, 482 /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL[0:1] */ 483 EXP_PARAM_DEFAULT_VAL_0000 = 64, 484 EXP_PARAM_DEFAULT_VAL_0001, 485 EXP_PARAM_DEFAULT_VAL_1110, 486 EXP_PARAM_DEFAULT_VAL_1111, 487 EXP_PARAM_UNDEFINED = 255, 488 }; 489 490 /* GCN-specific shader info. */ 491 struct si_shader_info { 492 ubyte vs_output_param_offset[SI_MAX_VS_OUTPUTS]; 493 ubyte num_input_sgprs; 494 ubyte num_input_vgprs; 495 char face_vgpr_index; 496 bool uses_instanceid; 497 ubyte nr_pos_exports; 498 ubyte nr_param_exports; 499 }; 500 501 struct si_shader { 502 struct si_compiler_ctx_state compiler_ctx_state; 503 504 struct si_shader_selector *selector; 505 struct si_shader *next_variant; 506 507 struct si_shader_part *prolog; 508 struct si_shader_part *epilog; 509 510 struct si_pm4_state *pm4; 511 struct r600_resource *bo; 512 struct r600_resource *scratch_bo; 513 struct si_shader_key key; 514 struct util_queue_fence optimized_ready; 515 bool compilation_failed; 516 bool is_monolithic; 517 bool is_optimized; 518 bool is_binary_shared; 519 bool is_gs_copy_shader; 520 521 /* The following data is all that's needed for binary shaders. */ 522 struct radeon_shader_binary binary; 523 struct si_shader_config config; 524 struct si_shader_info info; 525 526 /* Shader key + LLVM IR + disassembly + statistics. 527 * Generated for debug contexts only. 528 */ 529 char *shader_log; 530 size_t shader_log_size; 531 }; 532 533 struct si_shader_part { 534 struct si_shader_part *next; 535 union si_shader_part_key key; 536 struct radeon_shader_binary binary; 537 struct si_shader_config config; 538 }; 539 540 /* si_shader.c */ 541 struct si_shader * 542 si_generate_gs_copy_shader(struct si_screen *sscreen, 543 LLVMTargetMachineRef tm, 544 struct si_shader_selector *gs_selector, 545 struct pipe_debug_callback *debug); 546 int si_compile_tgsi_shader(struct si_screen *sscreen, 547 LLVMTargetMachineRef tm, 548 struct si_shader *shader, 549 bool is_monolithic, 550 struct pipe_debug_callback *debug); 551 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, 552 struct si_shader *shader, 553 struct pipe_debug_callback *debug); 554 int si_compile_llvm(struct si_screen *sscreen, 555 struct radeon_shader_binary *binary, 556 struct si_shader_config *conf, 557 LLVMTargetMachineRef tm, 558 LLVMModuleRef mod, 559 struct pipe_debug_callback *debug, 560 unsigned processor, 561 const char *name); 562 void si_shader_destroy(struct si_shader *shader); 563 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index); 564 unsigned si_shader_io_get_unique_index2(unsigned name, unsigned index); 565 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader); 566 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader, 567 struct pipe_debug_callback *debug, unsigned processor, 568 FILE *f, bool check_debug_option); 569 void si_multiwave_lds_size_workaround(struct si_screen *sscreen, 570 unsigned *lds_size); 571 void si_shader_apply_scratch_relocs(struct si_context *sctx, 572 struct si_shader *shader, 573 struct si_shader_config *config, 574 uint64_t scratch_va); 575 void si_shader_binary_read_config(struct radeon_shader_binary *binary, 576 struct si_shader_config *conf, 577 unsigned symbol_offset); 578 unsigned si_get_spi_shader_z_format(bool writes_z, bool writes_stencil, 579 bool writes_samplemask); 580 581 #endif 582