1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #if LLVM_AVAILABLE
8 #include "ac_llvm_util.h"
9 #endif
10
11 #include "ac_nir.h"
12 #include "ac_shader_util.h"
13 #include "compiler/nir/nir_serialize.h"
14 #include "nir/tgsi_to_nir.h"
15 #include "si_build_pm4.h"
16 #include "sid.h"
17 #include "util/crc32.h"
18 #include "util/disk_cache.h"
19 #include "util/hash_table.h"
20 #include "util/mesa-sha1.h"
21 #include "util/u_async_debug.h"
22 #include "util/u_math.h"
23 #include "util/u_memory.h"
24 #include "util/u_prim.h"
25 #include "tgsi/tgsi_from_mesa.h"
26
27 static void si_update_tess_in_out_patch_vertices(struct si_context *sctx);
28
si_determine_wave_size(struct si_screen * sscreen,struct si_shader * shader)29 unsigned si_determine_wave_size(struct si_screen *sscreen, struct si_shader *shader)
30 {
31 /* There are a few uses that pass shader=NULL here, expecting the default compute wave size. */
32 struct si_shader_info *info = shader ? &shader->selector->info : NULL;
33 gl_shader_stage stage = shader ? shader->selector->stage : MESA_SHADER_COMPUTE;
34
35 if (sscreen->info.gfx_level < GFX10)
36 return 64;
37
38 /* Legacy GS only supports Wave64. */
39 if ((stage == MESA_SHADER_VERTEX && shader->key.ge.as_es && !shader->key.ge.as_ngg) ||
40 (stage == MESA_SHADER_TESS_EVAL && shader->key.ge.as_es && !shader->key.ge.as_ngg) ||
41 (stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg))
42 return 64;
43
44 /* Workgroup sizes that are not divisible by 64 use Wave32. */
45 if (stage == MESA_SHADER_COMPUTE && info && !info->base.workgroup_size_variable &&
46 (info->base.workgroup_size[0] *
47 info->base.workgroup_size[1] *
48 info->base.workgroup_size[2]) % 64 != 0)
49 return 32;
50
51 /* AMD_DEBUG wave flags override everything else. */
52 if (sscreen->debug_flags &
53 (stage == MESA_SHADER_COMPUTE ? DBG(W32_CS) :
54 stage == MESA_SHADER_FRAGMENT ? DBG(W32_PS) : DBG(W32_GE)))
55 return 32;
56
57 if (sscreen->debug_flags &
58 (stage == MESA_SHADER_COMPUTE ? DBG(W64_CS) :
59 stage == MESA_SHADER_FRAGMENT ? DBG(W64_PS) : DBG(W64_GE)))
60 return 64;
61
62 /* Shader profiles. */
63 if (info && info->options & SI_PROFILE_WAVE32)
64 return 32;
65
66 if (info && info->options & SI_PROFILE_GFX10_WAVE64 &&
67 (sscreen->info.gfx_level == GFX10 || sscreen->info.gfx_level == GFX10_3))
68 return 64;
69
70 /* Gfx10: Pixel shaders without interp instructions don't suffer from reduced interpolation
71 * performance in Wave32, so use Wave32. This helps Piano and Voloplosion.
72 *
73 * Gfx11: Prefer Wave64 to take advantage of doubled VALU performance.
74 */
75 if (sscreen->info.gfx_level < GFX11 && stage == MESA_SHADER_FRAGMENT && !info->num_inputs)
76 return 32;
77
78 /* Gfx10: There are a few very rare cases where VS is better with Wave32, and there are no
79 * known cases where Wave64 is better.
80 *
81 * Wave32 is disabled for GFX10 when culling is active as a workaround for #6457. I don't
82 * know why this helps.
83 *
84 * Gfx11: Prefer Wave64 because it's slightly better than Wave32.
85 */
86 if (stage <= MESA_SHADER_GEOMETRY &&
87 (sscreen->info.gfx_level == GFX10 || sscreen->info.gfx_level == GFX10_3) &&
88 !(sscreen->info.gfx_level == GFX10 && shader && shader->key.ge.opt.ngg_culling))
89 return 32;
90
91 /* TODO: Merged shaders must use the same wave size because the driver doesn't recompile
92 * individual shaders of merged shaders to match the wave size between them.
93 */
94 bool merged_shader = stage <= MESA_SHADER_GEOMETRY && shader && !shader->is_gs_copy_shader &&
95 (shader->key.ge.as_ls || shader->key.ge.as_es ||
96 stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_GEOMETRY);
97
98 /* Divergent loops in Wave64 can end up having too many iterations in one half of the wave
99 * while the other half is idling but occupying VGPRs, preventing other waves from launching.
100 * Wave32 eliminates the idling half to allow the next wave to start.
101 *
102 * Gfx11: Wave32 continues to be faster with divergent loops despite worse VALU performance.
103 */
104 if (!merged_shader && info && info->has_divergent_loop)
105 return 32;
106
107 return 64;
108 }
109
110 /* SHADER_CACHE */
111
112 /**
113 * Return the IR key for the shader cache.
114 */
si_get_ir_cache_key(struct si_shader_selector * sel,bool ngg,bool es,unsigned wave_size,unsigned char ir_sha1_cache_key[20])115 void si_get_ir_cache_key(struct si_shader_selector *sel, bool ngg, bool es,
116 unsigned wave_size, unsigned char ir_sha1_cache_key[20])
117 {
118 struct blob blob = {};
119 unsigned ir_size;
120 void *ir_binary;
121
122 if (sel->nir_binary) {
123 ir_binary = sel->nir_binary;
124 ir_size = sel->nir_size;
125 } else {
126 assert(sel->nir);
127
128 blob_init(&blob);
129 /* Keep debug info if NIR debug prints are in use. */
130 nir_serialize(&blob, sel->nir, NIR_DEBUG(PRINT) == 0);
131 ir_binary = blob.data;
132 ir_size = blob.size;
133 }
134
135 /* These settings affect the compilation, but they are not derived
136 * from the input shader IR.
137 */
138 unsigned shader_variant_flags = 0;
139
140 if (ngg)
141 shader_variant_flags |= 1 << 0;
142 if (sel->nir)
143 shader_variant_flags |= 1 << 1;
144 if (wave_size == 32)
145 shader_variant_flags |= 1 << 2;
146
147 /* bit gap */
148
149 /* use_ngg_culling disables NGG passthrough for non-culling shaders to reduce context
150 * rolls, which can be changed with AMD_DEBUG=nonggc or AMD_DEBUG=nggc.
151 */
152 if (sel->screen->use_ngg_culling)
153 shader_variant_flags |= 1 << 4;
154 if (sel->screen->record_llvm_ir)
155 shader_variant_flags |= 1 << 5;
156 if (sel->screen->info.has_image_opcodes)
157 shader_variant_flags |= 1 << 6;
158 if (sel->screen->options.no_infinite_interp)
159 shader_variant_flags |= 1 << 7;
160 if (sel->screen->options.clamp_div_by_zero)
161 shader_variant_flags |= 1 << 8;
162 if ((sel->stage == MESA_SHADER_VERTEX ||
163 sel->stage == MESA_SHADER_TESS_EVAL ||
164 sel->stage == MESA_SHADER_GEOMETRY) &&
165 !es &&
166 sel->screen->options.vrs2x2)
167 shader_variant_flags |= 1 << 10;
168 if (sel->screen->options.inline_uniforms)
169 shader_variant_flags |= 1 << 11;
170 if (sel->screen->options.clear_lds)
171 shader_variant_flags |= 1 << 12;
172
173 struct mesa_sha1 ctx;
174 _mesa_sha1_init(&ctx);
175 _mesa_sha1_update(&ctx, &shader_variant_flags, 4);
176 _mesa_sha1_update(&ctx, ir_binary, ir_size);
177 _mesa_sha1_final(&ctx, ir_sha1_cache_key);
178
179 if (ir_binary == blob.data)
180 blob_finish(&blob);
181 }
182
183 /** Copy "data" to "ptr" and return the next dword following copied data. */
write_data(uint32_t * ptr,const void * data,unsigned size)184 static uint32_t *write_data(uint32_t *ptr, const void *data, unsigned size)
185 {
186 /* data may be NULL if size == 0 */
187 if (size)
188 memcpy(ptr, data, size);
189 ptr += DIV_ROUND_UP(size, 4);
190 return ptr;
191 }
192
193 /** Read data from "ptr". Return the next dword following the data. */
read_data(uint32_t * ptr,void * data,unsigned size)194 static uint32_t *read_data(uint32_t *ptr, void *data, unsigned size)
195 {
196 memcpy(data, ptr, size);
197 ptr += DIV_ROUND_UP(size, 4);
198 return ptr;
199 }
200
201 /**
202 * Write the size as uint followed by the data. Return the next dword
203 * following the copied data.
204 */
write_chunk(uint32_t * ptr,const void * data,unsigned size)205 static uint32_t *write_chunk(uint32_t *ptr, const void *data, unsigned size)
206 {
207 *ptr++ = size;
208 return write_data(ptr, data, size);
209 }
210
211 /**
212 * Read the size as uint followed by the data. Return both via parameters.
213 * Return the next dword following the data.
214 */
read_chunk(uint32_t * ptr,void ** data,unsigned * size)215 static uint32_t *read_chunk(uint32_t *ptr, void **data, unsigned *size)
216 {
217 *size = *ptr++;
218 assert(*data == NULL);
219 if (!*size)
220 return ptr;
221 *data = malloc(*size);
222 return read_data(ptr, *data, *size);
223 }
224
225 struct si_shader_blob_head {
226 uint32_t size;
227 uint32_t type;
228 uint32_t crc32;
229 };
230
231 /**
232 * Return the shader binary in a buffer.
233 */
si_get_shader_binary(struct si_shader * shader)234 static uint32_t *si_get_shader_binary(struct si_shader *shader)
235 {
236 /* There is always a size of data followed by the data itself. */
237 unsigned llvm_ir_size =
238 shader->binary.llvm_ir_string ? strlen(shader->binary.llvm_ir_string) + 1 : 0;
239
240 /* Refuse to allocate overly large buffers and guard against integer
241 * overflow. */
242 if (shader->binary.code_size > UINT_MAX / 4 || llvm_ir_size > UINT_MAX / 4 ||
243 shader->binary.num_symbols > UINT_MAX / 32)
244 return NULL;
245
246 unsigned size = sizeof(struct si_shader_blob_head) +
247 align(sizeof(shader->config), 4) +
248 align(sizeof(shader->info), 4) +
249 4 + 4 + align(shader->binary.code_size, 4) +
250 4 + shader->binary.num_symbols * 8 +
251 4 + align(llvm_ir_size, 4);
252 uint32_t *buffer = (uint32_t*)CALLOC(1, size);
253 if (!buffer)
254 return NULL;
255
256 struct si_shader_blob_head *head = (struct si_shader_blob_head *)buffer;
257 head->type = shader->binary.type;
258 head->size = size;
259
260 uint32_t *data = buffer + sizeof(*head) / 4;
261 uint32_t *ptr = data;
262
263 ptr = write_data(ptr, &shader->config, sizeof(shader->config));
264 ptr = write_data(ptr, &shader->info, sizeof(shader->info));
265 ptr = write_data(ptr, &shader->binary.exec_size, 4);
266 ptr = write_chunk(ptr, shader->binary.code_buffer, shader->binary.code_size);
267 ptr = write_chunk(ptr, shader->binary.symbols, shader->binary.num_symbols * 8);
268 ptr = write_chunk(ptr, shader->binary.llvm_ir_string, llvm_ir_size);
269 assert((char *)ptr - (char *)buffer == (ptrdiff_t)size);
270
271 /* Compute CRC32. */
272 head->crc32 = util_hash_crc32(data, size - sizeof(*head));
273
274 return buffer;
275 }
276
si_load_shader_binary(struct si_shader * shader,void * binary)277 static bool si_load_shader_binary(struct si_shader *shader, void *binary)
278 {
279 struct si_shader_blob_head *head = (struct si_shader_blob_head *)binary;
280 unsigned chunk_size;
281 unsigned code_size;
282
283 uint32_t *ptr = (uint32_t *)binary + sizeof(*head) / 4;
284 if (util_hash_crc32(ptr, head->size - sizeof(*head)) != head->crc32) {
285 fprintf(stderr, "radeonsi: binary shader has invalid CRC32\n");
286 return false;
287 }
288
289 shader->binary.type = (enum si_shader_binary_type)head->type;
290 ptr = read_data(ptr, &shader->config, sizeof(shader->config));
291 ptr = read_data(ptr, &shader->info, sizeof(shader->info));
292 ptr = read_data(ptr, &shader->binary.exec_size, 4);
293 ptr = read_chunk(ptr, (void **)&shader->binary.code_buffer, &code_size);
294 shader->binary.code_size = code_size;
295 ptr = read_chunk(ptr, (void **)&shader->binary.symbols, &chunk_size);
296 shader->binary.num_symbols = chunk_size / 8;
297 ptr = read_chunk(ptr, (void **)&shader->binary.llvm_ir_string, &chunk_size);
298
299 if (!shader->is_gs_copy_shader &&
300 shader->selector->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) {
301 shader->gs_copy_shader = CALLOC_STRUCT(si_shader);
302 if (!shader->gs_copy_shader)
303 return false;
304
305 shader->gs_copy_shader->is_gs_copy_shader = true;
306
307 if (!si_load_shader_binary(shader->gs_copy_shader, (uint8_t*)binary + head->size)) {
308 FREE(shader->gs_copy_shader);
309 shader->gs_copy_shader = NULL;
310 return false;
311 }
312
313 util_queue_fence_init(&shader->gs_copy_shader->ready);
314 shader->gs_copy_shader->selector = shader->selector;
315 shader->gs_copy_shader->is_gs_copy_shader = true;
316 shader->gs_copy_shader->wave_size =
317 si_determine_wave_size(shader->selector->screen, shader->gs_copy_shader);
318
319 si_shader_binary_upload(shader->selector->screen, shader->gs_copy_shader, 0);
320 }
321
322 return true;
323 }
324
325 /**
326 * Insert a shader into the cache. It's assumed the shader is not in the cache.
327 * Use si_shader_cache_load_shader before calling this.
328 */
si_shader_cache_insert_shader(struct si_screen * sscreen,unsigned char ir_sha1_cache_key[20],struct si_shader * shader,bool insert_into_disk_cache)329 void si_shader_cache_insert_shader(struct si_screen *sscreen, unsigned char ir_sha1_cache_key[20],
330 struct si_shader *shader, bool insert_into_disk_cache)
331 {
332 uint32_t *hw_binary;
333 struct hash_entry *entry;
334 uint8_t key[CACHE_KEY_SIZE];
335 bool memory_cache_full = sscreen->shader_cache_size >= sscreen->shader_cache_max_size;
336
337 if (!insert_into_disk_cache && memory_cache_full)
338 return;
339
340 entry = _mesa_hash_table_search(sscreen->shader_cache, ir_sha1_cache_key);
341 if (entry)
342 return; /* already added */
343
344 hw_binary = si_get_shader_binary(shader);
345 if (!hw_binary)
346 return;
347
348 unsigned size = *hw_binary;
349
350 if (shader->selector->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) {
351 uint32_t *gs_copy_binary = si_get_shader_binary(shader->gs_copy_shader);
352 if (!gs_copy_binary) {
353 FREE(hw_binary);
354 return;
355 }
356
357 /* Combine both binaries. */
358 size += *gs_copy_binary;
359 uint32_t *combined_binary = (uint32_t*)MALLOC(size);
360 if (!combined_binary) {
361 FREE(hw_binary);
362 FREE(gs_copy_binary);
363 return;
364 }
365
366 memcpy(combined_binary, hw_binary, *hw_binary);
367 memcpy(combined_binary + *hw_binary / 4, gs_copy_binary, *gs_copy_binary);
368 FREE(hw_binary);
369 FREE(gs_copy_binary);
370 hw_binary = combined_binary;
371 }
372
373 if (!memory_cache_full) {
374 if (_mesa_hash_table_insert(sscreen->shader_cache,
375 mem_dup(ir_sha1_cache_key, 20),
376 hw_binary) == NULL) {
377 FREE(hw_binary);
378 return;
379 }
380
381 sscreen->shader_cache_size += size;
382 }
383
384 if (sscreen->disk_shader_cache && insert_into_disk_cache) {
385 disk_cache_compute_key(sscreen->disk_shader_cache, ir_sha1_cache_key, 20, key);
386 disk_cache_put(sscreen->disk_shader_cache, key, hw_binary, size, NULL);
387 }
388
389 if (memory_cache_full)
390 FREE(hw_binary);
391 }
392
si_shader_cache_load_shader(struct si_screen * sscreen,unsigned char ir_sha1_cache_key[20],struct si_shader * shader)393 bool si_shader_cache_load_shader(struct si_screen *sscreen, unsigned char ir_sha1_cache_key[20],
394 struct si_shader *shader)
395 {
396 struct hash_entry *entry = _mesa_hash_table_search(sscreen->shader_cache, ir_sha1_cache_key);
397
398 if (entry) {
399 if (si_load_shader_binary(shader, entry->data)) {
400 p_atomic_inc(&sscreen->num_memory_shader_cache_hits);
401 return true;
402 }
403 }
404 p_atomic_inc(&sscreen->num_memory_shader_cache_misses);
405
406 if (!sscreen->disk_shader_cache)
407 return false;
408
409 unsigned char sha1[CACHE_KEY_SIZE];
410 disk_cache_compute_key(sscreen->disk_shader_cache, ir_sha1_cache_key, 20, sha1);
411
412 size_t total_size;
413 uint32_t *buffer = (uint32_t*)disk_cache_get(sscreen->disk_shader_cache, sha1, &total_size);
414 if (buffer) {
415 unsigned size = *buffer;
416 unsigned gs_copy_binary_size = 0;
417
418 /* The GS copy shader binary is after the GS binary. */
419 if (shader->selector->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg)
420 gs_copy_binary_size = buffer[size / 4];
421
422 if (total_size >= sizeof(uint32_t) && size + gs_copy_binary_size == total_size) {
423 if (si_load_shader_binary(shader, buffer)) {
424 free(buffer);
425 si_shader_cache_insert_shader(sscreen, ir_sha1_cache_key, shader, false);
426 p_atomic_inc(&sscreen->num_disk_shader_cache_hits);
427 return true;
428 }
429 } else {
430 /* Something has gone wrong discard the item from the cache and
431 * rebuild/link from source.
432 */
433 assert(!"Invalid radeonsi shader disk cache item!");
434 disk_cache_remove(sscreen->disk_shader_cache, sha1);
435 }
436 }
437
438 free(buffer);
439 p_atomic_inc(&sscreen->num_disk_shader_cache_misses);
440 return false;
441 }
442
si_shader_cache_key_hash(const void * key)443 static uint32_t si_shader_cache_key_hash(const void *key)
444 {
445 /* Take the first dword of SHA1. */
446 return *(uint32_t *)key;
447 }
448
si_shader_cache_key_equals(const void * a,const void * b)449 static bool si_shader_cache_key_equals(const void *a, const void *b)
450 {
451 /* Compare SHA1s. */
452 return memcmp(a, b, 20) == 0;
453 }
454
si_destroy_shader_cache_entry(struct hash_entry * entry)455 static void si_destroy_shader_cache_entry(struct hash_entry *entry)
456 {
457 FREE((void *)entry->key);
458 FREE(entry->data);
459 }
460
si_init_shader_cache(struct si_screen * sscreen)461 bool si_init_shader_cache(struct si_screen *sscreen)
462 {
463 (void)simple_mtx_init(&sscreen->shader_cache_mutex, mtx_plain);
464 sscreen->shader_cache =
465 _mesa_hash_table_create(NULL, si_shader_cache_key_hash, si_shader_cache_key_equals);
466 sscreen->shader_cache_size = 0;
467 /* Maximum size: 64MB on 32 bits, 1GB else */
468 sscreen->shader_cache_max_size = ((sizeof(void *) == 4) ? 64 : 1024) * 1024 * 1024;
469
470 return sscreen->shader_cache != NULL;
471 }
472
si_destroy_shader_cache(struct si_screen * sscreen)473 void si_destroy_shader_cache(struct si_screen *sscreen)
474 {
475 if (sscreen->shader_cache)
476 _mesa_hash_table_destroy(sscreen->shader_cache, si_destroy_shader_cache_entry);
477 simple_mtx_destroy(&sscreen->shader_cache_mutex);
478 }
479
480 /* SHADER STATES */
481
si_shader_encode_vgprs(struct si_shader * shader)482 unsigned si_shader_encode_vgprs(struct si_shader *shader)
483 {
484 assert(shader->selector->screen->info.gfx_level >= GFX10 || shader->wave_size == 64);
485 return shader->config.num_vgprs / (shader->wave_size == 32 ? 8 : 4) - 1;
486 }
487
si_shader_encode_sgprs(struct si_shader * shader)488 unsigned si_shader_encode_sgprs(struct si_shader *shader)
489 {
490 if (shader->selector->screen->info.gfx_level >= GFX10)
491 return 0; /* Gfx10+ don't have the SGPRS field and always allocate 128 SGPRs. */
492
493 return shader->config.num_sgprs / 8 - 1;
494 }
495
si_shader_mem_ordered(struct si_shader * shader)496 bool si_shader_mem_ordered(struct si_shader *shader)
497 {
498 if (shader->selector->screen->info.gfx_level < GFX10)
499 return false;
500
501 /* Return true if both types of VMEM that return something are used. */
502 return shader->info.uses_vmem_sampler_or_bvh &&
503 (shader->info.uses_vmem_load_other ||
504 shader->config.scratch_bytes_per_wave);
505 }
506
si_set_tesseval_regs(struct si_screen * sscreen,const struct si_shader_selector * tes,struct si_shader * shader)507 static void si_set_tesseval_regs(struct si_screen *sscreen, const struct si_shader_selector *tes,
508 struct si_shader *shader)
509 {
510 const struct si_shader_info *info = &tes->info;
511 enum tess_primitive_mode tes_prim_mode = info->base.tess._primitive_mode;
512 unsigned tes_spacing = info->base.tess.spacing;
513 bool tes_vertex_order_cw = !info->base.tess.ccw;
514 bool tes_point_mode = info->base.tess.point_mode;
515 unsigned type, partitioning, topology, distribution_mode;
516
517 switch (tes_prim_mode) {
518 case TESS_PRIMITIVE_ISOLINES:
519 type = V_028B6C_TESS_ISOLINE;
520 break;
521 case TESS_PRIMITIVE_TRIANGLES:
522 type = V_028B6C_TESS_TRIANGLE;
523 break;
524 case TESS_PRIMITIVE_QUADS:
525 type = V_028B6C_TESS_QUAD;
526 break;
527 default:
528 assert(0);
529 return;
530 }
531
532 switch (tes_spacing) {
533 case TESS_SPACING_FRACTIONAL_ODD:
534 partitioning = V_028B6C_PART_FRAC_ODD;
535 break;
536 case TESS_SPACING_FRACTIONAL_EVEN:
537 partitioning = V_028B6C_PART_FRAC_EVEN;
538 break;
539 case TESS_SPACING_EQUAL:
540 partitioning = V_028B6C_PART_INTEGER;
541 break;
542 default:
543 assert(0);
544 return;
545 }
546
547 if (tes_point_mode)
548 topology = V_028B6C_OUTPUT_POINT;
549 else if (tes_prim_mode == TESS_PRIMITIVE_ISOLINES)
550 topology = V_028B6C_OUTPUT_LINE;
551 else if (tes_vertex_order_cw)
552 /* for some reason, this must be the other way around */
553 topology = V_028B6C_OUTPUT_TRIANGLE_CCW;
554 else
555 topology = V_028B6C_OUTPUT_TRIANGLE_CW;
556
557 if (sscreen->info.has_distributed_tess) {
558 if (sscreen->info.family == CHIP_FIJI || sscreen->info.family >= CHIP_POLARIS10)
559 distribution_mode = V_028B6C_TRAPEZOIDS;
560 else
561 distribution_mode = V_028B6C_DONUTS;
562 } else
563 distribution_mode = V_028B6C_NO_DIST;
564
565 shader->vgt_tf_param = S_028B6C_TYPE(type) | S_028B6C_PARTITIONING(partitioning) |
566 S_028B6C_TOPOLOGY(topology) |
567 S_028B6C_DISTRIBUTION_MODE(distribution_mode);
568 }
569
570 /* Polaris needs different VTX_REUSE_DEPTH settings depending on
571 * whether the "fractional odd" tessellation spacing is used.
572 *
573 * Possible VGT configurations and which state should set the register:
574 *
575 * Reg set in | VGT shader configuration | Value
576 * ------------------------------------------------------
577 * VS as VS | VS | 30
578 * VS as ES | ES -> GS -> VS | 30
579 * TES as VS | LS -> HS -> VS | 14 or 30
580 * TES as ES | LS -> HS -> ES -> GS -> VS | 14 or 30
581 */
polaris_set_vgt_vertex_reuse(struct si_screen * sscreen,struct si_shader_selector * sel,struct si_shader * shader)582 static void polaris_set_vgt_vertex_reuse(struct si_screen *sscreen, struct si_shader_selector *sel,
583 struct si_shader *shader)
584 {
585 if (sscreen->info.family < CHIP_POLARIS10 || sscreen->info.gfx_level >= GFX10)
586 return;
587
588 /* VS as VS, or VS as ES: */
589 if ((sel->stage == MESA_SHADER_VERTEX &&
590 (!shader->key.ge.as_ls && !shader->is_gs_copy_shader)) ||
591 /* TES as VS, or TES as ES: */
592 sel->stage == MESA_SHADER_TESS_EVAL) {
593 unsigned vtx_reuse_depth = 30;
594
595 if (sel->stage == MESA_SHADER_TESS_EVAL &&
596 sel->info.base.tess.spacing == TESS_SPACING_FRACTIONAL_ODD)
597 vtx_reuse_depth = 14;
598
599 shader->vgt_vertex_reuse_block_cntl = vtx_reuse_depth;
600 }
601 }
602
603 static struct si_pm4_state *
si_get_shader_pm4_state(struct si_shader * shader,void (* emit_func)(struct si_context * ctx,unsigned index))604 si_get_shader_pm4_state(struct si_shader *shader,
605 void (*emit_func)(struct si_context *ctx, unsigned index))
606 {
607 si_pm4_clear_state(&shader->pm4, shader->selector->screen, false);
608 shader->pm4.atom.emit = emit_func;
609 return &shader->pm4;
610 }
611
si_get_num_vs_user_sgprs(struct si_shader * shader,unsigned num_always_on_user_sgprs)612 static unsigned si_get_num_vs_user_sgprs(struct si_shader *shader,
613 unsigned num_always_on_user_sgprs)
614 {
615 struct si_shader_selector *vs =
616 shader->previous_stage_sel ? shader->previous_stage_sel : shader->selector;
617 unsigned num_vbos_in_user_sgprs = vs->info.num_vbos_in_user_sgprs;
618
619 /* 1 SGPR is reserved for the vertex buffer pointer. */
620 assert(num_always_on_user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST - 1);
621
622 if (num_vbos_in_user_sgprs)
623 return SI_SGPR_VS_VB_DESCRIPTOR_FIRST + num_vbos_in_user_sgprs * 4;
624
625 /* Add the pointer to VBO descriptors. */
626 return num_always_on_user_sgprs + 1;
627 }
628
629 /* Return VGPR_COMP_CNT for the API vertex shader. This can be hw LS, LSHS, ES, ESGS, VS. */
si_get_vs_vgpr_comp_cnt(struct si_screen * sscreen,struct si_shader * shader,bool legacy_vs_prim_id)630 static unsigned si_get_vs_vgpr_comp_cnt(struct si_screen *sscreen, struct si_shader *shader,
631 bool legacy_vs_prim_id)
632 {
633 assert(shader->selector->stage == MESA_SHADER_VERTEX ||
634 (shader->previous_stage_sel && shader->previous_stage_sel->stage == MESA_SHADER_VERTEX));
635
636 /* GFX6-9 LS (VertexID, RelAutoIndex, InstanceID / StepRate0, InstanceID)
637 * GFX6-9 ES,VS (VertexID, InstanceID / StepRate0, VSPrimID, InstanceID)
638 * GFX10-11 LS (VertexID, RelAutoIndex, UserVGPR1, UserVGPR2 or InstanceID)
639 * GFX10-11 ES,VS (VertexID, UserVGPR1, UserVGPR2 or VSPrimID, UserVGPR3 or InstanceID)
640 */
641 bool is_ls = shader->selector->stage == MESA_SHADER_TESS_CTRL || shader->key.ge.as_ls;
642 unsigned max = 0;
643
644 if (shader->info.uses_instanceid) {
645 if (sscreen->info.gfx_level >= GFX10)
646 max = MAX2(max, 3);
647 else if (is_ls)
648 max = MAX2(max, 2); /* use (InstanceID / StepRate0) because StepRate0 == 1 */
649 else
650 max = MAX2(max, 1); /* use (InstanceID / StepRate0) because StepRate0 == 1 */
651 }
652
653 if (legacy_vs_prim_id)
654 max = MAX2(max, 2); /* VSPrimID */
655
656 /* GFX11: We prefer to compute RelAutoIndex using (WaveID * WaveSize + ThreadID).
657 * Older chips didn't have WaveID in LS.
658 */
659 if (is_ls && sscreen->info.gfx_level <= GFX10_3)
660 max = MAX2(max, 1); /* RelAutoIndex */
661
662 return max;
663 }
664
si_get_shader_prefetch_size(struct si_shader * shader)665 unsigned si_get_shader_prefetch_size(struct si_shader *shader)
666 {
667 /* inst_pref_size is calculated in cache line size granularity */
668 assert(!(shader->bo->b.b.width0 & 0x7f));
669 return MIN2(shader->bo->b.b.width0, 8064) / 128;
670 }
671
si_shader_ls(struct si_screen * sscreen,struct si_shader * shader)672 static void si_shader_ls(struct si_screen *sscreen, struct si_shader *shader)
673 {
674 struct si_pm4_state *pm4;
675 uint64_t va;
676
677 assert(sscreen->info.gfx_level <= GFX8);
678
679 pm4 = si_get_shader_pm4_state(shader, NULL);
680 if (!pm4)
681 return;
682
683 va = shader->bo->gpu_address;
684 si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
685
686 shader->config.rsrc1 = S_00B528_VGPRS(si_shader_encode_vgprs(shader)) |
687 S_00B528_SGPRS(si_shader_encode_sgprs(shader)) |
688 S_00B528_VGPR_COMP_CNT(si_get_vs_vgpr_comp_cnt(sscreen, shader, false)) |
689 S_00B528_DX10_CLAMP(1) |
690 S_00B528_FLOAT_MODE(shader->config.float_mode);
691 shader->config.rsrc2 = S_00B52C_USER_SGPR(si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR)) |
692 S_00B52C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
693 si_pm4_finalize(pm4);
694 }
695
si_shader_hs(struct si_screen * sscreen,struct si_shader * shader)696 static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader)
697 {
698 struct si_pm4_state *pm4 = si_get_shader_pm4_state(shader, NULL);
699 if (!pm4)
700 return;
701
702 uint64_t va = shader->bo->gpu_address;
703 unsigned num_user_sgprs = sscreen->info.gfx_level >= GFX9 ?
704 si_get_num_vs_user_sgprs(shader, GFX9_TCS_NUM_USER_SGPR) :
705 GFX6_TCS_NUM_USER_SGPR;
706
707 if (sscreen->info.gfx_level >= GFX11) {
708 si_pm4_set_reg_idx3(pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS,
709 ac_apply_cu_en(S_00B404_INST_PREF_SIZE(si_get_shader_prefetch_size(shader)) |
710 S_00B404_CU_EN(0xffff),
711 C_00B404_CU_EN, 16, &sscreen->info));
712
713 si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
714 } else if (sscreen->info.gfx_level >= GFX10) {
715 si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
716 } else if (sscreen->info.gfx_level >= GFX9) {
717 si_pm4_set_reg(pm4, R_00B410_SPI_SHADER_PGM_LO_LS, va >> 8);
718 } else {
719 si_pm4_set_reg(pm4, R_00B420_SPI_SHADER_PGM_LO_HS, va >> 8);
720 si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS,
721 S_00B424_MEM_BASE(sscreen->info.address32_hi >> 8));
722 }
723
724 si_pm4_set_reg(pm4, R_00B428_SPI_SHADER_PGM_RSRC1_HS,
725 S_00B428_VGPRS(si_shader_encode_vgprs(shader)) |
726 S_00B428_SGPRS(si_shader_encode_sgprs(shader)) |
727 S_00B428_DX10_CLAMP(1) |
728 S_00B428_MEM_ORDERED(si_shader_mem_ordered(shader)) |
729 S_00B428_FLOAT_MODE(shader->config.float_mode) |
730 S_00B428_LS_VGPR_COMP_CNT(sscreen->info.gfx_level >= GFX9 ?
731 si_get_vs_vgpr_comp_cnt(sscreen, shader, false) : 0));
732
733 shader->config.rsrc2 = S_00B42C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0) |
734 S_00B42C_USER_SGPR(num_user_sgprs);
735
736 if (sscreen->info.gfx_level >= GFX10)
737 shader->config.rsrc2 |= S_00B42C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5);
738 else if (sscreen->info.gfx_level >= GFX9)
739 shader->config.rsrc2 |= S_00B42C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5);
740 else
741 shader->config.rsrc2 |= S_00B42C_OC_LDS_EN(1);
742
743 if (sscreen->info.gfx_level <= GFX8)
744 si_pm4_set_reg(pm4, R_00B42C_SPI_SHADER_PGM_RSRC2_HS, shader->config.rsrc2);
745
746 si_pm4_finalize(pm4);
747 }
748
si_emit_shader_es(struct si_context * sctx,unsigned index)749 static void si_emit_shader_es(struct si_context *sctx, unsigned index)
750 {
751 struct si_shader *shader = sctx->queued.named.es;
752
753 radeon_begin(&sctx->gfx_cs);
754 radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
755 SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
756 shader->selector->info.esgs_vertex_stride / 4);
757
758 if (shader->selector->stage == MESA_SHADER_TESS_EVAL)
759 radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM,
760 shader->vgt_tf_param);
761
762 if (shader->vgt_vertex_reuse_block_cntl)
763 radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
764 SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
765 shader->vgt_vertex_reuse_block_cntl);
766 radeon_end_update_context_roll(sctx);
767 }
768
si_shader_es(struct si_screen * sscreen,struct si_shader * shader)769 static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader)
770 {
771 struct si_pm4_state *pm4;
772 unsigned num_user_sgprs;
773 unsigned vgpr_comp_cnt;
774 uint64_t va;
775 unsigned oc_lds_en;
776
777 assert(sscreen->info.gfx_level <= GFX8);
778
779 pm4 = si_get_shader_pm4_state(shader, si_emit_shader_es);
780 if (!pm4)
781 return;
782
783 va = shader->bo->gpu_address;
784
785 if (shader->selector->stage == MESA_SHADER_VERTEX) {
786 vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, false);
787 num_user_sgprs = si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR);
788 } else if (shader->selector->stage == MESA_SHADER_TESS_EVAL) {
789 vgpr_comp_cnt = shader->selector->info.uses_primid ? 3 : 2;
790 num_user_sgprs = SI_TES_NUM_USER_SGPR;
791 } else
792 unreachable("invalid shader selector type");
793
794 oc_lds_en = shader->selector->stage == MESA_SHADER_TESS_EVAL ? 1 : 0;
795
796 si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
797 si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES,
798 S_00B324_MEM_BASE(sscreen->info.address32_hi >> 8));
799 si_pm4_set_reg(pm4, R_00B328_SPI_SHADER_PGM_RSRC1_ES,
800 S_00B328_VGPRS(si_shader_encode_vgprs(shader)) |
801 S_00B328_SGPRS(si_shader_encode_sgprs(shader)) |
802 S_00B328_VGPR_COMP_CNT(vgpr_comp_cnt) |
803 S_00B328_DX10_CLAMP(1) |
804 S_00B328_FLOAT_MODE(shader->config.float_mode));
805 si_pm4_set_reg(pm4, R_00B32C_SPI_SHADER_PGM_RSRC2_ES,
806 S_00B32C_USER_SGPR(num_user_sgprs) | S_00B32C_OC_LDS_EN(oc_lds_en) |
807 S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
808
809 if (shader->selector->stage == MESA_SHADER_TESS_EVAL)
810 si_set_tesseval_regs(sscreen, shader->selector, shader);
811
812 polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader);
813 si_pm4_finalize(pm4);
814 }
815
gfx9_get_gs_info(struct si_shader_selector * es,struct si_shader_selector * gs,struct gfx9_gs_info * out)816 void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *gs,
817 struct gfx9_gs_info *out)
818 {
819 unsigned gs_num_invocations = MAX2(gs->info.base.gs.invocations, 1);
820 unsigned input_prim = gs->info.base.gs.input_primitive;
821 bool uses_adjacency =
822 input_prim >= MESA_PRIM_LINES_ADJACENCY && input_prim <= MESA_PRIM_TRIANGLE_STRIP_ADJACENCY;
823
824 /* All these are in dwords: */
825 /* We can't allow using the whole LDS, because GS waves compete with
826 * other shader stages for LDS space. */
827 const unsigned max_lds_size = 8 * 1024;
828 const unsigned esgs_itemsize = es->info.esgs_vertex_stride / 4;
829 unsigned esgs_lds_size;
830
831 /* All these are per subgroup: */
832 const unsigned max_out_prims = 32 * 1024;
833 const unsigned max_es_verts = 255;
834 const unsigned ideal_gs_prims = 64;
835 unsigned max_gs_prims, gs_prims;
836 unsigned min_es_verts, es_verts, worst_case_es_verts;
837
838 if (uses_adjacency || gs_num_invocations > 1)
839 max_gs_prims = 127 / gs_num_invocations;
840 else
841 max_gs_prims = 255;
842
843 /* MAX_PRIMS_PER_SUBGROUP = gs_prims * max_vert_out * gs_invocations.
844 * Make sure we don't go over the maximum value.
845 */
846 if (gs->info.base.gs.vertices_out > 0) {
847 max_gs_prims =
848 MIN2(max_gs_prims, max_out_prims / (gs->info.base.gs.vertices_out * gs_num_invocations));
849 }
850 assert(max_gs_prims > 0);
851
852 /* If the primitive has adjacency, halve the number of vertices
853 * that will be reused in multiple primitives.
854 */
855 min_es_verts = gs->info.gs_input_verts_per_prim / (uses_adjacency ? 2 : 1);
856
857 gs_prims = MIN2(ideal_gs_prims, max_gs_prims);
858 worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);
859
860 /* Compute ESGS LDS size based on the worst case number of ES vertices
861 * needed to create the target number of GS prims per subgroup.
862 */
863 esgs_lds_size = esgs_itemsize * worst_case_es_verts;
864
865 /* If total LDS usage is too big, refactor partitions based on ratio
866 * of ESGS item sizes.
867 */
868 if (esgs_lds_size > max_lds_size) {
869 /* Our target GS Prims Per Subgroup was too large. Calculate
870 * the maximum number of GS Prims Per Subgroup that will fit
871 * into LDS, capped by the maximum that the hardware can support.
872 */
873 gs_prims = MIN2((max_lds_size / (esgs_itemsize * min_es_verts)), max_gs_prims);
874 assert(gs_prims > 0);
875 worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);
876
877 esgs_lds_size = esgs_itemsize * worst_case_es_verts;
878 assert(esgs_lds_size <= max_lds_size);
879 }
880
881 /* Now calculate remaining ESGS information. */
882 if (esgs_lds_size)
883 es_verts = MIN2(esgs_lds_size / esgs_itemsize, max_es_verts);
884 else
885 es_verts = max_es_verts;
886
887 /* Vertices for adjacency primitives are not always reused, so restore
888 * it for ES_VERTS_PER_SUBGRP.
889 */
890 min_es_verts = gs->info.gs_input_verts_per_prim;
891
892 /* For normal primitives, the VGT only checks if they are past the ES
893 * verts per subgroup after allocating a full GS primitive and if they
894 * are, kick off a new subgroup. But if those additional ES verts are
895 * unique (e.g. not reused) we need to make sure there is enough LDS
896 * space to account for those ES verts beyond ES_VERTS_PER_SUBGRP.
897 */
898 es_verts -= min_es_verts - 1;
899
900 out->es_verts_per_subgroup = es_verts;
901 out->gs_prims_per_subgroup = gs_prims;
902 out->gs_inst_prims_in_subgroup = gs_prims * gs_num_invocations;
903 out->max_prims_per_subgroup = out->gs_inst_prims_in_subgroup * gs->info.base.gs.vertices_out;
904 out->esgs_ring_size = esgs_lds_size;
905
906 assert(out->max_prims_per_subgroup <= max_out_prims);
907 }
908
gfx9_set_gs_sgpr_num_es_outputs(struct si_context * sctx,unsigned esgs_vertex_stride)909 static void gfx9_set_gs_sgpr_num_es_outputs(struct si_context *sctx, unsigned esgs_vertex_stride)
910 {
911 /* The stride must always be odd (e.g. a multiple of 4 + 1) to reduce LDS bank conflicts. */
912 assert(esgs_vertex_stride % 4 == 1);
913 unsigned num_es_outputs = (esgs_vertex_stride - 1) / 4;
914
915 /* If there are no ES outputs, GS doesn't use this SGPR field, so only set it if the number
916 * is non-zero.
917 */
918 if (num_es_outputs)
919 SET_FIELD(sctx->current_gs_state, GS_STATE_NUM_ES_OUTPUTS, num_es_outputs);
920 }
921
si_emit_shader_gs(struct si_context * sctx,unsigned index)922 static void si_emit_shader_gs(struct si_context *sctx, unsigned index)
923 {
924 struct si_shader *shader = sctx->queued.named.gs;
925
926 if (sctx->gfx_level >= GFX9)
927 gfx9_set_gs_sgpr_num_es_outputs(sctx, shader->key.ge.part.gs.es->info.esgs_vertex_stride / 4);
928
929 radeon_begin(&sctx->gfx_cs);
930
931 /* R_028A60_VGT_GSVS_RING_OFFSET_1, R_028A64_VGT_GSVS_RING_OFFSET_2
932 * R_028A68_VGT_GSVS_RING_OFFSET_3 */
933 radeon_opt_set_context_reg3(
934 sctx, R_028A60_VGT_GSVS_RING_OFFSET_1, SI_TRACKED_VGT_GSVS_RING_OFFSET_1,
935 shader->gs.vgt_gsvs_ring_offset_1, shader->gs.vgt_gsvs_ring_offset_2,
936 shader->gs.vgt_gsvs_ring_offset_3);
937
938 /* R_028AB0_VGT_GSVS_RING_ITEMSIZE */
939 radeon_opt_set_context_reg(sctx, R_028AB0_VGT_GSVS_RING_ITEMSIZE,
940 SI_TRACKED_VGT_GSVS_RING_ITEMSIZE,
941 shader->gs.vgt_gsvs_ring_itemsize);
942
943 /* R_028B38_VGT_GS_MAX_VERT_OUT */
944 radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT, SI_TRACKED_VGT_GS_MAX_VERT_OUT,
945 shader->gs.vgt_gs_max_vert_out);
946
947 /* R_028B5C_VGT_GS_VERT_ITEMSIZE, R_028B60_VGT_GS_VERT_ITEMSIZE_1
948 * R_028B64_VGT_GS_VERT_ITEMSIZE_2, R_028B68_VGT_GS_VERT_ITEMSIZE_3 */
949 radeon_opt_set_context_reg4(
950 sctx, R_028B5C_VGT_GS_VERT_ITEMSIZE, SI_TRACKED_VGT_GS_VERT_ITEMSIZE,
951 shader->gs.vgt_gs_vert_itemsize, shader->gs.vgt_gs_vert_itemsize_1,
952 shader->gs.vgt_gs_vert_itemsize_2, shader->gs.vgt_gs_vert_itemsize_3);
953
954 /* R_028B90_VGT_GS_INSTANCE_CNT */
955 radeon_opt_set_context_reg(sctx, R_028B90_VGT_GS_INSTANCE_CNT, SI_TRACKED_VGT_GS_INSTANCE_CNT,
956 shader->gs.vgt_gs_instance_cnt);
957
958 if (sctx->gfx_level >= GFX9) {
959 /* R_028A44_VGT_GS_ONCHIP_CNTL */
960 radeon_opt_set_context_reg(sctx, R_028A44_VGT_GS_ONCHIP_CNTL, SI_TRACKED_VGT_GS_ONCHIP_CNTL,
961 shader->gs.vgt_gs_onchip_cntl);
962 /* R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP */
963 if (sctx->gfx_level == GFX9) {
964 radeon_opt_set_context_reg(sctx, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
965 SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
966 shader->gs.vgt_gs_max_prims_per_subgroup);
967 }
968
969 if (shader->key.ge.part.gs.es->stage == MESA_SHADER_TESS_EVAL)
970 radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM,
971 shader->vgt_tf_param);
972 if (shader->vgt_vertex_reuse_block_cntl)
973 radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
974 SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
975 shader->vgt_vertex_reuse_block_cntl);
976 }
977 radeon_end_update_context_roll(sctx);
978
979 /* These don't cause any context rolls. */
980 radeon_begin_again(&sctx->gfx_cs);
981 if (sctx->gfx_level >= GFX7) {
982 if (sctx->screen->info.uses_kernel_cu_mask) {
983 radeon_opt_set_sh_reg_idx(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
984 SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS,
985 3, shader->gs.spi_shader_pgm_rsrc3_gs);
986 } else {
987 radeon_opt_set_sh_reg(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
988 SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS,
989 shader->gs.spi_shader_pgm_rsrc3_gs);
990 }
991 }
992 if (sctx->gfx_level >= GFX10) {
993 if (sctx->screen->info.uses_kernel_cu_mask) {
994 radeon_opt_set_sh_reg_idx(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
995 SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS,
996 3, shader->gs.spi_shader_pgm_rsrc4_gs);
997 } else {
998 radeon_opt_set_sh_reg(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
999 SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS,
1000 shader->gs.spi_shader_pgm_rsrc4_gs);
1001 }
1002 }
1003 radeon_end();
1004 }
1005
si_shader_gs(struct si_screen * sscreen,struct si_shader * shader)1006 static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
1007 {
1008 struct si_shader_selector *sel = shader->selector;
1009 const uint8_t *num_components = sel->info.num_stream_output_components;
1010 unsigned gs_num_invocations = sel->info.base.gs.invocations;
1011 struct si_pm4_state *pm4;
1012 uint64_t va;
1013 unsigned max_stream = util_last_bit(sel->info.base.gs.active_stream_mask);
1014 unsigned offset;
1015
1016 assert(sscreen->info.gfx_level < GFX11); /* gfx11 doesn't have the legacy pipeline */
1017
1018 pm4 = si_get_shader_pm4_state(shader, si_emit_shader_gs);
1019 if (!pm4)
1020 return;
1021
1022 offset = num_components[0] * sel->info.base.gs.vertices_out;
1023 shader->gs.vgt_gsvs_ring_offset_1 = offset;
1024
1025 if (max_stream >= 2)
1026 offset += num_components[1] * sel->info.base.gs.vertices_out;
1027 shader->gs.vgt_gsvs_ring_offset_2 = offset;
1028
1029 if (max_stream >= 3)
1030 offset += num_components[2] * sel->info.base.gs.vertices_out;
1031 shader->gs.vgt_gsvs_ring_offset_3 = offset;
1032
1033 if (max_stream >= 4)
1034 offset += num_components[3] * sel->info.base.gs.vertices_out;
1035 shader->gs.vgt_gsvs_ring_itemsize = offset;
1036
1037 /* The GSVS_RING_ITEMSIZE register takes 15 bits */
1038 assert(offset < (1 << 15));
1039
1040 shader->gs.vgt_gs_max_vert_out = sel->info.base.gs.vertices_out;
1041
1042 shader->gs.vgt_gs_vert_itemsize = num_components[0];
1043 shader->gs.vgt_gs_vert_itemsize_1 = (max_stream >= 2) ? num_components[1] : 0;
1044 shader->gs.vgt_gs_vert_itemsize_2 = (max_stream >= 3) ? num_components[2] : 0;
1045 shader->gs.vgt_gs_vert_itemsize_3 = (max_stream >= 4) ? num_components[3] : 0;
1046
1047 shader->gs.vgt_gs_instance_cnt =
1048 S_028B90_CNT(MIN2(gs_num_invocations, 127)) | S_028B90_ENABLE(gs_num_invocations > 0);
1049
1050 /* Copy over fields from the GS copy shader to make them easily accessible from GS. */
1051 shader->pa_cl_vs_out_cntl = shader->gs_copy_shader->pa_cl_vs_out_cntl;
1052
1053 va = shader->bo->gpu_address;
1054
1055 if (sscreen->info.gfx_level >= GFX9) {
1056 unsigned input_prim = sel->info.base.gs.input_primitive;
1057 gl_shader_stage es_stage = shader->key.ge.part.gs.es->stage;
1058 unsigned es_vgpr_comp_cnt, gs_vgpr_comp_cnt;
1059
1060 if (es_stage == MESA_SHADER_VERTEX) {
1061 es_vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, false);
1062 } else if (es_stage == MESA_SHADER_TESS_EVAL)
1063 es_vgpr_comp_cnt = shader->key.ge.part.gs.es->info.uses_primid ? 3 : 2;
1064 else
1065 unreachable("invalid shader selector type");
1066
1067 /* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and
1068 * VGPR[0:4] are always loaded.
1069 */
1070 if (sel->info.uses_invocationid)
1071 gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID. */
1072 else if (sel->info.uses_primid)
1073 gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */
1074 else if (input_prim >= MESA_PRIM_TRIANGLES)
1075 gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */
1076 else
1077 gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
1078
1079 unsigned num_user_sgprs;
1080 if (es_stage == MESA_SHADER_VERTEX)
1081 num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_GS_NUM_USER_SGPR);
1082 else
1083 num_user_sgprs = GFX9_GS_NUM_USER_SGPR;
1084
1085 if (sscreen->info.gfx_level >= GFX10) {
1086 si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
1087 } else {
1088 si_pm4_set_reg(pm4, R_00B210_SPI_SHADER_PGM_LO_ES, va >> 8);
1089 }
1090
1091 uint32_t rsrc1 = S_00B228_VGPRS(si_shader_encode_vgprs(shader)) |
1092 S_00B228_SGPRS(si_shader_encode_sgprs(shader)) |
1093 S_00B228_DX10_CLAMP(1) |
1094 S_00B228_MEM_ORDERED(si_shader_mem_ordered(shader)) |
1095 S_00B228_FLOAT_MODE(shader->config.float_mode) |
1096 S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt);
1097 uint32_t rsrc2 = S_00B22C_USER_SGPR(num_user_sgprs) |
1098 S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
1099 S_00B22C_OC_LDS_EN(es_stage == MESA_SHADER_TESS_EVAL) |
1100 S_00B22C_LDS_SIZE(shader->config.lds_size) |
1101 S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
1102
1103 if (sscreen->info.gfx_level >= GFX10) {
1104 rsrc2 |= S_00B22C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5);
1105 } else {
1106 rsrc2 |= S_00B22C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5);
1107 }
1108
1109 si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, rsrc1);
1110 si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2);
1111
1112 shader->gs.spi_shader_pgm_rsrc3_gs =
1113 ac_apply_cu_en(S_00B21C_CU_EN(0xffff) |
1114 S_00B21C_WAVE_LIMIT(0x3F),
1115 C_00B21C_CU_EN, 0, &sscreen->info);
1116 shader->gs.spi_shader_pgm_rsrc4_gs =
1117 ac_apply_cu_en(S_00B204_CU_EN_GFX10(0xffff) |
1118 S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0),
1119 C_00B204_CU_EN_GFX10, 16, &sscreen->info);
1120
1121 shader->gs.vgt_gs_onchip_cntl =
1122 S_028A44_ES_VERTS_PER_SUBGRP(shader->gs_info.es_verts_per_subgroup) |
1123 S_028A44_GS_PRIMS_PER_SUBGRP(shader->gs_info.gs_prims_per_subgroup) |
1124 S_028A44_GS_INST_PRIMS_IN_SUBGRP(shader->gs_info.gs_inst_prims_in_subgroup);
1125 shader->gs.vgt_gs_max_prims_per_subgroup =
1126 S_028A94_MAX_PRIMS_PER_SUBGROUP(shader->gs_info.max_prims_per_subgroup);
1127 shader->gs.vgt_esgs_ring_itemsize = shader->key.ge.part.gs.es->info.esgs_vertex_stride / 4;
1128
1129 if (es_stage == MESA_SHADER_TESS_EVAL)
1130 si_set_tesseval_regs(sscreen, shader->key.ge.part.gs.es, shader);
1131
1132 polaris_set_vgt_vertex_reuse(sscreen, shader->key.ge.part.gs.es, shader);
1133 } else {
1134 shader->gs.spi_shader_pgm_rsrc3_gs =
1135 ac_apply_cu_en(S_00B21C_CU_EN(0xffff) |
1136 S_00B21C_WAVE_LIMIT(0x3F),
1137 C_00B21C_CU_EN, 0, &sscreen->info);
1138
1139 si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8);
1140 si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS,
1141 S_00B224_MEM_BASE(sscreen->info.address32_hi >> 8));
1142
1143 si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
1144 S_00B228_VGPRS(si_shader_encode_vgprs(shader)) |
1145 S_00B228_SGPRS(si_shader_encode_sgprs(shader)) |
1146 S_00B228_DX10_CLAMP(1) |
1147 S_00B228_FLOAT_MODE(shader->config.float_mode));
1148 si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
1149 S_00B22C_USER_SGPR(GFX6_GS_NUM_USER_SGPR) |
1150 S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
1151 }
1152 si_pm4_finalize(pm4);
1153 }
1154
gfx10_is_ngg_passthrough(struct si_shader * shader)1155 bool gfx10_is_ngg_passthrough(struct si_shader *shader)
1156 {
1157 struct si_shader_selector *sel = shader->selector;
1158
1159 /* Never use NGG passthrough if culling is possible even when it's not used by this shader,
1160 * so that we don't get context rolls when enabling and disabling NGG passthrough.
1161 */
1162 if (sel->screen->use_ngg_culling)
1163 return false;
1164
1165 /* The definition of NGG passthrough is:
1166 * - user GS is turned off (no amplification, no GS instancing, and no culling)
1167 * - VGT_ESGS_RING_ITEMSIZE is ignored (behaving as if it was equal to 1)
1168 * - vertex indices are packed into 1 VGPR
1169 * - Navi23 and later chips can optionally skip the gs_alloc_req message
1170 *
1171 * NGG passthrough still allows the use of LDS.
1172 */
1173 return sel->stage != MESA_SHADER_GEOMETRY && !shader->key.ge.opt.ngg_culling;
1174 }
1175
1176 template <enum si_has_tess HAS_TESS>
gfx10_emit_shader_ngg(struct si_context * sctx,unsigned index)1177 static void gfx10_emit_shader_ngg(struct si_context *sctx, unsigned index)
1178 {
1179 struct si_shader *shader = sctx->queued.named.gs;
1180
1181 if (shader->selector->stage == MESA_SHADER_GEOMETRY)
1182 gfx9_set_gs_sgpr_num_es_outputs(sctx, shader->ngg.esgs_vertex_stride);
1183
1184 radeon_begin(&sctx->gfx_cs);
1185 if (HAS_TESS) {
1186 radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM,
1187 shader->vgt_tf_param);
1188 }
1189 radeon_opt_set_context_reg(sctx, R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP,
1190 SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP,
1191 shader->ngg.ge_max_output_per_subgroup);
1192 radeon_opt_set_context_reg(sctx, R_028B4C_GE_NGG_SUBGRP_CNTL, SI_TRACKED_GE_NGG_SUBGRP_CNTL,
1193 shader->ngg.ge_ngg_subgrp_cntl);
1194 radeon_opt_set_context_reg(sctx, R_028A84_VGT_PRIMITIVEID_EN, SI_TRACKED_VGT_PRIMITIVEID_EN,
1195 shader->ngg.vgt_primitiveid_en);
1196 if (sctx->gfx_level < GFX11) {
1197 radeon_opt_set_context_reg(sctx, R_028A44_VGT_GS_ONCHIP_CNTL, SI_TRACKED_VGT_GS_ONCHIP_CNTL,
1198 shader->ngg.vgt_gs_onchip_cntl);
1199 }
1200 radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT, SI_TRACKED_VGT_GS_MAX_VERT_OUT,
1201 shader->ngg.vgt_gs_max_vert_out);
1202 radeon_opt_set_context_reg(sctx, R_028B90_VGT_GS_INSTANCE_CNT, SI_TRACKED_VGT_GS_INSTANCE_CNT,
1203 shader->ngg.vgt_gs_instance_cnt);
1204 radeon_opt_set_context_reg(sctx, R_0286C4_SPI_VS_OUT_CONFIG, SI_TRACKED_SPI_VS_OUT_CONFIG,
1205 shader->ngg.spi_vs_out_config);
1206 radeon_opt_set_context_reg(sctx, R_02870C_SPI_SHADER_POS_FORMAT,
1207 SI_TRACKED_SPI_SHADER_POS_FORMAT,
1208 shader->ngg.spi_shader_pos_format);
1209 radeon_opt_set_context_reg(sctx, R_028818_PA_CL_VTE_CNTL, SI_TRACKED_PA_CL_VTE_CNTL,
1210 shader->ngg.pa_cl_vte_cntl);
1211 radeon_end_update_context_roll(sctx);
1212
1213 /* These don't cause a context roll. */
1214 radeon_begin_again(&sctx->gfx_cs);
1215 if (sctx->screen->info.uses_kernel_cu_mask) {
1216 radeon_opt_set_sh_reg_idx(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
1217 SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS,
1218 3, shader->ngg.spi_shader_pgm_rsrc3_gs);
1219 radeon_opt_set_sh_reg_idx(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
1220 SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS,
1221 3, shader->ngg.spi_shader_pgm_rsrc4_gs);
1222 } else {
1223 radeon_opt_set_sh_reg(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
1224 SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS,
1225 shader->ngg.spi_shader_pgm_rsrc3_gs);
1226 radeon_opt_set_sh_reg(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
1227 SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS,
1228 shader->ngg.spi_shader_pgm_rsrc4_gs);
1229 }
1230 radeon_opt_set_uconfig_reg(sctx, R_030980_GE_PC_ALLOC, SI_TRACKED_GE_PC_ALLOC,
1231 shader->ngg.ge_pc_alloc);
1232 radeon_end();
1233 }
1234
1235 template <enum si_has_tess HAS_TESS>
gfx11_dgpu_emit_shader_ngg(struct si_context * sctx,unsigned index)1236 static void gfx11_dgpu_emit_shader_ngg(struct si_context *sctx, unsigned index)
1237 {
1238 struct si_shader *shader = sctx->queued.named.gs;
1239
1240 if (shader->selector->stage == MESA_SHADER_GEOMETRY)
1241 gfx9_set_gs_sgpr_num_es_outputs(sctx, shader->ngg.esgs_vertex_stride);
1242
1243 radeon_begin(&sctx->gfx_cs);
1244 gfx11_begin_packed_context_regs();
1245 if (HAS_TESS) {
1246 gfx11_opt_set_context_reg(R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM,
1247 shader->vgt_tf_param);
1248 }
1249 gfx11_opt_set_context_reg(R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP,
1250 SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP,
1251 shader->ngg.ge_max_output_per_subgroup);
1252 gfx11_opt_set_context_reg(R_028B4C_GE_NGG_SUBGRP_CNTL, SI_TRACKED_GE_NGG_SUBGRP_CNTL,
1253 shader->ngg.ge_ngg_subgrp_cntl);
1254 gfx11_opt_set_context_reg(R_028A84_VGT_PRIMITIVEID_EN, SI_TRACKED_VGT_PRIMITIVEID_EN,
1255 shader->ngg.vgt_primitiveid_en);
1256 gfx11_opt_set_context_reg(R_028B38_VGT_GS_MAX_VERT_OUT, SI_TRACKED_VGT_GS_MAX_VERT_OUT,
1257 shader->ngg.vgt_gs_max_vert_out);
1258 gfx11_opt_set_context_reg(R_028B90_VGT_GS_INSTANCE_CNT, SI_TRACKED_VGT_GS_INSTANCE_CNT,
1259 shader->ngg.vgt_gs_instance_cnt);
1260 gfx11_opt_set_context_reg(R_0286C4_SPI_VS_OUT_CONFIG, SI_TRACKED_SPI_VS_OUT_CONFIG,
1261 shader->ngg.spi_vs_out_config);
1262 gfx11_opt_set_context_reg(R_02870C_SPI_SHADER_POS_FORMAT, SI_TRACKED_SPI_SHADER_POS_FORMAT,
1263 shader->ngg.spi_shader_pos_format);
1264 gfx11_opt_set_context_reg(R_028818_PA_CL_VTE_CNTL, SI_TRACKED_PA_CL_VTE_CNTL,
1265 shader->ngg.pa_cl_vte_cntl);
1266 gfx11_end_packed_context_regs();
1267
1268 assert(!sctx->screen->info.uses_kernel_cu_mask);
1269 if (sctx->screen->info.has_set_sh_pairs_packed) {
1270 gfx11_opt_push_gfx_sh_reg(R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
1271 SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS,
1272 shader->gs.spi_shader_pgm_rsrc3_gs);
1273 gfx11_opt_push_gfx_sh_reg(R_00B204_SPI_SHADER_PGM_RSRC4_GS,
1274 SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS,
1275 shader->gs.spi_shader_pgm_rsrc4_gs);
1276 } else {
1277 if (sctx->screen->info.uses_kernel_cu_mask) {
1278 radeon_opt_set_sh_reg_idx(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
1279 SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS,
1280 3, shader->ngg.spi_shader_pgm_rsrc3_gs);
1281 radeon_opt_set_sh_reg_idx(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
1282 SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS,
1283 3, shader->ngg.spi_shader_pgm_rsrc4_gs);
1284 } else {
1285 radeon_opt_set_sh_reg(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
1286 SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS,
1287 shader->ngg.spi_shader_pgm_rsrc3_gs);
1288 radeon_opt_set_sh_reg(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
1289 SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS,
1290 shader->ngg.spi_shader_pgm_rsrc4_gs);
1291 }
1292 }
1293
1294 radeon_opt_set_uconfig_reg(sctx, R_030980_GE_PC_ALLOC, SI_TRACKED_GE_PC_ALLOC,
1295 shader->ngg.ge_pc_alloc);
1296 radeon_end();
1297 }
1298
si_get_input_prim(const struct si_shader_selector * gs,const union si_shader_key * key)1299 unsigned si_get_input_prim(const struct si_shader_selector *gs, const union si_shader_key *key)
1300 {
1301 if (gs->stage == MESA_SHADER_GEOMETRY)
1302 return gs->info.base.gs.input_primitive;
1303
1304 if (gs->stage == MESA_SHADER_TESS_EVAL) {
1305 if (gs->info.base.tess.point_mode)
1306 return MESA_PRIM_POINTS;
1307 if (gs->info.base.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES)
1308 return MESA_PRIM_LINES;
1309 return MESA_PRIM_TRIANGLES;
1310 }
1311
1312 if (key->ge.opt.ngg_culling & SI_NGG_CULL_LINES)
1313 return MESA_PRIM_LINES;
1314
1315 return MESA_PRIM_TRIANGLES; /* worst case for all callers */
1316 }
1317
si_get_vs_out_cntl(const struct si_shader_selector * sel,const struct si_shader * shader,bool ngg)1318 static unsigned si_get_vs_out_cntl(const struct si_shader_selector *sel,
1319 const struct si_shader *shader, bool ngg)
1320 {
1321 /* Clip distances can be killed, but cull distances can't. */
1322 unsigned clipcull_mask = (sel->info.clipdist_mask & ~shader->key.ge.opt.kill_clip_distances) |
1323 sel->info.culldist_mask;
1324 bool writes_psize = sel->info.writes_psize && !shader->key.ge.opt.kill_pointsize;
1325 bool writes_layer = sel->info.writes_layer && !shader->key.ge.opt.kill_layer;
1326 bool misc_vec_ena = writes_psize || (sel->info.writes_edgeflag && !ngg) ||
1327 writes_layer || sel->info.writes_viewport_index ||
1328 sel->screen->options.vrs2x2;
1329
1330 return S_02881C_VS_OUT_CCDIST0_VEC_ENA((clipcull_mask & 0x0F) != 0) |
1331 S_02881C_VS_OUT_CCDIST1_VEC_ENA((clipcull_mask & 0xF0) != 0) |
1332 S_02881C_USE_VTX_POINT_SIZE(writes_psize) |
1333 S_02881C_USE_VTX_EDGE_FLAG(sel->info.writes_edgeflag && !ngg) |
1334 S_02881C_USE_VTX_VRS_RATE(sel->screen->options.vrs2x2) |
1335 S_02881C_USE_VTX_RENDER_TARGET_INDX(writes_layer) |
1336 S_02881C_USE_VTX_VIEWPORT_INDX(sel->info.writes_viewport_index) |
1337 S_02881C_VS_OUT_MISC_VEC_ENA(misc_vec_ena) |
1338 S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(misc_vec_ena ||
1339 (sel->screen->info.gfx_level >= GFX10_3 &&
1340 shader->info.nr_pos_exports > 1));
1341 }
1342
1343 /**
1344 * Prepare the PM4 image for \p shader, which will run as a merged ESGS shader
1345 * in NGG mode.
1346 */
gfx10_shader_ngg(struct si_screen * sscreen,struct si_shader * shader)1347 static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader)
1348 {
1349 const struct si_shader_selector *gs_sel = shader->selector;
1350 const struct si_shader_info *gs_info = &gs_sel->info;
1351 const gl_shader_stage gs_stage = shader->selector->stage;
1352 const struct si_shader_selector *es_sel =
1353 shader->previous_stage_sel ? shader->previous_stage_sel : shader->selector;
1354 const struct si_shader_info *es_info = &es_sel->info;
1355 const gl_shader_stage es_stage = es_sel->stage;
1356 unsigned num_user_sgprs;
1357 unsigned es_vgpr_comp_cnt, gs_vgpr_comp_cnt;
1358 uint64_t va;
1359 bool window_space = gs_sel->stage == MESA_SHADER_VERTEX ?
1360 gs_info->base.vs.window_space_position : 0;
1361 bool es_enable_prim_id = shader->key.ge.mono.u.vs_export_prim_id || es_info->uses_primid;
1362 unsigned gs_num_invocations = gs_sel->stage == MESA_SHADER_GEOMETRY ?
1363 CLAMP(gs_info->base.gs.invocations, 1, 32) : 0;
1364 unsigned input_prim = si_get_input_prim(gs_sel, &shader->key);
1365 bool break_wave_at_eoi = false;
1366
1367 struct si_pm4_state *pm4 = si_get_shader_pm4_state(shader, NULL);
1368 if (!pm4)
1369 return;
1370
1371 if (sscreen->info.has_set_context_pairs_packed) {
1372 if (es_stage == MESA_SHADER_TESS_EVAL)
1373 pm4->atom.emit = gfx11_dgpu_emit_shader_ngg<TESS_ON>;
1374 else
1375 pm4->atom.emit = gfx11_dgpu_emit_shader_ngg<TESS_OFF>;
1376 } else {
1377 if (es_stage == MESA_SHADER_TESS_EVAL)
1378 pm4->atom.emit = gfx10_emit_shader_ngg<TESS_ON>;
1379 else
1380 pm4->atom.emit = gfx10_emit_shader_ngg<TESS_OFF>;
1381 }
1382
1383 va = shader->bo->gpu_address;
1384
1385 if (es_stage == MESA_SHADER_VERTEX) {
1386 es_vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, false);
1387
1388 if (es_info->base.vs.blit_sgprs_amd) {
1389 num_user_sgprs =
1390 SI_SGPR_VS_BLIT_DATA + es_info->base.vs.blit_sgprs_amd;
1391 } else {
1392 num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_GS_NUM_USER_SGPR);
1393 }
1394 } else {
1395 assert(es_stage == MESA_SHADER_TESS_EVAL);
1396 es_vgpr_comp_cnt = es_enable_prim_id ? 3 : 2;
1397 num_user_sgprs = GFX9_GS_NUM_USER_SGPR;
1398
1399 if (es_enable_prim_id || gs_info->uses_primid)
1400 break_wave_at_eoi = true;
1401 }
1402
1403 /* Primitives with adjancency can only occur without tessellation. */
1404 assert(gs_info->gs_input_verts_per_prim <= 3 || es_stage == MESA_SHADER_VERTEX);
1405
1406 /* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and
1407 * VGPR[0:4] are always loaded.
1408 *
1409 * Vertex shaders always need to load VGPR3, because they need to
1410 * pass edge flags for decomposed primitives (such as quads) to the PA
1411 * for the GL_LINE polygon mode to skip rendering lines on inner edges.
1412 */
1413 if (gs_info->uses_invocationid ||
1414 (gfx10_edgeflags_have_effect(shader) && !gfx10_is_ngg_passthrough(shader)))
1415 gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID, edge flags. */
1416 else if ((gs_stage == MESA_SHADER_GEOMETRY && gs_info->uses_primid) ||
1417 (gs_stage == MESA_SHADER_VERTEX && shader->key.ge.mono.u.vs_export_prim_id))
1418 gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */
1419 else if (input_prim >= MESA_PRIM_TRIANGLES && !gfx10_is_ngg_passthrough(shader))
1420 gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */
1421 else
1422 gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
1423
1424 si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
1425 si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
1426 S_00B228_VGPRS(si_shader_encode_vgprs(shader)) |
1427 S_00B228_FLOAT_MODE(shader->config.float_mode) |
1428 S_00B228_DX10_CLAMP(1) |
1429 S_00B228_MEM_ORDERED(si_shader_mem_ordered(shader)) |
1430 S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt));
1431 si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
1432 S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0) |
1433 S_00B22C_USER_SGPR(num_user_sgprs) |
1434 S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
1435 S_00B22C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5) |
1436 S_00B22C_OC_LDS_EN(es_stage == MESA_SHADER_TESS_EVAL) |
1437 S_00B22C_LDS_SIZE(shader->config.lds_size));
1438
1439 /* Set register values emitted conditionally in gfx10_emit_shader_ngg_*. */
1440 shader->ngg.spi_shader_pos_format =
1441 S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
1442 S_02870C_POS1_EXPORT_FORMAT(shader->info.nr_pos_exports > 1 ? V_02870C_SPI_SHADER_4COMP
1443 : V_02870C_SPI_SHADER_NONE) |
1444 S_02870C_POS2_EXPORT_FORMAT(shader->info.nr_pos_exports > 2 ? V_02870C_SPI_SHADER_4COMP
1445 : V_02870C_SPI_SHADER_NONE) |
1446 S_02870C_POS3_EXPORT_FORMAT(shader->info.nr_pos_exports > 3 ? V_02870C_SPI_SHADER_4COMP
1447 : V_02870C_SPI_SHADER_NONE);
1448 shader->ngg.ge_max_output_per_subgroup = S_0287FC_MAX_VERTS_PER_SUBGROUP(shader->ngg.max_out_verts);
1449 shader->ngg.vgt_gs_instance_cnt =
1450 S_028B90_ENABLE(gs_num_invocations > 1) |
1451 S_028B90_CNT(gs_num_invocations) |
1452 S_028B90_EN_MAX_VERT_OUT_PER_GS_INSTANCE(shader->ngg.max_vert_out_per_gs_instance);
1453 shader->pa_cl_vs_out_cntl = si_get_vs_out_cntl(shader->selector, shader, true);
1454
1455 if (gs_stage == MESA_SHADER_GEOMETRY) {
1456 shader->ngg.esgs_vertex_stride = es_sel->info.esgs_vertex_stride / 4;
1457 shader->ngg.vgt_gs_max_vert_out = gs_sel->info.base.gs.vertices_out;
1458 shader->ngg.ge_ngg_subgrp_cntl = S_028B4C_PRIM_AMP_FACTOR(gs_sel->info.base.gs.vertices_out);
1459 } else {
1460 shader->ngg.esgs_vertex_stride = 1;
1461 shader->ngg.vgt_gs_max_vert_out = 1;
1462 shader->ngg.ge_ngg_subgrp_cntl = S_028B4C_PRIM_AMP_FACTOR(1);
1463 }
1464
1465 if (es_stage == MESA_SHADER_TESS_EVAL)
1466 si_set_tesseval_regs(sscreen, es_sel, shader);
1467
1468 shader->ngg.vgt_primitiveid_en =
1469 S_028A84_NGG_DISABLE_PROVOK_REUSE(shader->key.ge.mono.u.vs_export_prim_id ||
1470 gs_sel->info.writes_primid);
1471
1472 unsigned late_alloc_wave64, cu_mask;
1473
1474 ac_compute_late_alloc(&sscreen->info, true, shader->key.ge.opt.ngg_culling,
1475 shader->config.scratch_bytes_per_wave > 0,
1476 &late_alloc_wave64, &cu_mask);
1477
1478 /* Oversubscribe PC. This improves performance when there are too many varyings. */
1479 unsigned oversub_pc_lines, oversub_pc_factor = 1;
1480
1481 if (shader->key.ge.opt.ngg_culling) {
1482 /* Be more aggressive with NGG culling. */
1483 if (shader->info.nr_param_exports > 4)
1484 oversub_pc_factor = 4;
1485 else if (shader->info.nr_param_exports > 2)
1486 oversub_pc_factor = 3;
1487 else
1488 oversub_pc_factor = 2;
1489 }
1490 oversub_pc_lines = late_alloc_wave64 ? (sscreen->info.pc_lines / 4) * oversub_pc_factor : 0;
1491 shader->ngg.ge_pc_alloc = S_030980_OVERSUB_EN(oversub_pc_lines > 0) |
1492 S_030980_NUM_PC_LINES(oversub_pc_lines - 1);
1493 shader->ngg.vgt_primitiveid_en |= S_028A84_PRIMITIVEID_EN(es_enable_prim_id);
1494 shader->ngg.spi_shader_pgm_rsrc3_gs =
1495 ac_apply_cu_en(S_00B21C_CU_EN(cu_mask) |
1496 S_00B21C_WAVE_LIMIT(0x3F),
1497 C_00B21C_CU_EN, 0, &sscreen->info);
1498 shader->ngg.spi_shader_pgm_rsrc4_gs = S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64);
1499 shader->ngg.spi_vs_out_config =
1500 S_0286C4_VS_EXPORT_COUNT(MAX2(shader->info.nr_param_exports, 1) - 1) |
1501 S_0286C4_NO_PC_EXPORT(shader->info.nr_param_exports == 0);
1502
1503 if (sscreen->info.gfx_level >= GFX11) {
1504 shader->ngg.spi_shader_pgm_rsrc4_gs |=
1505 ac_apply_cu_en(S_00B204_CU_EN_GFX11(0x1) |
1506 S_00B204_INST_PREF_SIZE(si_get_shader_prefetch_size(shader)),
1507 C_00B204_CU_EN_GFX11, 16, &sscreen->info);
1508 } else {
1509 shader->ngg.spi_shader_pgm_rsrc4_gs |=
1510 ac_apply_cu_en(S_00B204_CU_EN_GFX10(0xffff),
1511 C_00B204_CU_EN_GFX10, 16, &sscreen->info);
1512 }
1513
1514 if (sscreen->info.gfx_level >= GFX11) {
1515 /* This should be <= 252 for performance on Gfx11. 256 works too but is slower. */
1516 unsigned max_prim_grp_size = 252;
1517 unsigned prim_amp_factor = gs_stage == MESA_SHADER_GEOMETRY ?
1518 gs_sel->info.base.gs.vertices_out : 1;
1519
1520 shader->ge_cntl = S_03096C_PRIMS_PER_SUBGRP(shader->ngg.max_gsprims) |
1521 S_03096C_VERTS_PER_SUBGRP(shader->ngg.hw_max_esverts) |
1522 S_03096C_BREAK_PRIMGRP_AT_EOI(break_wave_at_eoi) |
1523 S_03096C_PRIM_GRP_SIZE_GFX11(
1524 CLAMP(max_prim_grp_size / MAX2(prim_amp_factor, 1), 1, 256));
1525 } else {
1526 shader->ge_cntl = S_03096C_PRIM_GRP_SIZE_GFX10(shader->ngg.max_gsprims) |
1527 S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts) |
1528 S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
1529
1530 shader->ngg.vgt_gs_onchip_cntl =
1531 S_028A44_ES_VERTS_PER_SUBGRP(shader->ngg.hw_max_esverts) |
1532 S_028A44_GS_PRIMS_PER_SUBGRP(shader->ngg.max_gsprims) |
1533 S_028A44_GS_INST_PRIMS_IN_SUBGRP(shader->ngg.max_gsprims * gs_num_invocations);
1534
1535 /* On gfx10, the GE only checks against the maximum number of ES verts after
1536 * allocating a full GS primitive. So we need to ensure that whenever
1537 * this check passes, there is enough space for a full primitive without
1538 * vertex reuse. VERT_GRP_SIZE=256 doesn't need this. We should always get 256
1539 * if we have enough LDS.
1540 *
1541 * Tessellation is unaffected because it always sets GE_CNTL.VERT_GRP_SIZE = 0.
1542 */
1543 if ((sscreen->info.gfx_level == GFX10) &&
1544 (es_stage == MESA_SHADER_VERTEX || gs_stage == MESA_SHADER_VERTEX) && /* = no tess */
1545 shader->ngg.hw_max_esverts != 256 &&
1546 shader->ngg.hw_max_esverts > 5) {
1547 /* This could be based on the input primitive type. 5 is the worst case
1548 * for primitive types with adjacency.
1549 */
1550 shader->ge_cntl &= C_03096C_VERT_GRP_SIZE;
1551 shader->ge_cntl |= S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5);
1552 }
1553 }
1554
1555 if (window_space) {
1556 shader->ngg.pa_cl_vte_cntl = S_028818_VTX_XY_FMT(1) | S_028818_VTX_Z_FMT(1);
1557 } else {
1558 shader->ngg.pa_cl_vte_cntl = S_028818_VTX_W0_FMT(1) |
1559 S_028818_VPORT_X_SCALE_ENA(1) | S_028818_VPORT_X_OFFSET_ENA(1) |
1560 S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) |
1561 S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1);
1562 }
1563
1564 shader->ngg.vgt_shader_stages_en =
1565 S_028B54_ES_EN(es_stage == MESA_SHADER_TESS_EVAL ?
1566 V_028B54_ES_STAGE_DS : V_028B54_ES_STAGE_REAL) |
1567 S_028B54_GS_EN(gs_stage == MESA_SHADER_GEOMETRY) |
1568 S_028B54_PRIMGEN_EN(1) |
1569 S_028B54_PRIMGEN_PASSTHRU_EN(gfx10_is_ngg_passthrough(shader)) |
1570 S_028B54_PRIMGEN_PASSTHRU_NO_MSG(gfx10_is_ngg_passthrough(shader) &&
1571 sscreen->info.family >= CHIP_NAVI23) |
1572 S_028B54_NGG_WAVE_ID_EN(si_shader_uses_streamout(shader)) |
1573 S_028B54_GS_W32_EN(shader->wave_size == 32) |
1574 S_028B54_MAX_PRIMGRP_IN_WAVE(2);
1575
1576 si_pm4_finalize(pm4);
1577 }
1578
si_emit_shader_vs(struct si_context * sctx,unsigned index)1579 static void si_emit_shader_vs(struct si_context *sctx, unsigned index)
1580 {
1581 struct si_shader *shader = sctx->queued.named.vs;
1582
1583 radeon_begin(&sctx->gfx_cs);
1584 radeon_opt_set_context_reg(sctx, R_028A40_VGT_GS_MODE, SI_TRACKED_VGT_GS_MODE,
1585 shader->vs.vgt_gs_mode);
1586 radeon_opt_set_context_reg(sctx, R_028A84_VGT_PRIMITIVEID_EN, SI_TRACKED_VGT_PRIMITIVEID_EN,
1587 shader->vs.vgt_primitiveid_en);
1588
1589 if (sctx->gfx_level <= GFX8) {
1590 radeon_opt_set_context_reg(sctx, R_028AB4_VGT_REUSE_OFF, SI_TRACKED_VGT_REUSE_OFF,
1591 shader->vs.vgt_reuse_off);
1592 }
1593
1594 radeon_opt_set_context_reg(sctx, R_0286C4_SPI_VS_OUT_CONFIG, SI_TRACKED_SPI_VS_OUT_CONFIG,
1595 shader->vs.spi_vs_out_config);
1596
1597 radeon_opt_set_context_reg(sctx, R_02870C_SPI_SHADER_POS_FORMAT,
1598 SI_TRACKED_SPI_SHADER_POS_FORMAT,
1599 shader->vs.spi_shader_pos_format);
1600
1601 radeon_opt_set_context_reg(sctx, R_028818_PA_CL_VTE_CNTL, SI_TRACKED_PA_CL_VTE_CNTL,
1602 shader->vs.pa_cl_vte_cntl);
1603
1604 if (shader->selector->stage == MESA_SHADER_TESS_EVAL)
1605 radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM,
1606 shader->vgt_tf_param);
1607
1608 if (shader->vgt_vertex_reuse_block_cntl)
1609 radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
1610 SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
1611 shader->vgt_vertex_reuse_block_cntl);
1612
1613 /* Required programming for tessellation. (legacy pipeline only) */
1614 if (sctx->gfx_level >= GFX10 && shader->selector->stage == MESA_SHADER_TESS_EVAL) {
1615 radeon_opt_set_context_reg(sctx, R_028A44_VGT_GS_ONCHIP_CNTL,
1616 SI_TRACKED_VGT_GS_ONCHIP_CNTL,
1617 S_028A44_ES_VERTS_PER_SUBGRP(250) |
1618 S_028A44_GS_PRIMS_PER_SUBGRP(126) |
1619 S_028A44_GS_INST_PRIMS_IN_SUBGRP(126));
1620 }
1621
1622 radeon_end_update_context_roll(sctx);
1623
1624 /* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */
1625 if (sctx->gfx_level >= GFX10) {
1626 radeon_begin_again(&sctx->gfx_cs);
1627 radeon_opt_set_uconfig_reg(sctx, R_030980_GE_PC_ALLOC, SI_TRACKED_GE_PC_ALLOC,
1628 shader->vs.ge_pc_alloc);
1629 radeon_end();
1630 }
1631 }
1632
1633 /**
1634 * Compute the state for \p shader, which will run as a vertex shader on the
1635 * hardware.
1636 *
1637 * If \p gs is non-NULL, it points to the geometry shader for which this shader
1638 * is the copy shader.
1639 */
si_shader_vs(struct si_screen * sscreen,struct si_shader * shader,struct si_shader_selector * gs)1640 static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
1641 struct si_shader_selector *gs)
1642 {
1643 const struct si_shader_info *info = &shader->selector->info;
1644 struct si_pm4_state *pm4;
1645 unsigned num_user_sgprs, vgpr_comp_cnt;
1646 uint64_t va;
1647 unsigned nparams, oc_lds_en;
1648 bool window_space = shader->selector->stage == MESA_SHADER_VERTEX ?
1649 info->base.vs.window_space_position : 0;
1650 bool enable_prim_id = shader->key.ge.mono.u.vs_export_prim_id || info->uses_primid;
1651
1652 assert(sscreen->info.gfx_level < GFX11);
1653
1654 pm4 = si_get_shader_pm4_state(shader, si_emit_shader_vs);
1655 if (!pm4)
1656 return;
1657
1658 /* We always write VGT_GS_MODE in the VS state, because every switch
1659 * between different shader pipelines involving a different GS or no
1660 * GS at all involves a switch of the VS (different GS use different
1661 * copy shaders). On the other hand, when the API switches from a GS to
1662 * no GS and then back to the same GS used originally, the GS state is
1663 * not sent again.
1664 */
1665 if (!gs) {
1666 unsigned mode = V_028A40_GS_OFF;
1667
1668 /* PrimID needs GS scenario A. */
1669 if (enable_prim_id)
1670 mode = V_028A40_GS_SCENARIO_A;
1671
1672 shader->vs.vgt_gs_mode = S_028A40_MODE(mode);
1673 shader->vs.vgt_primitiveid_en = enable_prim_id;
1674 } else {
1675 shader->vs.vgt_gs_mode =
1676 ac_vgt_gs_mode(gs->info.base.gs.vertices_out, sscreen->info.gfx_level);
1677 shader->vs.vgt_primitiveid_en = 0;
1678 }
1679
1680 if (sscreen->info.gfx_level <= GFX8) {
1681 /* Reuse needs to be set off if we write oViewport. */
1682 shader->vs.vgt_reuse_off = S_028AB4_REUSE_OFF(info->writes_viewport_index);
1683 }
1684
1685 va = shader->bo->gpu_address;
1686
1687 if (gs) {
1688 vgpr_comp_cnt = 0; /* only VertexID is needed for GS-COPY. */
1689 num_user_sgprs = SI_GSCOPY_NUM_USER_SGPR;
1690 } else if (shader->selector->stage == MESA_SHADER_VERTEX) {
1691 vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, enable_prim_id);
1692
1693 if (info->base.vs.blit_sgprs_amd) {
1694 num_user_sgprs = SI_SGPR_VS_BLIT_DATA + info->base.vs.blit_sgprs_amd;
1695 } else {
1696 num_user_sgprs = si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR);
1697 }
1698 } else if (shader->selector->stage == MESA_SHADER_TESS_EVAL) {
1699 vgpr_comp_cnt = enable_prim_id ? 3 : 2;
1700 num_user_sgprs = SI_TES_NUM_USER_SGPR;
1701 } else
1702 unreachable("invalid shader selector type");
1703
1704 /* VS is required to export at least one param. */
1705 nparams = MAX2(shader->info.nr_param_exports, 1);
1706 shader->vs.spi_vs_out_config = S_0286C4_VS_EXPORT_COUNT(nparams - 1);
1707
1708 if (sscreen->info.gfx_level >= GFX10) {
1709 shader->vs.spi_vs_out_config |=
1710 S_0286C4_NO_PC_EXPORT(shader->info.nr_param_exports == 0);
1711 }
1712
1713 shader->vs.spi_shader_pos_format =
1714 S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
1715 S_02870C_POS1_EXPORT_FORMAT(shader->info.nr_pos_exports > 1 ? V_02870C_SPI_SHADER_4COMP
1716 : V_02870C_SPI_SHADER_NONE) |
1717 S_02870C_POS2_EXPORT_FORMAT(shader->info.nr_pos_exports > 2 ? V_02870C_SPI_SHADER_4COMP
1718 : V_02870C_SPI_SHADER_NONE) |
1719 S_02870C_POS3_EXPORT_FORMAT(shader->info.nr_pos_exports > 3 ? V_02870C_SPI_SHADER_4COMP
1720 : V_02870C_SPI_SHADER_NONE);
1721 unsigned late_alloc_wave64, cu_mask;
1722 ac_compute_late_alloc(&sscreen->info, false, false,
1723 shader->config.scratch_bytes_per_wave > 0,
1724 &late_alloc_wave64, &cu_mask);
1725
1726 shader->vs.ge_pc_alloc = S_030980_OVERSUB_EN(late_alloc_wave64 > 0) |
1727 S_030980_NUM_PC_LINES(sscreen->info.pc_lines / 4 - 1);
1728 shader->pa_cl_vs_out_cntl = si_get_vs_out_cntl(shader->selector, shader, false);
1729
1730 oc_lds_en = shader->selector->stage == MESA_SHADER_TESS_EVAL ? 1 : 0;
1731
1732 if (sscreen->info.gfx_level >= GFX7) {
1733 si_pm4_set_reg_idx3(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS,
1734 ac_apply_cu_en(S_00B118_CU_EN(cu_mask) |
1735 S_00B118_WAVE_LIMIT(0x3F),
1736 C_00B118_CU_EN, 0, &sscreen->info));
1737 si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(late_alloc_wave64));
1738 }
1739
1740 si_pm4_set_reg(pm4, R_00B120_SPI_SHADER_PGM_LO_VS, va >> 8);
1741 si_pm4_set_reg(pm4, R_00B124_SPI_SHADER_PGM_HI_VS,
1742 S_00B124_MEM_BASE(sscreen->info.address32_hi >> 8));
1743
1744 uint32_t rsrc1 =
1745 S_00B128_VGPRS(si_shader_encode_vgprs(shader)) |
1746 S_00B128_SGPRS(si_shader_encode_sgprs(shader)) |
1747 S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt) |
1748 S_00B128_DX10_CLAMP(1) |
1749 S_00B128_MEM_ORDERED(si_shader_mem_ordered(shader)) |
1750 S_00B128_FLOAT_MODE(shader->config.float_mode);
1751 uint32_t rsrc2 = S_00B12C_USER_SGPR(num_user_sgprs) | S_00B12C_OC_LDS_EN(oc_lds_en) |
1752 S_00B12C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
1753
1754 if (sscreen->info.gfx_level >= GFX10)
1755 rsrc2 |= S_00B12C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5);
1756 else if (sscreen->info.gfx_level == GFX9)
1757 rsrc2 |= S_00B12C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5);
1758
1759 if (si_shader_uses_streamout(shader)) {
1760 rsrc2 |= S_00B12C_SO_BASE0_EN(!!shader->selector->info.base.xfb_stride[0]) |
1761 S_00B12C_SO_BASE1_EN(!!shader->selector->info.base.xfb_stride[1]) |
1762 S_00B12C_SO_BASE2_EN(!!shader->selector->info.base.xfb_stride[2]) |
1763 S_00B12C_SO_BASE3_EN(!!shader->selector->info.base.xfb_stride[3]) |
1764 S_00B12C_SO_EN(1);
1765 }
1766
1767 si_pm4_set_reg(pm4, R_00B128_SPI_SHADER_PGM_RSRC1_VS, rsrc1);
1768 si_pm4_set_reg(pm4, R_00B12C_SPI_SHADER_PGM_RSRC2_VS, rsrc2);
1769
1770 if (window_space)
1771 shader->vs.pa_cl_vte_cntl = S_028818_VTX_XY_FMT(1) | S_028818_VTX_Z_FMT(1);
1772 else
1773 shader->vs.pa_cl_vte_cntl =
1774 S_028818_VTX_W0_FMT(1) | S_028818_VPORT_X_SCALE_ENA(1) | S_028818_VPORT_X_OFFSET_ENA(1) |
1775 S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) |
1776 S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1);
1777
1778 if (shader->selector->stage == MESA_SHADER_TESS_EVAL)
1779 si_set_tesseval_regs(sscreen, shader->selector, shader);
1780
1781 polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader);
1782 si_pm4_finalize(pm4);
1783 }
1784
si_get_spi_shader_col_format(struct si_shader * shader)1785 static unsigned si_get_spi_shader_col_format(struct si_shader *shader)
1786 {
1787 unsigned spi_shader_col_format = shader->key.ps.part.epilog.spi_shader_col_format;
1788 unsigned value = 0, num_mrts = 0;
1789 unsigned i, num_targets = (util_last_bit(spi_shader_col_format) + 3) / 4;
1790
1791 /* Remove holes in spi_shader_col_format. */
1792 for (i = 0; i < num_targets; i++) {
1793 unsigned spi_format = (spi_shader_col_format >> (i * 4)) & 0xf;
1794
1795 if (spi_format) {
1796 value |= spi_format << (num_mrts * 4);
1797 num_mrts++;
1798 }
1799 }
1800
1801 return value;
1802 }
1803
gfx6_emit_shader_ps(struct si_context * sctx,unsigned index)1804 static void gfx6_emit_shader_ps(struct si_context *sctx, unsigned index)
1805 {
1806 struct si_shader *shader = sctx->queued.named.ps;
1807
1808 radeon_begin(&sctx->gfx_cs);
1809 radeon_opt_set_context_reg2(sctx, R_0286CC_SPI_PS_INPUT_ENA, SI_TRACKED_SPI_PS_INPUT_ENA,
1810 shader->ps.spi_ps_input_ena,
1811 shader->ps.spi_ps_input_addr);
1812 radeon_opt_set_context_reg(sctx, R_0286E0_SPI_BARYC_CNTL, SI_TRACKED_SPI_BARYC_CNTL,
1813 shader->ps.spi_baryc_cntl);
1814 radeon_opt_set_context_reg(sctx, R_0286D8_SPI_PS_IN_CONTROL, SI_TRACKED_SPI_PS_IN_CONTROL,
1815 shader->ps.spi_ps_in_control);
1816 radeon_opt_set_context_reg2(sctx, R_028710_SPI_SHADER_Z_FORMAT, SI_TRACKED_SPI_SHADER_Z_FORMAT,
1817 shader->ps.spi_shader_z_format,
1818 shader->ps.spi_shader_col_format);
1819 radeon_opt_set_context_reg(sctx, R_02823C_CB_SHADER_MASK, SI_TRACKED_CB_SHADER_MASK,
1820 shader->ps.cb_shader_mask);
1821 radeon_end_update_context_roll(sctx);
1822 }
1823
gfx11_dgpu_emit_shader_ps(struct si_context * sctx,unsigned index)1824 static void gfx11_dgpu_emit_shader_ps(struct si_context *sctx, unsigned index)
1825 {
1826 struct si_shader *shader = sctx->queued.named.ps;
1827
1828 radeon_begin(&sctx->gfx_cs);
1829 gfx11_begin_packed_context_regs();
1830 gfx11_opt_set_context_reg(R_0286CC_SPI_PS_INPUT_ENA, SI_TRACKED_SPI_PS_INPUT_ENA,
1831 shader->ps.spi_ps_input_ena);
1832 gfx11_opt_set_context_reg(R_0286D0_SPI_PS_INPUT_ADDR, SI_TRACKED_SPI_PS_INPUT_ADDR,
1833 shader->ps.spi_ps_input_addr);
1834 gfx11_opt_set_context_reg(R_0286E0_SPI_BARYC_CNTL, SI_TRACKED_SPI_BARYC_CNTL,
1835 shader->ps.spi_baryc_cntl);
1836 gfx11_opt_set_context_reg(R_0286D8_SPI_PS_IN_CONTROL, SI_TRACKED_SPI_PS_IN_CONTROL,
1837 shader->ps.spi_ps_in_control);
1838 gfx11_opt_set_context_reg(R_028710_SPI_SHADER_Z_FORMAT, SI_TRACKED_SPI_SHADER_Z_FORMAT,
1839 shader->ps.spi_shader_z_format);
1840 gfx11_opt_set_context_reg(R_028714_SPI_SHADER_COL_FORMAT, SI_TRACKED_SPI_SHADER_COL_FORMAT,
1841 shader->ps.spi_shader_col_format);
1842 gfx11_opt_set_context_reg(R_02823C_CB_SHADER_MASK, SI_TRACKED_CB_SHADER_MASK,
1843 shader->ps.cb_shader_mask);
1844 gfx11_end_packed_context_regs();
1845 radeon_end(); /* don't track context rolls on GFX11 */
1846 }
1847
si_shader_ps(struct si_screen * sscreen,struct si_shader * shader)1848 static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader)
1849 {
1850 struct si_shader_info *info = &shader->selector->info;
1851 const unsigned input_ena = shader->config.spi_ps_input_ena;
1852
1853 /* we need to enable at least one of them, otherwise we hang the GPU */
1854 assert(G_0286CC_PERSP_SAMPLE_ENA(input_ena) || G_0286CC_PERSP_CENTER_ENA(input_ena) ||
1855 G_0286CC_PERSP_CENTROID_ENA(input_ena) || G_0286CC_PERSP_PULL_MODEL_ENA(input_ena) ||
1856 G_0286CC_LINEAR_SAMPLE_ENA(input_ena) || G_0286CC_LINEAR_CENTER_ENA(input_ena) ||
1857 G_0286CC_LINEAR_CENTROID_ENA(input_ena) || G_0286CC_LINE_STIPPLE_TEX_ENA(input_ena));
1858 /* POS_W_FLOAT_ENA requires one of the perspective weights. */
1859 assert(!G_0286CC_POS_W_FLOAT_ENA(input_ena) || G_0286CC_PERSP_SAMPLE_ENA(input_ena) ||
1860 G_0286CC_PERSP_CENTER_ENA(input_ena) || G_0286CC_PERSP_CENTROID_ENA(input_ena) ||
1861 G_0286CC_PERSP_PULL_MODEL_ENA(input_ena));
1862
1863 /* Validate interpolation optimization flags (read as implications). */
1864 assert(!shader->key.ps.part.prolog.bc_optimize_for_persp ||
1865 (G_0286CC_PERSP_CENTER_ENA(input_ena) && G_0286CC_PERSP_CENTROID_ENA(input_ena)));
1866 assert(!shader->key.ps.part.prolog.bc_optimize_for_linear ||
1867 (G_0286CC_LINEAR_CENTER_ENA(input_ena) && G_0286CC_LINEAR_CENTROID_ENA(input_ena)));
1868 assert(!shader->key.ps.part.prolog.force_persp_center_interp ||
1869 (!G_0286CC_PERSP_SAMPLE_ENA(input_ena) && !G_0286CC_PERSP_CENTROID_ENA(input_ena)));
1870 assert(!shader->key.ps.part.prolog.force_linear_center_interp ||
1871 (!G_0286CC_LINEAR_SAMPLE_ENA(input_ena) && !G_0286CC_LINEAR_CENTROID_ENA(input_ena)));
1872 assert(!shader->key.ps.part.prolog.force_persp_sample_interp ||
1873 (!G_0286CC_PERSP_CENTER_ENA(input_ena) && !G_0286CC_PERSP_CENTROID_ENA(input_ena)));
1874 assert(!shader->key.ps.part.prolog.force_linear_sample_interp ||
1875 (!G_0286CC_LINEAR_CENTER_ENA(input_ena) && !G_0286CC_LINEAR_CENTROID_ENA(input_ena)));
1876
1877 /* color_two_side always enables FRONT_FACE. Since st/mesa disables two-side colors if the back
1878 * face is culled, the only case when both color_two_side and force_front_face_input can be set
1879 * is when the front face is culled (which means force_front_face_input == -1).
1880 */
1881 assert(!shader->key.ps.opt.force_front_face_input || !G_0286CC_FRONT_FACE_ENA(input_ena) ||
1882 (shader->key.ps.part.prolog.color_two_side &&
1883 shader->key.ps.opt.force_front_face_input == -1));
1884
1885 /* Validate cases when the optimizations are off (read as implications). */
1886 assert(shader->key.ps.part.prolog.bc_optimize_for_persp ||
1887 !G_0286CC_PERSP_CENTER_ENA(input_ena) || !G_0286CC_PERSP_CENTROID_ENA(input_ena));
1888 assert(shader->key.ps.part.prolog.bc_optimize_for_linear ||
1889 !G_0286CC_LINEAR_CENTER_ENA(input_ena) || !G_0286CC_LINEAR_CENTROID_ENA(input_ena));
1890
1891 /* DB_SHADER_CONTROL */
1892 shader->ps.db_shader_control = S_02880C_Z_EXPORT_ENABLE(info->writes_z) |
1893 S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(info->writes_stencil) |
1894 S_02880C_MASK_EXPORT_ENABLE(shader->ps.writes_samplemask) |
1895 S_02880C_KILL_ENABLE(si_shader_uses_discard(shader));
1896
1897 switch (info->base.fs.depth_layout) {
1898 case FRAG_DEPTH_LAYOUT_GREATER:
1899 shader->ps.db_shader_control |= S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_GREATER_THAN_Z);
1900 break;
1901 case FRAG_DEPTH_LAYOUT_LESS:
1902 shader->ps.db_shader_control |= S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_LESS_THAN_Z);
1903 break;
1904 default:;
1905 }
1906
1907 /* Z_ORDER, EXEC_ON_HIER_FAIL and EXEC_ON_NOOP should be set as following:
1908 *
1909 * | early Z/S | writes_mem | allow_ReZ? | Z_ORDER | EXEC_ON_HIER_FAIL | EXEC_ON_NOOP
1910 * --|-----------|------------|------------|--------------------|-------------------|-------------
1911 * 1a| false | false | true | EarlyZ_Then_ReZ | 0 | 0
1912 * 1b| false | false | false | EarlyZ_Then_LateZ | 0 | 0
1913 * 2 | false | true | n/a | LateZ | 1 | 0
1914 * 3 | true | false | n/a | EarlyZ_Then_LateZ | 0 | 0
1915 * 4 | true | true | n/a | EarlyZ_Then_LateZ | 0 | 1
1916 *
1917 * In cases 3 and 4, HW will force Z_ORDER to EarlyZ regardless of what's set in the register.
1918 * In case 2, NOOP_CULL is a don't care field. In case 2, 3 and 4, ReZ doesn't make sense.
1919 *
1920 * Don't use ReZ without profiling !!!
1921 *
1922 * ReZ decreases performance by 15% in DiRT: Showdown on Ultra settings, which has pretty complex
1923 * shaders.
1924 */
1925 if (info->base.fs.early_fragment_tests) {
1926 /* Cases 3, 4. */
1927 shader->ps.db_shader_control |= S_02880C_DEPTH_BEFORE_SHADER(1) |
1928 S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z) |
1929 S_02880C_EXEC_ON_NOOP(info->base.writes_memory);
1930 } else if (info->base.writes_memory) {
1931 /* Case 2. */
1932 shader->ps.db_shader_control |= S_02880C_Z_ORDER(V_02880C_LATE_Z) |
1933 S_02880C_EXEC_ON_HIER_FAIL(1);
1934 } else {
1935 /* Case 1. */
1936 shader->ps.db_shader_control |= S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z);
1937 }
1938
1939 if (info->base.fs.post_depth_coverage)
1940 shader->ps.db_shader_control |= S_02880C_PRE_SHADER_DEPTH_COVERAGE_ENABLE(1);
1941
1942 /* Bug workaround for smoothing (overrasterization) on GFX6. */
1943 if (sscreen->info.gfx_level == GFX6 && shader->key.ps.mono.poly_line_smoothing) {
1944 shader->ps.db_shader_control &= C_02880C_Z_ORDER;
1945 shader->ps.db_shader_control |= S_02880C_Z_ORDER(V_02880C_LATE_Z);
1946 }
1947
1948 if (sscreen->info.has_rbplus && !sscreen->info.rbplus_allowed)
1949 shader->ps.db_shader_control |= S_02880C_DUAL_QUAD_DISABLE(1);
1950
1951 /* SPI_BARYC_CNTL.POS_FLOAT_LOCATION
1952 * Possible values:
1953 * 0 -> Position = pixel center
1954 * 1 -> Position = pixel centroid
1955 * 2 -> Position = at sample position
1956 *
1957 * From GLSL 4.5 specification, section 7.1:
1958 * "The variable gl_FragCoord is available as an input variable from
1959 * within fragment shaders and it holds the window relative coordinates
1960 * (x, y, z, 1/w) values for the fragment. If multi-sampling, this
1961 * value can be for any location within the pixel, or one of the
1962 * fragment samples. The use of centroid does not further restrict
1963 * this value to be inside the current primitive."
1964 *
1965 * Meaning that centroid has no effect and we can return anything within
1966 * the pixel. Thus, return the value at sample position, because that's
1967 * the most accurate one shaders can get.
1968 */
1969 shader->ps.spi_baryc_cntl = S_0286E0_POS_FLOAT_LOCATION(2) |
1970 S_0286E0_POS_FLOAT_ULC(info->base.fs.pixel_center_integer) |
1971 S_0286E0_FRONT_FACE_ALL_BITS(1);
1972 shader->ps.spi_shader_col_format = si_get_spi_shader_col_format(shader);
1973 shader->ps.cb_shader_mask = ac_get_cb_shader_mask(shader->key.ps.part.epilog.spi_shader_col_format);
1974 shader->ps.spi_ps_input_ena = shader->config.spi_ps_input_ena;
1975 shader->ps.spi_ps_input_addr = shader->config.spi_ps_input_addr;
1976 shader->ps.num_interp = si_get_ps_num_interp(shader);
1977 shader->ps.spi_shader_z_format =
1978 ac_get_spi_shader_z_format(info->writes_z, info->writes_stencil, shader->ps.writes_samplemask,
1979 shader->key.ps.part.epilog.alpha_to_coverage_via_mrtz);
1980
1981 /* Ensure that some export memory is always allocated, for two reasons:
1982 *
1983 * 1) Correctness: The hardware ignores the EXEC mask if no export
1984 * memory is allocated, so KILL and alpha test do not work correctly
1985 * without this.
1986 * 2) Performance: Every shader needs at least a NULL export, even when
1987 * it writes no color/depth output. The NULL export instruction
1988 * stalls without this setting.
1989 *
1990 * Don't add this to CB_SHADER_MASK.
1991 *
1992 * GFX10 supports pixel shaders without exports by setting both
1993 * the color and Z formats to SPI_SHADER_ZERO. The hw will skip export
1994 * instructions if any are present.
1995 *
1996 * RB+ depth-only rendering requires SPI_SHADER_32_R.
1997 */
1998 bool has_mrtz = info->writes_z || info->writes_stencil || shader->ps.writes_samplemask;
1999
2000 if (!shader->ps.spi_shader_col_format) {
2001 if (shader->key.ps.part.epilog.rbplus_depth_only_opt) {
2002 shader->ps.spi_shader_col_format = V_028714_SPI_SHADER_32_R;
2003 } else if (!has_mrtz) {
2004 if (sscreen->info.gfx_level >= GFX10) {
2005 if (G_02880C_KILL_ENABLE(shader->ps.db_shader_control))
2006 shader->ps.spi_shader_col_format = V_028714_SPI_SHADER_32_R;
2007 } else {
2008 shader->ps.spi_shader_col_format = V_028714_SPI_SHADER_32_R;
2009 }
2010 }
2011 }
2012
2013 /* Enable PARAM_GEN for point smoothing.
2014 * Gfx11 workaround when there are no PS inputs but LDS is used.
2015 */
2016 bool param_gen = shader->key.ps.mono.point_smoothing ||
2017 (sscreen->info.gfx_level == GFX11 && !shader->ps.num_interp &&
2018 shader->config.lds_size);
2019
2020 shader->ps.spi_ps_in_control = S_0286D8_NUM_INTERP(shader->ps.num_interp) |
2021 S_0286D8_PARAM_GEN(param_gen) |
2022 S_0286D8_PS_W32_EN(shader->wave_size == 32);
2023
2024 struct si_pm4_state *pm4 = si_get_shader_pm4_state(shader, NULL);
2025 if (!pm4)
2026 return;
2027
2028 if (sscreen->info.has_set_context_pairs_packed)
2029 pm4->atom.emit = gfx11_dgpu_emit_shader_ps;
2030 else
2031 pm4->atom.emit = gfx6_emit_shader_ps;
2032
2033 /* If multiple state sets are allowed to be in a bin, break the batch on a new PS. */
2034 if (sscreen->dpbb_allowed &&
2035 (sscreen->pbb_context_states_per_bin > 1 ||
2036 sscreen->pbb_persistent_states_per_bin > 1)) {
2037 si_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0));
2038 si_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
2039 }
2040
2041 if (sscreen->info.gfx_level >= GFX11) {
2042 unsigned cu_mask_ps = gfx103_get_cu_mask_ps(sscreen);
2043
2044 si_pm4_set_reg_idx3(pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS,
2045 ac_apply_cu_en(S_00B004_CU_EN(cu_mask_ps >> 16) |
2046 S_00B004_INST_PREF_SIZE(si_get_shader_prefetch_size(shader)),
2047 C_00B004_CU_EN, 16, &sscreen->info));
2048 }
2049
2050 uint64_t va = shader->bo->gpu_address;
2051 si_pm4_set_reg(pm4, R_00B020_SPI_SHADER_PGM_LO_PS, va >> 8);
2052 si_pm4_set_reg(pm4, R_00B024_SPI_SHADER_PGM_HI_PS,
2053 S_00B024_MEM_BASE(sscreen->info.address32_hi >> 8));
2054
2055 si_pm4_set_reg(pm4, R_00B028_SPI_SHADER_PGM_RSRC1_PS,
2056 S_00B028_VGPRS(si_shader_encode_vgprs(shader)) |
2057 S_00B028_SGPRS(si_shader_encode_sgprs(shader)) |
2058 S_00B028_DX10_CLAMP(1) |
2059 S_00B028_MEM_ORDERED(si_shader_mem_ordered(shader)) |
2060 S_00B028_FLOAT_MODE(shader->config.float_mode));
2061 si_pm4_set_reg(pm4, R_00B02C_SPI_SHADER_PGM_RSRC2_PS,
2062 S_00B02C_EXTRA_LDS_SIZE(shader->config.lds_size) |
2063 S_00B02C_USER_SGPR(SI_PS_NUM_USER_SGPR) |
2064 S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
2065 si_pm4_finalize(pm4);
2066 }
2067
si_shader_init_pm4_state(struct si_screen * sscreen,struct si_shader * shader)2068 static void si_shader_init_pm4_state(struct si_screen *sscreen, struct si_shader *shader)
2069 {
2070 assert(shader->wave_size);
2071
2072 switch (shader->selector->stage) {
2073 case MESA_SHADER_VERTEX:
2074 if (shader->key.ge.as_ls)
2075 si_shader_ls(sscreen, shader);
2076 else if (shader->key.ge.as_es)
2077 si_shader_es(sscreen, shader);
2078 else if (shader->key.ge.as_ngg)
2079 gfx10_shader_ngg(sscreen, shader);
2080 else
2081 si_shader_vs(sscreen, shader, NULL);
2082 break;
2083 case MESA_SHADER_TESS_CTRL:
2084 si_shader_hs(sscreen, shader);
2085 break;
2086 case MESA_SHADER_TESS_EVAL:
2087 if (shader->key.ge.as_es)
2088 si_shader_es(sscreen, shader);
2089 else if (shader->key.ge.as_ngg)
2090 gfx10_shader_ngg(sscreen, shader);
2091 else
2092 si_shader_vs(sscreen, shader, NULL);
2093 break;
2094 case MESA_SHADER_GEOMETRY:
2095 if (shader->key.ge.as_ngg) {
2096 gfx10_shader_ngg(sscreen, shader);
2097 } else {
2098 /* VS must be initialized first because GS uses its fields. */
2099 si_shader_vs(sscreen, shader->gs_copy_shader, shader->selector);
2100 si_shader_gs(sscreen, shader);
2101 }
2102 break;
2103 case MESA_SHADER_FRAGMENT:
2104 si_shader_ps(sscreen, shader);
2105 break;
2106 default:
2107 assert(0);
2108 }
2109
2110 assert(!(sscreen->debug_flags & DBG(SQTT)) || shader->pm4.spi_shader_pgm_lo_reg != 0);
2111 }
2112
si_clear_vs_key_inputs(union si_shader_key * key)2113 static void si_clear_vs_key_inputs(union si_shader_key *key)
2114 {
2115 key->ge.mono.instance_divisor_is_one = 0;
2116 key->ge.mono.instance_divisor_is_fetched = 0;
2117 key->ge.mono.vs_fetch_opencode = 0;
2118 memset(key->ge.mono.vs_fix_fetch, 0, sizeof(key->ge.mono.vs_fix_fetch));
2119 }
2120
si_vs_key_update_inputs(struct si_context * sctx)2121 void si_vs_key_update_inputs(struct si_context *sctx)
2122 {
2123 struct si_shader_selector *vs = sctx->shader.vs.cso;
2124 struct si_vertex_elements *elts = sctx->vertex_elements;
2125 union si_shader_key *key = &sctx->shader.vs.key;
2126
2127 if (!vs)
2128 return;
2129
2130 if (vs->info.base.vs.blit_sgprs_amd) {
2131 si_clear_vs_key_inputs(key);
2132 key->ge.opt.prefer_mono = 0;
2133 sctx->uses_nontrivial_vs_inputs = false;
2134 return;
2135 }
2136
2137 bool uses_nontrivial_vs_inputs = false;
2138
2139 if (elts->instance_divisor_is_one || elts->instance_divisor_is_fetched)
2140 uses_nontrivial_vs_inputs = true;
2141
2142 key->ge.mono.instance_divisor_is_one = elts->instance_divisor_is_one;
2143 key->ge.mono.instance_divisor_is_fetched = elts->instance_divisor_is_fetched;
2144 key->ge.opt.prefer_mono = elts->instance_divisor_is_fetched;
2145
2146 unsigned count_mask = (1 << vs->info.num_inputs) - 1;
2147 unsigned fix = elts->fix_fetch_always & count_mask;
2148 unsigned opencode = elts->fix_fetch_opencode & count_mask;
2149
2150 if (sctx->vertex_buffer_unaligned & elts->vb_alignment_check_mask) {
2151 uint32_t mask = elts->fix_fetch_unaligned & count_mask;
2152 while (mask) {
2153 unsigned i = u_bit_scan(&mask);
2154 unsigned log_hw_load_size = 1 + ((elts->hw_load_is_dword >> i) & 1);
2155 unsigned vbidx = elts->vertex_buffer_index[i];
2156 struct pipe_vertex_buffer *vb = &sctx->vertex_buffer[vbidx];
2157 unsigned align_mask = (1 << log_hw_load_size) - 1;
2158 if (vb->buffer_offset & align_mask) {
2159 fix |= 1 << i;
2160 opencode |= 1 << i;
2161 }
2162 }
2163 }
2164
2165 memset(key->ge.mono.vs_fix_fetch, 0, sizeof(key->ge.mono.vs_fix_fetch));
2166
2167 while (fix) {
2168 unsigned i = u_bit_scan(&fix);
2169 uint8_t fix_fetch = elts->fix_fetch[i];
2170
2171 key->ge.mono.vs_fix_fetch[i].bits = fix_fetch;
2172 if (fix_fetch)
2173 uses_nontrivial_vs_inputs = true;
2174 }
2175 key->ge.mono.vs_fetch_opencode = opencode;
2176 if (opencode)
2177 uses_nontrivial_vs_inputs = true;
2178
2179 sctx->uses_nontrivial_vs_inputs = uses_nontrivial_vs_inputs;
2180
2181 /* draw_vertex_state (display lists) requires that all VS input lowering is disabled
2182 * because its vertex elements never need any lowering.
2183 *
2184 * We just computed the key because we needed to set uses_nontrivial_vs_inputs, so that we know
2185 * whether the VS should be updated when we switch from draw_vertex_state to draw_vbo. Now
2186 * clear the VS input bits for draw_vertex_state. This should happen rarely because VS inputs
2187 * don't usually need any lowering.
2188 */
2189 if (uses_nontrivial_vs_inputs && sctx->force_trivial_vs_inputs)
2190 si_clear_vs_key_inputs(key);
2191 }
2192
si_get_vs_key_inputs(struct si_context * sctx,union si_shader_key * key)2193 static void si_get_vs_key_inputs(struct si_context *sctx, union si_shader_key *key)
2194 {
2195 key->ge.mono.instance_divisor_is_one = sctx->shader.vs.key.ge.mono.instance_divisor_is_one;
2196 key->ge.mono.instance_divisor_is_fetched = sctx->shader.vs.key.ge.mono.instance_divisor_is_fetched;
2197 key->ge.mono.vs_fetch_opencode = sctx->shader.vs.key.ge.mono.vs_fetch_opencode;
2198 memcpy(key->ge.mono.vs_fix_fetch, sctx->shader.vs.key.ge.mono.vs_fix_fetch,
2199 sizeof(key->ge.mono.vs_fix_fetch));
2200 }
2201
si_update_ps_inputs_read_or_disabled(struct si_context * sctx)2202 void si_update_ps_inputs_read_or_disabled(struct si_context *sctx)
2203 {
2204 struct si_shader_selector *ps = sctx->shader.ps.cso;
2205
2206 /* Find out if PS is disabled. */
2207 bool ps_disabled = true;
2208 if (ps) {
2209 bool ps_modifies_zs = ps->info.base.fs.uses_discard ||
2210 ps->info.writes_z ||
2211 ps->info.writes_stencil ||
2212 ps->info.writes_samplemask ||
2213 sctx->queued.named.blend->alpha_to_coverage ||
2214 sctx->queued.named.dsa->alpha_func != PIPE_FUNC_ALWAYS ||
2215 sctx->queued.named.rasterizer->poly_stipple_enable ||
2216 sctx->queued.named.rasterizer->point_smooth;
2217
2218 ps_disabled = sctx->queued.named.rasterizer->rasterizer_discard ||
2219 (!ps_modifies_zs && !ps->info.base.writes_memory &&
2220 !si_any_colorbuffer_written(sctx));
2221 }
2222
2223 uint64_t ps_inputs_read_or_disabled;
2224
2225 if (ps_disabled) {
2226 ps_inputs_read_or_disabled = 0;
2227 } else {
2228 uint64_t inputs_read = ps->info.inputs_read;
2229
2230 if (ps->info.colors_read && sctx->queued.named.rasterizer->two_side) {
2231 if (inputs_read & BITFIELD64_BIT(SI_UNIQUE_SLOT_COL0))
2232 inputs_read |= BITFIELD64_BIT(SI_UNIQUE_SLOT_BFC0);
2233
2234 if (inputs_read & BITFIELD64_BIT(SI_UNIQUE_SLOT_COL1))
2235 inputs_read |= BITFIELD64_BIT(SI_UNIQUE_SLOT_BFC1);
2236 }
2237
2238 ps_inputs_read_or_disabled = inputs_read;
2239 }
2240
2241 if (sctx->ps_inputs_read_or_disabled != ps_inputs_read_or_disabled) {
2242 sctx->ps_inputs_read_or_disabled = ps_inputs_read_or_disabled;
2243 sctx->do_update_shaders = true;
2244 }
2245 }
2246
si_vs_ps_key_update_rast_prim_smooth_stipple(struct si_context * sctx)2247 void si_vs_ps_key_update_rast_prim_smooth_stipple(struct si_context *sctx)
2248 {
2249 struct si_shader_ctx_state *hw_vs = si_get_vs(sctx);
2250 struct si_shader_selector *ps = sctx->shader.ps.cso;
2251
2252 if (!hw_vs->cso || !ps)
2253 return;
2254
2255 struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
2256 union si_shader_key *vs_key = &hw_vs->key; /* could also be TES or GS before PS */
2257 union si_shader_key *ps_key = &sctx->shader.ps.key;
2258
2259 bool old_kill_pointsize = vs_key->ge.opt.kill_pointsize;
2260 bool old_color_two_side = ps_key->ps.part.prolog.color_two_side;
2261 bool old_poly_stipple = ps_key->ps.part.prolog.poly_stipple;
2262 bool old_poly_line_smoothing = ps_key->ps.mono.poly_line_smoothing;
2263 bool old_point_smoothing = ps_key->ps.mono.point_smoothing;
2264 int old_force_front_face_input = ps_key->ps.opt.force_front_face_input;
2265
2266 if (sctx->current_rast_prim == MESA_PRIM_POINTS) {
2267 vs_key->ge.opt.kill_pointsize = 0;
2268 ps_key->ps.part.prolog.color_two_side = 0;
2269 ps_key->ps.part.prolog.poly_stipple = 0;
2270 ps_key->ps.mono.poly_line_smoothing = 0;
2271 ps_key->ps.mono.point_smoothing = rs->point_smooth;
2272 ps_key->ps.opt.force_front_face_input = ps->info.uses_frontface;
2273 } else if (util_prim_is_lines(sctx->current_rast_prim)) {
2274 vs_key->ge.opt.kill_pointsize = hw_vs->cso->info.writes_psize;
2275 ps_key->ps.part.prolog.color_two_side = 0;
2276 ps_key->ps.part.prolog.poly_stipple = 0;
2277 ps_key->ps.mono.poly_line_smoothing = rs->line_smooth && sctx->framebuffer.nr_samples <= 1;
2278 ps_key->ps.mono.point_smoothing = 0;
2279 ps_key->ps.opt.force_front_face_input = ps->info.uses_frontface;
2280 } else {
2281 /* Triangles. */
2282 vs_key->ge.opt.kill_pointsize = hw_vs->cso->info.writes_psize &&
2283 !rs->polygon_mode_is_points;
2284 ps_key->ps.part.prolog.color_two_side = rs->two_side && ps->info.colors_read;
2285 ps_key->ps.part.prolog.poly_stipple = rs->poly_stipple_enable;
2286 ps_key->ps.mono.poly_line_smoothing = rs->poly_smooth && sctx->framebuffer.nr_samples <= 1;
2287 ps_key->ps.mono.point_smoothing = 0;
2288 ps_key->ps.opt.force_front_face_input = rs->force_front_face_input &&
2289 ps->info.uses_frontface;
2290 }
2291
2292 if (vs_key->ge.opt.kill_pointsize != old_kill_pointsize ||
2293 ps_key->ps.part.prolog.color_two_side != old_color_two_side ||
2294 ps_key->ps.part.prolog.poly_stipple != old_poly_stipple ||
2295 ps_key->ps.mono.poly_line_smoothing != old_poly_line_smoothing ||
2296 ps_key->ps.mono.point_smoothing != old_point_smoothing ||
2297 ps_key->ps.opt.force_front_face_input != old_force_front_face_input)
2298 sctx->do_update_shaders = true;
2299 }
2300
si_get_vs_key_outputs(struct si_context * sctx,struct si_shader_selector * vs,union si_shader_key * key)2301 static void si_get_vs_key_outputs(struct si_context *sctx, struct si_shader_selector *vs,
2302 union si_shader_key *key)
2303 {
2304 key->ge.opt.kill_clip_distances = vs->info.clipdist_mask & ~sctx->queued.named.rasterizer->clip_plane_enable;
2305
2306 /* Find out which VS outputs aren't used by the PS. */
2307 uint64_t outputs_written = vs->info.outputs_written_before_ps;
2308 uint64_t linked = outputs_written & sctx->ps_inputs_read_or_disabled;
2309
2310 key->ge.opt.kill_layer = vs->info.writes_layer &&
2311 sctx->framebuffer.state.layers <= 1;
2312 key->ge.opt.kill_outputs = ~linked & outputs_written;
2313 key->ge.opt.ngg_culling = sctx->ngg_culling;
2314 key->ge.mono.u.vs_export_prim_id = vs->stage != MESA_SHADER_GEOMETRY &&
2315 sctx->shader.ps.cso && sctx->shader.ps.cso->info.uses_primid;
2316 key->ge.opt.remove_streamout = vs->info.enabled_streamout_buffer_mask &&
2317 !sctx->streamout.enabled_mask;
2318 }
2319
si_clear_vs_key_outputs(struct si_context * sctx,struct si_shader_selector * vs,union si_shader_key * key)2320 static void si_clear_vs_key_outputs(struct si_context *sctx, struct si_shader_selector *vs,
2321 union si_shader_key *key)
2322 {
2323 key->ge.opt.kill_clip_distances = 0;
2324 key->ge.opt.kill_outputs = 0;
2325 key->ge.opt.remove_streamout = 0;
2326 key->ge.opt.ngg_culling = 0;
2327 key->ge.mono.u.vs_export_prim_id = 0;
2328 }
2329
si_ps_key_update_framebuffer(struct si_context * sctx)2330 void si_ps_key_update_framebuffer(struct si_context *sctx)
2331 {
2332 struct si_shader_selector *sel = sctx->shader.ps.cso;
2333 union si_shader_key *key = &sctx->shader.ps.key;
2334
2335 if (!sel)
2336 return;
2337
2338 if (sel->info.color0_writes_all_cbufs &&
2339 sel->info.colors_written == 0x1)
2340 key->ps.part.epilog.last_cbuf = MAX2(sctx->framebuffer.state.nr_cbufs, 1) - 1;
2341 else
2342 key->ps.part.epilog.last_cbuf = 0;
2343
2344 /* ps_uses_fbfetch is true only if the color buffer is bound. */
2345 if (sctx->ps_uses_fbfetch) {
2346 struct pipe_surface *cb0 = sctx->framebuffer.state.cbufs[0];
2347 struct pipe_resource *tex = cb0->texture;
2348
2349 /* 1D textures are allocated and used as 2D on GFX9. */
2350 key->ps.mono.fbfetch_msaa = sctx->framebuffer.nr_samples > 1;
2351 key->ps.mono.fbfetch_is_1D =
2352 sctx->gfx_level != GFX9 &&
2353 (tex->target == PIPE_TEXTURE_1D || tex->target == PIPE_TEXTURE_1D_ARRAY);
2354 key->ps.mono.fbfetch_layered =
2355 tex->target == PIPE_TEXTURE_1D_ARRAY || tex->target == PIPE_TEXTURE_2D_ARRAY ||
2356 tex->target == PIPE_TEXTURE_CUBE || tex->target == PIPE_TEXTURE_CUBE_ARRAY ||
2357 tex->target == PIPE_TEXTURE_3D;
2358 } else {
2359 key->ps.mono.fbfetch_msaa = 0;
2360 key->ps.mono.fbfetch_is_1D = 0;
2361 key->ps.mono.fbfetch_layered = 0;
2362 }
2363 }
2364
si_ps_key_update_framebuffer_blend_rasterizer(struct si_context * sctx)2365 void si_ps_key_update_framebuffer_blend_rasterizer(struct si_context *sctx)
2366 {
2367 struct si_shader_selector *sel = sctx->shader.ps.cso;
2368 if (!sel)
2369 return;
2370
2371 union si_shader_key *key = &sctx->shader.ps.key;
2372 struct si_state_blend *blend = sctx->queued.named.blend;
2373 struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
2374 bool alpha_to_coverage = blend->alpha_to_coverage && rs->multisample_enable &&
2375 sctx->framebuffer.nr_samples >= 2;
2376 unsigned need_src_alpha_4bit = blend->need_src_alpha_4bit;
2377
2378 /* Old key data for comparison. */
2379 struct si_ps_epilog_bits old_epilog;
2380 memcpy(&old_epilog, &key->ps.part.epilog, sizeof(old_epilog));
2381 bool old_prefer_mono = key->ps.opt.prefer_mono;
2382 #ifndef NDEBUG
2383 struct si_shader_key_ps old_key;
2384 memcpy(&old_key, &key->ps, sizeof(old_key));
2385 #endif
2386
2387 key->ps.part.epilog.alpha_to_one = blend->alpha_to_one && rs->multisample_enable;
2388 key->ps.part.epilog.alpha_to_coverage_via_mrtz =
2389 sctx->gfx_level >= GFX11 && alpha_to_coverage &&
2390 (sel->info.writes_z || sel->info.writes_stencil || sel->info.writes_samplemask);
2391
2392 /* Remove the gl_SampleMask fragment shader output if MSAA is disabled.
2393 * This is required for correctness and it's also an optimization.
2394 */
2395 key->ps.part.epilog.kill_samplemask = sel->info.writes_samplemask &&
2396 (sctx->framebuffer.nr_samples <= 1 ||
2397 !rs->multisample_enable);
2398
2399 /* If alpha-to-coverage isn't exported via MRTZ, set that we need to export alpha
2400 * through MRT0.
2401 */
2402 if (alpha_to_coverage && !key->ps.part.epilog.alpha_to_coverage_via_mrtz)
2403 need_src_alpha_4bit |= 0xf;
2404
2405 /* Select the shader color format based on whether
2406 * blending or alpha are needed.
2407 */
2408 key->ps.part.epilog.spi_shader_col_format =
2409 (blend->blend_enable_4bit & need_src_alpha_4bit &
2410 sctx->framebuffer.spi_shader_col_format_blend_alpha) |
2411 (blend->blend_enable_4bit & ~need_src_alpha_4bit &
2412 sctx->framebuffer.spi_shader_col_format_blend) |
2413 (~blend->blend_enable_4bit & need_src_alpha_4bit &
2414 sctx->framebuffer.spi_shader_col_format_alpha) |
2415 (~blend->blend_enable_4bit & ~need_src_alpha_4bit &
2416 sctx->framebuffer.spi_shader_col_format);
2417 key->ps.part.epilog.spi_shader_col_format &= blend->cb_target_enabled_4bit;
2418
2419 key->ps.part.epilog.dual_src_blend_swizzle = sctx->gfx_level >= GFX11 &&
2420 blend->dual_src_blend &&
2421 (sel->info.colors_written_4bit & 0xff) == 0xff;
2422
2423 /* The output for dual source blending should have
2424 * the same format as the first output.
2425 */
2426 if (blend->dual_src_blend) {
2427 key->ps.part.epilog.spi_shader_col_format |=
2428 (key->ps.part.epilog.spi_shader_col_format & 0xf) << 4;
2429 }
2430
2431 /* If alpha-to-coverage is enabled, we have to export alpha
2432 * even if there is no color buffer.
2433 *
2434 * Gfx11 exports alpha-to-coverage via MRTZ if MRTZ is present.
2435 */
2436 if (!(key->ps.part.epilog.spi_shader_col_format & 0xf) && alpha_to_coverage &&
2437 !key->ps.part.epilog.alpha_to_coverage_via_mrtz)
2438 key->ps.part.epilog.spi_shader_col_format |= V_028710_SPI_SHADER_32_AR;
2439
2440 /* On GFX6 and GFX7 except Hawaii, the CB doesn't clamp outputs
2441 * to the range supported by the type if a channel has less
2442 * than 16 bits and the export format is 16_ABGR.
2443 */
2444 if (sctx->gfx_level <= GFX7 && sctx->family != CHIP_HAWAII) {
2445 key->ps.part.epilog.color_is_int8 = sctx->framebuffer.color_is_int8;
2446 key->ps.part.epilog.color_is_int10 = sctx->framebuffer.color_is_int10;
2447 }
2448
2449 /* Disable unwritten outputs (if WRITE_ALL_CBUFS isn't enabled). */
2450 if (!key->ps.part.epilog.last_cbuf) {
2451 key->ps.part.epilog.spi_shader_col_format &= sel->info.colors_written_4bit;
2452 key->ps.part.epilog.color_is_int8 &= sel->info.colors_written;
2453 key->ps.part.epilog.color_is_int10 &= sel->info.colors_written;
2454 }
2455
2456 /* Enable RB+ for depth-only rendering. Registers must be programmed as follows:
2457 * CB_COLOR_CONTROL.MODE = CB_DISABLE
2458 * CB_COLOR0_INFO.FORMAT = COLOR_32
2459 * CB_COLOR0_INFO.NUMBER_TYPE = NUMBER_FLOAT
2460 * SPI_SHADER_COL_FORMAT.COL0_EXPORT_FORMAT = SPI_SHADER_32_R
2461 * SX_PS_DOWNCONVERT.MRT0 = SX_RT_EXPORT_32_R
2462 *
2463 * Also, the following conditions must be met.
2464 */
2465 key->ps.part.epilog.rbplus_depth_only_opt =
2466 sctx->screen->info.rbplus_allowed &&
2467 blend->cb_target_enabled_4bit == 0 && /* implies CB_DISABLE */
2468 !alpha_to_coverage &&
2469 !sel->info.base.writes_memory &&
2470 !key->ps.part.epilog.spi_shader_col_format;
2471
2472 /* Eliminate shader code computing output values that are unused.
2473 * This enables dead code elimination between shader parts.
2474 * Check if any output is eliminated.
2475 *
2476 * Dual source blending never has color buffer 1 enabled, so ignore it.
2477 *
2478 * On gfx11, pixel shaders that write memory should be compiled with an inlined epilog,
2479 * so that the compiler can see s_endpgm and deallocates VGPRs before memory stores return.
2480 */
2481 if (sel->info.colors_written_4bit &
2482 (blend->dual_src_blend ? 0xffffff0f : 0xffffffff) &
2483 ~(sctx->framebuffer.colorbuf_enabled_4bit & blend->cb_target_enabled_4bit))
2484 key->ps.opt.prefer_mono = 1;
2485 else if (sctx->gfx_level >= GFX11 && sel->info.base.writes_memory)
2486 key->ps.opt.prefer_mono = 1;
2487 else
2488 key->ps.opt.prefer_mono = 0;
2489
2490 /* Update shaders only if the key changed. */
2491 if (memcmp(&key->ps.part.epilog, &old_epilog, sizeof(old_epilog)) ||
2492 key->ps.opt.prefer_mono != old_prefer_mono) {
2493 sctx->do_update_shaders = true;
2494 } else {
2495 assert(memcmp(&key->ps, &old_key, sizeof(old_key)) == 0);
2496 }
2497 }
2498
si_ps_key_update_rasterizer(struct si_context * sctx)2499 void si_ps_key_update_rasterizer(struct si_context *sctx)
2500 {
2501 struct si_shader_selector *sel = sctx->shader.ps.cso;
2502 union si_shader_key *key = &sctx->shader.ps.key;
2503 struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
2504
2505 if (!sel)
2506 return;
2507
2508 bool old_flatshade_colors = key->ps.part.prolog.flatshade_colors;
2509 bool old_clamp_color = key->ps.part.epilog.clamp_color;
2510
2511 key->ps.part.prolog.flatshade_colors = rs->flatshade && sel->info.uses_interp_color;
2512 key->ps.part.epilog.clamp_color = rs->clamp_fragment_color;
2513
2514 if (key->ps.part.prolog.flatshade_colors != old_flatshade_colors ||
2515 key->ps.part.epilog.clamp_color != old_clamp_color)
2516 sctx->do_update_shaders = true;
2517 }
2518
si_ps_key_update_dsa(struct si_context * sctx)2519 void si_ps_key_update_dsa(struct si_context *sctx)
2520 {
2521 union si_shader_key *key = &sctx->shader.ps.key;
2522
2523 key->ps.part.epilog.alpha_func = sctx->queued.named.dsa->alpha_func;
2524 }
2525
si_ps_key_update_sample_shading(struct si_context * sctx)2526 void si_ps_key_update_sample_shading(struct si_context *sctx)
2527 {
2528 struct si_shader_selector *sel = sctx->shader.ps.cso;
2529 union si_shader_key *key = &sctx->shader.ps.key;
2530
2531 if (!sel)
2532 return;
2533
2534 if (sctx->ps_iter_samples > 1 && sel->info.reads_samplemask)
2535 key->ps.part.prolog.samplemask_log_ps_iter = util_logbase2(sctx->ps_iter_samples);
2536 else
2537 key->ps.part.prolog.samplemask_log_ps_iter = 0;
2538 }
2539
si_ps_key_update_framebuffer_rasterizer_sample_shading(struct si_context * sctx)2540 void si_ps_key_update_framebuffer_rasterizer_sample_shading(struct si_context *sctx)
2541 {
2542 struct si_shader_selector *sel = sctx->shader.ps.cso;
2543 union si_shader_key *key = &sctx->shader.ps.key;
2544 struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
2545
2546 if (!sel)
2547 return;
2548
2549 /* Old key data for comparison. */
2550 struct si_ps_prolog_bits old_prolog;
2551 memcpy(&old_prolog, &key->ps.part.prolog, sizeof(old_prolog));
2552 bool old_interpolate_at_sample_force_center = key->ps.mono.interpolate_at_sample_force_center;
2553
2554 bool uses_persp_center = sel->info.uses_persp_center ||
2555 (!rs->flatshade && sel->info.uses_persp_center_color);
2556 bool uses_persp_centroid = sel->info.uses_persp_centroid ||
2557 (!rs->flatshade && sel->info.uses_persp_centroid_color);
2558 bool uses_persp_sample = sel->info.uses_persp_sample ||
2559 (!rs->flatshade && sel->info.uses_persp_sample_color);
2560
2561 if (rs->force_persample_interp && rs->multisample_enable &&
2562 sctx->framebuffer.nr_samples > 1 && sctx->ps_iter_samples > 1) {
2563 key->ps.part.prolog.force_persp_sample_interp =
2564 uses_persp_center || uses_persp_centroid;
2565
2566 key->ps.part.prolog.force_linear_sample_interp =
2567 sel->info.uses_linear_center || sel->info.uses_linear_centroid;
2568
2569 key->ps.part.prolog.force_persp_center_interp = 0;
2570 key->ps.part.prolog.force_linear_center_interp = 0;
2571 key->ps.part.prolog.bc_optimize_for_persp = 0;
2572 key->ps.part.prolog.bc_optimize_for_linear = 0;
2573 key->ps.mono.interpolate_at_sample_force_center = 0;
2574 } else if (rs->multisample_enable && sctx->framebuffer.nr_samples > 1) {
2575 key->ps.part.prolog.force_persp_sample_interp = 0;
2576 key->ps.part.prolog.force_linear_sample_interp = 0;
2577 key->ps.part.prolog.force_persp_center_interp = 0;
2578 key->ps.part.prolog.force_linear_center_interp = 0;
2579 key->ps.part.prolog.bc_optimize_for_persp =
2580 uses_persp_center && uses_persp_centroid;
2581 key->ps.part.prolog.bc_optimize_for_linear =
2582 sel->info.uses_linear_center && sel->info.uses_linear_centroid;
2583 key->ps.mono.interpolate_at_sample_force_center = 0;
2584 } else {
2585 key->ps.part.prolog.force_persp_sample_interp = 0;
2586 key->ps.part.prolog.force_linear_sample_interp = 0;
2587
2588 /* Make sure SPI doesn't compute more than 1 pair
2589 * of (i,j), which is the optimization here. */
2590 key->ps.part.prolog.force_persp_center_interp = uses_persp_center +
2591 uses_persp_centroid +
2592 uses_persp_sample > 1;
2593
2594 key->ps.part.prolog.force_linear_center_interp = sel->info.uses_linear_center +
2595 sel->info.uses_linear_centroid +
2596 sel->info.uses_linear_sample > 1;
2597 key->ps.part.prolog.bc_optimize_for_persp = 0;
2598 key->ps.part.prolog.bc_optimize_for_linear = 0;
2599 key->ps.mono.interpolate_at_sample_force_center = sel->info.uses_interp_at_sample;
2600 }
2601
2602 /* Update shaders only if the key changed. */
2603 if (memcmp(&key->ps.part.prolog, &old_prolog, sizeof(old_prolog)) ||
2604 key->ps.mono.interpolate_at_sample_force_center != old_interpolate_at_sample_force_center)
2605 sctx->do_update_shaders = true;
2606 }
2607
2608 /* Compute the key for the hw shader variant */
si_shader_selector_key(struct pipe_context * ctx,struct si_shader_selector * sel,union si_shader_key * key)2609 static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_shader_selector *sel,
2610 union si_shader_key *key)
2611 {
2612 struct si_context *sctx = (struct si_context *)ctx;
2613
2614 switch (sel->stage) {
2615 case MESA_SHADER_VERTEX:
2616 if (!sctx->shader.tes.cso && !sctx->shader.gs.cso)
2617 si_get_vs_key_outputs(sctx, sel, key);
2618 else
2619 si_clear_vs_key_outputs(sctx, sel, key);
2620 break;
2621 case MESA_SHADER_TESS_CTRL:
2622 if (sctx->gfx_level >= GFX9) {
2623 si_get_vs_key_inputs(sctx, key);
2624 key->ge.part.tcs.ls = sctx->shader.vs.cso;
2625 }
2626 break;
2627 case MESA_SHADER_TESS_EVAL:
2628 if (!sctx->shader.gs.cso)
2629 si_get_vs_key_outputs(sctx, sel, key);
2630 else
2631 si_clear_vs_key_outputs(sctx, sel, key);
2632 break;
2633 case MESA_SHADER_GEOMETRY:
2634 if (sctx->gfx_level >= GFX9) {
2635 if (sctx->shader.tes.cso) {
2636 si_clear_vs_key_inputs(key);
2637 key->ge.part.gs.es = sctx->shader.tes.cso;
2638 } else {
2639 si_get_vs_key_inputs(sctx, key);
2640 key->ge.part.gs.es = sctx->shader.vs.cso;
2641 }
2642
2643 /* Only NGG can eliminate GS outputs, because the code is shared with VS. */
2644 if (sctx->ngg)
2645 si_get_vs_key_outputs(sctx, sel, key);
2646 else
2647 si_clear_vs_key_outputs(sctx, sel, key);
2648 }
2649 break;
2650 case MESA_SHADER_FRAGMENT:
2651 break;
2652 default:
2653 assert(0);
2654 }
2655 }
2656
si_build_shader_variant(struct si_shader * shader,int thread_index,bool low_priority)2657 static void si_build_shader_variant(struct si_shader *shader, int thread_index, bool low_priority)
2658 {
2659 struct si_shader_selector *sel = shader->selector;
2660 struct si_screen *sscreen = sel->screen;
2661 struct ac_llvm_compiler **compiler;
2662 struct util_debug_callback *debug = &shader->compiler_ctx_state.debug;
2663
2664 if (thread_index >= 0) {
2665 if (low_priority) {
2666 assert(thread_index < (int)ARRAY_SIZE(sscreen->compiler_lowp));
2667 compiler = &sscreen->compiler_lowp[thread_index];
2668 } else {
2669 assert(thread_index < (int)ARRAY_SIZE(sscreen->compiler));
2670 compiler = &sscreen->compiler[thread_index];
2671 }
2672 if (!debug->async)
2673 debug = NULL;
2674 } else {
2675 assert(!low_priority);
2676 compiler = &shader->compiler_ctx_state.compiler;
2677 }
2678
2679 if (!sscreen->use_aco && !*compiler)
2680 *compiler = si_create_llvm_compiler(sscreen);
2681
2682 if (unlikely(!si_create_shader_variant(sscreen, *compiler, shader, debug))) {
2683 PRINT_ERR("Failed to build shader variant (type=%u)\n", sel->stage);
2684 shader->compilation_failed = true;
2685 return;
2686 }
2687
2688 if (shader->compiler_ctx_state.is_debug_context) {
2689 FILE *f = open_memstream(&shader->shader_log, &shader->shader_log_size);
2690 if (f) {
2691 si_shader_dump(sscreen, shader, NULL, f, false);
2692 fclose(f);
2693 }
2694 }
2695
2696 si_shader_init_pm4_state(sscreen, shader);
2697 }
2698
si_build_shader_variant_low_priority(void * job,void * gdata,int thread_index)2699 static void si_build_shader_variant_low_priority(void *job, void *gdata, int thread_index)
2700 {
2701 struct si_shader *shader = (struct si_shader *)job;
2702
2703 assert(thread_index >= 0);
2704
2705 si_build_shader_variant(shader, thread_index, true);
2706 }
2707
2708 /* This should be const, but C++ doesn't allow implicit zero-initialization with const. */
2709 static union si_shader_key zeroed;
2710
si_check_missing_main_part(struct si_screen * sscreen,struct si_shader_selector * sel,struct si_compiler_ctx_state * compiler_state,const union si_shader_key * key)2711 static bool si_check_missing_main_part(struct si_screen *sscreen, struct si_shader_selector *sel,
2712 struct si_compiler_ctx_state *compiler_state,
2713 const union si_shader_key *key)
2714 {
2715 struct si_shader **mainp = si_get_main_shader_part(sel, key);
2716
2717 if (!*mainp) {
2718 struct si_shader *main_part = CALLOC_STRUCT(si_shader);
2719
2720 if (!main_part)
2721 return false;
2722
2723 /* We can leave the fence as permanently signaled because the
2724 * main part becomes visible globally only after it has been
2725 * compiled. */
2726 util_queue_fence_init(&main_part->ready);
2727
2728 main_part->selector = sel;
2729 if (sel->stage <= MESA_SHADER_GEOMETRY) {
2730 main_part->key.ge.as_es = key->ge.as_es;
2731 main_part->key.ge.as_ls = key->ge.as_ls;
2732 main_part->key.ge.as_ngg = key->ge.as_ngg;
2733 }
2734 main_part->is_monolithic = false;
2735 main_part->wave_size = si_determine_wave_size(sscreen, main_part);
2736
2737 if (!si_compile_shader(sscreen, compiler_state->compiler, main_part,
2738 &compiler_state->debug)) {
2739 FREE(main_part);
2740 return false;
2741 }
2742 *mainp = main_part;
2743 }
2744 return true;
2745 }
2746
2747 /* A helper to copy *key to *local_key and return local_key. */
2748 template<typename SHADER_KEY_TYPE>
2749 static ALWAYS_INLINE const SHADER_KEY_TYPE *
use_local_key_copy(const SHADER_KEY_TYPE * key,SHADER_KEY_TYPE * local_key,unsigned key_size)2750 use_local_key_copy(const SHADER_KEY_TYPE *key, SHADER_KEY_TYPE *local_key, unsigned key_size)
2751 {
2752 if (key != local_key)
2753 memcpy(local_key, key, key_size);
2754
2755 return local_key;
2756 }
2757
2758 #define NO_INLINE_UNIFORMS false
2759
2760 /**
2761 * Select a shader variant according to the shader key.
2762 *
2763 * This uses a C++ template to compute the optimal memcmp size at compile time, which is important
2764 * for getting inlined memcmp. The memcmp size depends on the shader key type and whether inlined
2765 * uniforms are enabled.
2766 */
2767 template<bool INLINE_UNIFORMS = true, typename SHADER_KEY_TYPE>
si_shader_select_with_key(struct si_context * sctx,struct si_shader_ctx_state * state,const SHADER_KEY_TYPE * key)2768 static int si_shader_select_with_key(struct si_context *sctx, struct si_shader_ctx_state *state,
2769 const SHADER_KEY_TYPE *key)
2770 {
2771 struct si_screen *sscreen = sctx->screen;
2772 struct si_shader_selector *sel = state->cso;
2773 struct si_shader_selector *previous_stage_sel = NULL;
2774 struct si_shader *current = state->current;
2775 struct si_shader *shader = NULL;
2776 const SHADER_KEY_TYPE *zeroed_key = (SHADER_KEY_TYPE*)&zeroed;
2777
2778 /* "opt" must be the last field and "inlined_uniform_values" must be the last field inside opt.
2779 * If there is padding, insert the padding manually before opt or inside opt.
2780 */
2781 STATIC_ASSERT(offsetof(SHADER_KEY_TYPE, opt) + sizeof(key->opt) == sizeof(*key));
2782 STATIC_ASSERT(offsetof(SHADER_KEY_TYPE, opt.inlined_uniform_values) +
2783 sizeof(key->opt.inlined_uniform_values) == sizeof(*key));
2784
2785 const unsigned key_size_no_uniforms = sizeof(*key) - sizeof(key->opt.inlined_uniform_values);
2786 /* Don't compare inlined_uniform_values if uniform inlining is disabled. */
2787 const unsigned key_size = INLINE_UNIFORMS ? sizeof(*key) : key_size_no_uniforms;
2788 const unsigned key_opt_size =
2789 INLINE_UNIFORMS ? sizeof(key->opt) :
2790 sizeof(key->opt) - sizeof(key->opt.inlined_uniform_values);
2791
2792 /* si_shader_select_with_key must not modify 'key' because it would affect future shaders.
2793 * If we need to modify it for this specific shader (eg: to disable optimizations), we
2794 * use a copy.
2795 */
2796 SHADER_KEY_TYPE local_key;
2797
2798 if (unlikely(sscreen->debug_flags & DBG(NO_OPT_VARIANT))) {
2799 /* Disable shader variant optimizations. */
2800 key = use_local_key_copy<SHADER_KEY_TYPE>(key, &local_key, key_size);
2801 memset(&local_key.opt, 0, key_opt_size);
2802 }
2803
2804 again:
2805 /* Check if we don't need to change anything.
2806 * This path is also used for most shaders that don't need multiple
2807 * variants, it will cost just a computation of the key and this
2808 * test. */
2809 if (likely(current && memcmp(¤t->key, key, key_size) == 0)) {
2810 if (unlikely(!util_queue_fence_is_signalled(¤t->ready))) {
2811 if (current->is_optimized) {
2812 key = use_local_key_copy(key, &local_key, key_size);
2813 memset(&local_key.opt, 0, key_opt_size);
2814 goto current_not_ready;
2815 }
2816
2817 util_queue_fence_wait(¤t->ready);
2818 }
2819
2820 return current->compilation_failed ? -1 : 0;
2821 }
2822 current_not_ready:
2823
2824 /* This must be done before the mutex is locked, because async GS
2825 * compilation calls this function too, and therefore must enter
2826 * the mutex first.
2827 */
2828 util_queue_fence_wait(&sel->ready);
2829
2830 simple_mtx_lock(&sel->mutex);
2831
2832 int variant_count = 0;
2833 const int max_inline_uniforms_variants = 5;
2834
2835 /* Find the shader variant. */
2836 const unsigned cnt = sel->variants_count;
2837 for (unsigned i = 0; i < cnt; i++) {
2838 const SHADER_KEY_TYPE *iter_key = (const SHADER_KEY_TYPE *)&sel->keys[i];
2839
2840 if (memcmp(iter_key, key, key_size_no_uniforms) == 0) {
2841 struct si_shader *iter = sel->variants[i];
2842
2843 /* Check the inlined uniform values separately, and count
2844 * the number of variants based on them.
2845 */
2846 if (key->opt.inline_uniforms &&
2847 memcmp(iter_key->opt.inlined_uniform_values,
2848 key->opt.inlined_uniform_values,
2849 MAX_INLINABLE_UNIFORMS * 4) != 0) {
2850 if (variant_count++ > max_inline_uniforms_variants) {
2851 key = use_local_key_copy(key, &local_key, key_size);
2852 /* Too many variants. Disable inlining for this shader. */
2853 local_key.opt.inline_uniforms = 0;
2854 memset(local_key.opt.inlined_uniform_values, 0, MAX_INLINABLE_UNIFORMS * 4);
2855 simple_mtx_unlock(&sel->mutex);
2856 goto again;
2857 }
2858 continue;
2859 }
2860
2861 simple_mtx_unlock(&sel->mutex);
2862
2863 if (unlikely(!util_queue_fence_is_signalled(&iter->ready))) {
2864 /* If it's an optimized shader and its compilation has
2865 * been started but isn't done, use the unoptimized
2866 * shader so as not to cause a stall due to compilation.
2867 */
2868 if (iter->is_optimized) {
2869 key = use_local_key_copy(key, &local_key, key_size);
2870 memset(&local_key.opt, 0, key_opt_size);
2871 goto again;
2872 }
2873
2874 util_queue_fence_wait(&iter->ready);
2875 }
2876
2877 if (iter->compilation_failed) {
2878 return -1; /* skip the draw call */
2879 }
2880
2881 state->current = sel->variants[i];
2882 return 0;
2883 }
2884 }
2885
2886 /* Build a new shader. */
2887 shader = CALLOC_STRUCT(si_shader);
2888 if (!shader) {
2889 simple_mtx_unlock(&sel->mutex);
2890 return -ENOMEM;
2891 }
2892
2893 util_queue_fence_init(&shader->ready);
2894
2895 if (!sscreen->use_aco && !sctx->compiler)
2896 sctx->compiler = si_create_llvm_compiler(sctx->screen);
2897
2898 shader->selector = sel;
2899 *((SHADER_KEY_TYPE*)&shader->key) = *key;
2900 shader->wave_size = si_determine_wave_size(sscreen, shader);
2901 shader->compiler_ctx_state.compiler = sctx->compiler;
2902 shader->compiler_ctx_state.debug = sctx->debug;
2903 shader->compiler_ctx_state.is_debug_context = sctx->is_debug;
2904
2905 /* If this is a merged shader, get the first shader's selector. */
2906 if (sscreen->info.gfx_level >= GFX9) {
2907 if (sel->stage == MESA_SHADER_TESS_CTRL)
2908 previous_stage_sel = ((struct si_shader_key_ge*)key)->part.tcs.ls;
2909 else if (sel->stage == MESA_SHADER_GEOMETRY)
2910 previous_stage_sel = ((struct si_shader_key_ge*)key)->part.gs.es;
2911
2912 /* We need to wait for the previous shader. */
2913 if (previous_stage_sel)
2914 util_queue_fence_wait(&previous_stage_sel->ready);
2915 }
2916
2917 bool is_pure_monolithic =
2918 sscreen->use_monolithic_shaders || memcmp(&key->mono, &zeroed_key->mono, sizeof(key->mono)) != 0;
2919
2920 /* Compile the main shader part if it doesn't exist. This can happen
2921 * if the initial guess was wrong.
2922 */
2923 if (!is_pure_monolithic) {
2924 bool ok = true;
2925
2926 /* Make sure the main shader part is present. This is needed
2927 * for shaders that can be compiled as VS, LS, or ES, and only
2928 * one of them is compiled at creation.
2929 *
2930 * It is also needed for GS, which can be compiled as non-NGG
2931 * and NGG.
2932 *
2933 * For merged shaders, check that the starting shader's main
2934 * part is present.
2935 */
2936 if (previous_stage_sel) {
2937 union si_shader_key shader1_key = zeroed;
2938
2939 if (sel->stage == MESA_SHADER_TESS_CTRL) {
2940 shader1_key.ge.as_ls = 1;
2941 } else if (sel->stage == MESA_SHADER_GEOMETRY) {
2942 shader1_key.ge.as_es = 1;
2943 shader1_key.ge.as_ngg = ((struct si_shader_key_ge*)key)->as_ngg; /* for Wave32 vs Wave64 */
2944 } else {
2945 assert(0);
2946 }
2947
2948 simple_mtx_lock(&previous_stage_sel->mutex);
2949 ok = si_check_missing_main_part(sscreen, previous_stage_sel, &shader->compiler_ctx_state,
2950 &shader1_key);
2951 simple_mtx_unlock(&previous_stage_sel->mutex);
2952 }
2953
2954 if (ok) {
2955 ok = si_check_missing_main_part(sscreen, sel, &shader->compiler_ctx_state,
2956 (union si_shader_key*)key);
2957 }
2958
2959 if (!ok) {
2960 FREE(shader);
2961 simple_mtx_unlock(&sel->mutex);
2962 return -ENOMEM; /* skip the draw call */
2963 }
2964 }
2965
2966 if (sel->variants_count == sel->variants_max_count) {
2967 sel->variants_max_count += 2;
2968 sel->variants = (struct si_shader**)
2969 realloc(sel->variants, sel->variants_max_count * sizeof(struct si_shader*));
2970 sel->keys = (union si_shader_key*)
2971 realloc(sel->keys, sel->variants_max_count * sizeof(union si_shader_key));
2972 }
2973
2974 /* Keep the reference to the 1st shader of merged shaders, so that
2975 * Gallium can't destroy it before we destroy the 2nd shader.
2976 *
2977 * Set sctx = NULL, because it's unused if we're not releasing
2978 * the shader, and we don't have any sctx here.
2979 */
2980 si_shader_selector_reference(NULL, &shader->previous_stage_sel, previous_stage_sel);
2981
2982 /* Monolithic-only shaders don't make a distinction between optimized
2983 * and unoptimized. */
2984 shader->is_monolithic =
2985 is_pure_monolithic || memcmp(&key->opt, &zeroed_key->opt, key_opt_size) != 0;
2986
2987 shader->is_optimized = !is_pure_monolithic &&
2988 memcmp(&key->opt, &zeroed_key->opt, key_opt_size) != 0;
2989
2990 /* If it's an optimized shader, compile it asynchronously. */
2991 if (shader->is_optimized) {
2992 /* Compile it asynchronously. */
2993 util_queue_add_job(&sscreen->shader_compiler_queue_opt_variants, shader, &shader->ready,
2994 si_build_shader_variant_low_priority, NULL, 0);
2995
2996 /* Add only after the ready fence was reset, to guard against a
2997 * race with si_bind_XX_shader. */
2998 sel->variants[sel->variants_count] = shader;
2999 sel->keys[sel->variants_count] = shader->key;
3000 sel->variants_count++;
3001
3002 /* Use the default (unoptimized) shader for now. */
3003 key = use_local_key_copy(key, &local_key, key_size);
3004 memset(&local_key.opt, 0, key_opt_size);
3005 simple_mtx_unlock(&sel->mutex);
3006
3007 if (sscreen->options.sync_compile)
3008 util_queue_fence_wait(&shader->ready);
3009
3010 goto again;
3011 }
3012
3013 /* Reset the fence before adding to the variant list. */
3014 util_queue_fence_reset(&shader->ready);
3015
3016 sel->variants[sel->variants_count] = shader;
3017 sel->keys[sel->variants_count] = shader->key;
3018 sel->variants_count++;
3019
3020 simple_mtx_unlock(&sel->mutex);
3021
3022 assert(!shader->is_optimized);
3023 si_build_shader_variant(shader, -1, false);
3024
3025 util_queue_fence_signal(&shader->ready);
3026
3027 if (!shader->compilation_failed)
3028 state->current = shader;
3029
3030 return shader->compilation_failed ? -1 : 0;
3031 }
3032
si_shader_select(struct pipe_context * ctx,struct si_shader_ctx_state * state)3033 int si_shader_select(struct pipe_context *ctx, struct si_shader_ctx_state *state)
3034 {
3035 struct si_context *sctx = (struct si_context *)ctx;
3036
3037 si_shader_selector_key(ctx, state->cso, &state->key);
3038
3039 if (state->cso->stage == MESA_SHADER_FRAGMENT) {
3040 if (state->key.ps.opt.inline_uniforms)
3041 return si_shader_select_with_key(sctx, state, &state->key.ps);
3042 else
3043 return si_shader_select_with_key<NO_INLINE_UNIFORMS>(sctx, state, &state->key.ps);
3044 } else {
3045 if (state->key.ge.opt.inline_uniforms) {
3046 return si_shader_select_with_key(sctx, state, &state->key.ge);
3047 } else {
3048 return si_shader_select_with_key<NO_INLINE_UNIFORMS>(sctx, state, &state->key.ge);
3049 }
3050 }
3051 }
3052
si_parse_next_shader_property(const struct si_shader_info * info,union si_shader_key * key)3053 static void si_parse_next_shader_property(const struct si_shader_info *info,
3054 union si_shader_key *key)
3055 {
3056 gl_shader_stage next_shader = info->base.next_stage;
3057
3058 switch (info->base.stage) {
3059 case MESA_SHADER_VERTEX:
3060 switch (next_shader) {
3061 case MESA_SHADER_GEOMETRY:
3062 key->ge.as_es = 1;
3063 break;
3064 case MESA_SHADER_TESS_CTRL:
3065 case MESA_SHADER_TESS_EVAL:
3066 key->ge.as_ls = 1;
3067 break;
3068 default:
3069 /* If POSITION isn't written, it can only be a HW VS
3070 * if streamout is used. If streamout isn't used,
3071 * assume that it's a HW LS. (the next shader is TCS)
3072 * This heuristic is needed for separate shader objects.
3073 */
3074 if (!info->writes_position && !info->enabled_streamout_buffer_mask)
3075 key->ge.as_ls = 1;
3076 }
3077 break;
3078
3079 case MESA_SHADER_TESS_EVAL:
3080 if (next_shader == MESA_SHADER_GEOMETRY || !info->writes_position)
3081 key->ge.as_es = 1;
3082 break;
3083
3084 default:;
3085 }
3086 }
3087
3088 /**
3089 * Compile the main shader part or the monolithic shader as part of
3090 * si_shader_selector initialization. Since it can be done asynchronously,
3091 * there is no way to report compile failures to applications.
3092 */
si_init_shader_selector_async(void * job,void * gdata,int thread_index)3093 static void si_init_shader_selector_async(void *job, void *gdata, int thread_index)
3094 {
3095 struct si_shader_selector *sel = (struct si_shader_selector *)job;
3096 struct si_screen *sscreen = sel->screen;
3097 struct ac_llvm_compiler **compiler;
3098 struct util_debug_callback *debug = &sel->compiler_ctx_state.debug;
3099
3100 assert(!debug->debug_message || debug->async);
3101 assert(thread_index >= 0);
3102 assert(thread_index < (int)ARRAY_SIZE(sscreen->compiler));
3103 compiler = &sscreen->compiler[thread_index];
3104
3105 if (!sscreen->use_aco && !*compiler)
3106 *compiler = si_create_llvm_compiler(sscreen);
3107
3108 /* Serialize NIR to save memory. Monolithic shader variants
3109 * have to deserialize NIR before compilation.
3110 */
3111 if (sel->nir) {
3112 struct blob blob;
3113 size_t size;
3114
3115 blob_init(&blob);
3116 /* true = remove optional debugging data to increase
3117 * the likehood of getting more shader cache hits.
3118 * It also drops variable names, so we'll save more memory.
3119 * If NIR debug prints are used we don't strip to get more
3120 * useful logs.
3121 */
3122 nir_serialize(&blob, sel->nir, NIR_DEBUG(PRINT) == 0);
3123 blob_finish_get_buffer(&blob, &sel->nir_binary, &size);
3124 sel->nir_size = size;
3125 }
3126
3127 /* Compile the main shader part for use with a prolog and/or epilog.
3128 * If this fails, the driver will try to compile a monolithic shader
3129 * on demand.
3130 */
3131 if (!sscreen->use_monolithic_shaders) {
3132 struct si_shader *shader = CALLOC_STRUCT(si_shader);
3133 unsigned char ir_sha1_cache_key[20];
3134
3135 if (!shader) {
3136 fprintf(stderr, "radeonsi: can't allocate a main shader part\n");
3137 return;
3138 }
3139
3140 /* We can leave the fence signaled because use of the default
3141 * main part is guarded by the selector's ready fence. */
3142 util_queue_fence_init(&shader->ready);
3143
3144 shader->selector = sel;
3145 shader->is_monolithic = false;
3146 si_parse_next_shader_property(&sel->info, &shader->key);
3147
3148 if (sel->stage <= MESA_SHADER_GEOMETRY &&
3149 sscreen->use_ngg && (!sel->info.enabled_streamout_buffer_mask ||
3150 sscreen->info.gfx_level >= GFX11) &&
3151 ((sel->stage == MESA_SHADER_VERTEX && !shader->key.ge.as_ls) ||
3152 sel->stage == MESA_SHADER_TESS_EVAL || sel->stage == MESA_SHADER_GEOMETRY))
3153 shader->key.ge.as_ngg = 1;
3154
3155 shader->wave_size = si_determine_wave_size(sscreen, shader);
3156
3157 if (sel->nir) {
3158 if (sel->stage <= MESA_SHADER_GEOMETRY) {
3159 si_get_ir_cache_key(sel, shader->key.ge.as_ngg, shader->key.ge.as_es,
3160 shader->wave_size, ir_sha1_cache_key);
3161 } else {
3162 si_get_ir_cache_key(sel, false, false, shader->wave_size, ir_sha1_cache_key);
3163 }
3164 }
3165
3166 /* Try to load the shader from the shader cache. */
3167 simple_mtx_lock(&sscreen->shader_cache_mutex);
3168
3169 if (si_shader_cache_load_shader(sscreen, ir_sha1_cache_key, shader)) {
3170 simple_mtx_unlock(&sscreen->shader_cache_mutex);
3171 si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
3172 } else {
3173 simple_mtx_unlock(&sscreen->shader_cache_mutex);
3174
3175 /* Compile the shader if it hasn't been loaded from the cache. */
3176 if (!si_compile_shader(sscreen, *compiler, shader, debug)) {
3177 fprintf(stderr,
3178 "radeonsi: can't compile a main shader part (type: %s, name: %s).\n"
3179 "This is probably a driver bug, please report "
3180 "it to https://gitlab.freedesktop.org/mesa/mesa/-/issues.\n",
3181 gl_shader_stage_name(shader->selector->stage),
3182 shader->selector->info.base.name);
3183 FREE(shader);
3184 return;
3185 }
3186
3187 simple_mtx_lock(&sscreen->shader_cache_mutex);
3188 si_shader_cache_insert_shader(sscreen, ir_sha1_cache_key, shader, true);
3189 simple_mtx_unlock(&sscreen->shader_cache_mutex);
3190 }
3191
3192 *si_get_main_shader_part(sel, &shader->key) = shader;
3193
3194 /* Unset "outputs_written" flags for outputs converted to
3195 * DEFAULT_VAL, so that later inter-shader optimizations don't
3196 * try to eliminate outputs that don't exist in the final
3197 * shader.
3198 *
3199 * This is only done if non-monolithic shaders are enabled.
3200 */
3201 if ((sel->stage == MESA_SHADER_VERTEX ||
3202 sel->stage == MESA_SHADER_TESS_EVAL ||
3203 sel->stage == MESA_SHADER_GEOMETRY) &&
3204 !shader->key.ge.as_ls && !shader->key.ge.as_es) {
3205 unsigned i;
3206
3207 for (i = 0; i < sel->info.num_outputs; i++) {
3208 unsigned semantic = sel->info.output_semantic[i];
3209 unsigned ps_input_cntl = shader->info.vs_output_ps_input_cntl[semantic];
3210
3211 /* OFFSET=0x20 means DEFAULT_VAL, which means VS doesn't export it. */
3212 if (G_028644_OFFSET(ps_input_cntl) != 0x20)
3213 continue;
3214
3215 unsigned id;
3216
3217 /* Remove the output from the mask. */
3218 if ((semantic <= VARYING_SLOT_VAR31 || semantic >= VARYING_SLOT_VAR0_16BIT) &&
3219 semantic != VARYING_SLOT_POS &&
3220 semantic != VARYING_SLOT_PSIZ &&
3221 semantic != VARYING_SLOT_CLIP_VERTEX &&
3222 semantic != VARYING_SLOT_EDGE &&
3223 semantic != VARYING_SLOT_LAYER) {
3224 id = si_shader_io_get_unique_index(semantic);
3225 sel->info.outputs_written_before_ps &= ~(1ull << id);
3226 }
3227 }
3228 }
3229 }
3230
3231 /* Free NIR. We only keep serialized NIR after this point. */
3232 if (sel->nir) {
3233 ralloc_free(sel->nir);
3234 sel->nir = NULL;
3235 }
3236 }
3237
si_schedule_initial_compile(struct si_context * sctx,gl_shader_stage stage,struct util_queue_fence * ready_fence,struct si_compiler_ctx_state * compiler_ctx_state,void * job,util_queue_execute_func execute)3238 void si_schedule_initial_compile(struct si_context *sctx, gl_shader_stage stage,
3239 struct util_queue_fence *ready_fence,
3240 struct si_compiler_ctx_state *compiler_ctx_state, void *job,
3241 util_queue_execute_func execute)
3242 {
3243 util_queue_fence_init(ready_fence);
3244
3245 struct util_async_debug_callback async_debug;
3246 bool debug = (sctx->debug.debug_message && !sctx->debug.async) || sctx->is_debug ||
3247 si_can_dump_shader(sctx->screen, stage, SI_DUMP_ALWAYS);
3248
3249 if (debug) {
3250 u_async_debug_init(&async_debug);
3251 compiler_ctx_state->debug = async_debug.base;
3252 }
3253
3254 util_queue_add_job(&sctx->screen->shader_compiler_queue, job, ready_fence, execute, NULL, 0);
3255
3256 if (debug) {
3257 util_queue_fence_wait(ready_fence);
3258 u_async_debug_drain(&async_debug, &sctx->debug);
3259 u_async_debug_cleanup(&async_debug);
3260 }
3261
3262 if (sctx->screen->options.sync_compile)
3263 util_queue_fence_wait(ready_fence);
3264 }
3265
3266 /* Return descriptor slot usage masks from the given shader info. */
si_get_active_slot_masks(struct si_screen * sscreen,const struct si_shader_info * info,uint64_t * const_and_shader_buffers,uint64_t * samplers_and_images)3267 void si_get_active_slot_masks(struct si_screen *sscreen, const struct si_shader_info *info,
3268 uint64_t *const_and_shader_buffers, uint64_t *samplers_and_images)
3269 {
3270 unsigned start, num_shaderbufs, num_constbufs, num_images, num_msaa_images, num_samplers;
3271
3272 num_shaderbufs = info->base.num_ssbos;
3273 num_constbufs = info->base.num_ubos;
3274 /* two 8-byte images share one 16-byte slot */
3275 num_images = align(info->base.num_images, 2);
3276 num_msaa_images = align(BITSET_LAST_BIT(info->base.msaa_images), 2);
3277 num_samplers = BITSET_LAST_BIT(info->base.textures_used);
3278
3279 /* The layout is: sb[last] ... sb[0], cb[0] ... cb[last] */
3280 start = si_get_shaderbuf_slot(num_shaderbufs - 1);
3281 *const_and_shader_buffers = u_bit_consecutive64(start, num_shaderbufs + num_constbufs);
3282
3283 /* The layout is:
3284 * - fmask[last] ... fmask[0] go to [15-last .. 15]
3285 * - image[last] ... image[0] go to [31-last .. 31]
3286 * - sampler[0] ... sampler[last] go to [32 .. 32+last*2]
3287 *
3288 * FMASKs for images are placed separately, because MSAA images are rare,
3289 * and so we can benefit from a better cache hit rate if we keep image
3290 * descriptors together.
3291 */
3292 if (sscreen->info.gfx_level < GFX11 && num_msaa_images)
3293 num_images = SI_NUM_IMAGES + num_msaa_images; /* add FMASK descriptors */
3294
3295 start = si_get_image_slot(num_images - 1) / 2;
3296 *samplers_and_images = u_bit_consecutive64(start, num_images / 2 + num_samplers);
3297 }
3298
si_create_shader_selector(struct pipe_context * ctx,const struct pipe_shader_state * state)3299 static void *si_create_shader_selector(struct pipe_context *ctx,
3300 const struct pipe_shader_state *state)
3301 {
3302 struct si_screen *sscreen = (struct si_screen *)ctx->screen;
3303 struct si_context *sctx = (struct si_context *)ctx;
3304 struct si_shader_selector *sel = CALLOC_STRUCT(si_shader_selector);
3305
3306 if (!sel)
3307 return NULL;
3308
3309 sel->screen = sscreen;
3310 sel->compiler_ctx_state.debug = sctx->debug;
3311 sel->compiler_ctx_state.is_debug_context = sctx->is_debug;
3312 sel->variants_max_count = 2;
3313 sel->keys = (union si_shader_key *)
3314 realloc(NULL, sel->variants_max_count * sizeof(union si_shader_key));
3315 sel->variants = (struct si_shader **)
3316 realloc(NULL, sel->variants_max_count * sizeof(struct si_shader *));
3317
3318 if (state->type == PIPE_SHADER_IR_TGSI) {
3319 sel->nir = tgsi_to_nir(state->tokens, ctx->screen, true);
3320 } else {
3321 assert(state->type == PIPE_SHADER_IR_NIR);
3322 sel->nir = (nir_shader*)state->ir.nir;
3323 }
3324
3325 si_nir_scan_shader(sscreen, sel->nir, &sel->info);
3326
3327 sel->stage = sel->nir->info.stage;
3328 const enum pipe_shader_type type = pipe_shader_type_from_mesa(sel->stage);
3329 sel->pipe_shader_type = type;
3330 sel->const_and_shader_buf_descriptors_index =
3331 si_const_and_shader_buffer_descriptors_idx(type);
3332 sel->sampler_and_images_descriptors_index =
3333 si_sampler_and_image_descriptors_idx(type);
3334
3335 if (si_can_dump_shader(sscreen, sel->stage, SI_DUMP_INIT_NIR))
3336 nir_print_shader(sel->nir, stderr);
3337
3338 p_atomic_inc(&sscreen->num_shaders_created);
3339 si_get_active_slot_masks(sscreen, &sel->info, &sel->active_const_and_shader_buffers,
3340 &sel->active_samplers_and_images);
3341
3342 switch (sel->stage) {
3343 case MESA_SHADER_GEOMETRY:
3344 /* Only possibilities: POINTS, LINE_STRIP, TRIANGLES */
3345 sel->rast_prim = (enum mesa_prim)sel->info.base.gs.output_primitive;
3346 if (util_rast_prim_is_triangles(sel->rast_prim))
3347 sel->rast_prim = MESA_PRIM_TRIANGLES;
3348
3349 /* EN_MAX_VERT_OUT_PER_GS_INSTANCE does not work with tessellation so
3350 * we can't split workgroups. Disable ngg if any of the following conditions is true:
3351 * - num_invocations * gs.vertices_out > 256
3352 * - LDS usage is too high
3353 */
3354 sel->tess_turns_off_ngg = sscreen->info.gfx_level >= GFX10 &&
3355 sscreen->info.gfx_level <= GFX10_3 &&
3356 (sel->info.base.gs.invocations * sel->info.base.gs.vertices_out > 256 ||
3357 sel->info.base.gs.invocations * sel->info.base.gs.vertices_out *
3358 (sel->info.num_outputs * 4 + 1) > 6500 /* max dw per GS primitive */);
3359 break;
3360
3361 case MESA_SHADER_VERTEX:
3362 case MESA_SHADER_TESS_EVAL:
3363 if (sel->stage == MESA_SHADER_TESS_EVAL) {
3364 if (sel->info.base.tess.point_mode)
3365 sel->rast_prim = MESA_PRIM_POINTS;
3366 else if (sel->info.base.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES)
3367 sel->rast_prim = MESA_PRIM_LINE_STRIP;
3368 else
3369 sel->rast_prim = MESA_PRIM_TRIANGLES;
3370 } else {
3371 sel->rast_prim = MESA_PRIM_TRIANGLES;
3372 }
3373 break;
3374 default:;
3375 }
3376
3377 bool ngg_culling_allowed =
3378 sscreen->info.gfx_level >= GFX10 &&
3379 sscreen->use_ngg_culling &&
3380 sel->info.writes_position &&
3381 !sel->info.writes_viewport_index && /* cull only against viewport 0 */
3382 !sel->info.base.writes_memory &&
3383 /* NGG GS supports culling with streamout because it culls after streamout. */
3384 (sel->stage == MESA_SHADER_GEOMETRY || !sel->info.enabled_streamout_buffer_mask) &&
3385 (sel->stage != MESA_SHADER_GEOMETRY || sel->info.num_stream_output_components[0]) &&
3386 (sel->stage != MESA_SHADER_VERTEX ||
3387 (!sel->info.base.vs.blit_sgprs_amd &&
3388 !sel->info.base.vs.window_space_position));
3389
3390 sel->ngg_cull_vert_threshold = UINT_MAX; /* disabled (changed below) */
3391
3392 if (ngg_culling_allowed) {
3393 if (sel->stage == MESA_SHADER_VERTEX) {
3394 if (sscreen->debug_flags & DBG(ALWAYS_NGG_CULLING_ALL))
3395 sel->ngg_cull_vert_threshold = 0; /* always enabled */
3396 else
3397 sel->ngg_cull_vert_threshold = 128;
3398 } else if (sel->stage == MESA_SHADER_TESS_EVAL ||
3399 sel->stage == MESA_SHADER_GEOMETRY) {
3400 if (sel->rast_prim != MESA_PRIM_POINTS)
3401 sel->ngg_cull_vert_threshold = 0; /* always enabled */
3402 }
3403 }
3404
3405 (void)simple_mtx_init(&sel->mutex, mtx_plain);
3406
3407 si_schedule_initial_compile(sctx, sel->stage, &sel->ready, &sel->compiler_ctx_state,
3408 sel, si_init_shader_selector_async);
3409 return sel;
3410 }
3411
si_create_shader(struct pipe_context * ctx,const struct pipe_shader_state * state)3412 static void *si_create_shader(struct pipe_context *ctx, const struct pipe_shader_state *state)
3413 {
3414 struct si_context *sctx = (struct si_context *)ctx;
3415 struct si_screen *sscreen = (struct si_screen *)ctx->screen;
3416 bool cache_hit;
3417 struct si_shader_selector *sel = (struct si_shader_selector *)util_live_shader_cache_get(
3418 ctx, &sscreen->live_shader_cache, state, &cache_hit);
3419
3420 if (sel && cache_hit && sctx->debug.debug_message) {
3421 if (sel->main_shader_part)
3422 si_shader_dump_stats_for_shader_db(sscreen, sel->main_shader_part, &sctx->debug);
3423 if (sel->main_shader_part_ls)
3424 si_shader_dump_stats_for_shader_db(sscreen, sel->main_shader_part_ls, &sctx->debug);
3425 if (sel->main_shader_part_es)
3426 si_shader_dump_stats_for_shader_db(sscreen, sel->main_shader_part_es, &sctx->debug);
3427 if (sel->main_shader_part_ngg)
3428 si_shader_dump_stats_for_shader_db(sscreen, sel->main_shader_part_ngg, &sctx->debug);
3429 if (sel->main_shader_part_ngg_es)
3430 si_shader_dump_stats_for_shader_db(sscreen, sel->main_shader_part_ngg_es, &sctx->debug);
3431 }
3432 return sel;
3433 }
3434
si_update_streamout_state(struct si_context * sctx)3435 static void si_update_streamout_state(struct si_context *sctx)
3436 {
3437 struct si_shader_selector *shader_with_so = si_get_vs(sctx)->cso;
3438
3439 if (!shader_with_so)
3440 return;
3441
3442 sctx->streamout.enabled_stream_buffers_mask = shader_with_so->info.enabled_streamout_buffer_mask;
3443 sctx->streamout.stride_in_dw = shader_with_so->info.base.xfb_stride;
3444
3445 /* GDS must be allocated when any GDS instructions are used, otherwise it hangs. */
3446 if (sctx->gfx_level >= GFX11 && shader_with_so->info.enabled_streamout_buffer_mask &&
3447 !sctx->screen->gds_oa) {
3448 /* Gfx11 only uses GDS OA, not GDS memory. */
3449 simple_mtx_lock(&sctx->screen->gds_mutex);
3450 if (!sctx->screen->gds_oa) {
3451 sctx->screen->gds_oa = sctx->ws->buffer_create(sctx->ws, 1, 1, RADEON_DOMAIN_OA,
3452 RADEON_FLAG_DRIVER_INTERNAL);
3453 assert(sctx->screen->gds_oa);
3454 }
3455 simple_mtx_unlock(&sctx->screen->gds_mutex);
3456
3457 if (sctx->screen->gds_oa)
3458 sctx->ws->cs_add_buffer(&sctx->gfx_cs, sctx->screen->gds_oa, RADEON_USAGE_READWRITE,
3459 (enum radeon_bo_domain)0);
3460 }
3461 }
3462
si_update_clip_regs(struct si_context * sctx,struct si_shader_selector * old_hw_vs,struct si_shader * old_hw_vs_variant,struct si_shader_selector * next_hw_vs,struct si_shader * next_hw_vs_variant)3463 static void si_update_clip_regs(struct si_context *sctx, struct si_shader_selector *old_hw_vs,
3464 struct si_shader *old_hw_vs_variant,
3465 struct si_shader_selector *next_hw_vs,
3466 struct si_shader *next_hw_vs_variant)
3467 {
3468 if (next_hw_vs &&
3469 (!old_hw_vs ||
3470 (old_hw_vs->stage == MESA_SHADER_VERTEX && old_hw_vs->info.base.vs.window_space_position) !=
3471 (next_hw_vs->stage == MESA_SHADER_VERTEX && next_hw_vs->info.base.vs.window_space_position) ||
3472 old_hw_vs->info.clipdist_mask != next_hw_vs->info.clipdist_mask ||
3473 old_hw_vs->info.culldist_mask != next_hw_vs->info.culldist_mask || !old_hw_vs_variant ||
3474 !next_hw_vs_variant ||
3475 old_hw_vs_variant->pa_cl_vs_out_cntl != next_hw_vs_variant->pa_cl_vs_out_cntl))
3476 si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs);
3477 }
3478
si_update_rasterized_prim(struct si_context * sctx)3479 static void si_update_rasterized_prim(struct si_context *sctx)
3480 {
3481 struct si_shader *hw_vs = si_get_vs(sctx)->current;
3482
3483 if (sctx->shader.gs.cso) {
3484 /* Only possibilities: POINTS, LINE_STRIP, TRIANGLES */
3485 si_set_rasterized_prim(sctx, sctx->shader.gs.cso->rast_prim, hw_vs, sctx->ngg);
3486 } else if (sctx->shader.tes.cso) {
3487 /* Only possibilities: POINTS, LINE_STRIP, TRIANGLES */
3488 si_set_rasterized_prim(sctx, sctx->shader.tes.cso->rast_prim, hw_vs, sctx->ngg);
3489 } else {
3490 /* The rasterized prim is determined by draw calls. */
3491 }
3492
3493 /* This must be done unconditionally because it also depends on si_shader fields. */
3494 si_update_ngg_sgpr_state_out_prim(sctx, hw_vs, sctx->ngg);
3495 }
3496
si_update_common_shader_state(struct si_context * sctx,struct si_shader_selector * sel,enum pipe_shader_type type)3497 static void si_update_common_shader_state(struct si_context *sctx, struct si_shader_selector *sel,
3498 enum pipe_shader_type type)
3499 {
3500 si_set_active_descriptors_for_shader(sctx, sel);
3501
3502 sctx->uses_bindless_samplers = si_shader_uses_bindless_samplers(sctx->shader.vs.cso) ||
3503 si_shader_uses_bindless_samplers(sctx->shader.gs.cso) ||
3504 si_shader_uses_bindless_samplers(sctx->shader.ps.cso) ||
3505 si_shader_uses_bindless_samplers(sctx->shader.tcs.cso) ||
3506 si_shader_uses_bindless_samplers(sctx->shader.tes.cso);
3507 sctx->uses_bindless_images = si_shader_uses_bindless_images(sctx->shader.vs.cso) ||
3508 si_shader_uses_bindless_images(sctx->shader.gs.cso) ||
3509 si_shader_uses_bindless_images(sctx->shader.ps.cso) ||
3510 si_shader_uses_bindless_images(sctx->shader.tcs.cso) ||
3511 si_shader_uses_bindless_images(sctx->shader.tes.cso);
3512
3513 if (type == PIPE_SHADER_VERTEX || type == PIPE_SHADER_TESS_EVAL || type == PIPE_SHADER_GEOMETRY)
3514 sctx->ngg_culling = 0; /* this will be enabled on the first draw if needed */
3515
3516 si_invalidate_inlinable_uniforms(sctx, type);
3517 sctx->do_update_shaders = true;
3518 }
3519
si_update_last_vgt_stage_state(struct si_context * sctx,struct si_shader_selector * old_hw_vs,struct si_shader * old_hw_vs_variant)3520 static void si_update_last_vgt_stage_state(struct si_context *sctx,
3521 /* hw_vs refers to the last VGT stage */
3522 struct si_shader_selector *old_hw_vs,
3523 struct si_shader *old_hw_vs_variant)
3524 {
3525 struct si_shader_ctx_state *hw_vs = si_get_vs(sctx);
3526
3527 si_update_vs_viewport_state(sctx);
3528 si_update_streamout_state(sctx);
3529 si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant, hw_vs->cso, hw_vs->current);
3530 si_update_rasterized_prim(sctx);
3531
3532 /* Clear kill_pointsize because we only want it to be set in the last shader before PS. */
3533 sctx->shader.vs.key.ge.opt.kill_pointsize = 0;
3534 sctx->shader.tes.key.ge.opt.kill_pointsize = 0;
3535 sctx->shader.gs.key.ge.opt.kill_pointsize = 0;
3536 si_vs_ps_key_update_rast_prim_smooth_stipple(sctx);
3537 }
3538
si_bind_vs_shader(struct pipe_context * ctx,void * state)3539 static void si_bind_vs_shader(struct pipe_context *ctx, void *state)
3540 {
3541 struct si_context *sctx = (struct si_context *)ctx;
3542 struct si_shader_selector *old_hw_vs = si_get_vs(sctx)->cso;
3543 struct si_shader *old_hw_vs_variant = si_get_vs(sctx)->current;
3544 struct si_shader_selector *sel = (struct si_shader_selector*)state;
3545
3546 if (sctx->shader.vs.cso == sel)
3547 return;
3548
3549 sctx->shader.vs.cso = sel;
3550 sctx->shader.vs.current = (sel && sel->variants_count) ? sel->variants[0] : NULL;
3551 sctx->num_vs_blit_sgprs = sel ? sel->info.base.vs.blit_sgprs_amd : 0;
3552 sctx->vs_uses_draw_id = sel ? sel->info.uses_drawid : false;
3553
3554 if (si_update_ngg(sctx))
3555 si_shader_change_notify(sctx);
3556
3557 si_update_common_shader_state(sctx, sel, PIPE_SHADER_VERTEX);
3558 si_select_draw_vbo(sctx);
3559 si_update_last_vgt_stage_state(sctx, old_hw_vs, old_hw_vs_variant);
3560 si_vs_key_update_inputs(sctx);
3561
3562 if (sctx->screen->dpbb_allowed) {
3563 bool force_off = sel && sel->info.options & SI_PROFILE_VS_NO_BINNING;
3564
3565 if (force_off != sctx->dpbb_force_off_profile_vs) {
3566 sctx->dpbb_force_off_profile_vs = force_off;
3567 si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
3568 }
3569 }
3570 }
3571
si_update_tess_uses_prim_id(struct si_context * sctx)3572 static void si_update_tess_uses_prim_id(struct si_context *sctx)
3573 {
3574 sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id =
3575 (sctx->shader.tes.cso && sctx->shader.tes.cso->info.uses_primid) ||
3576 (sctx->shader.tcs.cso && sctx->shader.tcs.cso->info.uses_primid) ||
3577 (sctx->shader.gs.cso && sctx->shader.gs.cso->info.uses_primid) ||
3578 (sctx->shader.ps.cso && !sctx->shader.gs.cso && sctx->shader.ps.cso->info.uses_primid);
3579 }
3580
si_update_ngg(struct si_context * sctx)3581 bool si_update_ngg(struct si_context *sctx)
3582 {
3583 if (!sctx->screen->use_ngg) {
3584 assert(!sctx->ngg);
3585 return false;
3586 }
3587
3588 bool new_ngg = true;
3589
3590 if (sctx->shader.gs.cso && sctx->shader.tes.cso && sctx->shader.gs.cso->tess_turns_off_ngg) {
3591 new_ngg = false;
3592 } else if (sctx->gfx_level < GFX11) {
3593 struct si_shader_selector *last = si_get_vs(sctx)->cso;
3594
3595 if ((last && last->info.enabled_streamout_buffer_mask) ||
3596 sctx->streamout.prims_gen_query_enabled)
3597 new_ngg = false;
3598 }
3599
3600 if (new_ngg != sctx->ngg) {
3601 /* Transitioning from NGG to legacy GS requires VGT_FLUSH on Navi10-14.
3602 * VGT_FLUSH is also emitted at the beginning of IBs when legacy GS ring
3603 * pointers are set.
3604 */
3605 if (sctx->screen->info.has_vgt_flush_ngg_legacy_bug && !new_ngg) {
3606 sctx->flags |= SI_CONTEXT_VGT_FLUSH;
3607 si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
3608
3609 if (sctx->gfx_level == GFX10) {
3610 /* Workaround for https://gitlab.freedesktop.org/mesa/mesa/-/issues/2941 */
3611 si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
3612 }
3613 }
3614
3615 sctx->ngg = new_ngg;
3616 si_select_draw_vbo(sctx);
3617 return true;
3618 }
3619 return false;
3620 }
3621
si_bind_gs_shader(struct pipe_context * ctx,void * state)3622 static void si_bind_gs_shader(struct pipe_context *ctx, void *state)
3623 {
3624 struct si_context *sctx = (struct si_context *)ctx;
3625 struct si_shader_selector *old_hw_vs = si_get_vs(sctx)->cso;
3626 struct si_shader *old_hw_vs_variant = si_get_vs(sctx)->current;
3627 struct si_shader_selector *sel = (struct si_shader_selector*)state;
3628 bool enable_changed = !!sctx->shader.gs.cso != !!sel;
3629 bool ngg_changed;
3630
3631 if (sctx->shader.gs.cso == sel)
3632 return;
3633
3634 sctx->shader.gs.cso = sel;
3635 sctx->shader.gs.current = (sel && sel->variants_count) ? sel->variants[0] : NULL;
3636 sctx->ia_multi_vgt_param_key.u.uses_gs = sel != NULL;
3637
3638 si_update_common_shader_state(sctx, sel, PIPE_SHADER_GEOMETRY);
3639 si_select_draw_vbo(sctx);
3640
3641 ngg_changed = si_update_ngg(sctx);
3642 if (ngg_changed || enable_changed)
3643 si_shader_change_notify(sctx);
3644 if (enable_changed) {
3645 if (sctx->ia_multi_vgt_param_key.u.uses_tess)
3646 si_update_tess_uses_prim_id(sctx);
3647 }
3648 si_update_last_vgt_stage_state(sctx, old_hw_vs, old_hw_vs_variant);
3649 }
3650
si_bind_tcs_shader(struct pipe_context * ctx,void * state)3651 static void si_bind_tcs_shader(struct pipe_context *ctx, void *state)
3652 {
3653 struct si_context *sctx = (struct si_context *)ctx;
3654 struct si_shader_selector *sel = (struct si_shader_selector*)state;
3655 bool enable_changed = !!sctx->shader.tcs.cso != !!sel;
3656
3657 /* Note it could happen that user shader sel is same as fixed function shader,
3658 * so we should update this field even sctx->shader.tcs.cso == sel.
3659 */
3660 sctx->is_user_tcs = !!sel;
3661
3662 if (sctx->shader.tcs.cso == sel)
3663 return;
3664
3665 sctx->shader.tcs.cso = sel;
3666 sctx->shader.tcs.current = (sel && sel->variants_count) ? sel->variants[0] : NULL;
3667 sctx->shader.tcs.key.ge.part.tcs.epilog.invoc0_tess_factors_are_def =
3668 sel ? sel->info.tessfactors_are_def_in_all_invocs : 0;
3669 si_update_tess_uses_prim_id(sctx);
3670 si_update_tess_in_out_patch_vertices(sctx);
3671
3672 si_update_common_shader_state(sctx, sel, PIPE_SHADER_TESS_CTRL);
3673
3674 if (enable_changed)
3675 sctx->last_tcs = NULL; /* invalidate derived tess state */
3676 }
3677
si_bind_tes_shader(struct pipe_context * ctx,void * state)3678 static void si_bind_tes_shader(struct pipe_context *ctx, void *state)
3679 {
3680 struct si_context *sctx = (struct si_context *)ctx;
3681 struct si_shader_selector *old_hw_vs = si_get_vs(sctx)->cso;
3682 struct si_shader *old_hw_vs_variant = si_get_vs(sctx)->current;
3683 struct si_shader_selector *sel = (struct si_shader_selector*)state;
3684 bool enable_changed = !!sctx->shader.tes.cso != !!sel;
3685
3686 if (sctx->shader.tes.cso == sel)
3687 return;
3688
3689 sctx->shader.tes.cso = sel;
3690 sctx->shader.tes.current = (sel && sel->variants_count) ? sel->variants[0] : NULL;
3691 sctx->ia_multi_vgt_param_key.u.uses_tess = sel != NULL;
3692 si_update_tess_uses_prim_id(sctx);
3693
3694 sctx->shader.tcs.key.ge.part.tcs.epilog.prim_mode =
3695 sel ? sel->info.base.tess._primitive_mode : 0;
3696
3697 sctx->shader.tcs.key.ge.part.tcs.epilog.tes_reads_tess_factors =
3698 sel ? sel->info.reads_tess_factors : 0;
3699
3700 si_update_common_shader_state(sctx, sel, PIPE_SHADER_TESS_EVAL);
3701 si_select_draw_vbo(sctx);
3702
3703 bool ngg_changed = si_update_ngg(sctx);
3704 if (ngg_changed || enable_changed)
3705 si_shader_change_notify(sctx);
3706 if (enable_changed)
3707 sctx->last_tes_sh_base = -1; /* invalidate derived tess state */
3708 si_update_last_vgt_stage_state(sctx, old_hw_vs, old_hw_vs_variant);
3709 }
3710
si_update_vrs_flat_shading(struct si_context * sctx)3711 void si_update_vrs_flat_shading(struct si_context *sctx)
3712 {
3713 if (sctx->gfx_level >= GFX10_3 && sctx->shader.ps.cso) {
3714 struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
3715 struct si_shader_info *info = &sctx->shader.ps.cso->info;
3716 bool allow_flat_shading = info->allow_flat_shading;
3717
3718 if (allow_flat_shading &&
3719 (rs->line_smooth || rs->poly_smooth || rs->poly_stipple_enable ||
3720 rs->point_smooth || (!rs->flatshade && info->uses_interp_color)))
3721 allow_flat_shading = false;
3722
3723 if (sctx->allow_flat_shading != allow_flat_shading) {
3724 sctx->allow_flat_shading = allow_flat_shading;
3725 si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
3726 }
3727 }
3728 }
3729
si_bind_ps_shader(struct pipe_context * ctx,void * state)3730 static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
3731 {
3732 struct si_context *sctx = (struct si_context *)ctx;
3733 struct si_shader_selector *old_sel = sctx->shader.ps.cso;
3734 struct si_shader_selector *sel = (struct si_shader_selector*)state;
3735
3736 /* skip if supplied shader is one already in use */
3737 if (old_sel == sel)
3738 return;
3739
3740 sctx->shader.ps.cso = sel;
3741 sctx->shader.ps.current = (sel && sel->variants_count) ? sel->variants[0] : NULL;
3742
3743 si_update_common_shader_state(sctx, sel, PIPE_SHADER_FRAGMENT);
3744 if (sel) {
3745 if (sctx->ia_multi_vgt_param_key.u.uses_tess)
3746 si_update_tess_uses_prim_id(sctx);
3747
3748 if (!old_sel || old_sel->info.colors_written != sel->info.colors_written)
3749 si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
3750
3751 if (sctx->screen->info.has_out_of_order_rast &&
3752 (!old_sel || old_sel->info.base.writes_memory != sel->info.base.writes_memory ||
3753 old_sel->info.base.fs.early_fragment_tests !=
3754 sel->info.base.fs.early_fragment_tests))
3755 si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
3756 }
3757 si_update_ps_colorbuf0_slot(sctx);
3758
3759 si_ps_key_update_framebuffer(sctx);
3760 si_ps_key_update_framebuffer_blend_rasterizer(sctx);
3761 si_ps_key_update_rasterizer(sctx);
3762 si_ps_key_update_dsa(sctx);
3763 si_ps_key_update_sample_shading(sctx);
3764 si_ps_key_update_framebuffer_rasterizer_sample_shading(sctx);
3765 si_update_ps_inputs_read_or_disabled(sctx);
3766 si_update_vrs_flat_shading(sctx);
3767
3768 if (sctx->screen->dpbb_allowed) {
3769 bool force_off = sel && sel->info.options & SI_PROFILE_GFX9_GFX10_PS_NO_BINNING &&
3770 (sctx->gfx_level >= GFX9 && sctx->gfx_level <= GFX10_3);
3771
3772 if (force_off != sctx->dpbb_force_off_profile_ps) {
3773 sctx->dpbb_force_off_profile_ps = force_off;
3774 si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
3775 }
3776 }
3777 }
3778
si_delete_shader(struct si_context * sctx,struct si_shader * shader)3779 static void si_delete_shader(struct si_context *sctx, struct si_shader *shader)
3780 {
3781 if (shader->is_optimized) {
3782 util_queue_drop_job(&sctx->screen->shader_compiler_queue_opt_variants, &shader->ready);
3783 }
3784
3785 util_queue_fence_destroy(&shader->ready);
3786
3787 /* If destroyed shaders were not unbound, the next compiled
3788 * shader variant could get the same pointer address and so
3789 * binding it to the same shader stage would be considered
3790 * a no-op, causing random behavior.
3791 */
3792 int state_index = -1;
3793
3794 switch (shader->selector->stage) {
3795 case MESA_SHADER_VERTEX:
3796 if (shader->key.ge.as_ls) {
3797 if (sctx->gfx_level <= GFX8)
3798 state_index = SI_STATE_IDX(ls);
3799 } else if (shader->key.ge.as_es) {
3800 if (sctx->gfx_level <= GFX8)
3801 state_index = SI_STATE_IDX(es);
3802 } else if (shader->key.ge.as_ngg) {
3803 state_index = SI_STATE_IDX(gs);
3804 } else {
3805 state_index = SI_STATE_IDX(vs);
3806 }
3807 break;
3808 case MESA_SHADER_TESS_CTRL:
3809 state_index = SI_STATE_IDX(hs);
3810 break;
3811 case MESA_SHADER_TESS_EVAL:
3812 if (shader->key.ge.as_es) {
3813 if (sctx->gfx_level <= GFX8)
3814 state_index = SI_STATE_IDX(es);
3815 } else if (shader->key.ge.as_ngg) {
3816 state_index = SI_STATE_IDX(gs);
3817 } else {
3818 state_index = SI_STATE_IDX(vs);
3819 }
3820 break;
3821 case MESA_SHADER_GEOMETRY:
3822 if (shader->is_gs_copy_shader)
3823 state_index = SI_STATE_IDX(vs);
3824 else
3825 state_index = SI_STATE_IDX(gs);
3826 break;
3827 case MESA_SHADER_FRAGMENT:
3828 state_index = SI_STATE_IDX(ps);
3829 break;
3830 default:;
3831 }
3832
3833 if (shader->gs_copy_shader)
3834 si_delete_shader(sctx, shader->gs_copy_shader);
3835
3836 si_shader_selector_reference(sctx, &shader->previous_stage_sel, NULL);
3837 si_shader_destroy(shader);
3838 si_pm4_free_state(sctx, &shader->pm4, state_index);
3839 }
3840
si_destroy_shader_selector(struct pipe_context * ctx,void * cso)3841 static void si_destroy_shader_selector(struct pipe_context *ctx, void *cso)
3842 {
3843 struct si_context *sctx = (struct si_context *)ctx;
3844 struct si_shader_selector *sel = (struct si_shader_selector *)cso;
3845 enum pipe_shader_type type = pipe_shader_type_from_mesa(sel->stage);
3846
3847 util_queue_drop_job(&sctx->screen->shader_compiler_queue, &sel->ready);
3848
3849 if (sctx->shaders[type].cso == sel) {
3850 sctx->shaders[type].cso = NULL;
3851 sctx->shaders[type].current = NULL;
3852 }
3853
3854 for (unsigned i = 0; i < sel->variants_count; i++) {
3855 si_delete_shader(sctx, sel->variants[i]);
3856 }
3857
3858 if (sel->main_shader_part)
3859 si_delete_shader(sctx, sel->main_shader_part);
3860 if (sel->main_shader_part_ls)
3861 si_delete_shader(sctx, sel->main_shader_part_ls);
3862 if (sel->main_shader_part_es)
3863 si_delete_shader(sctx, sel->main_shader_part_es);
3864 if (sel->main_shader_part_ngg)
3865 si_delete_shader(sctx, sel->main_shader_part_ngg);
3866
3867 free(sel->keys);
3868 free(sel->variants);
3869
3870 util_queue_fence_destroy(&sel->ready);
3871 simple_mtx_destroy(&sel->mutex);
3872 ralloc_free(sel->nir);
3873 free(sel->nir_binary);
3874 free(sel);
3875 }
3876
si_delete_shader_selector(struct pipe_context * ctx,void * state)3877 static void si_delete_shader_selector(struct pipe_context *ctx, void *state)
3878 {
3879 struct si_context *sctx = (struct si_context *)ctx;
3880 struct si_shader_selector *sel = (struct si_shader_selector *)state;
3881
3882 si_shader_selector_reference(sctx, &sel, NULL);
3883 }
3884
3885 /**
3886 * Writing CONFIG or UCONFIG VGT registers requires VGT_FLUSH before that.
3887 */
si_cs_preamble_add_vgt_flush(struct si_context * sctx,bool tmz)3888 static void si_cs_preamble_add_vgt_flush(struct si_context *sctx, bool tmz)
3889 {
3890 struct si_pm4_state *pm4 = tmz ? sctx->cs_preamble_state_tmz : sctx->cs_preamble_state;
3891 bool *has_vgt_flush = tmz ? &sctx->cs_preamble_has_vgt_flush_tmz :
3892 &sctx->cs_preamble_has_vgt_flush;
3893
3894 /* We shouldn't get here if registers are shadowed. */
3895 assert(!sctx->shadowing.registers);
3896
3897 if (*has_vgt_flush)
3898 return;
3899
3900 /* Done by Vulkan before VGT_FLUSH. */
3901 si_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0));
3902 si_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
3903
3904 /* VGT_FLUSH is required even if VGT is idle. It resets VGT pointers. */
3905 si_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0));
3906 si_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
3907 si_pm4_finalize(pm4);
3908
3909 *has_vgt_flush = true;
3910 }
3911
3912 /**
3913 * Writing CONFIG or UCONFIG VGT registers requires VGT_FLUSH before that.
3914 */
si_emit_vgt_flush(struct radeon_cmdbuf * cs)3915 static void si_emit_vgt_flush(struct radeon_cmdbuf *cs)
3916 {
3917 radeon_begin(cs);
3918
3919 /* This is required before VGT_FLUSH. */
3920 radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
3921 radeon_emit(EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
3922
3923 /* VGT_FLUSH is required even if VGT is idle. It resets VGT pointers. */
3924 radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
3925 radeon_emit(EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
3926 radeon_end();
3927 }
3928
3929 /* Initialize state related to ESGS / GSVS ring buffers */
si_update_gs_ring_buffers(struct si_context * sctx)3930 bool si_update_gs_ring_buffers(struct si_context *sctx)
3931 {
3932 assert(sctx->gfx_level < GFX11);
3933
3934 struct si_shader_selector *es =
3935 sctx->shader.tes.cso ? sctx->shader.tes.cso : sctx->shader.vs.cso;
3936 struct si_shader_selector *gs = sctx->shader.gs.cso;
3937
3938 /* Chip constants. */
3939 unsigned num_se = sctx->screen->info.max_se;
3940 unsigned wave_size = 64;
3941 unsigned max_gs_waves = 32 * num_se; /* max 32 per SE on GCN */
3942 /* On GFX6-GFX7, the value comes from VGT_GS_VERTEX_REUSE = 16.
3943 * On GFX8+, the value comes from VGT_VERTEX_REUSE_BLOCK_CNTL = 30 (+2).
3944 */
3945 unsigned gs_vertex_reuse = (sctx->gfx_level >= GFX8 ? 32 : 16) * num_se;
3946 unsigned alignment = 256 * num_se;
3947 /* The maximum size is 63.999 MB per SE. */
3948 unsigned max_size = ((unsigned)(63.999 * 1024 * 1024) & ~255) * num_se;
3949
3950 /* Calculate the minimum size. */
3951 unsigned min_esgs_ring_size = align(es->info.esgs_vertex_stride * gs_vertex_reuse * wave_size, alignment);
3952
3953 /* These are recommended sizes, not minimum sizes. */
3954 unsigned esgs_ring_size =
3955 max_gs_waves * 2 * wave_size * es->info.esgs_vertex_stride * gs->info.gs_input_verts_per_prim;
3956 unsigned gsvs_ring_size = max_gs_waves * 2 * wave_size * gs->info.max_gsvs_emit_size;
3957
3958 min_esgs_ring_size = align(min_esgs_ring_size, alignment);
3959 esgs_ring_size = align(esgs_ring_size, alignment);
3960 gsvs_ring_size = align(gsvs_ring_size, alignment);
3961
3962 esgs_ring_size = CLAMP(esgs_ring_size, min_esgs_ring_size, max_size);
3963 gsvs_ring_size = MIN2(gsvs_ring_size, max_size);
3964
3965 /* Some rings don't have to be allocated if shaders don't use them.
3966 * (e.g. no varyings between ES and GS or GS and VS)
3967 *
3968 * GFX9 doesn't have the ESGS ring.
3969 */
3970 bool update_esgs = sctx->gfx_level <= GFX8 && esgs_ring_size &&
3971 (!sctx->esgs_ring || sctx->esgs_ring->width0 < esgs_ring_size);
3972 bool update_gsvs =
3973 gsvs_ring_size && (!sctx->gsvs_ring || sctx->gsvs_ring->width0 < gsvs_ring_size);
3974
3975 if (!update_esgs && !update_gsvs)
3976 return true;
3977
3978 if (update_esgs) {
3979 pipe_resource_reference(&sctx->esgs_ring, NULL);
3980 sctx->esgs_ring =
3981 pipe_aligned_buffer_create(sctx->b.screen,
3982 PIPE_RESOURCE_FLAG_UNMAPPABLE | SI_RESOURCE_FLAG_DRIVER_INTERNAL |
3983 SI_RESOURCE_FLAG_DISCARDABLE,
3984 PIPE_USAGE_DEFAULT,
3985 esgs_ring_size, sctx->screen->info.pte_fragment_size);
3986 if (!sctx->esgs_ring)
3987 return false;
3988 }
3989
3990 if (update_gsvs) {
3991 pipe_resource_reference(&sctx->gsvs_ring, NULL);
3992 sctx->gsvs_ring =
3993 pipe_aligned_buffer_create(sctx->b.screen,
3994 PIPE_RESOURCE_FLAG_UNMAPPABLE | SI_RESOURCE_FLAG_DRIVER_INTERNAL |
3995 SI_RESOURCE_FLAG_DISCARDABLE,
3996 PIPE_USAGE_DEFAULT,
3997 gsvs_ring_size, sctx->screen->info.pte_fragment_size);
3998 if (!sctx->gsvs_ring)
3999 return false;
4000 }
4001
4002 /* Set ring bindings. */
4003 if (sctx->esgs_ring) {
4004 assert(sctx->gfx_level <= GFX8);
4005 si_set_ring_buffer(sctx, SI_RING_ESGS, sctx->esgs_ring, 0, sctx->esgs_ring->width0, false,
4006 false, 0, 0, 0);
4007 }
4008 if (sctx->gsvs_ring) {
4009 si_set_ring_buffer(sctx, SI_RING_GSVS, sctx->gsvs_ring, 0, sctx->gsvs_ring->width0, false,
4010 false, 0, 0, 0);
4011 }
4012
4013 if (sctx->shadowing.registers) {
4014 /* These registers will be shadowed, so set them only once. */
4015 struct radeon_cmdbuf *cs = &sctx->gfx_cs;
4016
4017 assert(sctx->gfx_level >= GFX7);
4018
4019 si_emit_vgt_flush(cs);
4020
4021 radeon_begin(cs);
4022
4023 /* Set the GS registers. */
4024 if (sctx->esgs_ring) {
4025 assert(sctx->gfx_level <= GFX8);
4026 radeon_set_uconfig_reg(R_030900_VGT_ESGS_RING_SIZE,
4027 sctx->esgs_ring->width0 / 256);
4028 }
4029 if (sctx->gsvs_ring) {
4030 radeon_set_uconfig_reg(R_030904_VGT_GSVS_RING_SIZE,
4031 sctx->gsvs_ring->width0 / 256);
4032 }
4033 radeon_end();
4034 return true;
4035 }
4036
4037 /* The codepath without register shadowing. */
4038 for (unsigned tmz = 0; tmz <= 1; tmz++) {
4039 struct si_pm4_state *pm4 = tmz ? sctx->cs_preamble_state_tmz : sctx->cs_preamble_state;
4040 uint16_t *gs_ring_state_dw_offset = tmz ? &sctx->gs_ring_state_dw_offset_tmz :
4041 &sctx->gs_ring_state_dw_offset;
4042 unsigned old_ndw = 0;
4043
4044 si_cs_preamble_add_vgt_flush(sctx, tmz);
4045
4046 if (!*gs_ring_state_dw_offset) {
4047 /* We are here for the first time. The packets will be added. */
4048 *gs_ring_state_dw_offset = pm4->ndw;
4049 } else {
4050 /* We have been here before. Overwrite the previous packets. */
4051 old_ndw = pm4->ndw;
4052 pm4->ndw = *gs_ring_state_dw_offset;
4053 }
4054
4055 /* Unallocated rings are written to reserve the space in the pm4
4056 * (to be able to overwrite them later). */
4057 if (sctx->gfx_level >= GFX7) {
4058 if (sctx->gfx_level <= GFX8)
4059 si_pm4_set_reg(pm4, R_030900_VGT_ESGS_RING_SIZE,
4060 sctx->esgs_ring ? sctx->esgs_ring->width0 / 256 : 0);
4061 si_pm4_set_reg(pm4, R_030904_VGT_GSVS_RING_SIZE,
4062 sctx->gsvs_ring ? sctx->gsvs_ring->width0 / 256 : 0);
4063 } else {
4064 si_pm4_set_reg(pm4, R_0088C8_VGT_ESGS_RING_SIZE,
4065 sctx->esgs_ring ? sctx->esgs_ring->width0 / 256 : 0);
4066 si_pm4_set_reg(pm4, R_0088CC_VGT_GSVS_RING_SIZE,
4067 sctx->gsvs_ring ? sctx->gsvs_ring->width0 / 256 : 0);
4068 }
4069 si_pm4_finalize(pm4);
4070
4071 if (old_ndw) {
4072 pm4->ndw = old_ndw;
4073 pm4->last_opcode = 255; /* invalid opcode (we don't save the last opcode) */
4074 }
4075 }
4076
4077 /* Flush the context to re-emit both cs_preamble states. */
4078 sctx->initial_gfx_cs_size = 0; /* force flush */
4079 si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
4080
4081 return true;
4082 }
4083
si_shader_lock(struct si_shader * shader)4084 static void si_shader_lock(struct si_shader *shader)
4085 {
4086 simple_mtx_lock(&shader->selector->mutex);
4087 if (shader->previous_stage_sel) {
4088 assert(shader->previous_stage_sel != shader->selector);
4089 simple_mtx_lock(&shader->previous_stage_sel->mutex);
4090 }
4091 }
4092
si_shader_unlock(struct si_shader * shader)4093 static void si_shader_unlock(struct si_shader *shader)
4094 {
4095 if (shader->previous_stage_sel)
4096 simple_mtx_unlock(&shader->previous_stage_sel->mutex);
4097 simple_mtx_unlock(&shader->selector->mutex);
4098 }
4099
4100 /**
4101 * @returns 1 if \p sel has been updated to use a new scratch buffer
4102 * 0 if not
4103 * < 0 if there was a failure
4104 */
si_update_scratch_buffer(struct si_context * sctx,struct si_shader * shader)4105 static int si_update_scratch_buffer(struct si_context *sctx, struct si_shader *shader)
4106 {
4107 uint64_t scratch_va = sctx->scratch_buffer->gpu_address;
4108
4109 if (!shader)
4110 return 0;
4111
4112 /* This shader doesn't need a scratch buffer */
4113 if (shader->config.scratch_bytes_per_wave == 0)
4114 return 0;
4115
4116 /* Prevent race conditions when updating:
4117 * - si_shader::scratch_bo
4118 * - si_shader::binary::code
4119 * - si_shader::previous_stage::binary::code.
4120 */
4121 si_shader_lock(shader);
4122
4123 /* This shader is already configured to use the current
4124 * scratch buffer. */
4125 if (shader->scratch_bo == sctx->scratch_buffer) {
4126 si_shader_unlock(shader);
4127 return 0;
4128 }
4129
4130 assert(sctx->scratch_buffer);
4131
4132 /* Replace the shader bo with a new bo that has the relocs applied. */
4133 if (!si_shader_binary_upload(sctx->screen, shader, scratch_va)) {
4134 si_shader_unlock(shader);
4135 return -1;
4136 }
4137
4138 /* Update the shader state to use the new shader bo. */
4139 si_shader_init_pm4_state(sctx->screen, shader);
4140
4141 si_resource_reference(&shader->scratch_bo, sctx->scratch_buffer);
4142
4143 si_shader_unlock(shader);
4144 return 1;
4145 }
4146
si_update_scratch_relocs(struct si_context * sctx)4147 static bool si_update_scratch_relocs(struct si_context *sctx)
4148 {
4149 int r;
4150
4151 /* Update the shaders, so that they are using the latest scratch.
4152 * The scratch buffer may have been changed since these shaders were
4153 * last used, so we still need to try to update them, even if they
4154 * require scratch buffers smaller than the current size.
4155 */
4156 r = si_update_scratch_buffer(sctx, sctx->shader.ps.current);
4157 if (r < 0)
4158 return false;
4159 if (r == 1)
4160 si_pm4_bind_state(sctx, ps, sctx->shader.ps.current);
4161
4162 r = si_update_scratch_buffer(sctx, sctx->shader.gs.current);
4163 if (r < 0)
4164 return false;
4165 if (r == 1)
4166 si_pm4_bind_state(sctx, gs, sctx->shader.gs.current);
4167
4168 r = si_update_scratch_buffer(sctx, sctx->shader.tcs.current);
4169 if (r < 0)
4170 return false;
4171 if (r == 1)
4172 si_pm4_bind_state(sctx, hs, sctx->shader.tcs.current);
4173
4174 /* VS can be bound as LS, ES, or VS. */
4175 r = si_update_scratch_buffer(sctx, sctx->shader.vs.current);
4176 if (r < 0)
4177 return false;
4178 if (r == 1) {
4179 if (sctx->shader.vs.current->key.ge.as_ls)
4180 si_pm4_bind_state(sctx, ls, sctx->shader.vs.current);
4181 else if (sctx->shader.vs.current->key.ge.as_es)
4182 si_pm4_bind_state(sctx, es, sctx->shader.vs.current);
4183 else if (sctx->shader.vs.current->key.ge.as_ngg)
4184 si_pm4_bind_state(sctx, gs, sctx->shader.vs.current);
4185 else
4186 si_pm4_bind_state(sctx, vs, sctx->shader.vs.current);
4187 }
4188
4189 /* TES can be bound as ES or VS. */
4190 r = si_update_scratch_buffer(sctx, sctx->shader.tes.current);
4191 if (r < 0)
4192 return false;
4193 if (r == 1) {
4194 if (sctx->shader.tes.current->key.ge.as_es)
4195 si_pm4_bind_state(sctx, es, sctx->shader.tes.current);
4196 else if (sctx->shader.tes.current->key.ge.as_ngg)
4197 si_pm4_bind_state(sctx, gs, sctx->shader.tes.current);
4198 else
4199 si_pm4_bind_state(sctx, vs, sctx->shader.tes.current);
4200 }
4201
4202 return true;
4203 }
4204
si_update_spi_tmpring_size(struct si_context * sctx,unsigned bytes)4205 bool si_update_spi_tmpring_size(struct si_context *sctx, unsigned bytes)
4206 {
4207 unsigned spi_tmpring_size;
4208 ac_get_scratch_tmpring_size(&sctx->screen->info, bytes,
4209 &sctx->max_seen_scratch_bytes_per_wave, &spi_tmpring_size);
4210
4211 unsigned scratch_needed_size = sctx->max_seen_scratch_bytes_per_wave *
4212 sctx->screen->info.max_scratch_waves;
4213
4214 if (scratch_needed_size > 0) {
4215 if (!sctx->scratch_buffer || scratch_needed_size > sctx->scratch_buffer->b.b.width0) {
4216 /* Create a bigger scratch buffer */
4217 si_resource_reference(&sctx->scratch_buffer, NULL);
4218
4219 sctx->scratch_buffer = si_aligned_buffer_create(
4220 &sctx->screen->b,
4221 PIPE_RESOURCE_FLAG_UNMAPPABLE | SI_RESOURCE_FLAG_DRIVER_INTERNAL |
4222 SI_RESOURCE_FLAG_DISCARDABLE,
4223 PIPE_USAGE_DEFAULT, scratch_needed_size,
4224 sctx->screen->info.pte_fragment_size);
4225 if (!sctx->scratch_buffer)
4226 return false;
4227 }
4228
4229 if (sctx->gfx_level < GFX11 && !si_update_scratch_relocs(sctx))
4230 return false;
4231 }
4232
4233 if (spi_tmpring_size != sctx->spi_tmpring_size) {
4234 sctx->spi_tmpring_size = spi_tmpring_size;
4235 si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state);
4236 }
4237 return true;
4238 }
4239
si_init_tess_factor_ring(struct si_context * sctx)4240 void si_init_tess_factor_ring(struct si_context *sctx)
4241 {
4242 assert(!sctx->tess_rings);
4243
4244 /* The address must be aligned to 2^19, because the shader only
4245 * receives the high 13 bits. Align it to 2MB to match the GPU page size.
4246 */
4247 sctx->tess_rings = pipe_aligned_buffer_create(sctx->b.screen,
4248 PIPE_RESOURCE_FLAG_UNMAPPABLE |
4249 SI_RESOURCE_FLAG_32BIT |
4250 SI_RESOURCE_FLAG_DRIVER_INTERNAL |
4251 SI_RESOURCE_FLAG_DISCARDABLE,
4252 PIPE_USAGE_DEFAULT,
4253 sctx->screen->hs.tess_offchip_ring_size +
4254 sctx->screen->hs.tess_factor_ring_size,
4255 2 * 1024 * 1024);
4256 if (!sctx->tess_rings)
4257 return;
4258
4259 if (sctx->screen->info.has_tmz_support) {
4260 sctx->tess_rings_tmz = pipe_aligned_buffer_create(sctx->b.screen,
4261 PIPE_RESOURCE_FLAG_UNMAPPABLE |
4262 PIPE_RESOURCE_FLAG_ENCRYPTED |
4263 SI_RESOURCE_FLAG_32BIT |
4264 SI_RESOURCE_FLAG_DRIVER_INTERNAL |
4265 SI_RESOURCE_FLAG_DISCARDABLE,
4266 PIPE_USAGE_DEFAULT,
4267 sctx->screen->hs.tess_offchip_ring_size +
4268 sctx->screen->hs.tess_factor_ring_size,
4269 2 * 1024 * 1024);
4270 }
4271
4272 uint64_t factor_va =
4273 si_resource(sctx->tess_rings)->gpu_address + sctx->screen->hs.tess_offchip_ring_size;
4274
4275 unsigned tf_ring_size_field = sctx->screen->hs.tess_factor_ring_size / 4;
4276 if (sctx->gfx_level >= GFX11)
4277 tf_ring_size_field /= sctx->screen->info.max_se;
4278
4279 assert((tf_ring_size_field & C_030938_SIZE) == 0);
4280
4281 if (sctx->shadowing.registers) {
4282 /* These registers will be shadowed, so set them only once. */
4283 /* TODO: tmz + shadowed_regs support */
4284 struct radeon_cmdbuf *cs = &sctx->gfx_cs;
4285
4286 assert(sctx->gfx_level >= GFX7);
4287
4288 radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(sctx->tess_rings),
4289 RADEON_USAGE_READWRITE | RADEON_PRIO_SHADER_RINGS);
4290 si_emit_vgt_flush(cs);
4291
4292 /* Set tessellation registers. */
4293 radeon_begin(cs);
4294 radeon_set_uconfig_reg(R_030938_VGT_TF_RING_SIZE,
4295 S_030938_SIZE(tf_ring_size_field));
4296 radeon_set_uconfig_reg(R_030940_VGT_TF_MEMORY_BASE, factor_va >> 8);
4297 if (sctx->gfx_level >= GFX10) {
4298 radeon_set_uconfig_reg(R_030984_VGT_TF_MEMORY_BASE_HI,
4299 S_030984_BASE_HI(factor_va >> 40));
4300 } else if (sctx->gfx_level == GFX9) {
4301 radeon_set_uconfig_reg(R_030944_VGT_TF_MEMORY_BASE_HI,
4302 S_030944_BASE_HI(factor_va >> 40));
4303 }
4304 radeon_set_uconfig_reg(R_03093C_VGT_HS_OFFCHIP_PARAM,
4305 sctx->screen->hs.hs_offchip_param);
4306 radeon_end();
4307 return;
4308 }
4309
4310 /* The codepath without register shadowing is below. */
4311 /* Add these registers to cs_preamble_state. */
4312 for (unsigned tmz = 0; tmz <= 1; tmz++) {
4313 struct si_pm4_state *pm4 = tmz ? sctx->cs_preamble_state_tmz : sctx->cs_preamble_state;
4314 struct pipe_resource *tf_ring = tmz ? sctx->tess_rings_tmz : sctx->tess_rings;
4315
4316 if (!tf_ring)
4317 continue; /* TMZ not supported */
4318
4319 uint64_t va = si_resource(tf_ring)->gpu_address + sctx->screen->hs.tess_offchip_ring_size;
4320
4321 si_cs_preamble_add_vgt_flush(sctx, tmz);
4322
4323 if (sctx->gfx_level >= GFX7) {
4324 si_pm4_set_reg(pm4, R_030938_VGT_TF_RING_SIZE, S_030938_SIZE(tf_ring_size_field));
4325 si_pm4_set_reg(pm4, R_03093C_VGT_HS_OFFCHIP_PARAM, sctx->screen->hs.hs_offchip_param);
4326 si_pm4_set_reg(pm4, R_030940_VGT_TF_MEMORY_BASE, va >> 8);
4327 if (sctx->gfx_level >= GFX10)
4328 si_pm4_set_reg(pm4, R_030984_VGT_TF_MEMORY_BASE_HI, S_030984_BASE_HI(va >> 40));
4329 else if (sctx->gfx_level == GFX9)
4330 si_pm4_set_reg(pm4, R_030944_VGT_TF_MEMORY_BASE_HI, S_030944_BASE_HI(va >> 40));
4331 } else {
4332 si_pm4_set_reg(pm4, R_008988_VGT_TF_RING_SIZE, S_008988_SIZE(tf_ring_size_field));
4333 si_pm4_set_reg(pm4, R_0089B8_VGT_TF_MEMORY_BASE, factor_va >> 8);
4334 si_pm4_set_reg(pm4, R_0089B0_VGT_HS_OFFCHIP_PARAM, sctx->screen->hs.hs_offchip_param);
4335 }
4336 si_pm4_finalize(pm4);
4337 }
4338
4339 /* Flush the context to re-emit the cs_preamble state.
4340 * This is done only once in a lifetime of a context.
4341 */
4342 sctx->initial_gfx_cs_size = 0; /* force flush */
4343 si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
4344 }
4345
si_emit_vgt_pipeline_state(struct si_context * sctx,unsigned index)4346 static void si_emit_vgt_pipeline_state(struct si_context *sctx, unsigned index)
4347 {
4348 struct radeon_cmdbuf *cs = &sctx->gfx_cs;
4349
4350 radeon_begin(cs);
4351 radeon_opt_set_context_reg(sctx, R_028B54_VGT_SHADER_STAGES_EN, SI_TRACKED_VGT_SHADER_STAGES_EN,
4352 sctx->vgt_shader_stages_en);
4353 radeon_end_update_context_roll(sctx);
4354
4355 if (sctx->gfx_level >= GFX10) {
4356 uint32_t ge_cntl = sctx->ge_cntl;
4357
4358 if (sctx->gfx_level < GFX11 && sctx->shader.tes.cso) {
4359 /* This must be a multiple of VGT_LS_HS_CONFIG.NUM_PATCHES. */
4360 ge_cntl |= S_03096C_PRIM_GRP_SIZE_GFX10(sctx->num_patches_per_workgroup);
4361 }
4362
4363 radeon_begin_again(cs);
4364 radeon_opt_set_uconfig_reg(sctx, R_03096C_GE_CNTL, SI_TRACKED_GE_CNTL, ge_cntl);
4365 radeon_end();
4366 }
4367 }
4368
si_emit_scratch_state(struct si_context * sctx,unsigned index)4369 static void si_emit_scratch_state(struct si_context *sctx, unsigned index)
4370 {
4371 struct radeon_cmdbuf *cs = &sctx->gfx_cs;
4372
4373 radeon_begin(cs);
4374 if (sctx->gfx_level >= GFX11) {
4375 radeon_set_context_reg_seq(R_0286E8_SPI_TMPRING_SIZE, 3);
4376 radeon_emit(sctx->spi_tmpring_size); /* SPI_TMPRING_SIZE */
4377 radeon_emit(sctx->scratch_buffer->gpu_address >> 8); /* SPI_GFX_SCRATCH_BASE_LO */
4378 radeon_emit(sctx->scratch_buffer->gpu_address >> 40); /* SPI_GFX_SCRATCH_BASE_HI */
4379 } else {
4380 radeon_set_context_reg(R_0286E8_SPI_TMPRING_SIZE, sctx->spi_tmpring_size);
4381 }
4382 radeon_end();
4383
4384 if (sctx->scratch_buffer) {
4385 radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, sctx->scratch_buffer,
4386 RADEON_USAGE_READWRITE | RADEON_PRIO_SCRATCH_BUFFER);
4387 }
4388 }
4389
4390 struct si_fixed_func_tcs_shader_key {
4391 uint64_t outputs_written;
4392 uint8_t vertices_out;
4393 };
4394
si_fixed_func_tcs_shader_key_hash(const void * key)4395 static uint32_t si_fixed_func_tcs_shader_key_hash(const void *key)
4396 {
4397 return _mesa_hash_data(key, sizeof(struct si_fixed_func_tcs_shader_key));
4398 }
4399
si_fixed_func_tcs_shader_key_equals(const void * a,const void * b)4400 static bool si_fixed_func_tcs_shader_key_equals(const void *a, const void *b)
4401 {
4402 return memcmp(a, b, sizeof(struct si_fixed_func_tcs_shader_key)) == 0;
4403 }
4404
si_set_tcs_to_fixed_func_shader(struct si_context * sctx)4405 bool si_set_tcs_to_fixed_func_shader(struct si_context *sctx)
4406 {
4407 if (!sctx->fixed_func_tcs_shader_cache) {
4408 sctx->fixed_func_tcs_shader_cache = _mesa_hash_table_create(
4409 NULL, si_fixed_func_tcs_shader_key_hash,
4410 si_fixed_func_tcs_shader_key_equals);
4411 }
4412
4413 struct si_fixed_func_tcs_shader_key key;
4414 key.outputs_written = sctx->shader.vs.cso->info.outputs_written_before_tes_gs;
4415 key.vertices_out = sctx->patch_vertices;
4416
4417 struct hash_entry *entry = _mesa_hash_table_search(
4418 sctx->fixed_func_tcs_shader_cache, &key);
4419
4420 struct si_shader_selector *tcs;
4421 if (entry)
4422 tcs = (struct si_shader_selector *)entry->data;
4423 else {
4424 tcs = (struct si_shader_selector *)si_create_passthrough_tcs(sctx);
4425 if (!tcs)
4426 return false;
4427 _mesa_hash_table_insert(sctx->fixed_func_tcs_shader_cache, &key, (void *)tcs);
4428 }
4429
4430 sctx->shader.tcs.cso = tcs;
4431 sctx->shader.tcs.key.ge.part.tcs.epilog.invoc0_tess_factors_are_def =
4432 tcs->info.tessfactors_are_def_in_all_invocs;
4433
4434 return true;
4435 }
4436
si_update_tess_in_out_patch_vertices(struct si_context * sctx)4437 static void si_update_tess_in_out_patch_vertices(struct si_context *sctx)
4438 {
4439 if (sctx->is_user_tcs) {
4440 struct si_shader_selector *tcs = sctx->shader.tcs.cso;
4441
4442 bool same_patch_vertices =
4443 sctx->gfx_level >= GFX9 &&
4444 sctx->patch_vertices == tcs->info.base.tess.tcs_vertices_out;
4445
4446 if (sctx->shader.tcs.key.ge.opt.same_patch_vertices != same_patch_vertices) {
4447 sctx->shader.tcs.key.ge.opt.same_patch_vertices = same_patch_vertices;
4448 sctx->do_update_shaders = true;
4449 }
4450 } else {
4451 /* These fields are static for fixed function TCS. So no need to set
4452 * do_update_shaders between fixed-TCS draws. As fixed-TCS to user-TCS
4453 * or opposite, do_update_shaders should already be set by bind state.
4454 */
4455 sctx->shader.tcs.key.ge.opt.same_patch_vertices = sctx->gfx_level >= GFX9;
4456
4457 /* User may only change patch vertices, needs to update fixed func TCS. */
4458 if (sctx->shader.tcs.cso &&
4459 sctx->shader.tcs.cso->info.base.tess.tcs_vertices_out != sctx->patch_vertices)
4460 sctx->do_update_shaders = true;
4461 }
4462 }
4463
si_set_patch_vertices(struct pipe_context * ctx,uint8_t patch_vertices)4464 static void si_set_patch_vertices(struct pipe_context *ctx, uint8_t patch_vertices)
4465 {
4466 struct si_context *sctx = (struct si_context *)ctx;
4467
4468 if (sctx->patch_vertices != patch_vertices) {
4469 sctx->patch_vertices = patch_vertices;
4470 si_update_tess_in_out_patch_vertices(sctx);
4471 if (sctx->shader.tcs.current) {
4472 /* Update the io layout now if possible,
4473 * otherwise make sure it's done by si_update_shaders.
4474 */
4475 if (sctx->tess_rings)
4476 si_update_tess_io_layout_state(sctx);
4477 else
4478 sctx->do_update_shaders = true;
4479 }
4480
4481 }
4482 }
4483
4484 /**
4485 * This calculates the LDS size for tessellation shaders (VS, TCS, TES).
4486 * LS.LDS_SIZE is shared by all 3 shader stages.
4487 *
4488 * The information about LDS and other non-compile-time parameters is then
4489 * written to userdata SGPRs.
4490 *
4491 * This depends on:
4492 * - patch_vertices
4493 * - VS and the currently selected shader variant (called by si_update_shaders)
4494 * - TCS and the currently selected shader variant (called by si_update_shaders)
4495 * - tess_uses_prim_id (called by si_update_shaders)
4496 * - sh_base[TESS_EVAL] depending on GS on/off (called by si_update_shaders)
4497 */
si_update_tess_io_layout_state(struct si_context * sctx)4498 void si_update_tess_io_layout_state(struct si_context *sctx)
4499 {
4500 struct si_shader *ls_current;
4501 struct si_shader_selector *ls;
4502 struct si_shader_selector *tcs = sctx->shader.tcs.cso;
4503 unsigned tess_uses_primid = sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id;
4504 bool has_primid_instancing_bug = sctx->gfx_level == GFX6 && sctx->screen->info.max_se == 1;
4505 unsigned tes_sh_base = sctx->shader_pointers.sh_base[PIPE_SHADER_TESS_EVAL];
4506 uint8_t num_tcs_input_cp = sctx->patch_vertices;
4507
4508 assert(sctx->shader.tcs.current);
4509
4510 /* Since GFX9 has merged LS-HS in the TCS state, set LS = TCS. */
4511 if (sctx->gfx_level >= GFX9) {
4512 ls_current = sctx->shader.tcs.current;
4513 ls = ls_current->key.ge.part.tcs.ls;
4514 } else {
4515 ls_current = sctx->shader.vs.current;
4516 ls = sctx->shader.vs.cso;
4517 }
4518
4519 if (sctx->last_ls == ls_current && sctx->last_tcs == tcs &&
4520 sctx->last_tes_sh_base == tes_sh_base && sctx->last_num_tcs_input_cp == num_tcs_input_cp &&
4521 (!has_primid_instancing_bug || (sctx->last_tess_uses_primid == tess_uses_primid)))
4522 return;
4523
4524 sctx->last_ls = ls_current;
4525 sctx->last_tcs = tcs;
4526 sctx->last_tes_sh_base = tes_sh_base;
4527 sctx->last_num_tcs_input_cp = num_tcs_input_cp;
4528 sctx->last_tess_uses_primid = tess_uses_primid;
4529
4530 /* This calculates how shader inputs and outputs among VS, TCS, and TES
4531 * are laid out in LDS. */
4532 unsigned num_tcs_outputs = util_last_bit64(tcs->info.outputs_written_before_tes_gs);
4533 unsigned num_tcs_output_cp = tcs->info.base.tess.tcs_vertices_out;
4534 unsigned num_tcs_patch_outputs = util_last_bit64(tcs->info.patch_outputs_written);
4535
4536 unsigned input_vertex_size = ls->info.lshs_vertex_stride;
4537 unsigned output_vertex_size = num_tcs_outputs * 16;
4538 unsigned input_patch_size;
4539
4540 /* Allocate LDS for TCS inputs only if it's used. */
4541 if (!ls_current->key.ge.opt.same_patch_vertices ||
4542 tcs->info.base.inputs_read & ~tcs->info.tcs_vgpr_only_inputs)
4543 input_patch_size = num_tcs_input_cp * input_vertex_size;
4544 else
4545 input_patch_size = 0;
4546
4547 unsigned pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size;
4548 unsigned output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
4549 unsigned lds_per_patch;
4550
4551 /* Compute the LDS size per patch.
4552 *
4553 * LDS is used to store TCS outputs if they are read, and to store tess
4554 * factors if they are not defined in all invocations.
4555 */
4556 if (tcs->info.base.outputs_read ||
4557 tcs->info.base.patch_outputs_read ||
4558 !tcs->info.tessfactors_are_def_in_all_invocs) {
4559 lds_per_patch = input_patch_size + output_patch_size;
4560 } else {
4561 /* LDS will only store TCS inputs. The offchip buffer will only store TCS outputs. */
4562 lds_per_patch = MAX2(input_patch_size, output_patch_size);
4563 }
4564
4565 /* Ensure that we only need 4 waves per CU, so that we don't need to check
4566 * resource usage (such as whether we have enough VGPRs to fit the whole
4567 * threadgroup into the CU). It also ensures that the number of tcs in and out
4568 * vertices per threadgroup are at most 256, which is the hw limit.
4569 */
4570 unsigned max_verts_per_patch = MAX2(num_tcs_input_cp, num_tcs_output_cp);
4571 unsigned num_patches = 256 / max_verts_per_patch;
4572
4573 /* Not necessary for correctness, but higher numbers are slower.
4574 * The hardware can do more, but the radeonsi shader constant is
4575 * limited to 6 bits.
4576 */
4577 num_patches = MIN2(num_patches, 64); /* e.g. 64 triangles in exactly 3 waves */
4578
4579 /* When distributed tessellation is unsupported, switch between SEs
4580 * at a higher frequency to manually balance the workload between SEs.
4581 */
4582 if (!sctx->screen->info.has_distributed_tess && sctx->screen->info.max_se > 1)
4583 num_patches = MIN2(num_patches, 16); /* recommended */
4584
4585 /* Make sure the output data fits in the offchip buffer */
4586 num_patches =
4587 MIN2(num_patches, (sctx->screen->hs.tess_offchip_block_dw_size * 4) / output_patch_size);
4588
4589 /* Make sure that the data fits in LDS. This assumes the shaders only
4590 * use LDS for the inputs and outputs.
4591 *
4592 * The maximum allowed LDS size is 32K. Higher numbers can hang.
4593 * Use 16K as the maximum, so that we can fit 2 workgroups on the same CU.
4594 */
4595 ASSERTED unsigned max_lds_size = 32 * 1024; /* hw limit */
4596 unsigned target_lds_size = 16 * 1024; /* target at least 2 workgroups per CU, 16K each */
4597 num_patches = MIN2(num_patches, target_lds_size / lds_per_patch);
4598 num_patches = MAX2(num_patches, 1);
4599 assert(num_patches * lds_per_patch <= max_lds_size);
4600
4601 /* Make sure that vector lanes are fully occupied by cutting off the last wave
4602 * if it's only partially filled.
4603 */
4604 unsigned temp_verts_per_tg = num_patches * max_verts_per_patch;
4605 unsigned wave_size = ls_current->wave_size;
4606
4607 if (temp_verts_per_tg > wave_size &&
4608 (wave_size - temp_verts_per_tg % wave_size >= MAX2(max_verts_per_patch, 8)))
4609 num_patches = (temp_verts_per_tg & ~(wave_size - 1)) / max_verts_per_patch;
4610
4611 if (sctx->gfx_level == GFX6) {
4612 /* GFX6 bug workaround, related to power management. Limit LS-HS
4613 * threadgroups to only one wave.
4614 */
4615 unsigned one_wave = wave_size / max_verts_per_patch;
4616 num_patches = MIN2(num_patches, one_wave);
4617 }
4618
4619 /* The VGT HS block increments the patch ID unconditionally
4620 * within a single threadgroup. This results in incorrect
4621 * patch IDs when instanced draws are used.
4622 *
4623 * The intended solution is to restrict threadgroups to
4624 * a single instance by setting SWITCH_ON_EOI, which
4625 * should cause IA to split instances up. However, this
4626 * doesn't work correctly on GFX6 when there is no other
4627 * SE to switch to.
4628 */
4629 if (has_primid_instancing_bug && tess_uses_primid)
4630 num_patches = 1;
4631
4632 if (sctx->num_patches_per_workgroup != num_patches) {
4633 sctx->num_patches_per_workgroup = num_patches;
4634 si_mark_atom_dirty(sctx, &sctx->atoms.s.vgt_pipeline_state);
4635 }
4636
4637 unsigned output_patch0_offset = input_patch_size * num_patches;
4638 unsigned perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size;
4639
4640 /* Compute userdata SGPRs. */
4641 assert(((input_vertex_size / 4) & ~0xff) == 0);
4642 assert(((perpatch_output_offset / 4) & ~0xffff) == 0);
4643 assert(num_tcs_input_cp <= 32);
4644 assert(num_tcs_output_cp <= 32);
4645 assert(num_patches <= 64);
4646 assert(((pervertex_output_patch_size * num_patches) & ~0xffff) == 0);
4647
4648 uint64_t ring_va = (unlikely(sctx->ws->cs_is_secure(&sctx->gfx_cs)) ?
4649 si_resource(sctx->tess_rings_tmz) : si_resource(sctx->tess_rings))->gpu_address;
4650 assert((ring_va & u_bit_consecutive(0, 19)) == 0);
4651
4652 sctx->tes_offchip_ring_va_sgpr = ring_va;
4653 sctx->tcs_offchip_layout =
4654 (num_patches - 1) | ((num_tcs_output_cp - 1) << 6) | ((num_tcs_input_cp - 1) << 11) |
4655 ((pervertex_output_patch_size * num_patches) << 16);
4656
4657 /* Compute the LDS size. */
4658 unsigned lds_size = lds_per_patch * num_patches;
4659
4660 if (sctx->gfx_level >= GFX7) {
4661 assert(lds_size <= 65536);
4662 lds_size = align(lds_size, 512) / 512;
4663 } else {
4664 assert(lds_size <= 32768);
4665 lds_size = align(lds_size, 256) / 256;
4666 }
4667
4668 /* Set SI_SGPR_VS_STATE_BITS. */
4669 SET_FIELD(sctx->current_vs_state, VS_STATE_LS_OUT_VERTEX_SIZE, input_vertex_size / 4);
4670 SET_FIELD(sctx->current_vs_state, VS_STATE_TCS_OUT_PATCH0_OFFSET, perpatch_output_offset / 4);
4671
4672 /* We should be able to support in-shader LDS use with LLVM >= 9
4673 * by just adding the lds_sizes together, but it has never
4674 * been tested. */
4675 assert(ls_current->config.lds_size == 0);
4676
4677 unsigned ls_hs_rsrc2;
4678
4679 if (sctx->gfx_level >= GFX9) {
4680 ls_hs_rsrc2 = sctx->shader.tcs.current->config.rsrc2;
4681
4682 if (sctx->gfx_level >= GFX10)
4683 ls_hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX10(lds_size);
4684 else
4685 ls_hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX9(lds_size);
4686 } else {
4687 ls_hs_rsrc2 = sctx->shader.vs.current->config.rsrc2;
4688
4689 si_multiwave_lds_size_workaround(sctx->screen, &lds_size);
4690 ls_hs_rsrc2 |= S_00B52C_LDS_SIZE(lds_size);
4691 }
4692
4693 sctx->ls_hs_rsrc2 = ls_hs_rsrc2;
4694 sctx->ls_hs_config =
4695 S_028B58_NUM_PATCHES(sctx->num_patches_per_workgroup) |
4696 S_028B58_HS_NUM_INPUT_CP(num_tcs_input_cp) |
4697 S_028B58_HS_NUM_OUTPUT_CP(num_tcs_output_cp);
4698
4699 si_mark_atom_dirty(sctx, &sctx->atoms.s.tess_io_layout);
4700 }
4701
si_emit_tess_io_layout_state(struct si_context * sctx,unsigned index)4702 static void si_emit_tess_io_layout_state(struct si_context *sctx, unsigned index)
4703 {
4704 struct radeon_cmdbuf *cs = &sctx->gfx_cs;
4705
4706 if (!sctx->shader.tes.cso || !sctx->shader.tcs.current)
4707 return;
4708
4709 radeon_begin(cs);
4710 if (sctx->screen->info.has_set_sh_pairs_packed) {
4711 gfx11_opt_push_gfx_sh_reg(R_00B42C_SPI_SHADER_PGM_RSRC2_HS,
4712 SI_TRACKED_SPI_SHADER_PGM_RSRC2_HS, sctx->ls_hs_rsrc2);
4713
4714 /* Set userdata SGPRs for merged LS-HS. */
4715 gfx11_opt_push_gfx_sh_reg(R_00B430_SPI_SHADER_USER_DATA_HS_0 +
4716 GFX9_SGPR_TCS_OFFCHIP_LAYOUT * 4,
4717 SI_TRACKED_SPI_SHADER_USER_DATA_HS__TCS_OFFCHIP_LAYOUT,
4718 sctx->tcs_offchip_layout);
4719 gfx11_opt_push_gfx_sh_reg(R_00B430_SPI_SHADER_USER_DATA_HS_0 +
4720 GFX9_SGPR_TCS_OFFCHIP_ADDR * 4,
4721 SI_TRACKED_SPI_SHADER_USER_DATA_HS__TCS_OFFCHIP_ADDR,
4722 sctx->tes_offchip_ring_va_sgpr);
4723 } else if (sctx->gfx_level >= GFX9) {
4724 radeon_opt_set_sh_reg(sctx, R_00B42C_SPI_SHADER_PGM_RSRC2_HS,
4725 SI_TRACKED_SPI_SHADER_PGM_RSRC2_HS, sctx->ls_hs_rsrc2);
4726
4727 /* Set userdata SGPRs for merged LS-HS. */
4728 radeon_opt_set_sh_reg2(sctx,
4729 R_00B430_SPI_SHADER_USER_DATA_HS_0 +
4730 GFX9_SGPR_TCS_OFFCHIP_LAYOUT * 4,
4731 SI_TRACKED_SPI_SHADER_USER_DATA_HS__TCS_OFFCHIP_LAYOUT,
4732 sctx->tcs_offchip_layout, sctx->tes_offchip_ring_va_sgpr);
4733 } else {
4734 /* Due to a hw bug, RSRC2_LS must be written twice with another
4735 * LS register written in between. */
4736 if (sctx->gfx_level == GFX7 && sctx->family != CHIP_HAWAII)
4737 radeon_set_sh_reg(R_00B52C_SPI_SHADER_PGM_RSRC2_LS, sctx->ls_hs_rsrc2);
4738 radeon_set_sh_reg_seq(R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2);
4739 radeon_emit(sctx->shader.vs.current->config.rsrc1);
4740 radeon_emit(sctx->ls_hs_rsrc2);
4741
4742 /* Set userdata SGPRs for TCS. */
4743 radeon_opt_set_sh_reg3(sctx,
4744 R_00B430_SPI_SHADER_USER_DATA_HS_0 +
4745 GFX6_SGPR_TCS_OFFCHIP_LAYOUT * 4,
4746 SI_TRACKED_SPI_SHADER_USER_DATA_HS__TCS_OFFCHIP_LAYOUT,
4747 sctx->tcs_offchip_layout, sctx->tes_offchip_ring_va_sgpr,
4748 sctx->current_vs_state);
4749 }
4750
4751 /* Set userdata SGPRs for TES. */
4752 unsigned tes_sh_base = sctx->shader_pointers.sh_base[PIPE_SHADER_TESS_EVAL];
4753 assert(tes_sh_base);
4754
4755 /* TES (as ES or VS) reuses the BaseVertex and DrawID user SGPRs that are used when
4756 * tessellation is disabled. We can do that because those user SGPRs are only set in LS
4757 * for tessellation and are unused in TES.
4758 */
4759 if (sctx->screen->info.has_set_sh_pairs_packed) {
4760 gfx11_opt_push_gfx_sh_reg(tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4,
4761 SI_TRACKED_SPI_SHADER_USER_DATA_ES__BASE_VERTEX,
4762 sctx->tcs_offchip_layout);
4763 gfx11_opt_push_gfx_sh_reg(tes_sh_base + SI_SGPR_TES_OFFCHIP_ADDR * 4,
4764 SI_TRACKED_SPI_SHADER_USER_DATA_ES__DRAWID,
4765 sctx->tes_offchip_ring_va_sgpr);
4766 } else {
4767 bool has_gs = sctx->ngg || sctx->shader.gs.cso;
4768
4769 radeon_opt_set_sh_reg2(sctx, tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4,
4770 has_gs ? SI_TRACKED_SPI_SHADER_USER_DATA_ES__BASE_VERTEX
4771 : SI_TRACKED_SPI_SHADER_USER_DATA_VS__BASE_VERTEX,
4772 sctx->tcs_offchip_layout, sctx->tes_offchip_ring_va_sgpr);
4773 }
4774 radeon_end();
4775
4776 radeon_begin_again(cs);
4777 if (sctx->gfx_level >= GFX7) {
4778 radeon_opt_set_context_reg_idx(sctx, R_028B58_VGT_LS_HS_CONFIG,
4779 SI_TRACKED_VGT_LS_HS_CONFIG, 2, sctx->ls_hs_config);
4780 } else {
4781 radeon_opt_set_context_reg(sctx, R_028B58_VGT_LS_HS_CONFIG,
4782 SI_TRACKED_VGT_LS_HS_CONFIG, sctx->ls_hs_config);
4783 }
4784 radeon_end_update_context_roll(sctx);
4785 }
4786
si_init_screen_live_shader_cache(struct si_screen * sscreen)4787 void si_init_screen_live_shader_cache(struct si_screen *sscreen)
4788 {
4789 util_live_shader_cache_init(&sscreen->live_shader_cache, si_create_shader_selector,
4790 si_destroy_shader_selector);
4791 }
4792
4793 template<int NUM_INTERP>
si_emit_spi_map(struct si_context * sctx,unsigned index)4794 static void si_emit_spi_map(struct si_context *sctx, unsigned index)
4795 {
4796 struct si_shader *ps = sctx->shader.ps.current;
4797 unsigned spi_ps_input_cntl[NUM_INTERP];
4798
4799 STATIC_ASSERT(NUM_INTERP >= 0 && NUM_INTERP <= 32);
4800
4801 if (!NUM_INTERP)
4802 return;
4803
4804 struct si_shader *vs = si_get_vs(sctx)->current;
4805 struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
4806
4807 for (unsigned i = 0; i < NUM_INTERP; i++) {
4808 union si_input_info input = ps->info.ps_inputs[i];
4809 unsigned ps_input_cntl = vs->info.vs_output_ps_input_cntl[input.semantic];
4810 bool non_default_val = G_028644_OFFSET(ps_input_cntl) != 0x20;
4811
4812 if (non_default_val) {
4813 if (input.interpolate == INTERP_MODE_FLAT ||
4814 (input.interpolate == INTERP_MODE_COLOR && rs->flatshade))
4815 ps_input_cntl |= S_028644_FLAT_SHADE(1);
4816
4817 if (input.fp16_lo_hi_valid) {
4818 ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) |
4819 S_028644_ATTR0_VALID(1) | /* this must be set if FP16_INTERP_MODE is set */
4820 S_028644_ATTR1_VALID(!!(input.fp16_lo_hi_valid & 0x2));
4821 }
4822 }
4823
4824 if (input.semantic == VARYING_SLOT_PNTC ||
4825 (input.semantic >= VARYING_SLOT_TEX0 && input.semantic <= VARYING_SLOT_TEX7 &&
4826 rs->sprite_coord_enable & (1 << (input.semantic - VARYING_SLOT_TEX0)))) {
4827 /* Overwrite the whole value (except OFFSET) for sprite coordinates. */
4828 ps_input_cntl &= ~C_028644_OFFSET;
4829 ps_input_cntl |= S_028644_PT_SPRITE_TEX(1);
4830 if (input.fp16_lo_hi_valid & 0x1) {
4831 ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) |
4832 S_028644_ATTR0_VALID(1);
4833 }
4834 }
4835
4836 spi_ps_input_cntl[i] = ps_input_cntl;
4837 }
4838
4839 /* Performance notes:
4840 * Dota 2: Only ~16% of SPI map updates set different values.
4841 * Talos: Only ~9% of SPI map updates set different values.
4842 */
4843 radeon_begin(&sctx->gfx_cs);
4844 radeon_opt_set_context_regn(sctx, R_028644_SPI_PS_INPUT_CNTL_0, spi_ps_input_cntl,
4845 sctx->tracked_regs.spi_ps_input_cntl, NUM_INTERP);
4846 radeon_end_update_context_roll(sctx);
4847 }
4848
si_init_shader_functions(struct si_context * sctx)4849 void si_init_shader_functions(struct si_context *sctx)
4850 {
4851 sctx->atoms.s.vgt_pipeline_state.emit = si_emit_vgt_pipeline_state;
4852 sctx->atoms.s.scratch_state.emit = si_emit_scratch_state;
4853 sctx->atoms.s.tess_io_layout.emit = si_emit_tess_io_layout_state;
4854
4855 sctx->b.create_vs_state = si_create_shader;
4856 sctx->b.create_tcs_state = si_create_shader;
4857 sctx->b.create_tes_state = si_create_shader;
4858 sctx->b.create_gs_state = si_create_shader;
4859 sctx->b.create_fs_state = si_create_shader;
4860
4861 sctx->b.bind_vs_state = si_bind_vs_shader;
4862 sctx->b.bind_tcs_state = si_bind_tcs_shader;
4863 sctx->b.bind_tes_state = si_bind_tes_shader;
4864 sctx->b.bind_gs_state = si_bind_gs_shader;
4865 sctx->b.bind_fs_state = si_bind_ps_shader;
4866
4867 sctx->b.delete_vs_state = si_delete_shader_selector;
4868 sctx->b.delete_tcs_state = si_delete_shader_selector;
4869 sctx->b.delete_tes_state = si_delete_shader_selector;
4870 sctx->b.delete_gs_state = si_delete_shader_selector;
4871 sctx->b.delete_fs_state = si_delete_shader_selector;
4872
4873 sctx->b.set_patch_vertices = si_set_patch_vertices;
4874
4875 /* This unrolls the loops in si_emit_spi_map and inlines memcmp and memcpys.
4876 * It improves performance for viewperf/snx.
4877 */
4878 sctx->emit_spi_map[0] = si_emit_spi_map<0>;
4879 sctx->emit_spi_map[1] = si_emit_spi_map<1>;
4880 sctx->emit_spi_map[2] = si_emit_spi_map<2>;
4881 sctx->emit_spi_map[3] = si_emit_spi_map<3>;
4882 sctx->emit_spi_map[4] = si_emit_spi_map<4>;
4883 sctx->emit_spi_map[5] = si_emit_spi_map<5>;
4884 sctx->emit_spi_map[6] = si_emit_spi_map<6>;
4885 sctx->emit_spi_map[7] = si_emit_spi_map<7>;
4886 sctx->emit_spi_map[8] = si_emit_spi_map<8>;
4887 sctx->emit_spi_map[9] = si_emit_spi_map<9>;
4888 sctx->emit_spi_map[10] = si_emit_spi_map<10>;
4889 sctx->emit_spi_map[11] = si_emit_spi_map<11>;
4890 sctx->emit_spi_map[12] = si_emit_spi_map<12>;
4891 sctx->emit_spi_map[13] = si_emit_spi_map<13>;
4892 sctx->emit_spi_map[14] = si_emit_spi_map<14>;
4893 sctx->emit_spi_map[15] = si_emit_spi_map<15>;
4894 sctx->emit_spi_map[16] = si_emit_spi_map<16>;
4895 sctx->emit_spi_map[17] = si_emit_spi_map<17>;
4896 sctx->emit_spi_map[18] = si_emit_spi_map<18>;
4897 sctx->emit_spi_map[19] = si_emit_spi_map<19>;
4898 sctx->emit_spi_map[20] = si_emit_spi_map<20>;
4899 sctx->emit_spi_map[21] = si_emit_spi_map<21>;
4900 sctx->emit_spi_map[22] = si_emit_spi_map<22>;
4901 sctx->emit_spi_map[23] = si_emit_spi_map<23>;
4902 sctx->emit_spi_map[24] = si_emit_spi_map<24>;
4903 sctx->emit_spi_map[25] = si_emit_spi_map<25>;
4904 sctx->emit_spi_map[26] = si_emit_spi_map<26>;
4905 sctx->emit_spi_map[27] = si_emit_spi_map<27>;
4906 sctx->emit_spi_map[28] = si_emit_spi_map<28>;
4907 sctx->emit_spi_map[29] = si_emit_spi_map<29>;
4908 sctx->emit_spi_map[30] = si_emit_spi_map<30>;
4909 sctx->emit_spi_map[31] = si_emit_spi_map<31>;
4910 sctx->emit_spi_map[32] = si_emit_spi_map<32>;
4911 }
4912