1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 *
5 * based in part on anv driver which is:
6 * Copyright © 2015 Intel Corporation
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the "Software"),
10 * to deal in the Software without restriction, including without limitation
11 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 * and/or sell copies of the Software, and to permit persons to whom the
13 * Software is furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the next
16 * paragraph) shall be included in all copies or substantial portions of the
17 * Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * IN THE SOFTWARE.
26 */
27
28 #include "nir/nir.h"
29 #include "nir/nir_builder.h"
30 #include "nir/nir_xfb_info.h"
31 #include "spirv/nir_spirv.h"
32 #include "util/disk_cache.h"
33 #include "util/mesa-sha1.h"
34 #include "util/u_atomic.h"
35 #include "radv_cs.h"
36 #include "radv_debug.h"
37 #include "radv_private.h"
38 #include "radv_shader.h"
39 #include "vk_util.h"
40
41 #include "util/debug.h"
42 #include "ac_binary.h"
43 #include "ac_exp_param.h"
44 #include "ac_nir.h"
45 #include "ac_shader_util.h"
46 #include "aco_interface.h"
47 #include "sid.h"
48 #include "vk_format.h"
49
50 struct radv_blend_state {
51 uint32_t blend_enable_4bit;
52 uint32_t need_src_alpha;
53
54 uint32_t cb_target_mask;
55 uint32_t cb_target_enabled_4bit;
56 uint32_t sx_mrt_blend_opt[8];
57 uint32_t cb_blend_control[8];
58
59 uint32_t spi_shader_col_format;
60 uint32_t col_format_is_int8;
61 uint32_t col_format_is_int10;
62 uint32_t cb_shader_mask;
63 uint32_t db_alpha_to_mask;
64
65 uint32_t commutative_4bit;
66
67 bool single_cb_enable;
68 bool mrt0_is_dual_src;
69 };
70
71 struct radv_dsa_order_invariance {
72 /* Whether the final result in Z/S buffers is guaranteed to be
73 * invariant under changes to the order in which fragments arrive.
74 */
75 bool zs;
76
77 /* Whether the set of fragments that pass the combined Z/S test is
78 * guaranteed to be invariant under changes to the order in which
79 * fragments arrive.
80 */
81 bool pass_set;
82 };
83
84 static bool
radv_is_state_dynamic(const VkGraphicsPipelineCreateInfo * pCreateInfo,VkDynamicState state)85 radv_is_state_dynamic(const VkGraphicsPipelineCreateInfo *pCreateInfo, VkDynamicState state)
86 {
87 if (pCreateInfo->pDynamicState) {
88 uint32_t count = pCreateInfo->pDynamicState->dynamicStateCount;
89 for (uint32_t i = 0; i < count; i++) {
90 if (pCreateInfo->pDynamicState->pDynamicStates[i] == state)
91 return true;
92 }
93 }
94
95 return false;
96 }
97
98 static const VkPipelineMultisampleStateCreateInfo *
radv_pipeline_get_multisample_state(const VkGraphicsPipelineCreateInfo * pCreateInfo)99 radv_pipeline_get_multisample_state(const VkGraphicsPipelineCreateInfo *pCreateInfo)
100 {
101 if (!pCreateInfo->pRasterizationState->rasterizerDiscardEnable ||
102 radv_is_state_dynamic(pCreateInfo, VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE_EXT))
103 return pCreateInfo->pMultisampleState;
104 return NULL;
105 }
106
107 static const VkPipelineTessellationStateCreateInfo *
radv_pipeline_get_tessellation_state(const VkGraphicsPipelineCreateInfo * pCreateInfo)108 radv_pipeline_get_tessellation_state(const VkGraphicsPipelineCreateInfo *pCreateInfo)
109 {
110 for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) {
111 if (pCreateInfo->pStages[i].stage == VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT ||
112 pCreateInfo->pStages[i].stage == VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT) {
113 return pCreateInfo->pTessellationState;
114 }
115 }
116 return NULL;
117 }
118
119 static const VkPipelineDepthStencilStateCreateInfo *
radv_pipeline_get_depth_stencil_state(const VkGraphicsPipelineCreateInfo * pCreateInfo)120 radv_pipeline_get_depth_stencil_state(const VkGraphicsPipelineCreateInfo *pCreateInfo)
121 {
122 RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
123 struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
124
125 if ((!pCreateInfo->pRasterizationState->rasterizerDiscardEnable &&
126 subpass->depth_stencil_attachment) ||
127 radv_is_state_dynamic(pCreateInfo, VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE_EXT))
128 return pCreateInfo->pDepthStencilState;
129 return NULL;
130 }
131
132 static const VkPipelineColorBlendStateCreateInfo *
radv_pipeline_get_color_blend_state(const VkGraphicsPipelineCreateInfo * pCreateInfo)133 radv_pipeline_get_color_blend_state(const VkGraphicsPipelineCreateInfo *pCreateInfo)
134 {
135 RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
136 struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
137
138 if ((!pCreateInfo->pRasterizationState->rasterizerDiscardEnable && subpass->has_color_att) ||
139 radv_is_state_dynamic(pCreateInfo, VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE_EXT))
140 return pCreateInfo->pColorBlendState;
141 return NULL;
142 }
143
144 static bool
radv_pipeline_has_ngg(const struct radv_pipeline * pipeline)145 radv_pipeline_has_ngg(const struct radv_pipeline *pipeline)
146 {
147 if (pipeline->graphics.last_vgt_api_stage == MESA_SHADER_NONE)
148 return false;
149
150 struct radv_shader_variant *variant =
151 pipeline->shaders[pipeline->graphics.last_vgt_api_stage];
152
153 return variant->info.is_ngg;
154 }
155
156 bool
radv_pipeline_has_ngg_passthrough(const struct radv_pipeline * pipeline)157 radv_pipeline_has_ngg_passthrough(const struct radv_pipeline *pipeline)
158 {
159 if (pipeline->graphics.last_vgt_api_stage == MESA_SHADER_NONE)
160 return false;
161
162 assert(radv_pipeline_has_ngg(pipeline));
163
164 struct radv_shader_variant *variant =
165 pipeline->shaders[pipeline->graphics.last_vgt_api_stage];
166
167 return variant->info.is_ngg_passthrough;
168 }
169
170 bool
radv_pipeline_has_gs_copy_shader(const struct radv_pipeline * pipeline)171 radv_pipeline_has_gs_copy_shader(const struct radv_pipeline *pipeline)
172 {
173 return !!pipeline->gs_copy_shader;
174 }
175
176 void
radv_pipeline_destroy(struct radv_device * device,struct radv_pipeline * pipeline,const VkAllocationCallbacks * allocator)177 radv_pipeline_destroy(struct radv_device *device, struct radv_pipeline *pipeline,
178 const VkAllocationCallbacks *allocator)
179 {
180 if (pipeline->type == RADV_PIPELINE_COMPUTE) {
181 free(pipeline->compute.rt_group_handles);
182 free(pipeline->compute.rt_stack_sizes);
183 } else if (pipeline->type == RADV_PIPELINE_LIBRARY) {
184 free(pipeline->library.groups);
185 free(pipeline->library.stages);
186 }
187
188 for (unsigned i = 0; i < MESA_SHADER_STAGES; ++i)
189 if (pipeline->shaders[i])
190 radv_shader_variant_destroy(device, pipeline->shaders[i]);
191
192 if (pipeline->gs_copy_shader)
193 radv_shader_variant_destroy(device, pipeline->gs_copy_shader);
194
195 if (pipeline->cs.buf)
196 free(pipeline->cs.buf);
197
198 vk_object_base_finish(&pipeline->base);
199 vk_free2(&device->vk.alloc, allocator, pipeline);
200 }
201
202 void
radv_DestroyPipeline(VkDevice _device,VkPipeline _pipeline,const VkAllocationCallbacks * pAllocator)203 radv_DestroyPipeline(VkDevice _device, VkPipeline _pipeline,
204 const VkAllocationCallbacks *pAllocator)
205 {
206 RADV_FROM_HANDLE(radv_device, device, _device);
207 RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);
208
209 if (!_pipeline)
210 return;
211
212 radv_pipeline_destroy(device, pipeline, pAllocator);
213 }
214
215 uint32_t
radv_get_hash_flags(const struct radv_device * device,bool stats)216 radv_get_hash_flags(const struct radv_device *device, bool stats)
217 {
218 uint32_t hash_flags = 0;
219
220 if (device->physical_device->use_ngg_culling)
221 hash_flags |= RADV_HASH_SHADER_USE_NGG_CULLING;
222 if (device->instance->perftest_flags & RADV_PERFTEST_FORCE_EMULATE_RT)
223 hash_flags |= RADV_HASH_SHADER_FORCE_EMULATE_RT;
224 if (device->physical_device->cs_wave_size == 32)
225 hash_flags |= RADV_HASH_SHADER_CS_WAVE32;
226 if (device->physical_device->ps_wave_size == 32)
227 hash_flags |= RADV_HASH_SHADER_PS_WAVE32;
228 if (device->physical_device->ge_wave_size == 32)
229 hash_flags |= RADV_HASH_SHADER_GE_WAVE32;
230 if (device->physical_device->use_llvm)
231 hash_flags |= RADV_HASH_SHADER_LLVM;
232 if (stats)
233 hash_flags |= RADV_HASH_SHADER_KEEP_STATISTICS;
234 if (device->robust_buffer_access) /* forces per-attribute vertex descriptors */
235 hash_flags |= RADV_HASH_SHADER_ROBUST_BUFFER_ACCESS;
236 if (device->robust_buffer_access2) /* affects load/store vectorizer */
237 hash_flags |= RADV_HASH_SHADER_ROBUST_BUFFER_ACCESS2;
238 return hash_flags;
239 }
240
241 static void
radv_pipeline_init_scratch(const struct radv_device * device,struct radv_pipeline * pipeline)242 radv_pipeline_init_scratch(const struct radv_device *device, struct radv_pipeline *pipeline)
243 {
244 unsigned scratch_bytes_per_wave = 0;
245 unsigned max_waves = 0;
246
247 for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
248 if (pipeline->shaders[i] && pipeline->shaders[i]->config.scratch_bytes_per_wave) {
249 unsigned max_stage_waves = device->scratch_waves;
250
251 scratch_bytes_per_wave =
252 MAX2(scratch_bytes_per_wave, pipeline->shaders[i]->config.scratch_bytes_per_wave);
253
254 max_stage_waves =
255 MIN2(max_stage_waves, 4 * device->physical_device->rad_info.num_good_compute_units *
256 radv_get_max_waves(device, pipeline->shaders[i], i));
257 max_waves = MAX2(max_waves, max_stage_waves);
258 }
259 }
260
261 pipeline->scratch_bytes_per_wave = scratch_bytes_per_wave;
262 pipeline->max_waves = max_waves;
263 }
264
265 static uint32_t
si_translate_blend_function(VkBlendOp op)266 si_translate_blend_function(VkBlendOp op)
267 {
268 switch (op) {
269 case VK_BLEND_OP_ADD:
270 return V_028780_COMB_DST_PLUS_SRC;
271 case VK_BLEND_OP_SUBTRACT:
272 return V_028780_COMB_SRC_MINUS_DST;
273 case VK_BLEND_OP_REVERSE_SUBTRACT:
274 return V_028780_COMB_DST_MINUS_SRC;
275 case VK_BLEND_OP_MIN:
276 return V_028780_COMB_MIN_DST_SRC;
277 case VK_BLEND_OP_MAX:
278 return V_028780_COMB_MAX_DST_SRC;
279 default:
280 return 0;
281 }
282 }
283
284 static uint32_t
si_translate_blend_factor(VkBlendFactor factor)285 si_translate_blend_factor(VkBlendFactor factor)
286 {
287 switch (factor) {
288 case VK_BLEND_FACTOR_ZERO:
289 return V_028780_BLEND_ZERO;
290 case VK_BLEND_FACTOR_ONE:
291 return V_028780_BLEND_ONE;
292 case VK_BLEND_FACTOR_SRC_COLOR:
293 return V_028780_BLEND_SRC_COLOR;
294 case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
295 return V_028780_BLEND_ONE_MINUS_SRC_COLOR;
296 case VK_BLEND_FACTOR_DST_COLOR:
297 return V_028780_BLEND_DST_COLOR;
298 case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
299 return V_028780_BLEND_ONE_MINUS_DST_COLOR;
300 case VK_BLEND_FACTOR_SRC_ALPHA:
301 return V_028780_BLEND_SRC_ALPHA;
302 case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
303 return V_028780_BLEND_ONE_MINUS_SRC_ALPHA;
304 case VK_BLEND_FACTOR_DST_ALPHA:
305 return V_028780_BLEND_DST_ALPHA;
306 case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
307 return V_028780_BLEND_ONE_MINUS_DST_ALPHA;
308 case VK_BLEND_FACTOR_CONSTANT_COLOR:
309 return V_028780_BLEND_CONSTANT_COLOR;
310 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
311 return V_028780_BLEND_ONE_MINUS_CONSTANT_COLOR;
312 case VK_BLEND_FACTOR_CONSTANT_ALPHA:
313 return V_028780_BLEND_CONSTANT_ALPHA;
314 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
315 return V_028780_BLEND_ONE_MINUS_CONSTANT_ALPHA;
316 case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
317 return V_028780_BLEND_SRC_ALPHA_SATURATE;
318 case VK_BLEND_FACTOR_SRC1_COLOR:
319 return V_028780_BLEND_SRC1_COLOR;
320 case VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR:
321 return V_028780_BLEND_INV_SRC1_COLOR;
322 case VK_BLEND_FACTOR_SRC1_ALPHA:
323 return V_028780_BLEND_SRC1_ALPHA;
324 case VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA:
325 return V_028780_BLEND_INV_SRC1_ALPHA;
326 default:
327 return 0;
328 }
329 }
330
331 static uint32_t
si_translate_blend_opt_function(VkBlendOp op)332 si_translate_blend_opt_function(VkBlendOp op)
333 {
334 switch (op) {
335 case VK_BLEND_OP_ADD:
336 return V_028760_OPT_COMB_ADD;
337 case VK_BLEND_OP_SUBTRACT:
338 return V_028760_OPT_COMB_SUBTRACT;
339 case VK_BLEND_OP_REVERSE_SUBTRACT:
340 return V_028760_OPT_COMB_REVSUBTRACT;
341 case VK_BLEND_OP_MIN:
342 return V_028760_OPT_COMB_MIN;
343 case VK_BLEND_OP_MAX:
344 return V_028760_OPT_COMB_MAX;
345 default:
346 return V_028760_OPT_COMB_BLEND_DISABLED;
347 }
348 }
349
350 static uint32_t
si_translate_blend_opt_factor(VkBlendFactor factor,bool is_alpha)351 si_translate_blend_opt_factor(VkBlendFactor factor, bool is_alpha)
352 {
353 switch (factor) {
354 case VK_BLEND_FACTOR_ZERO:
355 return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_ALL;
356 case VK_BLEND_FACTOR_ONE:
357 return V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE;
358 case VK_BLEND_FACTOR_SRC_COLOR:
359 return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0
360 : V_028760_BLEND_OPT_PRESERVE_C1_IGNORE_C0;
361 case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
362 return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1
363 : V_028760_BLEND_OPT_PRESERVE_C0_IGNORE_C1;
364 case VK_BLEND_FACTOR_SRC_ALPHA:
365 return V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0;
366 case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
367 return V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1;
368 case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
369 return is_alpha ? V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE
370 : V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0;
371 default:
372 return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
373 }
374 }
375
376 /**
377 * Get rid of DST in the blend factors by commuting the operands:
378 * func(src * DST, dst * 0) ---> func(src * 0, dst * SRC)
379 */
380 static void
si_blend_remove_dst(VkBlendOp * func,VkBlendFactor * src_factor,VkBlendFactor * dst_factor,VkBlendFactor expected_dst,VkBlendFactor replacement_src)381 si_blend_remove_dst(VkBlendOp *func, VkBlendFactor *src_factor, VkBlendFactor *dst_factor,
382 VkBlendFactor expected_dst, VkBlendFactor replacement_src)
383 {
384 if (*src_factor == expected_dst && *dst_factor == VK_BLEND_FACTOR_ZERO) {
385 *src_factor = VK_BLEND_FACTOR_ZERO;
386 *dst_factor = replacement_src;
387
388 /* Commuting the operands requires reversing subtractions. */
389 if (*func == VK_BLEND_OP_SUBTRACT)
390 *func = VK_BLEND_OP_REVERSE_SUBTRACT;
391 else if (*func == VK_BLEND_OP_REVERSE_SUBTRACT)
392 *func = VK_BLEND_OP_SUBTRACT;
393 }
394 }
395
396 static bool
si_blend_factor_uses_dst(VkBlendFactor factor)397 si_blend_factor_uses_dst(VkBlendFactor factor)
398 {
399 return factor == VK_BLEND_FACTOR_DST_COLOR || factor == VK_BLEND_FACTOR_DST_ALPHA ||
400 factor == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE ||
401 factor == VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA ||
402 factor == VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR;
403 }
404
405 static bool
is_dual_src(VkBlendFactor factor)406 is_dual_src(VkBlendFactor factor)
407 {
408 switch (factor) {
409 case VK_BLEND_FACTOR_SRC1_COLOR:
410 case VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR:
411 case VK_BLEND_FACTOR_SRC1_ALPHA:
412 case VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA:
413 return true;
414 default:
415 return false;
416 }
417 }
418
419 static unsigned
radv_choose_spi_color_format(const struct radv_device * device,VkFormat vk_format,bool blend_enable,bool blend_need_alpha)420 radv_choose_spi_color_format(const struct radv_device *device, VkFormat vk_format,
421 bool blend_enable, bool blend_need_alpha)
422 {
423 const struct util_format_description *desc = vk_format_description(vk_format);
424 bool use_rbplus = device->physical_device->rad_info.rbplus_allowed;
425 struct ac_spi_color_formats formats = {0};
426 unsigned format, ntype, swap;
427
428 format = radv_translate_colorformat(vk_format);
429 ntype = radv_translate_color_numformat(vk_format, desc,
430 vk_format_get_first_non_void_channel(vk_format));
431 swap = radv_translate_colorswap(vk_format, false);
432
433 ac_choose_spi_color_formats(format, swap, ntype, false, use_rbplus, &formats);
434
435 if (blend_enable && blend_need_alpha)
436 return formats.blend_alpha;
437 else if (blend_need_alpha)
438 return formats.alpha;
439 else if (blend_enable)
440 return formats.blend;
441 else
442 return formats.normal;
443 }
444
445 static bool
format_is_int8(VkFormat format)446 format_is_int8(VkFormat format)
447 {
448 const struct util_format_description *desc = vk_format_description(format);
449 int channel = vk_format_get_first_non_void_channel(format);
450
451 return channel >= 0 && desc->channel[channel].pure_integer && desc->channel[channel].size == 8;
452 }
453
454 static bool
format_is_int10(VkFormat format)455 format_is_int10(VkFormat format)
456 {
457 const struct util_format_description *desc = vk_format_description(format);
458
459 if (desc->nr_channels != 4)
460 return false;
461 for (unsigned i = 0; i < 4; i++) {
462 if (desc->channel[i].pure_integer && desc->channel[i].size == 10)
463 return true;
464 }
465 return false;
466 }
467
468 static void
radv_pipeline_compute_spi_color_formats(const struct radv_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo,struct radv_blend_state * blend)469 radv_pipeline_compute_spi_color_formats(const struct radv_pipeline *pipeline,
470 const VkGraphicsPipelineCreateInfo *pCreateInfo,
471 struct radv_blend_state *blend)
472 {
473 RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
474 struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
475 unsigned col_format = 0, is_int8 = 0, is_int10 = 0;
476 unsigned num_targets;
477
478 for (unsigned i = 0; i < (blend->single_cb_enable ? 1 : subpass->color_count); ++i) {
479 unsigned cf;
480
481 if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED ||
482 !(blend->cb_target_mask & (0xfu << (i * 4)))) {
483 cf = V_028714_SPI_SHADER_ZERO;
484 } else {
485 struct radv_render_pass_attachment *attachment =
486 pass->attachments + subpass->color_attachments[i].attachment;
487 bool blend_enable = blend->blend_enable_4bit & (0xfu << (i * 4));
488
489 cf = radv_choose_spi_color_format(pipeline->device, attachment->format, blend_enable,
490 blend->need_src_alpha & (1 << i));
491
492 if (format_is_int8(attachment->format))
493 is_int8 |= 1 << i;
494 if (format_is_int10(attachment->format))
495 is_int10 |= 1 << i;
496 }
497
498 col_format |= cf << (4 * i);
499 }
500
501 if (!(col_format & 0xf) && blend->need_src_alpha & (1 << 0)) {
502 /* When a subpass doesn't have any color attachments, write the
503 * alpha channel of MRT0 when alpha coverage is enabled because
504 * the depth attachment needs it.
505 */
506 col_format |= V_028714_SPI_SHADER_32_AR;
507 }
508
509 /* If the i-th target format is set, all previous target formats must
510 * be non-zero to avoid hangs.
511 */
512 num_targets = (util_last_bit(col_format) + 3) / 4;
513 for (unsigned i = 0; i < num_targets; i++) {
514 if (!(col_format & (0xfu << (i * 4)))) {
515 col_format |= V_028714_SPI_SHADER_32_R << (i * 4);
516 }
517 }
518
519 /* The output for dual source blending should have the same format as
520 * the first output.
521 */
522 if (blend->mrt0_is_dual_src) {
523 assert(!(col_format >> 4));
524 col_format |= (col_format & 0xf) << 4;
525 }
526
527 blend->cb_shader_mask = ac_get_cb_shader_mask(col_format);
528 blend->spi_shader_col_format = col_format;
529 blend->col_format_is_int8 = is_int8;
530 blend->col_format_is_int10 = is_int10;
531 }
532
533 /*
534 * Ordered so that for each i,
535 * radv_format_meta_fs_key(radv_fs_key_format_exemplars[i]) == i.
536 */
537 const VkFormat radv_fs_key_format_exemplars[NUM_META_FS_KEYS] = {
538 VK_FORMAT_R32_SFLOAT,
539 VK_FORMAT_R32G32_SFLOAT,
540 VK_FORMAT_R8G8B8A8_UNORM,
541 VK_FORMAT_R16G16B16A16_UNORM,
542 VK_FORMAT_R16G16B16A16_SNORM,
543 VK_FORMAT_R16G16B16A16_UINT,
544 VK_FORMAT_R16G16B16A16_SINT,
545 VK_FORMAT_R32G32B32A32_SFLOAT,
546 VK_FORMAT_R8G8B8A8_UINT,
547 VK_FORMAT_R8G8B8A8_SINT,
548 VK_FORMAT_A2R10G10B10_UINT_PACK32,
549 VK_FORMAT_A2R10G10B10_SINT_PACK32,
550 };
551
552 unsigned
radv_format_meta_fs_key(struct radv_device * device,VkFormat format)553 radv_format_meta_fs_key(struct radv_device *device, VkFormat format)
554 {
555 unsigned col_format = radv_choose_spi_color_format(device, format, false, false);
556 assert(col_format != V_028714_SPI_SHADER_32_AR);
557
558 bool is_int8 = format_is_int8(format);
559 bool is_int10 = format_is_int10(format);
560
561 if (col_format == V_028714_SPI_SHADER_UINT16_ABGR && is_int8)
562 return 8;
563 else if (col_format == V_028714_SPI_SHADER_SINT16_ABGR && is_int8)
564 return 9;
565 else if (col_format == V_028714_SPI_SHADER_UINT16_ABGR && is_int10)
566 return 10;
567 else if (col_format == V_028714_SPI_SHADER_SINT16_ABGR && is_int10)
568 return 11;
569 else {
570 if (col_format >= V_028714_SPI_SHADER_32_AR)
571 --col_format; /* Skip V_028714_SPI_SHADER_32_AR since there is no such VkFormat */
572
573 --col_format; /* Skip V_028714_SPI_SHADER_ZERO */
574 return col_format;
575 }
576 }
577
578 static void
radv_blend_check_commutativity(struct radv_blend_state * blend,VkBlendOp op,VkBlendFactor src,VkBlendFactor dst,unsigned chanmask)579 radv_blend_check_commutativity(struct radv_blend_state *blend, VkBlendOp op, VkBlendFactor src,
580 VkBlendFactor dst, unsigned chanmask)
581 {
582 /* Src factor is allowed when it does not depend on Dst. */
583 static const uint32_t src_allowed =
584 (1u << VK_BLEND_FACTOR_ONE) | (1u << VK_BLEND_FACTOR_SRC_COLOR) |
585 (1u << VK_BLEND_FACTOR_SRC_ALPHA) | (1u << VK_BLEND_FACTOR_SRC_ALPHA_SATURATE) |
586 (1u << VK_BLEND_FACTOR_CONSTANT_COLOR) | (1u << VK_BLEND_FACTOR_CONSTANT_ALPHA) |
587 (1u << VK_BLEND_FACTOR_SRC1_COLOR) | (1u << VK_BLEND_FACTOR_SRC1_ALPHA) |
588 (1u << VK_BLEND_FACTOR_ZERO) | (1u << VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR) |
589 (1u << VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA) |
590 (1u << VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR) |
591 (1u << VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA) |
592 (1u << VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR) | (1u << VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA);
593
594 if (dst == VK_BLEND_FACTOR_ONE && (src_allowed & (1u << src))) {
595 /* Addition is commutative, but floating point addition isn't
596 * associative: subtle changes can be introduced via different
597 * rounding. Be conservative, only enable for min and max.
598 */
599 if (op == VK_BLEND_OP_MAX || op == VK_BLEND_OP_MIN)
600 blend->commutative_4bit |= chanmask;
601 }
602 }
603
604 static struct radv_blend_state
radv_pipeline_init_blend_state(struct radv_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo,const struct radv_graphics_pipeline_create_info * extra)605 radv_pipeline_init_blend_state(struct radv_pipeline *pipeline,
606 const VkGraphicsPipelineCreateInfo *pCreateInfo,
607 const struct radv_graphics_pipeline_create_info *extra)
608 {
609 const VkPipelineColorBlendStateCreateInfo *vkblend =
610 radv_pipeline_get_color_blend_state(pCreateInfo);
611 const VkPipelineMultisampleStateCreateInfo *vkms =
612 radv_pipeline_get_multisample_state(pCreateInfo);
613 struct radv_blend_state blend = {0};
614 unsigned mode = V_028808_CB_NORMAL;
615 unsigned cb_color_control = 0;
616 int i;
617
618 if (extra && extra->custom_blend_mode) {
619 blend.single_cb_enable = true;
620 mode = extra->custom_blend_mode;
621 }
622
623 if (vkblend) {
624 if (vkblend->logicOpEnable)
625 cb_color_control |= S_028808_ROP3(si_translate_blend_logic_op(vkblend->logicOp));
626 else
627 cb_color_control |= S_028808_ROP3(V_028808_ROP3_COPY);
628 }
629
630 if (pipeline->device->instance->debug_flags & RADV_DEBUG_NO_ATOC_DITHERING)
631 {
632 blend.db_alpha_to_mask = S_028B70_ALPHA_TO_MASK_OFFSET0(2) | S_028B70_ALPHA_TO_MASK_OFFSET1(2) |
633 S_028B70_ALPHA_TO_MASK_OFFSET2(2) | S_028B70_ALPHA_TO_MASK_OFFSET3(2) |
634 S_028B70_OFFSET_ROUND(0);
635 }
636 else
637 {
638 blend.db_alpha_to_mask = S_028B70_ALPHA_TO_MASK_OFFSET0(3) | S_028B70_ALPHA_TO_MASK_OFFSET1(1) |
639 S_028B70_ALPHA_TO_MASK_OFFSET2(0) | S_028B70_ALPHA_TO_MASK_OFFSET3(2) |
640 S_028B70_OFFSET_ROUND(1);
641 }
642
643 if (vkms && vkms->alphaToCoverageEnable) {
644 blend.db_alpha_to_mask |= S_028B70_ALPHA_TO_MASK_ENABLE(1);
645 blend.need_src_alpha |= 0x1;
646 }
647
648 blend.cb_target_mask = 0;
649 if (vkblend) {
650 for (i = 0; i < vkblend->attachmentCount; i++) {
651 const VkPipelineColorBlendAttachmentState *att = &vkblend->pAttachments[i];
652 unsigned blend_cntl = 0;
653 unsigned srcRGB_opt, dstRGB_opt, srcA_opt, dstA_opt;
654 VkBlendOp eqRGB = att->colorBlendOp;
655 VkBlendFactor srcRGB = att->srcColorBlendFactor;
656 VkBlendFactor dstRGB = att->dstColorBlendFactor;
657 VkBlendOp eqA = att->alphaBlendOp;
658 VkBlendFactor srcA = att->srcAlphaBlendFactor;
659 VkBlendFactor dstA = att->dstAlphaBlendFactor;
660
661 blend.sx_mrt_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) |
662 S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED);
663
664 if (!att->colorWriteMask)
665 continue;
666
667 /* Ignore other blend targets if dual-source blending
668 * is enabled to prevent wrong behaviour.
669 */
670 if (blend.mrt0_is_dual_src)
671 continue;
672
673 blend.cb_target_mask |= (unsigned)att->colorWriteMask << (4 * i);
674 blend.cb_target_enabled_4bit |= 0xfu << (4 * i);
675 if (!att->blendEnable) {
676 blend.cb_blend_control[i] = blend_cntl;
677 continue;
678 }
679
680 if (is_dual_src(srcRGB) || is_dual_src(dstRGB) || is_dual_src(srcA) || is_dual_src(dstA))
681 if (i == 0)
682 blend.mrt0_is_dual_src = true;
683
684 if (eqRGB == VK_BLEND_OP_MIN || eqRGB == VK_BLEND_OP_MAX) {
685 srcRGB = VK_BLEND_FACTOR_ONE;
686 dstRGB = VK_BLEND_FACTOR_ONE;
687 }
688 if (eqA == VK_BLEND_OP_MIN || eqA == VK_BLEND_OP_MAX) {
689 srcA = VK_BLEND_FACTOR_ONE;
690 dstA = VK_BLEND_FACTOR_ONE;
691 }
692
693 radv_blend_check_commutativity(&blend, eqRGB, srcRGB, dstRGB, 0x7u << (4 * i));
694 radv_blend_check_commutativity(&blend, eqA, srcA, dstA, 0x8u << (4 * i));
695
696 /* Blending optimizations for RB+.
697 * These transformations don't change the behavior.
698 *
699 * First, get rid of DST in the blend factors:
700 * func(src * DST, dst * 0) ---> func(src * 0, dst * SRC)
701 */
702 si_blend_remove_dst(&eqRGB, &srcRGB, &dstRGB, VK_BLEND_FACTOR_DST_COLOR,
703 VK_BLEND_FACTOR_SRC_COLOR);
704
705 si_blend_remove_dst(&eqA, &srcA, &dstA, VK_BLEND_FACTOR_DST_COLOR,
706 VK_BLEND_FACTOR_SRC_COLOR);
707
708 si_blend_remove_dst(&eqA, &srcA, &dstA, VK_BLEND_FACTOR_DST_ALPHA,
709 VK_BLEND_FACTOR_SRC_ALPHA);
710
711 /* Look up the ideal settings from tables. */
712 srcRGB_opt = si_translate_blend_opt_factor(srcRGB, false);
713 dstRGB_opt = si_translate_blend_opt_factor(dstRGB, false);
714 srcA_opt = si_translate_blend_opt_factor(srcA, true);
715 dstA_opt = si_translate_blend_opt_factor(dstA, true);
716
717 /* Handle interdependencies. */
718 if (si_blend_factor_uses_dst(srcRGB))
719 dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
720 if (si_blend_factor_uses_dst(srcA))
721 dstA_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
722
723 if (srcRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE &&
724 (dstRGB == VK_BLEND_FACTOR_ZERO || dstRGB == VK_BLEND_FACTOR_SRC_ALPHA ||
725 dstRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE))
726 dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0;
727
728 /* Set the final value. */
729 blend.sx_mrt_blend_opt[i] =
730 S_028760_COLOR_SRC_OPT(srcRGB_opt) | S_028760_COLOR_DST_OPT(dstRGB_opt) |
731 S_028760_COLOR_COMB_FCN(si_translate_blend_opt_function(eqRGB)) |
732 S_028760_ALPHA_SRC_OPT(srcA_opt) | S_028760_ALPHA_DST_OPT(dstA_opt) |
733 S_028760_ALPHA_COMB_FCN(si_translate_blend_opt_function(eqA));
734 blend_cntl |= S_028780_ENABLE(1);
735
736 blend_cntl |= S_028780_COLOR_COMB_FCN(si_translate_blend_function(eqRGB));
737 blend_cntl |= S_028780_COLOR_SRCBLEND(si_translate_blend_factor(srcRGB));
738 blend_cntl |= S_028780_COLOR_DESTBLEND(si_translate_blend_factor(dstRGB));
739 if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB) {
740 blend_cntl |= S_028780_SEPARATE_ALPHA_BLEND(1);
741 blend_cntl |= S_028780_ALPHA_COMB_FCN(si_translate_blend_function(eqA));
742 blend_cntl |= S_028780_ALPHA_SRCBLEND(si_translate_blend_factor(srcA));
743 blend_cntl |= S_028780_ALPHA_DESTBLEND(si_translate_blend_factor(dstA));
744 }
745 blend.cb_blend_control[i] = blend_cntl;
746
747 blend.blend_enable_4bit |= 0xfu << (i * 4);
748
749 if (srcRGB == VK_BLEND_FACTOR_SRC_ALPHA || dstRGB == VK_BLEND_FACTOR_SRC_ALPHA ||
750 srcRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE ||
751 dstRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE ||
752 srcRGB == VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA ||
753 dstRGB == VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA)
754 blend.need_src_alpha |= 1 << i;
755 }
756 for (i = vkblend->attachmentCount; i < 8; i++) {
757 blend.cb_blend_control[i] = 0;
758 blend.sx_mrt_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) |
759 S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED);
760 }
761 }
762
763 if (pipeline->device->physical_device->rad_info.has_rbplus) {
764 /* Disable RB+ blend optimizations for dual source blending. */
765 if (blend.mrt0_is_dual_src) {
766 for (i = 0; i < 8; i++) {
767 blend.sx_mrt_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_NONE) |
768 S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_NONE);
769 }
770 }
771
772 /* RB+ doesn't work with dual source blending, logic op and
773 * RESOLVE.
774 */
775 if (blend.mrt0_is_dual_src || (vkblend && vkblend->logicOpEnable) ||
776 mode == V_028808_CB_RESOLVE)
777 cb_color_control |= S_028808_DISABLE_DUAL_QUAD(1);
778 }
779
780 if (blend.cb_target_mask)
781 cb_color_control |= S_028808_MODE(mode);
782 else
783 cb_color_control |= S_028808_MODE(V_028808_CB_DISABLE);
784
785 radv_pipeline_compute_spi_color_formats(pipeline, pCreateInfo, &blend);
786
787 pipeline->graphics.cb_color_control = cb_color_control;
788
789 return blend;
790 }
791
792 static uint32_t
si_translate_fill(VkPolygonMode func)793 si_translate_fill(VkPolygonMode func)
794 {
795 switch (func) {
796 case VK_POLYGON_MODE_FILL:
797 return V_028814_X_DRAW_TRIANGLES;
798 case VK_POLYGON_MODE_LINE:
799 return V_028814_X_DRAW_LINES;
800 case VK_POLYGON_MODE_POINT:
801 return V_028814_X_DRAW_POINTS;
802 default:
803 assert(0);
804 return V_028814_X_DRAW_POINTS;
805 }
806 }
807
808 static uint8_t
radv_pipeline_get_ps_iter_samples(const VkGraphicsPipelineCreateInfo * pCreateInfo)809 radv_pipeline_get_ps_iter_samples(const VkGraphicsPipelineCreateInfo *pCreateInfo)
810 {
811 const VkPipelineMultisampleStateCreateInfo *vkms = pCreateInfo->pMultisampleState;
812 RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
813 struct radv_subpass *subpass = &pass->subpasses[pCreateInfo->subpass];
814 uint32_t ps_iter_samples = 1;
815 uint32_t num_samples;
816
817 /* From the Vulkan 1.1.129 spec, 26.7. Sample Shading:
818 *
819 * "If the VK_AMD_mixed_attachment_samples extension is enabled and the
820 * subpass uses color attachments, totalSamples is the number of
821 * samples of the color attachments. Otherwise, totalSamples is the
822 * value of VkPipelineMultisampleStateCreateInfo::rasterizationSamples
823 * specified at pipeline creation time."
824 */
825 if (subpass->has_color_att) {
826 num_samples = subpass->color_sample_count;
827 } else {
828 num_samples = vkms->rasterizationSamples;
829 }
830
831 if (vkms->sampleShadingEnable) {
832 ps_iter_samples = ceilf(vkms->minSampleShading * num_samples);
833 ps_iter_samples = util_next_power_of_two(ps_iter_samples);
834 }
835 return ps_iter_samples;
836 }
837
838 static bool
radv_is_depth_write_enabled(const VkPipelineDepthStencilStateCreateInfo * pCreateInfo)839 radv_is_depth_write_enabled(const VkPipelineDepthStencilStateCreateInfo *pCreateInfo)
840 {
841 return pCreateInfo->depthTestEnable && pCreateInfo->depthWriteEnable &&
842 pCreateInfo->depthCompareOp != VK_COMPARE_OP_NEVER;
843 }
844
845 static bool
radv_writes_stencil(const VkStencilOpState * state)846 radv_writes_stencil(const VkStencilOpState *state)
847 {
848 return state->writeMask &&
849 (state->failOp != VK_STENCIL_OP_KEEP || state->passOp != VK_STENCIL_OP_KEEP ||
850 state->depthFailOp != VK_STENCIL_OP_KEEP);
851 }
852
853 static bool
radv_is_stencil_write_enabled(const VkPipelineDepthStencilStateCreateInfo * pCreateInfo)854 radv_is_stencil_write_enabled(const VkPipelineDepthStencilStateCreateInfo *pCreateInfo)
855 {
856 return pCreateInfo->stencilTestEnable &&
857 (radv_writes_stencil(&pCreateInfo->front) || radv_writes_stencil(&pCreateInfo->back));
858 }
859
860 static bool
radv_is_ds_write_enabled(const VkPipelineDepthStencilStateCreateInfo * pCreateInfo)861 radv_is_ds_write_enabled(const VkPipelineDepthStencilStateCreateInfo *pCreateInfo)
862 {
863 return radv_is_depth_write_enabled(pCreateInfo) || radv_is_stencil_write_enabled(pCreateInfo);
864 }
865
866 static bool
radv_order_invariant_stencil_op(VkStencilOp op)867 radv_order_invariant_stencil_op(VkStencilOp op)
868 {
869 /* REPLACE is normally order invariant, except when the stencil
870 * reference value is written by the fragment shader. Tracking this
871 * interaction does not seem worth the effort, so be conservative.
872 */
873 return op != VK_STENCIL_OP_INCREMENT_AND_CLAMP && op != VK_STENCIL_OP_DECREMENT_AND_CLAMP &&
874 op != VK_STENCIL_OP_REPLACE;
875 }
876
877 static bool
radv_order_invariant_stencil_state(const VkStencilOpState * state)878 radv_order_invariant_stencil_state(const VkStencilOpState *state)
879 {
880 /* Compute whether, assuming Z writes are disabled, this stencil state
881 * is order invariant in the sense that the set of passing fragments as
882 * well as the final stencil buffer result does not depend on the order
883 * of fragments.
884 */
885 return !state->writeMask ||
886 /* The following assumes that Z writes are disabled. */
887 (state->compareOp == VK_COMPARE_OP_ALWAYS &&
888 radv_order_invariant_stencil_op(state->passOp) &&
889 radv_order_invariant_stencil_op(state->depthFailOp)) ||
890 (state->compareOp == VK_COMPARE_OP_NEVER &&
891 radv_order_invariant_stencil_op(state->failOp));
892 }
893
894 static bool
radv_pipeline_has_dynamic_ds_states(const VkGraphicsPipelineCreateInfo * pCreateInfo)895 radv_pipeline_has_dynamic_ds_states(const VkGraphicsPipelineCreateInfo *pCreateInfo)
896 {
897 VkDynamicState ds_states[] = {
898 VK_DYNAMIC_STATE_DEPTH_TEST_ENABLE_EXT, VK_DYNAMIC_STATE_DEPTH_WRITE_ENABLE_EXT,
899 VK_DYNAMIC_STATE_DEPTH_COMPARE_OP_EXT, VK_DYNAMIC_STATE_STENCIL_TEST_ENABLE_EXT,
900 VK_DYNAMIC_STATE_STENCIL_OP_EXT,
901 };
902
903 for (uint32_t i = 0; i < ARRAY_SIZE(ds_states); i++) {
904 if (radv_is_state_dynamic(pCreateInfo, ds_states[i]))
905 return true;
906 }
907
908 return false;
909 }
910
911 static bool
radv_pipeline_out_of_order_rast(struct radv_pipeline * pipeline,const struct radv_blend_state * blend,const VkGraphicsPipelineCreateInfo * pCreateInfo)912 radv_pipeline_out_of_order_rast(struct radv_pipeline *pipeline,
913 const struct radv_blend_state *blend,
914 const VkGraphicsPipelineCreateInfo *pCreateInfo)
915 {
916 RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
917 struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
918 const VkPipelineDepthStencilStateCreateInfo *vkds =
919 radv_pipeline_get_depth_stencil_state(pCreateInfo);
920 const VkPipelineColorBlendStateCreateInfo *vkblend =
921 radv_pipeline_get_color_blend_state(pCreateInfo);
922 unsigned colormask = blend->cb_target_enabled_4bit;
923
924 if (!pipeline->device->physical_device->out_of_order_rast_allowed)
925 return false;
926
927 /* Be conservative if a logic operation is enabled with color buffers. */
928 if (colormask && vkblend && vkblend->logicOpEnable)
929 return false;
930
931 /* Be conservative if an extended dynamic depth/stencil state is
932 * enabled because the driver can't update out-of-order rasterization
933 * dynamically.
934 */
935 if (radv_pipeline_has_dynamic_ds_states(pCreateInfo))
936 return false;
937
938 /* Default depth/stencil invariance when no attachment is bound. */
939 struct radv_dsa_order_invariance dsa_order_invariant = {.zs = true, .pass_set = true};
940
941 if (vkds) {
942 struct radv_render_pass_attachment *attachment =
943 pass->attachments + subpass->depth_stencil_attachment->attachment;
944 bool has_stencil = vk_format_has_stencil(attachment->format);
945 struct radv_dsa_order_invariance order_invariance[2];
946 struct radv_shader_variant *ps = pipeline->shaders[MESA_SHADER_FRAGMENT];
947
948 /* Compute depth/stencil order invariance in order to know if
949 * it's safe to enable out-of-order.
950 */
951 bool zfunc_is_ordered = vkds->depthCompareOp == VK_COMPARE_OP_NEVER ||
952 vkds->depthCompareOp == VK_COMPARE_OP_LESS ||
953 vkds->depthCompareOp == VK_COMPARE_OP_LESS_OR_EQUAL ||
954 vkds->depthCompareOp == VK_COMPARE_OP_GREATER ||
955 vkds->depthCompareOp == VK_COMPARE_OP_GREATER_OR_EQUAL;
956
957 bool nozwrite_and_order_invariant_stencil =
958 !radv_is_ds_write_enabled(vkds) ||
959 (!radv_is_depth_write_enabled(vkds) && radv_order_invariant_stencil_state(&vkds->front) &&
960 radv_order_invariant_stencil_state(&vkds->back));
961
962 order_invariance[1].zs = nozwrite_and_order_invariant_stencil ||
963 (!radv_is_stencil_write_enabled(vkds) && zfunc_is_ordered);
964 order_invariance[0].zs = !radv_is_depth_write_enabled(vkds) || zfunc_is_ordered;
965
966 order_invariance[1].pass_set =
967 nozwrite_and_order_invariant_stencil ||
968 (!radv_is_stencil_write_enabled(vkds) && (vkds->depthCompareOp == VK_COMPARE_OP_ALWAYS ||
969 vkds->depthCompareOp == VK_COMPARE_OP_NEVER));
970 order_invariance[0].pass_set =
971 !radv_is_depth_write_enabled(vkds) || (vkds->depthCompareOp == VK_COMPARE_OP_ALWAYS ||
972 vkds->depthCompareOp == VK_COMPARE_OP_NEVER);
973
974 dsa_order_invariant = order_invariance[has_stencil];
975 if (!dsa_order_invariant.zs)
976 return false;
977
978 /* The set of PS invocations is always order invariant,
979 * except when early Z/S tests are requested.
980 */
981 if (ps && ps->info.ps.writes_memory && ps->info.ps.early_fragment_test &&
982 !dsa_order_invariant.pass_set)
983 return false;
984
985 /* Determine if out-of-order rasterization should be disabled
986 * when occlusion queries are used.
987 */
988 pipeline->graphics.disable_out_of_order_rast_for_occlusion = !dsa_order_invariant.pass_set;
989 }
990
991 /* No color buffers are enabled for writing. */
992 if (!colormask)
993 return true;
994
995 unsigned blendmask = colormask & blend->blend_enable_4bit;
996
997 if (blendmask) {
998 /* Only commutative blending. */
999 if (blendmask & ~blend->commutative_4bit)
1000 return false;
1001
1002 if (!dsa_order_invariant.pass_set)
1003 return false;
1004 }
1005
1006 if (colormask & ~blendmask)
1007 return false;
1008
1009 return true;
1010 }
1011
1012 static const VkConservativeRasterizationModeEXT
radv_get_conservative_raster_mode(const VkPipelineRasterizationStateCreateInfo * pCreateInfo)1013 radv_get_conservative_raster_mode(const VkPipelineRasterizationStateCreateInfo *pCreateInfo)
1014 {
1015 const VkPipelineRasterizationConservativeStateCreateInfoEXT *conservative_raster =
1016 vk_find_struct_const(pCreateInfo->pNext,
1017 PIPELINE_RASTERIZATION_CONSERVATIVE_STATE_CREATE_INFO_EXT);
1018
1019 if (!conservative_raster)
1020 return VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT;
1021 return conservative_raster->conservativeRasterizationMode;
1022 }
1023
1024 static void
radv_pipeline_init_multisample_state(struct radv_pipeline * pipeline,const struct radv_blend_state * blend,const VkGraphicsPipelineCreateInfo * pCreateInfo)1025 radv_pipeline_init_multisample_state(struct radv_pipeline *pipeline,
1026 const struct radv_blend_state *blend,
1027 const VkGraphicsPipelineCreateInfo *pCreateInfo)
1028 {
1029 const VkPipelineMultisampleStateCreateInfo *vkms =
1030 radv_pipeline_get_multisample_state(pCreateInfo);
1031 struct radv_multisample_state *ms = &pipeline->graphics.ms;
1032 unsigned num_tile_pipes = pipeline->device->physical_device->rad_info.num_tile_pipes;
1033 const VkConservativeRasterizationModeEXT mode =
1034 radv_get_conservative_raster_mode(pCreateInfo->pRasterizationState);
1035 bool out_of_order_rast = false;
1036 int ps_iter_samples = 1;
1037 uint32_t mask = 0xffff;
1038
1039 if (vkms) {
1040 ms->num_samples = vkms->rasterizationSamples;
1041
1042 /* From the Vulkan 1.1.129 spec, 26.7. Sample Shading:
1043 *
1044 * "Sample shading is enabled for a graphics pipeline:
1045 *
1046 * - If the interface of the fragment shader entry point of the
1047 * graphics pipeline includes an input variable decorated
1048 * with SampleId or SamplePosition. In this case
1049 * minSampleShadingFactor takes the value 1.0.
1050 * - Else if the sampleShadingEnable member of the
1051 * VkPipelineMultisampleStateCreateInfo structure specified
1052 * when creating the graphics pipeline is set to VK_TRUE. In
1053 * this case minSampleShadingFactor takes the value of
1054 * VkPipelineMultisampleStateCreateInfo::minSampleShading.
1055 *
1056 * Otherwise, sample shading is considered disabled."
1057 */
1058 if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.ps.uses_sample_shading) {
1059 ps_iter_samples = ms->num_samples;
1060 } else {
1061 ps_iter_samples = radv_pipeline_get_ps_iter_samples(pCreateInfo);
1062 }
1063 } else {
1064 ms->num_samples = 1;
1065 }
1066
1067 const struct VkPipelineRasterizationStateRasterizationOrderAMD *raster_order =
1068 vk_find_struct_const(pCreateInfo->pRasterizationState->pNext,
1069 PIPELINE_RASTERIZATION_STATE_RASTERIZATION_ORDER_AMD);
1070 if (raster_order && raster_order->rasterizationOrder == VK_RASTERIZATION_ORDER_RELAXED_AMD) {
1071 /* Out-of-order rasterization is explicitly enabled by the
1072 * application.
1073 */
1074 out_of_order_rast = true;
1075 } else {
1076 /* Determine if the driver can enable out-of-order
1077 * rasterization internally.
1078 */
1079 out_of_order_rast = radv_pipeline_out_of_order_rast(pipeline, blend, pCreateInfo);
1080 }
1081
1082 ms->pa_sc_aa_config = 0;
1083 ms->db_eqaa = S_028804_HIGH_QUALITY_INTERSECTIONS(1) | S_028804_INCOHERENT_EQAA_READS(1) |
1084 S_028804_INTERPOLATE_COMP_Z(1) | S_028804_STATIC_ANCHOR_ASSOCIATIONS(1);
1085
1086 /* Adjust MSAA state if conservative rasterization is enabled. */
1087 if (mode != VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT) {
1088 ms->pa_sc_aa_config |= S_028BE0_AA_MASK_CENTROID_DTMN(1);
1089
1090 ms->db_eqaa |=
1091 S_028804_ENABLE_POSTZ_OVERRASTERIZATION(1) | S_028804_OVERRASTERIZATION_AMOUNT(4);
1092 }
1093
1094 ms->pa_sc_mode_cntl_1 =
1095 S_028A4C_WALK_FENCE_ENABLE(1) | // TODO linear dst fixes
1096 S_028A4C_WALK_FENCE_SIZE(num_tile_pipes == 2 ? 2 : 3) |
1097 S_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(out_of_order_rast) |
1098 S_028A4C_OUT_OF_ORDER_WATER_MARK(0x7) |
1099 /* always 1: */
1100 S_028A4C_WALK_ALIGN8_PRIM_FITS_ST(1) | S_028A4C_SUPERTILE_WALK_ORDER_ENABLE(1) |
1101 S_028A4C_TILE_WALK_ORDER_ENABLE(1) | S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE(1) |
1102 S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) | S_028A4C_FORCE_EOV_REZ_ENABLE(1);
1103 ms->pa_sc_mode_cntl_0 = S_028A48_ALTERNATE_RBS_PER_TILE(
1104 pipeline->device->physical_device->rad_info.chip_class >= GFX9) |
1105 S_028A48_VPORT_SCISSOR_ENABLE(1);
1106
1107 const VkPipelineRasterizationLineStateCreateInfoEXT *rast_line = vk_find_struct_const(
1108 pCreateInfo->pRasterizationState->pNext, PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT);
1109 if (rast_line) {
1110 ms->pa_sc_mode_cntl_0 |= S_028A48_LINE_STIPPLE_ENABLE(rast_line->stippledLineEnable);
1111 if (rast_line->lineRasterizationMode == VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT) {
1112 /* From the Vulkan spec 1.1.129:
1113 *
1114 * "When VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT lines
1115 * are being rasterized, sample locations may all be
1116 * treated as being at the pixel center (this may
1117 * affect attribute and depth interpolation)."
1118 */
1119 ms->num_samples = 1;
1120 }
1121 }
1122
1123 if (ms->num_samples > 1) {
1124 RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
1125 struct radv_subpass *subpass = &pass->subpasses[pCreateInfo->subpass];
1126 uint32_t z_samples =
1127 subpass->depth_stencil_attachment ? subpass->depth_sample_count : ms->num_samples;
1128 unsigned log_samples = util_logbase2(ms->num_samples);
1129 unsigned log_z_samples = util_logbase2(z_samples);
1130 unsigned log_ps_iter_samples = util_logbase2(ps_iter_samples);
1131 ms->pa_sc_mode_cntl_0 |= S_028A48_MSAA_ENABLE(1);
1132 ms->db_eqaa |= S_028804_MAX_ANCHOR_SAMPLES(log_z_samples) |
1133 S_028804_PS_ITER_SAMPLES(log_ps_iter_samples) |
1134 S_028804_MASK_EXPORT_NUM_SAMPLES(log_samples) |
1135 S_028804_ALPHA_TO_MASK_NUM_SAMPLES(log_samples);
1136 ms->pa_sc_aa_config |=
1137 S_028BE0_MSAA_NUM_SAMPLES(log_samples) |
1138 S_028BE0_MAX_SAMPLE_DIST(radv_get_default_max_sample_dist(log_samples)) |
1139 S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples) | /* CM_R_028BE0_PA_SC_AA_CONFIG */
1140 S_028BE0_COVERED_CENTROID_IS_CENTER(
1141 pipeline->device->physical_device->rad_info.chip_class >= GFX10_3);
1142 ms->pa_sc_mode_cntl_1 |= S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1);
1143 if (ps_iter_samples > 1)
1144 pipeline->graphics.spi_baryc_cntl |= S_0286E0_POS_FLOAT_LOCATION(2);
1145 }
1146
1147 if (vkms && vkms->pSampleMask) {
1148 mask = vkms->pSampleMask[0] & 0xffff;
1149 }
1150
1151 ms->pa_sc_aa_mask[0] = mask | (mask << 16);
1152 ms->pa_sc_aa_mask[1] = mask | (mask << 16);
1153 }
1154
1155 static void
gfx103_pipeline_init_vrs_state(struct radv_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)1156 gfx103_pipeline_init_vrs_state(struct radv_pipeline *pipeline,
1157 const VkGraphicsPipelineCreateInfo *pCreateInfo)
1158 {
1159 const VkPipelineMultisampleStateCreateInfo *vkms =
1160 radv_pipeline_get_multisample_state(pCreateInfo);
1161 struct radv_shader_variant *ps = pipeline->shaders[MESA_SHADER_FRAGMENT];
1162 struct radv_multisample_state *ms = &pipeline->graphics.ms;
1163 struct radv_vrs_state *vrs = &pipeline->graphics.vrs;
1164
1165 if (vkms && (vkms->sampleShadingEnable || ps->info.ps.uses_sample_shading ||
1166 ps->info.ps.reads_sample_mask_in)) {
1167 /* Disable VRS and use the rates from PS_ITER_SAMPLES if:
1168 *
1169 * 1) sample shading is enabled or per-sample interpolation is
1170 * used by the fragment shader
1171 * 2) the fragment shader reads gl_SampleMaskIn because the
1172 * 16-bit sample coverage mask isn't enough for MSAA8x and
1173 * 2x2 coarse shading isn't enough.
1174 */
1175 vrs->pa_cl_vrs_cntl = S_028848_SAMPLE_ITER_COMBINER_MODE(V_028848_VRS_COMB_MODE_OVERRIDE);
1176
1177 /* Make sure sample shading is enabled even if only MSAA1x is
1178 * used because the SAMPLE_ITER combiner is in passthrough
1179 * mode if PS_ITER_SAMPLE is 0, and it uses the per-draw rate.
1180 * The default VRS rate when sample shading is enabled is 1x1.
1181 */
1182 if (!G_028A4C_PS_ITER_SAMPLE(ms->pa_sc_mode_cntl_1))
1183 ms->pa_sc_mode_cntl_1 |= S_028A4C_PS_ITER_SAMPLE(1);
1184 } else {
1185 vrs->pa_cl_vrs_cntl = S_028848_SAMPLE_ITER_COMBINER_MODE(V_028848_VRS_COMB_MODE_PASSTHRU);
1186 }
1187
1188 /* The primitive combiner is always passthrough. */
1189 vrs->pa_cl_vrs_cntl |= S_028848_PRIMITIVE_RATE_COMBINER_MODE(V_028848_VRS_COMB_MODE_PASSTHRU);
1190 }
1191
1192 static bool
radv_prim_can_use_guardband(enum VkPrimitiveTopology topology)1193 radv_prim_can_use_guardband(enum VkPrimitiveTopology topology)
1194 {
1195 switch (topology) {
1196 case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
1197 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
1198 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
1199 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
1200 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
1201 return false;
1202 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
1203 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
1204 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
1205 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
1206 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
1207 case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
1208 return true;
1209 default:
1210 unreachable("unhandled primitive type");
1211 }
1212 }
1213
1214 static uint32_t
si_conv_gl_prim_to_gs_out(unsigned gl_prim)1215 si_conv_gl_prim_to_gs_out(unsigned gl_prim)
1216 {
1217 switch (gl_prim) {
1218 case 0: /* GL_POINTS */
1219 return V_028A6C_POINTLIST;
1220 case 1: /* GL_LINES */
1221 case 3: /* GL_LINE_STRIP */
1222 case 0xA: /* GL_LINE_STRIP_ADJACENCY_ARB */
1223 case 0x8E7A: /* GL_ISOLINES */
1224 return V_028A6C_LINESTRIP;
1225
1226 case 4: /* GL_TRIANGLES */
1227 case 0xc: /* GL_TRIANGLES_ADJACENCY_ARB */
1228 case 5: /* GL_TRIANGLE_STRIP */
1229 case 7: /* GL_QUADS */
1230 return V_028A6C_TRISTRIP;
1231 default:
1232 assert(0);
1233 return 0;
1234 }
1235 }
1236
1237 static uint64_t
radv_dynamic_state_mask(VkDynamicState state)1238 radv_dynamic_state_mask(VkDynamicState state)
1239 {
1240 switch (state) {
1241 case VK_DYNAMIC_STATE_VIEWPORT:
1242 case VK_DYNAMIC_STATE_VIEWPORT_WITH_COUNT_EXT:
1243 return RADV_DYNAMIC_VIEWPORT;
1244 case VK_DYNAMIC_STATE_SCISSOR:
1245 case VK_DYNAMIC_STATE_SCISSOR_WITH_COUNT_EXT:
1246 return RADV_DYNAMIC_SCISSOR;
1247 case VK_DYNAMIC_STATE_LINE_WIDTH:
1248 return RADV_DYNAMIC_LINE_WIDTH;
1249 case VK_DYNAMIC_STATE_DEPTH_BIAS:
1250 return RADV_DYNAMIC_DEPTH_BIAS;
1251 case VK_DYNAMIC_STATE_BLEND_CONSTANTS:
1252 return RADV_DYNAMIC_BLEND_CONSTANTS;
1253 case VK_DYNAMIC_STATE_DEPTH_BOUNDS:
1254 return RADV_DYNAMIC_DEPTH_BOUNDS;
1255 case VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK:
1256 return RADV_DYNAMIC_STENCIL_COMPARE_MASK;
1257 case VK_DYNAMIC_STATE_STENCIL_WRITE_MASK:
1258 return RADV_DYNAMIC_STENCIL_WRITE_MASK;
1259 case VK_DYNAMIC_STATE_STENCIL_REFERENCE:
1260 return RADV_DYNAMIC_STENCIL_REFERENCE;
1261 case VK_DYNAMIC_STATE_DISCARD_RECTANGLE_EXT:
1262 return RADV_DYNAMIC_DISCARD_RECTANGLE;
1263 case VK_DYNAMIC_STATE_SAMPLE_LOCATIONS_EXT:
1264 return RADV_DYNAMIC_SAMPLE_LOCATIONS;
1265 case VK_DYNAMIC_STATE_LINE_STIPPLE_EXT:
1266 return RADV_DYNAMIC_LINE_STIPPLE;
1267 case VK_DYNAMIC_STATE_CULL_MODE_EXT:
1268 return RADV_DYNAMIC_CULL_MODE;
1269 case VK_DYNAMIC_STATE_FRONT_FACE_EXT:
1270 return RADV_DYNAMIC_FRONT_FACE;
1271 case VK_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY_EXT:
1272 return RADV_DYNAMIC_PRIMITIVE_TOPOLOGY;
1273 case VK_DYNAMIC_STATE_DEPTH_TEST_ENABLE_EXT:
1274 return RADV_DYNAMIC_DEPTH_TEST_ENABLE;
1275 case VK_DYNAMIC_STATE_DEPTH_WRITE_ENABLE_EXT:
1276 return RADV_DYNAMIC_DEPTH_WRITE_ENABLE;
1277 case VK_DYNAMIC_STATE_DEPTH_COMPARE_OP_EXT:
1278 return RADV_DYNAMIC_DEPTH_COMPARE_OP;
1279 case VK_DYNAMIC_STATE_DEPTH_BOUNDS_TEST_ENABLE_EXT:
1280 return RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE;
1281 case VK_DYNAMIC_STATE_STENCIL_TEST_ENABLE_EXT:
1282 return RADV_DYNAMIC_STENCIL_TEST_ENABLE;
1283 case VK_DYNAMIC_STATE_STENCIL_OP_EXT:
1284 return RADV_DYNAMIC_STENCIL_OP;
1285 case VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE_EXT:
1286 return RADV_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE;
1287 case VK_DYNAMIC_STATE_FRAGMENT_SHADING_RATE_KHR:
1288 return RADV_DYNAMIC_FRAGMENT_SHADING_RATE;
1289 case VK_DYNAMIC_STATE_PATCH_CONTROL_POINTS_EXT:
1290 return RADV_DYNAMIC_PATCH_CONTROL_POINTS;
1291 case VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE_EXT:
1292 return RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
1293 case VK_DYNAMIC_STATE_DEPTH_BIAS_ENABLE_EXT:
1294 return RADV_DYNAMIC_DEPTH_BIAS_ENABLE;
1295 case VK_DYNAMIC_STATE_LOGIC_OP_EXT:
1296 return RADV_DYNAMIC_LOGIC_OP;
1297 case VK_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE_EXT:
1298 return RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE;
1299 case VK_DYNAMIC_STATE_COLOR_WRITE_ENABLE_EXT:
1300 return RADV_DYNAMIC_COLOR_WRITE_ENABLE;
1301 case VK_DYNAMIC_STATE_VERTEX_INPUT_EXT:
1302 return RADV_DYNAMIC_VERTEX_INPUT;
1303 default:
1304 unreachable("Unhandled dynamic state");
1305 }
1306 }
1307
1308 static bool
radv_pipeline_is_blend_enabled(const VkGraphicsPipelineCreateInfo * pCreateInfo)1309 radv_pipeline_is_blend_enabled(const VkGraphicsPipelineCreateInfo *pCreateInfo)
1310 {
1311 const VkPipelineColorBlendStateCreateInfo *vkblend =
1312 radv_pipeline_get_color_blend_state(pCreateInfo);
1313
1314 assert(vkblend);
1315
1316 for (uint32_t i = 0; i < vkblend->attachmentCount; i++) {
1317 const VkPipelineColorBlendAttachmentState *att = &vkblend->pAttachments[i];
1318 if (att->colorWriteMask && att->blendEnable)
1319 return true;
1320 }
1321 return false;
1322 }
1323
1324 static uint64_t
radv_pipeline_needed_dynamic_state(const VkGraphicsPipelineCreateInfo * pCreateInfo)1325 radv_pipeline_needed_dynamic_state(const VkGraphicsPipelineCreateInfo *pCreateInfo)
1326 {
1327 RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
1328 struct radv_subpass *subpass = &pass->subpasses[pCreateInfo->subpass];
1329 uint64_t states = RADV_DYNAMIC_ALL;
1330
1331 /* If rasterization is disabled we do not care about any of the
1332 * dynamic states, since they are all rasterization related only,
1333 * except primitive topology, primitive restart enable, vertex
1334 * binding stride and rasterization discard itself.
1335 */
1336 if (pCreateInfo->pRasterizationState->rasterizerDiscardEnable &&
1337 !radv_is_state_dynamic(pCreateInfo, VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE_EXT)) {
1338 return RADV_DYNAMIC_PRIMITIVE_TOPOLOGY | RADV_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE |
1339 RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE | RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE |
1340 RADV_DYNAMIC_VERTEX_INPUT;
1341 }
1342
1343 if (!pCreateInfo->pRasterizationState->depthBiasEnable &&
1344 !radv_is_state_dynamic(pCreateInfo, VK_DYNAMIC_STATE_DEPTH_BIAS_ENABLE_EXT))
1345 states &= ~RADV_DYNAMIC_DEPTH_BIAS;
1346
1347 if (!pCreateInfo->pDepthStencilState ||
1348 (!pCreateInfo->pDepthStencilState->depthBoundsTestEnable &&
1349 !radv_is_state_dynamic(pCreateInfo, VK_DYNAMIC_STATE_DEPTH_BOUNDS_TEST_ENABLE_EXT)))
1350 states &= ~RADV_DYNAMIC_DEPTH_BOUNDS;
1351
1352 if (!pCreateInfo->pDepthStencilState ||
1353 (!pCreateInfo->pDepthStencilState->stencilTestEnable &&
1354 !radv_is_state_dynamic(pCreateInfo, VK_DYNAMIC_STATE_STENCIL_TEST_ENABLE_EXT)))
1355 states &= ~(RADV_DYNAMIC_STENCIL_COMPARE_MASK | RADV_DYNAMIC_STENCIL_WRITE_MASK |
1356 RADV_DYNAMIC_STENCIL_REFERENCE);
1357
1358 if (!vk_find_struct_const(pCreateInfo->pNext, PIPELINE_DISCARD_RECTANGLE_STATE_CREATE_INFO_EXT))
1359 states &= ~RADV_DYNAMIC_DISCARD_RECTANGLE;
1360
1361 if (!pCreateInfo->pMultisampleState ||
1362 !vk_find_struct_const(pCreateInfo->pMultisampleState->pNext,
1363 PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT))
1364 states &= ~RADV_DYNAMIC_SAMPLE_LOCATIONS;
1365
1366 if (!pCreateInfo->pRasterizationState)
1367 states &= ~RADV_DYNAMIC_LINE_STIPPLE;
1368 else {
1369 const VkPipelineRasterizationLineStateCreateInfoEXT *rast_line_info = vk_find_struct_const(pCreateInfo->pRasterizationState->pNext,
1370 PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT);
1371 if (!rast_line_info || !rast_line_info->stippledLineEnable)
1372 states &= ~RADV_DYNAMIC_LINE_STIPPLE;
1373 }
1374
1375 if (!vk_find_struct_const(pCreateInfo->pNext,
1376 PIPELINE_FRAGMENT_SHADING_RATE_STATE_CREATE_INFO_KHR) &&
1377 !radv_is_state_dynamic(pCreateInfo, VK_DYNAMIC_STATE_FRAGMENT_SHADING_RATE_KHR))
1378 states &= ~RADV_DYNAMIC_FRAGMENT_SHADING_RATE;
1379
1380 if (!subpass->has_color_att ||
1381 !radv_pipeline_is_blend_enabled(pCreateInfo))
1382 states &= ~RADV_DYNAMIC_BLEND_CONSTANTS;
1383
1384 if (!subpass->has_color_att)
1385 states &= ~RADV_DYNAMIC_COLOR_WRITE_ENABLE;
1386
1387 return states;
1388 }
1389
1390 static struct radv_ia_multi_vgt_param_helpers
radv_compute_ia_multi_vgt_param_helpers(struct radv_pipeline * pipeline)1391 radv_compute_ia_multi_vgt_param_helpers(struct radv_pipeline *pipeline)
1392 {
1393 struct radv_ia_multi_vgt_param_helpers ia_multi_vgt_param = {0};
1394 const struct radv_device *device = pipeline->device;
1395
1396 if (radv_pipeline_has_tess(pipeline))
1397 ia_multi_vgt_param.primgroup_size =
1398 pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.num_tess_patches;
1399 else if (radv_pipeline_has_gs(pipeline))
1400 ia_multi_vgt_param.primgroup_size = 64;
1401 else
1402 ia_multi_vgt_param.primgroup_size = 128; /* recommended without a GS */
1403
1404 /* GS requirement. */
1405 ia_multi_vgt_param.partial_es_wave = false;
1406 if (radv_pipeline_has_gs(pipeline) && device->physical_device->rad_info.chip_class <= GFX8)
1407 if (SI_GS_PER_ES / ia_multi_vgt_param.primgroup_size >= pipeline->device->gs_table_depth - 3)
1408 ia_multi_vgt_param.partial_es_wave = true;
1409
1410 ia_multi_vgt_param.ia_switch_on_eoi = false;
1411 if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.ps.prim_id_input)
1412 ia_multi_vgt_param.ia_switch_on_eoi = true;
1413 if (radv_pipeline_has_gs(pipeline) && pipeline->shaders[MESA_SHADER_GEOMETRY]->info.uses_prim_id)
1414 ia_multi_vgt_param.ia_switch_on_eoi = true;
1415 if (radv_pipeline_has_tess(pipeline)) {
1416 /* SWITCH_ON_EOI must be set if PrimID is used. */
1417 if (pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.uses_prim_id ||
1418 radv_get_shader(pipeline, MESA_SHADER_TESS_EVAL)->info.uses_prim_id)
1419 ia_multi_vgt_param.ia_switch_on_eoi = true;
1420 }
1421
1422 ia_multi_vgt_param.partial_vs_wave = false;
1423 if (radv_pipeline_has_tess(pipeline)) {
1424 /* Bug with tessellation and GS on Bonaire and older 2 SE chips. */
1425 if ((device->physical_device->rad_info.family == CHIP_TAHITI ||
1426 device->physical_device->rad_info.family == CHIP_PITCAIRN ||
1427 device->physical_device->rad_info.family == CHIP_BONAIRE) &&
1428 radv_pipeline_has_gs(pipeline))
1429 ia_multi_vgt_param.partial_vs_wave = true;
1430 /* Needed for 028B6C_DISTRIBUTION_MODE != 0 */
1431 if (device->physical_device->rad_info.has_distributed_tess) {
1432 if (radv_pipeline_has_gs(pipeline)) {
1433 if (device->physical_device->rad_info.chip_class <= GFX8)
1434 ia_multi_vgt_param.partial_es_wave = true;
1435 } else {
1436 ia_multi_vgt_param.partial_vs_wave = true;
1437 }
1438 }
1439 }
1440
1441 if (radv_pipeline_has_gs(pipeline)) {
1442 /* On these chips there is the possibility of a hang if the
1443 * pipeline uses a GS and partial_vs_wave is not set.
1444 *
1445 * This mostly does not hit 4-SE chips, as those typically set
1446 * ia_switch_on_eoi and then partial_vs_wave is set for pipelines
1447 * with GS due to another workaround.
1448 *
1449 * Reproducer: https://bugs.freedesktop.org/show_bug.cgi?id=109242
1450 */
1451 if (device->physical_device->rad_info.family == CHIP_TONGA ||
1452 device->physical_device->rad_info.family == CHIP_FIJI ||
1453 device->physical_device->rad_info.family == CHIP_POLARIS10 ||
1454 device->physical_device->rad_info.family == CHIP_POLARIS11 ||
1455 device->physical_device->rad_info.family == CHIP_POLARIS12 ||
1456 device->physical_device->rad_info.family == CHIP_VEGAM) {
1457 ia_multi_vgt_param.partial_vs_wave = true;
1458 }
1459 }
1460
1461 ia_multi_vgt_param.base =
1462 S_028AA8_PRIMGROUP_SIZE(ia_multi_vgt_param.primgroup_size - 1) |
1463 /* The following field was moved to VGT_SHADER_STAGES_EN in GFX9. */
1464 S_028AA8_MAX_PRIMGRP_IN_WAVE(device->physical_device->rad_info.chip_class == GFX8 ? 2 : 0) |
1465 S_030960_EN_INST_OPT_BASIC(device->physical_device->rad_info.chip_class >= GFX9) |
1466 S_030960_EN_INST_OPT_ADV(device->physical_device->rad_info.chip_class >= GFX9);
1467
1468 return ia_multi_vgt_param;
1469 }
1470
1471 static void
radv_pipeline_init_input_assembly_state(struct radv_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo,const struct radv_graphics_pipeline_create_info * extra)1472 radv_pipeline_init_input_assembly_state(struct radv_pipeline *pipeline,
1473 const VkGraphicsPipelineCreateInfo *pCreateInfo,
1474 const struct radv_graphics_pipeline_create_info *extra)
1475 {
1476 const VkPipelineInputAssemblyStateCreateInfo *ia_state = pCreateInfo->pInputAssemblyState;
1477 struct radv_shader_variant *tes = pipeline->shaders[MESA_SHADER_TESS_EVAL];
1478 struct radv_shader_variant *gs = pipeline->shaders[MESA_SHADER_GEOMETRY];
1479
1480 pipeline->graphics.can_use_guardband = radv_prim_can_use_guardband(ia_state->topology);
1481
1482 if (radv_pipeline_has_gs(pipeline)) {
1483 if (si_conv_gl_prim_to_gs_out(gs->info.gs.output_prim) == V_028A6C_TRISTRIP)
1484 pipeline->graphics.can_use_guardband = true;
1485 } else if (radv_pipeline_has_tess(pipeline)) {
1486 if (!tes->info.tes.point_mode &&
1487 si_conv_gl_prim_to_gs_out(tes->info.tes.primitive_mode) == V_028A6C_TRISTRIP)
1488 pipeline->graphics.can_use_guardband = true;
1489 }
1490
1491 if (extra && extra->use_rectlist) {
1492 pipeline->graphics.can_use_guardband = true;
1493 }
1494
1495 pipeline->graphics.ia_multi_vgt_param = radv_compute_ia_multi_vgt_param_helpers(pipeline);
1496 }
1497
1498 static void
radv_pipeline_init_dynamic_state(struct radv_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo,const struct radv_graphics_pipeline_create_info * extra)1499 radv_pipeline_init_dynamic_state(struct radv_pipeline *pipeline,
1500 const VkGraphicsPipelineCreateInfo *pCreateInfo,
1501 const struct radv_graphics_pipeline_create_info *extra)
1502 {
1503 uint64_t needed_states = radv_pipeline_needed_dynamic_state(pCreateInfo);
1504 uint64_t states = needed_states;
1505 RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
1506 struct radv_subpass *subpass = &pass->subpasses[pCreateInfo->subpass];
1507
1508 pipeline->dynamic_state = default_dynamic_state;
1509 pipeline->graphics.needed_dynamic_state = needed_states;
1510
1511 if (pCreateInfo->pDynamicState) {
1512 /* Remove all of the states that are marked as dynamic */
1513 uint32_t count = pCreateInfo->pDynamicState->dynamicStateCount;
1514 for (uint32_t s = 0; s < count; s++)
1515 states &= ~radv_dynamic_state_mask(pCreateInfo->pDynamicState->pDynamicStates[s]);
1516 }
1517
1518 struct radv_dynamic_state *dynamic = &pipeline->dynamic_state;
1519
1520 if (needed_states & RADV_DYNAMIC_VIEWPORT) {
1521 assert(pCreateInfo->pViewportState);
1522
1523 dynamic->viewport.count = pCreateInfo->pViewportState->viewportCount;
1524 if (states & RADV_DYNAMIC_VIEWPORT) {
1525 typed_memcpy(dynamic->viewport.viewports, pCreateInfo->pViewportState->pViewports,
1526 pCreateInfo->pViewportState->viewportCount);
1527 for (unsigned i = 0; i < dynamic->viewport.count; i++)
1528 radv_get_viewport_xform(&dynamic->viewport.viewports[i],
1529 dynamic->viewport.xform[i].scale, dynamic->viewport.xform[i].translate);
1530 }
1531 }
1532
1533 if (needed_states & RADV_DYNAMIC_SCISSOR) {
1534 dynamic->scissor.count = pCreateInfo->pViewportState->scissorCount;
1535 if (states & RADV_DYNAMIC_SCISSOR) {
1536 typed_memcpy(dynamic->scissor.scissors, pCreateInfo->pViewportState->pScissors,
1537 pCreateInfo->pViewportState->scissorCount);
1538 }
1539 }
1540
1541 if (states & RADV_DYNAMIC_LINE_WIDTH) {
1542 assert(pCreateInfo->pRasterizationState);
1543 dynamic->line_width = pCreateInfo->pRasterizationState->lineWidth;
1544 }
1545
1546 if (states & RADV_DYNAMIC_DEPTH_BIAS) {
1547 assert(pCreateInfo->pRasterizationState);
1548 dynamic->depth_bias.bias = pCreateInfo->pRasterizationState->depthBiasConstantFactor;
1549 dynamic->depth_bias.clamp = pCreateInfo->pRasterizationState->depthBiasClamp;
1550 dynamic->depth_bias.slope = pCreateInfo->pRasterizationState->depthBiasSlopeFactor;
1551 }
1552
1553 /* Section 9.2 of the Vulkan 1.0.15 spec says:
1554 *
1555 * pColorBlendState is [...] NULL if the pipeline has rasterization
1556 * disabled or if the subpass of the render pass the pipeline is
1557 * created against does not use any color attachments.
1558 */
1559 if (states & RADV_DYNAMIC_BLEND_CONSTANTS) {
1560 assert(pCreateInfo->pColorBlendState);
1561 typed_memcpy(dynamic->blend_constants, pCreateInfo->pColorBlendState->blendConstants, 4);
1562 }
1563
1564 if (states & RADV_DYNAMIC_CULL_MODE) {
1565 dynamic->cull_mode = pCreateInfo->pRasterizationState->cullMode;
1566 }
1567
1568 if (states & RADV_DYNAMIC_FRONT_FACE) {
1569 dynamic->front_face = pCreateInfo->pRasterizationState->frontFace;
1570 }
1571
1572 if (states & RADV_DYNAMIC_PRIMITIVE_TOPOLOGY) {
1573 dynamic->primitive_topology = si_translate_prim(pCreateInfo->pInputAssemblyState->topology);
1574 if (extra && extra->use_rectlist) {
1575 dynamic->primitive_topology = V_008958_DI_PT_RECTLIST;
1576 }
1577 }
1578
1579 /* If there is no depthstencil attachment, then don't read
1580 * pDepthStencilState. The Vulkan spec states that pDepthStencilState may
1581 * be NULL in this case. Even if pDepthStencilState is non-NULL, there is
1582 * no need to override the depthstencil defaults in
1583 * radv_pipeline::dynamic_state when there is no depthstencil attachment.
1584 *
1585 * Section 9.2 of the Vulkan 1.0.15 spec says:
1586 *
1587 * pDepthStencilState is [...] NULL if the pipeline has rasterization
1588 * disabled or if the subpass of the render pass the pipeline is created
1589 * against does not use a depth/stencil attachment.
1590 */
1591 if (needed_states && subpass->depth_stencil_attachment) {
1592 if (states & RADV_DYNAMIC_DEPTH_BOUNDS) {
1593 dynamic->depth_bounds.min = pCreateInfo->pDepthStencilState->minDepthBounds;
1594 dynamic->depth_bounds.max = pCreateInfo->pDepthStencilState->maxDepthBounds;
1595 }
1596
1597 if (states & RADV_DYNAMIC_STENCIL_COMPARE_MASK) {
1598 dynamic->stencil_compare_mask.front = pCreateInfo->pDepthStencilState->front.compareMask;
1599 dynamic->stencil_compare_mask.back = pCreateInfo->pDepthStencilState->back.compareMask;
1600 }
1601
1602 if (states & RADV_DYNAMIC_STENCIL_WRITE_MASK) {
1603 dynamic->stencil_write_mask.front = pCreateInfo->pDepthStencilState->front.writeMask;
1604 dynamic->stencil_write_mask.back = pCreateInfo->pDepthStencilState->back.writeMask;
1605 }
1606
1607 if (states & RADV_DYNAMIC_STENCIL_REFERENCE) {
1608 dynamic->stencil_reference.front = pCreateInfo->pDepthStencilState->front.reference;
1609 dynamic->stencil_reference.back = pCreateInfo->pDepthStencilState->back.reference;
1610 }
1611
1612 if (states & RADV_DYNAMIC_DEPTH_TEST_ENABLE) {
1613 dynamic->depth_test_enable = pCreateInfo->pDepthStencilState->depthTestEnable;
1614 }
1615
1616 if (states & RADV_DYNAMIC_DEPTH_WRITE_ENABLE) {
1617 dynamic->depth_write_enable = pCreateInfo->pDepthStencilState->depthWriteEnable;
1618 }
1619
1620 if (states & RADV_DYNAMIC_DEPTH_COMPARE_OP) {
1621 dynamic->depth_compare_op = pCreateInfo->pDepthStencilState->depthCompareOp;
1622 }
1623
1624 if (states & RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE) {
1625 dynamic->depth_bounds_test_enable = pCreateInfo->pDepthStencilState->depthBoundsTestEnable;
1626 }
1627
1628 if (states & RADV_DYNAMIC_STENCIL_TEST_ENABLE) {
1629 dynamic->stencil_test_enable = pCreateInfo->pDepthStencilState->stencilTestEnable;
1630 }
1631
1632 if (states & RADV_DYNAMIC_STENCIL_OP) {
1633 dynamic->stencil_op.front.compare_op = pCreateInfo->pDepthStencilState->front.compareOp;
1634 dynamic->stencil_op.front.fail_op = pCreateInfo->pDepthStencilState->front.failOp;
1635 dynamic->stencil_op.front.pass_op = pCreateInfo->pDepthStencilState->front.passOp;
1636 dynamic->stencil_op.front.depth_fail_op =
1637 pCreateInfo->pDepthStencilState->front.depthFailOp;
1638
1639 dynamic->stencil_op.back.compare_op = pCreateInfo->pDepthStencilState->back.compareOp;
1640 dynamic->stencil_op.back.fail_op = pCreateInfo->pDepthStencilState->back.failOp;
1641 dynamic->stencil_op.back.pass_op = pCreateInfo->pDepthStencilState->back.passOp;
1642 dynamic->stencil_op.back.depth_fail_op = pCreateInfo->pDepthStencilState->back.depthFailOp;
1643 }
1644 }
1645
1646 const VkPipelineDiscardRectangleStateCreateInfoEXT *discard_rectangle_info =
1647 vk_find_struct_const(pCreateInfo->pNext, PIPELINE_DISCARD_RECTANGLE_STATE_CREATE_INFO_EXT);
1648 if (needed_states & RADV_DYNAMIC_DISCARD_RECTANGLE) {
1649 dynamic->discard_rectangle.count = discard_rectangle_info->discardRectangleCount;
1650 if (states & RADV_DYNAMIC_DISCARD_RECTANGLE) {
1651 typed_memcpy(dynamic->discard_rectangle.rectangles,
1652 discard_rectangle_info->pDiscardRectangles,
1653 discard_rectangle_info->discardRectangleCount);
1654 }
1655 }
1656
1657 if (needed_states & RADV_DYNAMIC_SAMPLE_LOCATIONS) {
1658 const VkPipelineSampleLocationsStateCreateInfoEXT *sample_location_info =
1659 vk_find_struct_const(pCreateInfo->pMultisampleState->pNext,
1660 PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT);
1661 /* If sampleLocationsEnable is VK_FALSE, the default sample
1662 * locations are used and the values specified in
1663 * sampleLocationsInfo are ignored.
1664 */
1665 if (sample_location_info->sampleLocationsEnable) {
1666 const VkSampleLocationsInfoEXT *pSampleLocationsInfo =
1667 &sample_location_info->sampleLocationsInfo;
1668
1669 assert(pSampleLocationsInfo->sampleLocationsCount <= MAX_SAMPLE_LOCATIONS);
1670
1671 dynamic->sample_location.per_pixel = pSampleLocationsInfo->sampleLocationsPerPixel;
1672 dynamic->sample_location.grid_size = pSampleLocationsInfo->sampleLocationGridSize;
1673 dynamic->sample_location.count = pSampleLocationsInfo->sampleLocationsCount;
1674 typed_memcpy(&dynamic->sample_location.locations[0],
1675 pSampleLocationsInfo->pSampleLocations,
1676 pSampleLocationsInfo->sampleLocationsCount);
1677 }
1678 }
1679
1680 const VkPipelineRasterizationLineStateCreateInfoEXT *rast_line_info = vk_find_struct_const(
1681 pCreateInfo->pRasterizationState->pNext, PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT);
1682 if (needed_states & RADV_DYNAMIC_LINE_STIPPLE) {
1683 dynamic->line_stipple.factor = rast_line_info->lineStippleFactor;
1684 dynamic->line_stipple.pattern = rast_line_info->lineStipplePattern;
1685 }
1686
1687 if (!(states & RADV_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE) ||
1688 !(states & RADV_DYNAMIC_VERTEX_INPUT))
1689 pipeline->graphics.uses_dynamic_stride = true;
1690
1691 const VkPipelineFragmentShadingRateStateCreateInfoKHR *shading_rate = vk_find_struct_const(
1692 pCreateInfo->pNext, PIPELINE_FRAGMENT_SHADING_RATE_STATE_CREATE_INFO_KHR);
1693 if (states & RADV_DYNAMIC_FRAGMENT_SHADING_RATE) {
1694 dynamic->fragment_shading_rate.size = shading_rate->fragmentSize;
1695 for (int i = 0; i < 2; i++)
1696 dynamic->fragment_shading_rate.combiner_ops[i] = shading_rate->combinerOps[i];
1697 }
1698
1699 if (states & RADV_DYNAMIC_DEPTH_BIAS_ENABLE) {
1700 dynamic->depth_bias_enable = pCreateInfo->pRasterizationState->depthBiasEnable;
1701 }
1702
1703 if (states & RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE) {
1704 dynamic->primitive_restart_enable =
1705 !!pCreateInfo->pInputAssemblyState->primitiveRestartEnable;
1706 }
1707
1708 if (states & RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE) {
1709 dynamic->rasterizer_discard_enable =
1710 pCreateInfo->pRasterizationState->rasterizerDiscardEnable;
1711 }
1712
1713 if (subpass->has_color_att && states & RADV_DYNAMIC_LOGIC_OP) {
1714 if (pCreateInfo->pColorBlendState->logicOpEnable) {
1715 dynamic->logic_op = si_translate_blend_logic_op(pCreateInfo->pColorBlendState->logicOp);
1716 } else {
1717 dynamic->logic_op = V_028808_ROP3_COPY;
1718 }
1719 }
1720
1721 if (states & RADV_DYNAMIC_COLOR_WRITE_ENABLE) {
1722 const VkPipelineColorWriteCreateInfoEXT *color_write_info = vk_find_struct_const(
1723 pCreateInfo->pColorBlendState->pNext, PIPELINE_COLOR_WRITE_CREATE_INFO_EXT);
1724 if (color_write_info) {
1725 dynamic->color_write_enable = 0;
1726 for (uint32_t i = 0; i < color_write_info->attachmentCount; i++) {
1727 dynamic->color_write_enable |=
1728 color_write_info->pColorWriteEnables[i] ? (0xfu << (i * 4)) : 0;
1729 }
1730 }
1731 }
1732
1733 pipeline->dynamic_state.mask = states;
1734 }
1735
1736 static void
radv_pipeline_init_raster_state(struct radv_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)1737 radv_pipeline_init_raster_state(struct radv_pipeline *pipeline,
1738 const VkGraphicsPipelineCreateInfo *pCreateInfo)
1739 {
1740 const VkPipelineRasterizationStateCreateInfo *raster_info = pCreateInfo->pRasterizationState;
1741 const VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *provoking_vtx_info =
1742 vk_find_struct_const(raster_info->pNext,
1743 PIPELINE_RASTERIZATION_PROVOKING_VERTEX_STATE_CREATE_INFO_EXT);
1744 bool provoking_vtx_last = false;
1745
1746 if (provoking_vtx_info &&
1747 provoking_vtx_info->provokingVertexMode == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT) {
1748 provoking_vtx_last = true;
1749 }
1750
1751 pipeline->graphics.pa_su_sc_mode_cntl =
1752 S_028814_FACE(raster_info->frontFace) |
1753 S_028814_CULL_FRONT(!!(raster_info->cullMode & VK_CULL_MODE_FRONT_BIT)) |
1754 S_028814_CULL_BACK(!!(raster_info->cullMode & VK_CULL_MODE_BACK_BIT)) |
1755 S_028814_POLY_MODE(raster_info->polygonMode != VK_POLYGON_MODE_FILL) |
1756 S_028814_POLYMODE_FRONT_PTYPE(si_translate_fill(raster_info->polygonMode)) |
1757 S_028814_POLYMODE_BACK_PTYPE(si_translate_fill(raster_info->polygonMode)) |
1758 S_028814_POLY_OFFSET_FRONT_ENABLE(raster_info->depthBiasEnable ? 1 : 0) |
1759 S_028814_POLY_OFFSET_BACK_ENABLE(raster_info->depthBiasEnable ? 1 : 0) |
1760 S_028814_POLY_OFFSET_PARA_ENABLE(raster_info->depthBiasEnable ? 1 : 0) |
1761 S_028814_PROVOKING_VTX_LAST(provoking_vtx_last);
1762
1763 if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) {
1764 /* It should also be set if PERPENDICULAR_ENDCAP_ENA is set. */
1765 pipeline->graphics.pa_su_sc_mode_cntl |=
1766 S_028814_KEEP_TOGETHER_ENABLE(raster_info->polygonMode != VK_POLYGON_MODE_FILL);
1767 }
1768
1769 bool depth_clip_disable = raster_info->depthClampEnable;
1770 const VkPipelineRasterizationDepthClipStateCreateInfoEXT *depth_clip_state =
1771 vk_find_struct_const(raster_info->pNext,
1772 PIPELINE_RASTERIZATION_DEPTH_CLIP_STATE_CREATE_INFO_EXT);
1773 if (depth_clip_state) {
1774 depth_clip_disable = !depth_clip_state->depthClipEnable;
1775 }
1776
1777 pipeline->graphics.pa_cl_clip_cntl =
1778 S_028810_DX_CLIP_SPACE_DEF(1) | // vulkan uses DX conventions.
1779 S_028810_ZCLIP_NEAR_DISABLE(depth_clip_disable ? 1 : 0) |
1780 S_028810_ZCLIP_FAR_DISABLE(depth_clip_disable ? 1 : 0) |
1781 S_028810_DX_RASTERIZATION_KILL(raster_info->rasterizerDiscardEnable ? 1 : 0) |
1782 S_028810_DX_LINEAR_ATTR_CLIP_ENA(1);
1783
1784 pipeline->graphics.uses_conservative_overestimate =
1785 radv_get_conservative_raster_mode(pCreateInfo->pRasterizationState) ==
1786 VK_CONSERVATIVE_RASTERIZATION_MODE_OVERESTIMATE_EXT;
1787 }
1788
1789 static void
radv_pipeline_init_depth_stencil_state(struct radv_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)1790 radv_pipeline_init_depth_stencil_state(struct radv_pipeline *pipeline,
1791 const VkGraphicsPipelineCreateInfo *pCreateInfo)
1792 {
1793 const VkPipelineDepthStencilStateCreateInfo *ds_info =
1794 radv_pipeline_get_depth_stencil_state(pCreateInfo);
1795 RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
1796 struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
1797 struct radv_render_pass_attachment *attachment = NULL;
1798 uint32_t db_depth_control = 0;
1799
1800 if (subpass->depth_stencil_attachment)
1801 attachment = pass->attachments + subpass->depth_stencil_attachment->attachment;
1802
1803 bool has_depth_attachment = attachment && vk_format_has_depth(attachment->format);
1804 bool has_stencil_attachment = attachment && vk_format_has_stencil(attachment->format);
1805
1806 if (ds_info) {
1807 if (has_depth_attachment) {
1808 db_depth_control = S_028800_Z_ENABLE(ds_info->depthTestEnable ? 1 : 0) |
1809 S_028800_Z_WRITE_ENABLE(ds_info->depthWriteEnable ? 1 : 0) |
1810 S_028800_ZFUNC(ds_info->depthCompareOp) |
1811 S_028800_DEPTH_BOUNDS_ENABLE(ds_info->depthBoundsTestEnable ? 1 : 0);
1812 }
1813
1814 if (has_stencil_attachment && ds_info->stencilTestEnable) {
1815 db_depth_control |= S_028800_STENCIL_ENABLE(1) | S_028800_BACKFACE_ENABLE(1);
1816 db_depth_control |= S_028800_STENCILFUNC(ds_info->front.compareOp);
1817 db_depth_control |= S_028800_STENCILFUNC_BF(ds_info->back.compareOp);
1818 }
1819 }
1820
1821 pipeline->graphics.db_depth_control = db_depth_control;
1822 }
1823
1824 static void
gfx9_get_gs_info(const struct radv_pipeline_key * key,const struct radv_pipeline * pipeline,nir_shader ** nir,struct radv_shader_info * infos,struct gfx9_gs_info * out)1825 gfx9_get_gs_info(const struct radv_pipeline_key *key, const struct radv_pipeline *pipeline,
1826 nir_shader **nir, struct radv_shader_info *infos, struct gfx9_gs_info *out)
1827 {
1828 struct radv_shader_info *gs_info = &infos[MESA_SHADER_GEOMETRY];
1829 struct radv_es_output_info *es_info;
1830 bool has_tess = !!nir[MESA_SHADER_TESS_CTRL];
1831 if (pipeline->device->physical_device->rad_info.chip_class >= GFX9)
1832 es_info = has_tess ? &gs_info->tes.es_info : &gs_info->vs.es_info;
1833 else
1834 es_info = has_tess ? &infos[MESA_SHADER_TESS_EVAL].tes.es_info
1835 : &infos[MESA_SHADER_VERTEX].vs.es_info;
1836
1837 unsigned gs_num_invocations = MAX2(gs_info->gs.invocations, 1);
1838 bool uses_adjacency;
1839 switch (key->vs.topology) {
1840 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
1841 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
1842 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
1843 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
1844 uses_adjacency = true;
1845 break;
1846 default:
1847 uses_adjacency = false;
1848 break;
1849 }
1850
1851 /* All these are in dwords: */
1852 /* We can't allow using the whole LDS, because GS waves compete with
1853 * other shader stages for LDS space. */
1854 const unsigned max_lds_size = 8 * 1024;
1855 const unsigned esgs_itemsize = es_info->esgs_itemsize / 4;
1856 unsigned esgs_lds_size;
1857
1858 /* All these are per subgroup: */
1859 const unsigned max_out_prims = 32 * 1024;
1860 const unsigned max_es_verts = 255;
1861 const unsigned ideal_gs_prims = 64;
1862 unsigned max_gs_prims, gs_prims;
1863 unsigned min_es_verts, es_verts, worst_case_es_verts;
1864
1865 if (uses_adjacency || gs_num_invocations > 1)
1866 max_gs_prims = 127 / gs_num_invocations;
1867 else
1868 max_gs_prims = 255;
1869
1870 /* MAX_PRIMS_PER_SUBGROUP = gs_prims * max_vert_out * gs_invocations.
1871 * Make sure we don't go over the maximum value.
1872 */
1873 if (gs_info->gs.vertices_out > 0) {
1874 max_gs_prims =
1875 MIN2(max_gs_prims, max_out_prims / (gs_info->gs.vertices_out * gs_num_invocations));
1876 }
1877 assert(max_gs_prims > 0);
1878
1879 /* If the primitive has adjacency, halve the number of vertices
1880 * that will be reused in multiple primitives.
1881 */
1882 min_es_verts = gs_info->gs.vertices_in / (uses_adjacency ? 2 : 1);
1883
1884 gs_prims = MIN2(ideal_gs_prims, max_gs_prims);
1885 worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);
1886
1887 /* Compute ESGS LDS size based on the worst case number of ES vertices
1888 * needed to create the target number of GS prims per subgroup.
1889 */
1890 esgs_lds_size = esgs_itemsize * worst_case_es_verts;
1891
1892 /* If total LDS usage is too big, refactor partitions based on ratio
1893 * of ESGS item sizes.
1894 */
1895 if (esgs_lds_size > max_lds_size) {
1896 /* Our target GS Prims Per Subgroup was too large. Calculate
1897 * the maximum number of GS Prims Per Subgroup that will fit
1898 * into LDS, capped by the maximum that the hardware can support.
1899 */
1900 gs_prims = MIN2((max_lds_size / (esgs_itemsize * min_es_verts)), max_gs_prims);
1901 assert(gs_prims > 0);
1902 worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);
1903
1904 esgs_lds_size = esgs_itemsize * worst_case_es_verts;
1905 assert(esgs_lds_size <= max_lds_size);
1906 }
1907
1908 /* Now calculate remaining ESGS information. */
1909 if (esgs_lds_size)
1910 es_verts = MIN2(esgs_lds_size / esgs_itemsize, max_es_verts);
1911 else
1912 es_verts = max_es_verts;
1913
1914 /* Vertices for adjacency primitives are not always reused, so restore
1915 * it for ES_VERTS_PER_SUBGRP.
1916 */
1917 min_es_verts = gs_info->gs.vertices_in;
1918
1919 /* For normal primitives, the VGT only checks if they are past the ES
1920 * verts per subgroup after allocating a full GS primitive and if they
1921 * are, kick off a new subgroup. But if those additional ES verts are
1922 * unique (e.g. not reused) we need to make sure there is enough LDS
1923 * space to account for those ES verts beyond ES_VERTS_PER_SUBGRP.
1924 */
1925 es_verts -= min_es_verts - 1;
1926
1927 uint32_t es_verts_per_subgroup = es_verts;
1928 uint32_t gs_prims_per_subgroup = gs_prims;
1929 uint32_t gs_inst_prims_in_subgroup = gs_prims * gs_num_invocations;
1930 uint32_t max_prims_per_subgroup = gs_inst_prims_in_subgroup * gs_info->gs.vertices_out;
1931 out->lds_size = align(esgs_lds_size, 128) / 128;
1932 out->vgt_gs_onchip_cntl = S_028A44_ES_VERTS_PER_SUBGRP(es_verts_per_subgroup) |
1933 S_028A44_GS_PRIMS_PER_SUBGRP(gs_prims_per_subgroup) |
1934 S_028A44_GS_INST_PRIMS_IN_SUBGRP(gs_inst_prims_in_subgroup);
1935 out->vgt_gs_max_prims_per_subgroup = S_028A94_MAX_PRIMS_PER_SUBGROUP(max_prims_per_subgroup);
1936 out->vgt_esgs_ring_itemsize = esgs_itemsize;
1937 assert(max_prims_per_subgroup <= max_out_prims);
1938
1939 gl_shader_stage es_stage = has_tess ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
1940 unsigned workgroup_size =
1941 ac_compute_esgs_workgroup_size(
1942 pipeline->device->physical_device->rad_info.chip_class, infos[es_stage].wave_size,
1943 es_verts_per_subgroup, gs_inst_prims_in_subgroup);
1944 infos[es_stage].workgroup_size = workgroup_size;
1945 infos[MESA_SHADER_GEOMETRY].workgroup_size = workgroup_size;
1946 }
1947
1948 static void
clamp_gsprims_to_esverts(unsigned * max_gsprims,unsigned max_esverts,unsigned min_verts_per_prim,bool use_adjacency)1949 clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts, unsigned min_verts_per_prim,
1950 bool use_adjacency)
1951 {
1952 unsigned max_reuse = max_esverts - min_verts_per_prim;
1953 if (use_adjacency)
1954 max_reuse /= 2;
1955 *max_gsprims = MIN2(*max_gsprims, 1 + max_reuse);
1956 }
1957
1958 static unsigned
radv_get_num_input_vertices(nir_shader ** nir)1959 radv_get_num_input_vertices(nir_shader **nir)
1960 {
1961 if (nir[MESA_SHADER_GEOMETRY]) {
1962 nir_shader *gs = nir[MESA_SHADER_GEOMETRY];
1963
1964 return gs->info.gs.vertices_in;
1965 }
1966
1967 if (nir[MESA_SHADER_TESS_CTRL]) {
1968 nir_shader *tes = nir[MESA_SHADER_TESS_EVAL];
1969
1970 if (tes->info.tess.point_mode)
1971 return 1;
1972 if (tes->info.tess.primitive_mode == GL_ISOLINES)
1973 return 2;
1974 return 3;
1975 }
1976
1977 return 3;
1978 }
1979
1980 static void
gfx10_emit_ge_pc_alloc(struct radeon_cmdbuf * cs,enum chip_class chip_class,uint32_t oversub_pc_lines)1981 gfx10_emit_ge_pc_alloc(struct radeon_cmdbuf *cs, enum chip_class chip_class, uint32_t oversub_pc_lines)
1982 {
1983 radeon_set_uconfig_reg(
1984 cs, R_030980_GE_PC_ALLOC,
1985 S_030980_OVERSUB_EN(oversub_pc_lines > 0) | S_030980_NUM_PC_LINES(oversub_pc_lines - 1));
1986 }
1987
1988 static void
gfx10_get_ngg_info(const struct radv_pipeline_key * key,struct radv_pipeline * pipeline,nir_shader ** nir,struct radv_shader_info * infos,struct gfx10_ngg_info * ngg)1989 gfx10_get_ngg_info(const struct radv_pipeline_key *key, struct radv_pipeline *pipeline,
1990 nir_shader **nir, struct radv_shader_info *infos, struct gfx10_ngg_info *ngg)
1991 {
1992 struct radv_shader_info *gs_info = &infos[MESA_SHADER_GEOMETRY];
1993 struct radv_es_output_info *es_info =
1994 nir[MESA_SHADER_TESS_CTRL] ? &gs_info->tes.es_info : &gs_info->vs.es_info;
1995 unsigned gs_type = nir[MESA_SHADER_GEOMETRY] ? MESA_SHADER_GEOMETRY : MESA_SHADER_VERTEX;
1996 unsigned max_verts_per_prim = radv_get_num_input_vertices(nir);
1997 unsigned min_verts_per_prim = gs_type == MESA_SHADER_GEOMETRY ? max_verts_per_prim : 1;
1998 unsigned gs_num_invocations = nir[MESA_SHADER_GEOMETRY] ? MAX2(gs_info->gs.invocations, 1) : 1;
1999 bool uses_adjacency;
2000 switch (key->vs.topology) {
2001 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
2002 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
2003 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
2004 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
2005 uses_adjacency = true;
2006 break;
2007 default:
2008 uses_adjacency = false;
2009 break;
2010 }
2011
2012 /* All these are in dwords: */
2013 /* We can't allow using the whole LDS, because GS waves compete with
2014 * other shader stages for LDS space.
2015 *
2016 * TODO: We should really take the shader's internal LDS use into
2017 * account. The linker will fail if the size is greater than
2018 * 8K dwords.
2019 */
2020 const unsigned max_lds_size = 8 * 1024 - 768;
2021 const unsigned target_lds_size = max_lds_size;
2022 unsigned esvert_lds_size = 0;
2023 unsigned gsprim_lds_size = 0;
2024
2025 /* All these are per subgroup: */
2026 const unsigned min_esverts =
2027 pipeline->device->physical_device->rad_info.chip_class >= GFX10_3 ? 29 : 24;
2028 bool max_vert_out_per_gs_instance = false;
2029 unsigned max_esverts_base = 128;
2030 unsigned max_gsprims_base = 128; /* default prim group size clamp */
2031
2032 /* Hardware has the following non-natural restrictions on the value
2033 * of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of
2034 * the draw:
2035 * - at most 252 for any line input primitive type
2036 * - at most 251 for any quad input primitive type
2037 * - at most 251 for triangle strips with adjacency (this happens to
2038 * be the natural limit for triangle *lists* with adjacency)
2039 */
2040 max_esverts_base = MIN2(max_esverts_base, 251 + max_verts_per_prim - 1);
2041
2042 if (gs_type == MESA_SHADER_GEOMETRY) {
2043 unsigned max_out_verts_per_gsprim = gs_info->gs.vertices_out * gs_num_invocations;
2044
2045 if (max_out_verts_per_gsprim <= 256) {
2046 if (max_out_verts_per_gsprim) {
2047 max_gsprims_base = MIN2(max_gsprims_base, 256 / max_out_verts_per_gsprim);
2048 }
2049 } else {
2050 /* Use special multi-cycling mode in which each GS
2051 * instance gets its own subgroup. Does not work with
2052 * tessellation. */
2053 max_vert_out_per_gs_instance = true;
2054 max_gsprims_base = 1;
2055 max_out_verts_per_gsprim = gs_info->gs.vertices_out;
2056 }
2057
2058 esvert_lds_size = es_info->esgs_itemsize / 4;
2059 gsprim_lds_size = (gs_info->gs.gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim;
2060 } else {
2061 /* VS and TES. */
2062 /* LDS size for passing data from GS to ES. */
2063 struct radv_streamout_info *so_info = nir[MESA_SHADER_TESS_CTRL]
2064 ? &infos[MESA_SHADER_TESS_EVAL].so
2065 : &infos[MESA_SHADER_VERTEX].so;
2066
2067 if (so_info->num_outputs)
2068 esvert_lds_size = 4 * so_info->num_outputs + 1;
2069
2070 /* GS stores Primitive IDs (one DWORD) into LDS at the address
2071 * corresponding to the ES thread of the provoking vertex. All
2072 * ES threads load and export PrimitiveID for their thread.
2073 */
2074 if (!nir[MESA_SHADER_TESS_CTRL] && infos[MESA_SHADER_VERTEX].vs.outinfo.export_prim_id)
2075 esvert_lds_size = MAX2(esvert_lds_size, 1);
2076 }
2077
2078 unsigned max_gsprims = max_gsprims_base;
2079 unsigned max_esverts = max_esverts_base;
2080
2081 if (esvert_lds_size)
2082 max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size);
2083 if (gsprim_lds_size)
2084 max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size);
2085
2086 max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2087 clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency);
2088 assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
2089
2090 if (esvert_lds_size || gsprim_lds_size) {
2091 /* Now that we have a rough proportionality between esverts
2092 * and gsprims based on the primitive type, scale both of them
2093 * down simultaneously based on required LDS space.
2094 *
2095 * We could be smarter about this if we knew how much vertex
2096 * reuse to expect.
2097 */
2098 unsigned lds_total = max_esverts * esvert_lds_size + max_gsprims * gsprim_lds_size;
2099 if (lds_total > target_lds_size) {
2100 max_esverts = max_esverts * target_lds_size / lds_total;
2101 max_gsprims = max_gsprims * target_lds_size / lds_total;
2102
2103 max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2104 clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency);
2105 assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
2106 }
2107 }
2108
2109 /* Round up towards full wave sizes for better ALU utilization. */
2110 if (!max_vert_out_per_gs_instance) {
2111 unsigned orig_max_esverts;
2112 unsigned orig_max_gsprims;
2113 unsigned wavesize;
2114
2115 if (gs_type == MESA_SHADER_GEOMETRY) {
2116 wavesize = gs_info->wave_size;
2117 } else {
2118 wavesize = nir[MESA_SHADER_TESS_CTRL] ? infos[MESA_SHADER_TESS_EVAL].wave_size
2119 : infos[MESA_SHADER_VERTEX].wave_size;
2120 }
2121
2122 do {
2123 orig_max_esverts = max_esverts;
2124 orig_max_gsprims = max_gsprims;
2125
2126 max_esverts = align(max_esverts, wavesize);
2127 max_esverts = MIN2(max_esverts, max_esverts_base);
2128 if (esvert_lds_size)
2129 max_esverts =
2130 MIN2(max_esverts, (max_lds_size - max_gsprims * gsprim_lds_size) / esvert_lds_size);
2131 max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2132
2133 /* Hardware restriction: minimum value of max_esverts */
2134 if (pipeline->device->physical_device->rad_info.chip_class == GFX10)
2135 max_esverts = MAX2(max_esverts, min_esverts - 1 + max_verts_per_prim);
2136 else
2137 max_esverts = MAX2(max_esverts, min_esverts);
2138
2139 max_gsprims = align(max_gsprims, wavesize);
2140 max_gsprims = MIN2(max_gsprims, max_gsprims_base);
2141 if (gsprim_lds_size) {
2142 /* Don't count unusable vertices to the LDS
2143 * size. Those are vertices above the maximum
2144 * number of vertices that can occur in the
2145 * workgroup, which is e.g. max_gsprims * 3
2146 * for triangles.
2147 */
2148 unsigned usable_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2149 max_gsprims = MIN2(max_gsprims,
2150 (max_lds_size - usable_esverts * esvert_lds_size) / gsprim_lds_size);
2151 }
2152 clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency);
2153 assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
2154 } while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims);
2155
2156 /* Verify the restriction. */
2157 if (pipeline->device->physical_device->rad_info.chip_class == GFX10)
2158 assert(max_esverts >= min_esverts - 1 + max_verts_per_prim);
2159 else
2160 assert(max_esverts >= min_esverts);
2161 } else {
2162 /* Hardware restriction: minimum value of max_esverts */
2163 if (pipeline->device->physical_device->rad_info.chip_class == GFX10)
2164 max_esverts = MAX2(max_esverts, min_esverts - 1 + max_verts_per_prim);
2165 else
2166 max_esverts = MAX2(max_esverts, min_esverts);
2167 }
2168
2169 unsigned max_out_vertices = max_vert_out_per_gs_instance ? gs_info->gs.vertices_out
2170 : gs_type == MESA_SHADER_GEOMETRY
2171 ? max_gsprims * gs_num_invocations * gs_info->gs.vertices_out
2172 : max_esverts;
2173 assert(max_out_vertices <= 256);
2174
2175 unsigned prim_amp_factor = 1;
2176 if (gs_type == MESA_SHADER_GEOMETRY) {
2177 /* Number of output primitives per GS input primitive after
2178 * GS instancing. */
2179 prim_amp_factor = gs_info->gs.vertices_out;
2180 }
2181
2182 /* On Gfx10, the GE only checks against the maximum number of ES verts
2183 * after allocating a full GS primitive. So we need to ensure that
2184 * whenever this check passes, there is enough space for a full
2185 * primitive without vertex reuse.
2186 */
2187 if (pipeline->device->physical_device->rad_info.chip_class == GFX10)
2188 ngg->hw_max_esverts = max_esverts - max_verts_per_prim + 1;
2189 else
2190 ngg->hw_max_esverts = max_esverts;
2191
2192 ngg->max_gsprims = max_gsprims;
2193 ngg->max_out_verts = max_out_vertices;
2194 ngg->prim_amp_factor = prim_amp_factor;
2195 ngg->max_vert_out_per_gs_instance = max_vert_out_per_gs_instance;
2196 ngg->ngg_emit_size = max_gsprims * gsprim_lds_size;
2197 ngg->enable_vertex_grouping = true;
2198
2199 /* Don't count unusable vertices. */
2200 ngg->esgs_ring_size = MIN2(max_esverts, max_gsprims * max_verts_per_prim) * esvert_lds_size * 4;
2201
2202 if (gs_type == MESA_SHADER_GEOMETRY) {
2203 ngg->vgt_esgs_ring_itemsize = es_info->esgs_itemsize / 4;
2204 } else {
2205 ngg->vgt_esgs_ring_itemsize = 1;
2206 }
2207
2208 assert(ngg->hw_max_esverts >= min_esverts); /* HW limitation */
2209
2210 gl_shader_stage es_stage = nir[MESA_SHADER_TESS_CTRL] ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
2211 unsigned workgroup_size =
2212 ac_compute_ngg_workgroup_size(
2213 max_esverts, max_gsprims * gs_num_invocations, max_out_vertices, prim_amp_factor);
2214 infos[MESA_SHADER_GEOMETRY].workgroup_size = workgroup_size;
2215 infos[es_stage].workgroup_size = workgroup_size;
2216 }
2217
2218 static void
radv_pipeline_init_gs_ring_state(struct radv_pipeline * pipeline,const struct gfx9_gs_info * gs)2219 radv_pipeline_init_gs_ring_state(struct radv_pipeline *pipeline, const struct gfx9_gs_info *gs)
2220 {
2221 struct radv_device *device = pipeline->device;
2222 unsigned num_se = device->physical_device->rad_info.max_se;
2223 unsigned wave_size = 64;
2224 unsigned max_gs_waves = 32 * num_se; /* max 32 per SE on GCN */
2225 /* On GFX6-GFX7, the value comes from VGT_GS_VERTEX_REUSE = 16.
2226 * On GFX8+, the value comes from VGT_VERTEX_REUSE_BLOCK_CNTL = 30 (+2).
2227 */
2228 unsigned gs_vertex_reuse =
2229 (device->physical_device->rad_info.chip_class >= GFX8 ? 32 : 16) * num_se;
2230 unsigned alignment = 256 * num_se;
2231 /* The maximum size is 63.999 MB per SE. */
2232 unsigned max_size = ((unsigned)(63.999 * 1024 * 1024) & ~255) * num_se;
2233 struct radv_shader_info *gs_info = &pipeline->shaders[MESA_SHADER_GEOMETRY]->info;
2234
2235 /* Calculate the minimum size. */
2236 unsigned min_esgs_ring_size =
2237 align(gs->vgt_esgs_ring_itemsize * 4 * gs_vertex_reuse * wave_size, alignment);
2238 /* These are recommended sizes, not minimum sizes. */
2239 unsigned esgs_ring_size =
2240 max_gs_waves * 2 * wave_size * gs->vgt_esgs_ring_itemsize * 4 * gs_info->gs.vertices_in;
2241 unsigned gsvs_ring_size = max_gs_waves * 2 * wave_size * gs_info->gs.max_gsvs_emit_size;
2242
2243 min_esgs_ring_size = align(min_esgs_ring_size, alignment);
2244 esgs_ring_size = align(esgs_ring_size, alignment);
2245 gsvs_ring_size = align(gsvs_ring_size, alignment);
2246
2247 if (pipeline->device->physical_device->rad_info.chip_class <= GFX8)
2248 pipeline->graphics.esgs_ring_size = CLAMP(esgs_ring_size, min_esgs_ring_size, max_size);
2249
2250 pipeline->graphics.gsvs_ring_size = MIN2(gsvs_ring_size, max_size);
2251 }
2252
2253 struct radv_shader_variant *
radv_get_shader(const struct radv_pipeline * pipeline,gl_shader_stage stage)2254 radv_get_shader(const struct radv_pipeline *pipeline, gl_shader_stage stage)
2255 {
2256 if (stage == MESA_SHADER_VERTEX) {
2257 if (pipeline->shaders[MESA_SHADER_VERTEX])
2258 return pipeline->shaders[MESA_SHADER_VERTEX];
2259 if (pipeline->shaders[MESA_SHADER_TESS_CTRL])
2260 return pipeline->shaders[MESA_SHADER_TESS_CTRL];
2261 if (pipeline->shaders[MESA_SHADER_GEOMETRY])
2262 return pipeline->shaders[MESA_SHADER_GEOMETRY];
2263 } else if (stage == MESA_SHADER_TESS_EVAL) {
2264 if (!radv_pipeline_has_tess(pipeline))
2265 return NULL;
2266 if (pipeline->shaders[MESA_SHADER_TESS_EVAL])
2267 return pipeline->shaders[MESA_SHADER_TESS_EVAL];
2268 if (pipeline->shaders[MESA_SHADER_GEOMETRY])
2269 return pipeline->shaders[MESA_SHADER_GEOMETRY];
2270 }
2271 return pipeline->shaders[stage];
2272 }
2273
2274 static const struct radv_vs_output_info *
get_vs_output_info(const struct radv_pipeline * pipeline)2275 get_vs_output_info(const struct radv_pipeline *pipeline)
2276 {
2277 if (radv_pipeline_has_gs(pipeline))
2278 if (radv_pipeline_has_ngg(pipeline))
2279 return &pipeline->shaders[MESA_SHADER_GEOMETRY]->info.vs.outinfo;
2280 else
2281 return &pipeline->gs_copy_shader->info.vs.outinfo;
2282 else if (radv_pipeline_has_tess(pipeline))
2283 return &pipeline->shaders[MESA_SHADER_TESS_EVAL]->info.tes.outinfo;
2284 else
2285 return &pipeline->shaders[MESA_SHADER_VERTEX]->info.vs.outinfo;
2286 }
2287
2288 static bool
radv_nir_stage_uses_xfb(const nir_shader * nir)2289 radv_nir_stage_uses_xfb(const nir_shader *nir)
2290 {
2291 nir_xfb_info *xfb = nir_gather_xfb_info(nir, NULL);
2292 bool uses_xfb = !!xfb;
2293
2294 ralloc_free(xfb);
2295 return uses_xfb;
2296 }
2297
2298 static void
radv_link_shaders(struct radv_pipeline * pipeline,const struct radv_pipeline_key * pipeline_key,nir_shader ** shaders,bool optimize_conservatively)2299 radv_link_shaders(struct radv_pipeline *pipeline,
2300 const struct radv_pipeline_key *pipeline_key,
2301 nir_shader **shaders,
2302 bool optimize_conservatively)
2303 {
2304 nir_shader *ordered_shaders[MESA_SHADER_STAGES];
2305 int shader_count = 0;
2306
2307 if (shaders[MESA_SHADER_FRAGMENT]) {
2308 ordered_shaders[shader_count++] = shaders[MESA_SHADER_FRAGMENT];
2309 }
2310 if (shaders[MESA_SHADER_GEOMETRY]) {
2311 ordered_shaders[shader_count++] = shaders[MESA_SHADER_GEOMETRY];
2312 }
2313 if (shaders[MESA_SHADER_TESS_EVAL]) {
2314 ordered_shaders[shader_count++] = shaders[MESA_SHADER_TESS_EVAL];
2315 }
2316 if (shaders[MESA_SHADER_TESS_CTRL]) {
2317 ordered_shaders[shader_count++] = shaders[MESA_SHADER_TESS_CTRL];
2318 }
2319 if (shaders[MESA_SHADER_VERTEX]) {
2320 ordered_shaders[shader_count++] = shaders[MESA_SHADER_VERTEX];
2321 }
2322 if (shaders[MESA_SHADER_COMPUTE]) {
2323 ordered_shaders[shader_count++] = shaders[MESA_SHADER_COMPUTE];
2324 }
2325
2326 bool has_geom_tess = shaders[MESA_SHADER_GEOMETRY] || shaders[MESA_SHADER_TESS_CTRL];
2327 bool merged_gs = shaders[MESA_SHADER_GEOMETRY] &&
2328 pipeline->device->physical_device->rad_info.chip_class >= GFX9;
2329
2330 if (!optimize_conservatively && shader_count > 1) {
2331 unsigned first = ordered_shaders[shader_count - 1]->info.stage;
2332 unsigned last = ordered_shaders[0]->info.stage;
2333
2334 if (ordered_shaders[0]->info.stage == MESA_SHADER_FRAGMENT &&
2335 ordered_shaders[1]->info.has_transform_feedback_varyings)
2336 nir_link_xfb_varyings(ordered_shaders[1], ordered_shaders[0]);
2337
2338 for (int i = 1; i < shader_count; ++i) {
2339 nir_lower_io_arrays_to_elements(ordered_shaders[i], ordered_shaders[i - 1]);
2340 }
2341
2342 for (int i = 0; i < shader_count; ++i) {
2343 nir_variable_mode mask = 0;
2344
2345 if (ordered_shaders[i]->info.stage != first)
2346 mask = mask | nir_var_shader_in;
2347
2348 if (ordered_shaders[i]->info.stage != last)
2349 mask = mask | nir_var_shader_out;
2350
2351 if (nir_lower_io_to_scalar_early(ordered_shaders[i], mask)) {
2352 /* Optimize the new vector code and then remove dead vars */
2353 nir_copy_prop(ordered_shaders[i]);
2354 nir_opt_shrink_vectors(ordered_shaders[i],
2355 !pipeline->device->instance->disable_shrink_image_store);
2356
2357 if (ordered_shaders[i]->info.stage != last) {
2358 /* Optimize swizzled movs of load_const for
2359 * nir_link_opt_varyings's constant propagation
2360 */
2361 nir_opt_constant_folding(ordered_shaders[i]);
2362 /* For nir_link_opt_varyings's duplicate input opt */
2363 nir_opt_cse(ordered_shaders[i]);
2364 }
2365
2366 /* Run copy-propagation to help remove dead
2367 * output variables (some shaders have useless
2368 * copies to/from an output), so compaction
2369 * later will be more effective.
2370 *
2371 * This will have been done earlier but it might
2372 * not have worked because the outputs were vector.
2373 */
2374 if (ordered_shaders[i]->info.stage == MESA_SHADER_TESS_CTRL)
2375 nir_opt_copy_prop_vars(ordered_shaders[i]);
2376
2377 nir_opt_dce(ordered_shaders[i]);
2378 nir_remove_dead_variables(
2379 ordered_shaders[i], nir_var_function_temp | nir_var_shader_in | nir_var_shader_out,
2380 NULL);
2381 }
2382 }
2383 }
2384
2385 bool uses_xfb = pipeline->graphics.last_vgt_api_stage != -1 &&
2386 radv_nir_stage_uses_xfb(shaders[pipeline->graphics.last_vgt_api_stage]);
2387 if (!uses_xfb && !optimize_conservatively) {
2388 /* Remove PSIZ from shaders when it's not needed.
2389 * This is typically produced by translation layers like Zink or D9VK.
2390 */
2391 for (unsigned i = 0; i < shader_count; ++i) {
2392 shader_info *info = &ordered_shaders[i]->info;
2393 if (!(info->outputs_written & VARYING_BIT_PSIZ))
2394 continue;
2395
2396 bool next_stage_needs_psiz =
2397 i != 0 && /* ordered_shaders is backwards, so next stage is: i - 1 */
2398 ordered_shaders[i - 1]->info.inputs_read & VARYING_BIT_PSIZ;
2399 bool topology_uses_psiz =
2400 info->stage == pipeline->graphics.last_vgt_api_stage &&
2401 ((info->stage == MESA_SHADER_VERTEX && pipeline_key->vs.topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST) ||
2402 (info->stage == MESA_SHADER_TESS_EVAL && info->tess.point_mode) ||
2403 (info->stage == MESA_SHADER_GEOMETRY && info->gs.output_primitive == GL_POINTS));
2404
2405 nir_variable *psiz_var =
2406 nir_find_variable_with_location(ordered_shaders[i], nir_var_shader_out, VARYING_SLOT_PSIZ);
2407
2408 if (!next_stage_needs_psiz && !topology_uses_psiz && psiz_var) {
2409 /* Change PSIZ to a global variable which allows it to be DCE'd. */
2410 psiz_var->data.location = 0;
2411 psiz_var->data.mode = nir_var_shader_temp;
2412
2413 info->outputs_written &= ~VARYING_BIT_PSIZ;
2414 nir_fixup_deref_modes(ordered_shaders[i]);
2415 nir_remove_dead_variables(ordered_shaders[i], nir_var_shader_temp, NULL);
2416 nir_opt_dce(ordered_shaders[i]);
2417 }
2418 }
2419 }
2420
2421 for (int i = 1; !optimize_conservatively && (i < shader_count); ++i) {
2422 if (nir_link_opt_varyings(ordered_shaders[i], ordered_shaders[i - 1])) {
2423 nir_opt_constant_folding(ordered_shaders[i - 1]);
2424 nir_opt_algebraic(ordered_shaders[i - 1]);
2425 nir_opt_dce(ordered_shaders[i - 1]);
2426 }
2427
2428 nir_remove_dead_variables(ordered_shaders[i], nir_var_shader_out, NULL);
2429 nir_remove_dead_variables(ordered_shaders[i - 1], nir_var_shader_in, NULL);
2430
2431 bool progress = nir_remove_unused_varyings(ordered_shaders[i], ordered_shaders[i - 1]);
2432
2433 nir_compact_varyings(ordered_shaders[i], ordered_shaders[i - 1], true);
2434
2435 if (ordered_shaders[i]->info.stage == MESA_SHADER_TESS_CTRL ||
2436 (ordered_shaders[i]->info.stage == MESA_SHADER_VERTEX && has_geom_tess) ||
2437 (ordered_shaders[i]->info.stage == MESA_SHADER_TESS_EVAL && merged_gs)) {
2438 nir_lower_io_to_vector(ordered_shaders[i], nir_var_shader_out);
2439 if (ordered_shaders[i]->info.stage == MESA_SHADER_TESS_CTRL)
2440 nir_vectorize_tess_levels(ordered_shaders[i]);
2441 nir_opt_combine_stores(ordered_shaders[i], nir_var_shader_out);
2442 }
2443 if (ordered_shaders[i - 1]->info.stage == MESA_SHADER_GEOMETRY ||
2444 ordered_shaders[i - 1]->info.stage == MESA_SHADER_TESS_CTRL ||
2445 ordered_shaders[i - 1]->info.stage == MESA_SHADER_TESS_EVAL) {
2446 nir_lower_io_to_vector(ordered_shaders[i - 1], nir_var_shader_in);
2447 }
2448
2449 if (progress) {
2450 if (nir_lower_global_vars_to_local(ordered_shaders[i])) {
2451 ac_nir_lower_indirect_derefs(ordered_shaders[i],
2452 pipeline->device->physical_device->rad_info.chip_class);
2453 /* remove dead writes, which can remove input loads */
2454 nir_lower_vars_to_ssa(ordered_shaders[i]);
2455 nir_opt_dce(ordered_shaders[i]);
2456 }
2457
2458 if (nir_lower_global_vars_to_local(ordered_shaders[i - 1])) {
2459 ac_nir_lower_indirect_derefs(ordered_shaders[i - 1],
2460 pipeline->device->physical_device->rad_info.chip_class);
2461 }
2462 }
2463 }
2464 }
2465
2466 static void
radv_set_driver_locations(struct radv_pipeline * pipeline,nir_shader ** shaders,struct radv_shader_info infos[MESA_SHADER_STAGES])2467 radv_set_driver_locations(struct radv_pipeline *pipeline, nir_shader **shaders,
2468 struct radv_shader_info infos[MESA_SHADER_STAGES])
2469 {
2470 if (shaders[MESA_SHADER_FRAGMENT]) {
2471 nir_foreach_shader_out_variable(var, shaders[MESA_SHADER_FRAGMENT])
2472 {
2473 var->data.driver_location = var->data.location + var->data.index;
2474 }
2475 }
2476
2477 if (!shaders[MESA_SHADER_VERTEX])
2478 return;
2479
2480 bool has_tess = shaders[MESA_SHADER_TESS_CTRL];
2481 bool has_gs = shaders[MESA_SHADER_GEOMETRY];
2482
2483 /* Merged stage for VS and TES */
2484 unsigned vs_info_idx = MESA_SHADER_VERTEX;
2485 unsigned tes_info_idx = MESA_SHADER_TESS_EVAL;
2486
2487 if (pipeline->device->physical_device->rad_info.chip_class >= GFX9) {
2488 /* These are merged into the next stage */
2489 vs_info_idx = has_tess ? MESA_SHADER_TESS_CTRL : MESA_SHADER_GEOMETRY;
2490 tes_info_idx = has_gs ? MESA_SHADER_GEOMETRY : MESA_SHADER_TESS_EVAL;
2491 }
2492
2493 nir_foreach_shader_in_variable (var, shaders[MESA_SHADER_VERTEX]) {
2494 var->data.driver_location = var->data.location;
2495 }
2496
2497 if (has_tess) {
2498 nir_linked_io_var_info vs2tcs = nir_assign_linked_io_var_locations(
2499 shaders[MESA_SHADER_VERTEX], shaders[MESA_SHADER_TESS_CTRL]);
2500 nir_linked_io_var_info tcs2tes = nir_assign_linked_io_var_locations(
2501 shaders[MESA_SHADER_TESS_CTRL], shaders[MESA_SHADER_TESS_EVAL]);
2502
2503 infos[MESA_SHADER_VERTEX].vs.num_linked_outputs = vs2tcs.num_linked_io_vars;
2504 infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_inputs = vs2tcs.num_linked_io_vars;
2505 infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_outputs = tcs2tes.num_linked_io_vars;
2506 infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_patch_outputs = tcs2tes.num_linked_patch_io_vars;
2507 infos[MESA_SHADER_TESS_EVAL].tes.num_linked_inputs = tcs2tes.num_linked_io_vars;
2508 infos[MESA_SHADER_TESS_EVAL].tes.num_linked_patch_inputs = tcs2tes.num_linked_patch_io_vars;
2509
2510 /* Copy data to merged stage */
2511 infos[vs_info_idx].vs.num_linked_outputs = vs2tcs.num_linked_io_vars;
2512 infos[tes_info_idx].tes.num_linked_inputs = tcs2tes.num_linked_io_vars;
2513 infos[tes_info_idx].tes.num_linked_patch_inputs = tcs2tes.num_linked_patch_io_vars;
2514
2515 if (has_gs) {
2516 nir_linked_io_var_info tes2gs = nir_assign_linked_io_var_locations(
2517 shaders[MESA_SHADER_TESS_EVAL], shaders[MESA_SHADER_GEOMETRY]);
2518
2519 infos[MESA_SHADER_TESS_EVAL].tes.num_linked_outputs = tes2gs.num_linked_io_vars;
2520 infos[MESA_SHADER_GEOMETRY].gs.num_linked_inputs = tes2gs.num_linked_io_vars;
2521
2522 /* Copy data to merged stage */
2523 infos[tes_info_idx].tes.num_linked_outputs = tes2gs.num_linked_io_vars;
2524 }
2525 } else if (has_gs) {
2526 nir_linked_io_var_info vs2gs = nir_assign_linked_io_var_locations(
2527 shaders[MESA_SHADER_VERTEX], shaders[MESA_SHADER_GEOMETRY]);
2528
2529 infos[MESA_SHADER_VERTEX].vs.num_linked_outputs = vs2gs.num_linked_io_vars;
2530 infos[MESA_SHADER_GEOMETRY].gs.num_linked_inputs = vs2gs.num_linked_io_vars;
2531
2532 /* Copy data to merged stage */
2533 infos[vs_info_idx].vs.num_linked_outputs = vs2gs.num_linked_io_vars;
2534 }
2535
2536 assert(pipeline->graphics.last_vgt_api_stage != MESA_SHADER_NONE);
2537 nir_foreach_shader_out_variable(var, shaders[pipeline->graphics.last_vgt_api_stage])
2538 {
2539 var->data.driver_location = var->data.location;
2540 }
2541 }
2542
2543 static uint32_t
radv_get_attrib_stride(const VkPipelineVertexInputStateCreateInfo * input_state,uint32_t attrib_binding)2544 radv_get_attrib_stride(const VkPipelineVertexInputStateCreateInfo *input_state,
2545 uint32_t attrib_binding)
2546 {
2547 for (uint32_t i = 0; i < input_state->vertexBindingDescriptionCount; i++) {
2548 const VkVertexInputBindingDescription *input_binding =
2549 &input_state->pVertexBindingDescriptions[i];
2550
2551 if (input_binding->binding == attrib_binding)
2552 return input_binding->stride;
2553 }
2554
2555 return 0;
2556 }
2557
2558 static struct radv_pipeline_key
radv_generate_graphics_pipeline_key(const struct radv_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo,const struct radv_blend_state * blend)2559 radv_generate_graphics_pipeline_key(const struct radv_pipeline *pipeline,
2560 const VkGraphicsPipelineCreateInfo *pCreateInfo,
2561 const struct radv_blend_state *blend)
2562 {
2563 RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
2564 struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
2565 bool uses_dynamic_stride = false;
2566
2567 struct radv_pipeline_key key;
2568 memset(&key, 0, sizeof(key));
2569
2570 if (pCreateInfo->flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT)
2571 key.optimisations_disabled = 1;
2572
2573 key.has_multiview_view_index = !!subpass->view_mask;
2574
2575 if (pCreateInfo->pDynamicState) {
2576 uint32_t count = pCreateInfo->pDynamicState->dynamicStateCount;
2577 for (uint32_t i = 0; i < count; i++) {
2578 if (pCreateInfo->pDynamicState->pDynamicStates[i] == VK_DYNAMIC_STATE_VERTEX_INPUT_EXT) {
2579 key.vs.dynamic_input_state = true;
2580 /* we don't care about use_dynamic_stride in this case */
2581 break;
2582 } else if (pCreateInfo->pDynamicState->pDynamicStates[i] ==
2583 VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE_EXT) {
2584 uses_dynamic_stride = true;
2585 }
2586 }
2587 }
2588
2589 if (!key.vs.dynamic_input_state) {
2590 const VkPipelineVertexInputStateCreateInfo *input_state = pCreateInfo->pVertexInputState;
2591 const VkPipelineVertexInputDivisorStateCreateInfoEXT *divisor_state = vk_find_struct_const(
2592 input_state->pNext, PIPELINE_VERTEX_INPUT_DIVISOR_STATE_CREATE_INFO_EXT);
2593
2594 uint32_t binding_input_rate = 0;
2595 uint32_t instance_rate_divisors[MAX_VERTEX_ATTRIBS];
2596 for (unsigned i = 0; i < input_state->vertexBindingDescriptionCount; ++i) {
2597 if (input_state->pVertexBindingDescriptions[i].inputRate) {
2598 unsigned binding = input_state->pVertexBindingDescriptions[i].binding;
2599 binding_input_rate |= 1u << binding;
2600 instance_rate_divisors[binding] = 1;
2601 }
2602 }
2603 if (divisor_state) {
2604 for (unsigned i = 0; i < divisor_state->vertexBindingDivisorCount; ++i) {
2605 instance_rate_divisors[divisor_state->pVertexBindingDivisors[i].binding] =
2606 divisor_state->pVertexBindingDivisors[i].divisor;
2607 }
2608 }
2609
2610 for (unsigned i = 0; i < input_state->vertexAttributeDescriptionCount; ++i) {
2611 const VkVertexInputAttributeDescription *desc =
2612 &input_state->pVertexAttributeDescriptions[i];
2613 const struct util_format_description *format_desc;
2614 unsigned location = desc->location;
2615 unsigned binding = desc->binding;
2616 unsigned num_format, data_format;
2617 bool post_shuffle;
2618
2619 if (binding_input_rate & (1u << binding)) {
2620 key.vs.instance_rate_inputs |= 1u << location;
2621 key.vs.instance_rate_divisors[location] = instance_rate_divisors[binding];
2622 }
2623
2624 format_desc = vk_format_description(desc->format);
2625 radv_translate_vertex_format(pipeline->device->physical_device, desc->format, format_desc,
2626 &data_format, &num_format, &post_shuffle,
2627 &key.vs.vertex_alpha_adjust[location]);
2628
2629 key.vs.vertex_attribute_formats[location] = data_format | (num_format << 4);
2630 key.vs.vertex_attribute_bindings[location] = desc->binding;
2631 key.vs.vertex_attribute_offsets[location] = desc->offset;
2632
2633 const struct ac_data_format_info *dfmt_info = ac_get_data_format_info(data_format);
2634 unsigned attrib_align =
2635 dfmt_info->chan_byte_size ? dfmt_info->chan_byte_size : dfmt_info->element_size;
2636
2637 /* If desc->offset is misaligned, then the buffer offset must be too. Just
2638 * skip updating vertex_binding_align in this case.
2639 */
2640 if (desc->offset % attrib_align == 0)
2641 key.vs.vertex_binding_align[desc->binding] =
2642 MAX2(key.vs.vertex_binding_align[desc->binding], attrib_align);
2643
2644 if (!uses_dynamic_stride) {
2645 /* From the Vulkan spec 1.2.157:
2646 *
2647 * "If the bound pipeline state object was created
2648 * with the
2649 * VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE_EXT
2650 * dynamic state enabled then pStrides[i] specifies
2651 * the distance in bytes between two consecutive
2652 * elements within the corresponding buffer. In this
2653 * case the VkVertexInputBindingDescription::stride
2654 * state from the pipeline state object is ignored."
2655 *
2656 * Make sure the vertex attribute stride is zero to
2657 * avoid computing a wrong offset if it's initialized
2658 * to something else than zero.
2659 */
2660 key.vs.vertex_attribute_strides[location] =
2661 radv_get_attrib_stride(input_state, desc->binding);
2662 }
2663
2664 if (post_shuffle)
2665 key.vs.vertex_post_shuffle |= 1 << location;
2666 }
2667 }
2668
2669 const VkPipelineTessellationStateCreateInfo *tess =
2670 radv_pipeline_get_tessellation_state(pCreateInfo);
2671 if (tess)
2672 key.tcs.tess_input_vertices = tess->patchControlPoints;
2673
2674 const VkPipelineMultisampleStateCreateInfo *vkms =
2675 radv_pipeline_get_multisample_state(pCreateInfo);
2676 if (vkms && vkms->rasterizationSamples > 1) {
2677 uint32_t num_samples = vkms->rasterizationSamples;
2678 uint32_t ps_iter_samples = radv_pipeline_get_ps_iter_samples(pCreateInfo);
2679 key.ps.num_samples = num_samples;
2680 key.ps.log2_ps_iter_samples = util_logbase2(ps_iter_samples);
2681 }
2682
2683 key.ps.col_format = blend->spi_shader_col_format;
2684 if (pipeline->device->physical_device->rad_info.chip_class < GFX8) {
2685 key.ps.is_int8 = blend->col_format_is_int8;
2686 key.ps.is_int10 = blend->col_format_is_int10;
2687 }
2688
2689 if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) {
2690 key.vs.topology = pCreateInfo->pInputAssemblyState->topology;
2691
2692 const VkPipelineRasterizationStateCreateInfo *raster_info = pCreateInfo->pRasterizationState;
2693 const VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *provoking_vtx_info =
2694 vk_find_struct_const(raster_info->pNext,
2695 PIPELINE_RASTERIZATION_PROVOKING_VERTEX_STATE_CREATE_INFO_EXT);
2696 if (provoking_vtx_info &&
2697 provoking_vtx_info->provokingVertexMode == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT) {
2698 key.vs.provoking_vtx_last = true;
2699 }
2700 }
2701
2702 if (pipeline->device->instance->debug_flags & RADV_DEBUG_DISCARD_TO_DEMOTE)
2703 key.ps.lower_discard_to_demote = true;
2704
2705 if (pipeline->device->instance->enable_mrt_output_nan_fixup)
2706 key.ps.enable_mrt_output_nan_fixup = true;
2707
2708 key.ps.force_vrs = pipeline->device->force_vrs;
2709
2710 if (pipeline->device->instance->debug_flags & RADV_DEBUG_INVARIANT_GEOM)
2711 key.invariant_geom = true;
2712
2713 key.use_ngg = pipeline->device->physical_device->use_ngg;
2714
2715 return key;
2716 }
2717
2718 static uint8_t
radv_get_wave_size(struct radv_device * device,const VkPipelineShaderStageCreateInfo * pStage,gl_shader_stage stage,const struct radv_shader_info * info)2719 radv_get_wave_size(struct radv_device *device, const VkPipelineShaderStageCreateInfo *pStage,
2720 gl_shader_stage stage, const struct radv_shader_info *info)
2721 {
2722 if (stage == MESA_SHADER_GEOMETRY && !info->is_ngg)
2723 return 64;
2724 else if (stage == MESA_SHADER_COMPUTE) {
2725 return info->cs.subgroup_size;
2726 } else if (stage == MESA_SHADER_FRAGMENT)
2727 return device->physical_device->ps_wave_size;
2728 else
2729 return device->physical_device->ge_wave_size;
2730 }
2731
2732 static uint8_t
radv_get_ballot_bit_size(struct radv_device * device,const VkPipelineShaderStageCreateInfo * pStage,gl_shader_stage stage,const struct radv_shader_info * info)2733 radv_get_ballot_bit_size(struct radv_device *device, const VkPipelineShaderStageCreateInfo *pStage,
2734 gl_shader_stage stage, const struct radv_shader_info *info)
2735 {
2736 if (stage == MESA_SHADER_COMPUTE && info->cs.subgroup_size)
2737 return info->cs.subgroup_size;
2738 return 64;
2739 }
2740
2741 static void
radv_determine_ngg_settings(struct radv_pipeline * pipeline,const struct radv_pipeline_key * pipeline_key,struct radv_shader_info * infos,nir_shader ** nir)2742 radv_determine_ngg_settings(struct radv_pipeline *pipeline,
2743 const struct radv_pipeline_key *pipeline_key,
2744 struct radv_shader_info *infos, nir_shader **nir)
2745 {
2746 struct radv_device *device = pipeline->device;
2747
2748 if (!nir[MESA_SHADER_GEOMETRY] && pipeline->graphics.last_vgt_api_stage != MESA_SHADER_NONE) {
2749 uint64_t ps_inputs_read =
2750 nir[MESA_SHADER_FRAGMENT] ? nir[MESA_SHADER_FRAGMENT]->info.inputs_read : 0;
2751 gl_shader_stage es_stage = pipeline->graphics.last_vgt_api_stage;
2752
2753 unsigned num_vertices_per_prim = si_conv_prim_to_gs_out(pipeline_key->vs.topology) + 1;
2754 if (es_stage == MESA_SHADER_TESS_EVAL)
2755 num_vertices_per_prim = nir[es_stage]->info.tess.point_mode ? 1
2756 : nir[es_stage]->info.tess.primitive_mode == GL_ISOLINES ? 2
2757 : 3;
2758
2759 infos[es_stage].has_ngg_culling = radv_consider_culling(
2760 device, nir[es_stage], ps_inputs_read, num_vertices_per_prim, &infos[es_stage]);
2761
2762 nir_function_impl *impl = nir_shader_get_entrypoint(nir[es_stage]);
2763 infos[es_stage].has_ngg_early_prim_export = exec_list_is_singular(&impl->body);
2764
2765 /* Invocations that process an input vertex */
2766 const struct gfx10_ngg_info *ngg_info = &infos[es_stage].ngg_info;
2767 unsigned max_vtx_in = MIN2(256, ngg_info->enable_vertex_grouping ? ngg_info->hw_max_esverts : num_vertices_per_prim * ngg_info->max_gsprims);
2768
2769 unsigned lds_bytes_if_culling_off = 0;
2770 /* We need LDS space when VS needs to export the primitive ID. */
2771 if (es_stage == MESA_SHADER_VERTEX && infos[es_stage].vs.outinfo.export_prim_id)
2772 lds_bytes_if_culling_off = max_vtx_in * 4u;
2773 infos[es_stage].num_lds_blocks_when_not_culling =
2774 DIV_ROUND_UP(lds_bytes_if_culling_off,
2775 device->physical_device->rad_info.lds_encode_granularity);
2776
2777 /* NGG passthrough mode should be disabled when culling and when the vertex shader exports the
2778 * primitive ID.
2779 */
2780 infos[es_stage].is_ngg_passthrough = infos[es_stage].is_ngg_passthrough &&
2781 !infos[es_stage].has_ngg_culling &&
2782 !(es_stage == MESA_SHADER_VERTEX &&
2783 infos[es_stage].vs.outinfo.export_prim_id);
2784 }
2785 }
2786
2787 static void
radv_fill_shader_info(struct radv_pipeline * pipeline,struct radv_pipeline_layout * pipeline_layout,const VkPipelineShaderStageCreateInfo ** pStages,const struct radv_pipeline_key * pipeline_key,struct radv_shader_info * infos,nir_shader ** nir)2788 radv_fill_shader_info(struct radv_pipeline *pipeline,
2789 struct radv_pipeline_layout *pipeline_layout,
2790 const VkPipelineShaderStageCreateInfo **pStages,
2791 const struct radv_pipeline_key *pipeline_key,
2792 struct radv_shader_info *infos, nir_shader **nir)
2793 {
2794 struct radv_device *device = pipeline->device;
2795 unsigned active_stages = 0;
2796 unsigned filled_stages = 0;
2797
2798 for (int i = 0; i < MESA_SHADER_STAGES; i++) {
2799 if (nir[i])
2800 active_stages |= (1 << i);
2801 }
2802
2803 if (nir[MESA_SHADER_TESS_CTRL]) {
2804 infos[MESA_SHADER_VERTEX].vs.as_ls = true;
2805 }
2806
2807 if (nir[MESA_SHADER_GEOMETRY]) {
2808 if (nir[MESA_SHADER_TESS_CTRL])
2809 infos[MESA_SHADER_TESS_EVAL].tes.as_es = true;
2810 else
2811 infos[MESA_SHADER_VERTEX].vs.as_es = true;
2812 }
2813
2814 if (pipeline_key->use_ngg) {
2815 if (nir[MESA_SHADER_TESS_CTRL]) {
2816 infos[MESA_SHADER_TESS_EVAL].is_ngg = true;
2817 } else {
2818 infos[MESA_SHADER_VERTEX].is_ngg = true;
2819 }
2820
2821 if (nir[MESA_SHADER_TESS_CTRL] && nir[MESA_SHADER_GEOMETRY] &&
2822 nir[MESA_SHADER_GEOMETRY]->info.gs.invocations *
2823 nir[MESA_SHADER_GEOMETRY]->info.gs.vertices_out >
2824 256) {
2825 /* Fallback to the legacy path if tessellation is
2826 * enabled with extreme geometry because
2827 * EN_MAX_VERT_OUT_PER_GS_INSTANCE doesn't work and it
2828 * might hang.
2829 */
2830 infos[MESA_SHADER_TESS_EVAL].is_ngg = false;
2831 }
2832
2833 gl_shader_stage last_xfb_stage = MESA_SHADER_VERTEX;
2834
2835 for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
2836 if (nir[i])
2837 last_xfb_stage = i;
2838 }
2839
2840 bool uses_xfb = nir[last_xfb_stage] && radv_nir_stage_uses_xfb(nir[last_xfb_stage]);
2841
2842 if (!device->physical_device->use_ngg_streamout && uses_xfb) {
2843 if (nir[MESA_SHADER_TESS_CTRL])
2844 infos[MESA_SHADER_TESS_EVAL].is_ngg = false;
2845 else
2846 infos[MESA_SHADER_VERTEX].is_ngg = false;
2847 }
2848
2849 /* Determine if the pipeline is eligible for the NGG passthrough
2850 * mode. It can't be enabled for geometry shaders, for NGG
2851 * streamout or for vertex shaders that export the primitive ID
2852 * (this is checked later because we don't have the info here.)
2853 */
2854 if (!nir[MESA_SHADER_GEOMETRY] && !uses_xfb) {
2855 if (nir[MESA_SHADER_TESS_CTRL] && infos[MESA_SHADER_TESS_EVAL].is_ngg) {
2856 infos[MESA_SHADER_TESS_EVAL].is_ngg_passthrough = true;
2857 } else if (nir[MESA_SHADER_VERTEX] && infos[MESA_SHADER_VERTEX].is_ngg) {
2858 infos[MESA_SHADER_VERTEX].is_ngg_passthrough = true;
2859 }
2860 }
2861 }
2862
2863 if (nir[MESA_SHADER_FRAGMENT]) {
2864 radv_nir_shader_info_init(&infos[MESA_SHADER_FRAGMENT]);
2865 radv_nir_shader_info_pass(pipeline->device, nir[MESA_SHADER_FRAGMENT], pipeline_layout,
2866 pipeline_key, &infos[MESA_SHADER_FRAGMENT]);
2867
2868 assert(pipeline->graphics.last_vgt_api_stage != MESA_SHADER_NONE);
2869 if (infos[MESA_SHADER_FRAGMENT].ps.prim_id_input) {
2870 if (pipeline->graphics.last_vgt_api_stage == MESA_SHADER_VERTEX) {
2871 infos[MESA_SHADER_VERTEX].vs.outinfo.export_prim_id = true;
2872 } else if (pipeline->graphics.last_vgt_api_stage == MESA_SHADER_TESS_EVAL) {
2873 infos[MESA_SHADER_TESS_EVAL].tes.outinfo.export_prim_id = true;
2874 } else {
2875 assert(pipeline->graphics.last_vgt_api_stage == MESA_SHADER_GEOMETRY);
2876 }
2877 }
2878
2879 if (!!infos[MESA_SHADER_FRAGMENT].ps.num_input_clips_culls) {
2880 if (pipeline->graphics.last_vgt_api_stage == MESA_SHADER_VERTEX) {
2881 infos[MESA_SHADER_VERTEX].vs.outinfo.export_clip_dists = true;
2882 } else if (pipeline->graphics.last_vgt_api_stage == MESA_SHADER_TESS_EVAL) {
2883 infos[MESA_SHADER_TESS_EVAL].tes.outinfo.export_clip_dists = true;
2884 } else {
2885 assert(pipeline->graphics.last_vgt_api_stage == MESA_SHADER_GEOMETRY);
2886 infos[MESA_SHADER_GEOMETRY].vs.outinfo.export_clip_dists = true;
2887 }
2888 }
2889
2890 filled_stages |= (1 << MESA_SHADER_FRAGMENT);
2891 }
2892
2893 if (pipeline->device->physical_device->rad_info.chip_class >= GFX9 &&
2894 nir[MESA_SHADER_TESS_CTRL]) {
2895 struct nir_shader *combined_nir[] = {nir[MESA_SHADER_VERTEX], nir[MESA_SHADER_TESS_CTRL]};
2896
2897 radv_nir_shader_info_init(&infos[MESA_SHADER_TESS_CTRL]);
2898
2899 /* Copy data to merged stage. */
2900 infos[MESA_SHADER_TESS_CTRL].vs.as_ls = true;
2901
2902 for (int i = 0; i < 2; i++) {
2903 radv_nir_shader_info_pass(pipeline->device, combined_nir[i], pipeline_layout, pipeline_key,
2904 &infos[MESA_SHADER_TESS_CTRL]);
2905 }
2906
2907 filled_stages |= (1 << MESA_SHADER_VERTEX);
2908 filled_stages |= (1 << MESA_SHADER_TESS_CTRL);
2909 }
2910
2911 if (pipeline->device->physical_device->rad_info.chip_class >= GFX9 &&
2912 nir[MESA_SHADER_GEOMETRY]) {
2913 gl_shader_stage pre_stage =
2914 nir[MESA_SHADER_TESS_EVAL] ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
2915 struct nir_shader *combined_nir[] = {nir[pre_stage], nir[MESA_SHADER_GEOMETRY]};
2916
2917 radv_nir_shader_info_init(&infos[MESA_SHADER_GEOMETRY]);
2918
2919 /* Copy data to merged stage. */
2920 if (pre_stage == MESA_SHADER_VERTEX) {
2921 infos[MESA_SHADER_GEOMETRY].vs.as_es = infos[MESA_SHADER_VERTEX].vs.as_es;
2922 } else {
2923 infos[MESA_SHADER_GEOMETRY].tes.as_es = infos[MESA_SHADER_TESS_EVAL].tes.as_es;
2924 }
2925 infos[MESA_SHADER_GEOMETRY].is_ngg = infos[pre_stage].is_ngg;
2926 infos[MESA_SHADER_GEOMETRY].gs.es_type = pre_stage;
2927
2928 for (int i = 0; i < 2; i++) {
2929 radv_nir_shader_info_pass(pipeline->device, combined_nir[i], pipeline_layout, pipeline_key,
2930 &infos[MESA_SHADER_GEOMETRY]);
2931 }
2932
2933 filled_stages |= (1 << pre_stage);
2934 filled_stages |= (1 << MESA_SHADER_GEOMETRY);
2935 }
2936
2937 active_stages ^= filled_stages;
2938 while (active_stages) {
2939 int i = u_bit_scan(&active_stages);
2940 radv_nir_shader_info_init(&infos[i]);
2941 radv_nir_shader_info_pass(pipeline->device, nir[i], pipeline_layout, pipeline_key, &infos[i]);
2942 }
2943
2944 if (nir[MESA_SHADER_COMPUTE]) {
2945 unsigned subgroup_size = pipeline_key->cs.compute_subgroup_size;
2946 unsigned req_subgroup_size = subgroup_size;
2947 bool require_full_subgroups = pipeline_key->cs.require_full_subgroups;
2948
2949 if (!subgroup_size)
2950 subgroup_size = device->physical_device->cs_wave_size;
2951
2952 unsigned local_size = nir[MESA_SHADER_COMPUTE]->info.workgroup_size[0] *
2953 nir[MESA_SHADER_COMPUTE]->info.workgroup_size[1] *
2954 nir[MESA_SHADER_COMPUTE]->info.workgroup_size[2];
2955
2956 /* Games don't always request full subgroups when they should,
2957 * which can cause bugs if cswave32 is enabled.
2958 */
2959 if (device->physical_device->cs_wave_size == 32 &&
2960 nir[MESA_SHADER_COMPUTE]->info.cs.uses_wide_subgroup_intrinsics && !req_subgroup_size &&
2961 local_size % RADV_SUBGROUP_SIZE == 0)
2962 require_full_subgroups = true;
2963
2964 if (require_full_subgroups && !req_subgroup_size) {
2965 /* don't use wave32 pretending to be wave64 */
2966 subgroup_size = RADV_SUBGROUP_SIZE;
2967 }
2968
2969 infos[MESA_SHADER_COMPUTE].cs.subgroup_size = subgroup_size;
2970 }
2971
2972 for (int i = 0; i < MESA_SHADER_STAGES; i++) {
2973 if (nir[i]) {
2974 infos[i].wave_size = radv_get_wave_size(pipeline->device, pStages[i], i, &infos[i]);
2975 infos[i].ballot_bit_size =
2976 radv_get_ballot_bit_size(pipeline->device, pStages[i], i, &infos[i]);
2977 }
2978 }
2979
2980 /* PS always operates without workgroups. */
2981 if (nir[MESA_SHADER_FRAGMENT])
2982 infos[MESA_SHADER_FRAGMENT].workgroup_size = infos[MESA_SHADER_FRAGMENT].wave_size;
2983
2984 if (nir[MESA_SHADER_COMPUTE]) {
2985 /* Variable workgroup size is not supported by Vulkan. */
2986 assert(!nir[MESA_SHADER_COMPUTE]->info.workgroup_size_variable);
2987
2988 infos[MESA_SHADER_COMPUTE].workgroup_size =
2989 ac_compute_cs_workgroup_size(
2990 nir[MESA_SHADER_COMPUTE]->info.workgroup_size, false, UINT32_MAX);
2991 }
2992 }
2993
2994 static void
merge_tess_info(struct shader_info * tes_info,struct shader_info * tcs_info)2995 merge_tess_info(struct shader_info *tes_info, struct shader_info *tcs_info)
2996 {
2997 /* The Vulkan 1.0.38 spec, section 21.1 Tessellator says:
2998 *
2999 * "PointMode. Controls generation of points rather than triangles
3000 * or lines. This functionality defaults to disabled, and is
3001 * enabled if either shader stage includes the execution mode.
3002 *
3003 * and about Triangles, Quads, IsoLines, VertexOrderCw, VertexOrderCcw,
3004 * PointMode, SpacingEqual, SpacingFractionalEven, SpacingFractionalOdd,
3005 * and OutputVertices, it says:
3006 *
3007 * "One mode must be set in at least one of the tessellation
3008 * shader stages."
3009 *
3010 * So, the fields can be set in either the TCS or TES, but they must
3011 * agree if set in both. Our backend looks at TES, so bitwise-or in
3012 * the values from the TCS.
3013 */
3014 assert(tcs_info->tess.tcs_vertices_out == 0 || tes_info->tess.tcs_vertices_out == 0 ||
3015 tcs_info->tess.tcs_vertices_out == tes_info->tess.tcs_vertices_out);
3016 tes_info->tess.tcs_vertices_out |= tcs_info->tess.tcs_vertices_out;
3017
3018 assert(tcs_info->tess.spacing == TESS_SPACING_UNSPECIFIED ||
3019 tes_info->tess.spacing == TESS_SPACING_UNSPECIFIED ||
3020 tcs_info->tess.spacing == tes_info->tess.spacing);
3021 tes_info->tess.spacing |= tcs_info->tess.spacing;
3022
3023 assert(tcs_info->tess.primitive_mode == 0 || tes_info->tess.primitive_mode == 0 ||
3024 tcs_info->tess.primitive_mode == tes_info->tess.primitive_mode);
3025 tes_info->tess.primitive_mode |= tcs_info->tess.primitive_mode;
3026 tes_info->tess.ccw |= tcs_info->tess.ccw;
3027 tes_info->tess.point_mode |= tcs_info->tess.point_mode;
3028
3029 /* Copy the merged info back to the TCS */
3030 tcs_info->tess.tcs_vertices_out = tes_info->tess.tcs_vertices_out;
3031 tcs_info->tess.spacing = tes_info->tess.spacing;
3032 tcs_info->tess.primitive_mode = tes_info->tess.primitive_mode;
3033 tcs_info->tess.ccw = tes_info->tess.ccw;
3034 tcs_info->tess.point_mode = tes_info->tess.point_mode;
3035 }
3036
3037 static void
gather_tess_info(struct radv_device * device,nir_shader ** nir,struct radv_shader_info * infos,const struct radv_pipeline_key * pipeline_key)3038 gather_tess_info(struct radv_device *device, nir_shader **nir, struct radv_shader_info *infos,
3039 const struct radv_pipeline_key *pipeline_key)
3040 {
3041 merge_tess_info(&nir[MESA_SHADER_TESS_EVAL]->info, &nir[MESA_SHADER_TESS_CTRL]->info);
3042
3043 unsigned tess_in_patch_size = pipeline_key->tcs.tess_input_vertices;
3044 unsigned tess_out_patch_size = nir[MESA_SHADER_TESS_CTRL]->info.tess.tcs_vertices_out;
3045
3046 /* Number of tessellation patches per workgroup processed by the current pipeline. */
3047 unsigned num_patches = get_tcs_num_patches(
3048 tess_in_patch_size, tess_out_patch_size,
3049 infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_inputs,
3050 infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_outputs,
3051 infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_patch_outputs, device->tess_offchip_block_dw_size,
3052 device->physical_device->rad_info.chip_class, device->physical_device->rad_info.family);
3053
3054 /* LDS size used by VS+TCS for storing TCS inputs and outputs. */
3055 unsigned tcs_lds_size = calculate_tess_lds_size(
3056 device->physical_device->rad_info.chip_class, tess_in_patch_size, tess_out_patch_size,
3057 infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_inputs, num_patches,
3058 infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_outputs,
3059 infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_patch_outputs);
3060
3061 infos[MESA_SHADER_TESS_CTRL].num_tess_patches = num_patches;
3062 infos[MESA_SHADER_TESS_CTRL].tcs.num_lds_blocks = tcs_lds_size;
3063 infos[MESA_SHADER_TESS_CTRL].tcs.tes_reads_tess_factors =
3064 !!(nir[MESA_SHADER_TESS_EVAL]->info.inputs_read &
3065 (VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER));
3066 infos[MESA_SHADER_TESS_CTRL].tcs.tes_inputs_read = nir[MESA_SHADER_TESS_EVAL]->info.inputs_read;
3067 infos[MESA_SHADER_TESS_CTRL].tcs.tes_patch_inputs_read =
3068 nir[MESA_SHADER_TESS_EVAL]->info.patch_inputs_read;
3069
3070 infos[MESA_SHADER_TESS_EVAL].num_tess_patches = num_patches;
3071 infos[MESA_SHADER_GEOMETRY].num_tess_patches = num_patches;
3072 infos[MESA_SHADER_VERTEX].num_tess_patches = num_patches;
3073 infos[MESA_SHADER_TESS_CTRL].tcs.tcs_vertices_out = tess_out_patch_size;
3074 infos[MESA_SHADER_VERTEX].tcs.tcs_vertices_out = tess_out_patch_size;
3075
3076 if (!radv_use_llvm_for_stage(device, MESA_SHADER_VERTEX)) {
3077 /* When the number of TCS input and output vertices are the same (typically 3):
3078 * - There is an equal amount of LS and HS invocations
3079 * - In case of merged LSHS shaders, the LS and HS halves of the shader
3080 * always process the exact same vertex. We can use this knowledge to optimize them.
3081 *
3082 * We don't set tcs_in_out_eq if the float controls differ because that might
3083 * involve different float modes for the same block and our optimizer
3084 * doesn't handle a instruction dominating another with a different mode.
3085 */
3086 infos[MESA_SHADER_VERTEX].vs.tcs_in_out_eq =
3087 device->physical_device->rad_info.chip_class >= GFX9 &&
3088 tess_in_patch_size == tess_out_patch_size &&
3089 nir[MESA_SHADER_VERTEX]->info.float_controls_execution_mode ==
3090 nir[MESA_SHADER_TESS_CTRL]->info.float_controls_execution_mode;
3091
3092 if (infos[MESA_SHADER_VERTEX].vs.tcs_in_out_eq)
3093 infos[MESA_SHADER_VERTEX].vs.tcs_temp_only_input_mask =
3094 nir[MESA_SHADER_TESS_CTRL]->info.inputs_read &
3095 nir[MESA_SHADER_VERTEX]->info.outputs_written &
3096 ~nir[MESA_SHADER_TESS_CTRL]->info.tess.tcs_cross_invocation_inputs_read &
3097 ~nir[MESA_SHADER_TESS_CTRL]->info.inputs_read_indirectly &
3098 ~nir[MESA_SHADER_VERTEX]->info.outputs_accessed_indirectly;
3099
3100 /* Copy data to TCS so it can be accessed by the backend if they are merged. */
3101 infos[MESA_SHADER_TESS_CTRL].vs.tcs_in_out_eq = infos[MESA_SHADER_VERTEX].vs.tcs_in_out_eq;
3102 infos[MESA_SHADER_TESS_CTRL].vs.tcs_temp_only_input_mask =
3103 infos[MESA_SHADER_VERTEX].vs.tcs_temp_only_input_mask;
3104 }
3105
3106 for (gl_shader_stage s = MESA_SHADER_VERTEX; s <= MESA_SHADER_TESS_CTRL; ++s)
3107 infos[s].workgroup_size =
3108 ac_compute_lshs_workgroup_size(
3109 device->physical_device->rad_info.chip_class, s,
3110 num_patches, tess_in_patch_size, tess_out_patch_size);
3111 }
3112
3113 static void
radv_init_feedback(const VkPipelineCreationFeedbackCreateInfoEXT * ext)3114 radv_init_feedback(const VkPipelineCreationFeedbackCreateInfoEXT *ext)
3115 {
3116 if (!ext)
3117 return;
3118
3119 if (ext->pPipelineCreationFeedback) {
3120 ext->pPipelineCreationFeedback->flags = 0;
3121 ext->pPipelineCreationFeedback->duration = 0;
3122 }
3123
3124 for (unsigned i = 0; i < ext->pipelineStageCreationFeedbackCount; ++i) {
3125 ext->pPipelineStageCreationFeedbacks[i].flags = 0;
3126 ext->pPipelineStageCreationFeedbacks[i].duration = 0;
3127 }
3128 }
3129
3130 static void
radv_start_feedback(VkPipelineCreationFeedbackEXT * feedback)3131 radv_start_feedback(VkPipelineCreationFeedbackEXT *feedback)
3132 {
3133 if (!feedback)
3134 return;
3135
3136 feedback->duration -= radv_get_current_time();
3137 feedback->flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT;
3138 }
3139
3140 static void
radv_stop_feedback(VkPipelineCreationFeedbackEXT * feedback,bool cache_hit)3141 radv_stop_feedback(VkPipelineCreationFeedbackEXT *feedback, bool cache_hit)
3142 {
3143 if (!feedback)
3144 return;
3145
3146 feedback->duration += radv_get_current_time();
3147 feedback->flags =
3148 VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT |
3149 (cache_hit ? VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT_EXT : 0);
3150 }
3151
3152 static bool
mem_vectorize_callback(unsigned align_mul,unsigned align_offset,unsigned bit_size,unsigned num_components,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * data)3153 mem_vectorize_callback(unsigned align_mul, unsigned align_offset, unsigned bit_size,
3154 unsigned num_components, nir_intrinsic_instr *low, nir_intrinsic_instr *high,
3155 void *data)
3156 {
3157 if (num_components > 4)
3158 return false;
3159
3160 /* >128 bit loads are split except with SMEM */
3161 if (bit_size * num_components > 128)
3162 return false;
3163
3164 uint32_t align;
3165 if (align_offset)
3166 align = 1 << (ffs(align_offset) - 1);
3167 else
3168 align = align_mul;
3169
3170 switch (low->intrinsic) {
3171 case nir_intrinsic_load_global:
3172 case nir_intrinsic_store_global:
3173 case nir_intrinsic_store_ssbo:
3174 case nir_intrinsic_load_ssbo:
3175 case nir_intrinsic_load_ubo:
3176 case nir_intrinsic_load_push_constant: {
3177 unsigned max_components;
3178 if (align % 4 == 0)
3179 max_components = NIR_MAX_VEC_COMPONENTS;
3180 else if (align % 2 == 0)
3181 max_components = 16u / bit_size;
3182 else
3183 max_components = 8u / bit_size;
3184 return (align % (bit_size / 8u)) == 0 && num_components <= max_components;
3185 }
3186 case nir_intrinsic_load_deref:
3187 case nir_intrinsic_store_deref:
3188 assert(nir_deref_mode_is(nir_src_as_deref(low->src[0]), nir_var_mem_shared));
3189 FALLTHROUGH;
3190 case nir_intrinsic_load_shared:
3191 case nir_intrinsic_store_shared:
3192 if (bit_size * num_components ==
3193 96) { /* 96 bit loads require 128 bit alignment and are split otherwise */
3194 return align % 16 == 0;
3195 } else if (bit_size == 16 && (align % 4)) {
3196 /* AMD hardware can't do 2-byte aligned f16vec2 loads, but they are useful for ALU
3197 * vectorization, because our vectorizer requires the scalar IR to already contain vectors.
3198 */
3199 return (align % 2 == 0) && num_components <= 2;
3200 } else {
3201 if (num_components == 3) {
3202 /* AMD hardware can't do 3-component loads except for 96-bit loads, handled above. */
3203 return false;
3204 }
3205 unsigned req = bit_size * num_components;
3206 if (req == 64 || req == 128) /* 64-bit and 128-bit loads can use ds_read2_b{32,64} */
3207 req /= 2u;
3208 return align % (req / 8u) == 0;
3209 }
3210 default:
3211 return false;
3212 }
3213 return false;
3214 }
3215
3216 static unsigned
lower_bit_size_callback(const nir_instr * instr,void * _)3217 lower_bit_size_callback(const nir_instr *instr, void *_)
3218 {
3219 struct radv_device *device = _;
3220 enum chip_class chip = device->physical_device->rad_info.chip_class;
3221
3222 if (instr->type != nir_instr_type_alu)
3223 return 0;
3224 nir_alu_instr *alu = nir_instr_as_alu(instr);
3225
3226 if (alu->dest.dest.ssa.bit_size & (8 | 16)) {
3227 unsigned bit_size = alu->dest.dest.ssa.bit_size;
3228 switch (alu->op) {
3229 case nir_op_iabs:
3230 case nir_op_bitfield_select:
3231 case nir_op_imul_high:
3232 case nir_op_umul_high:
3233 case nir_op_ineg:
3234 case nir_op_isign:
3235 return 32;
3236 case nir_op_imax:
3237 case nir_op_umax:
3238 case nir_op_imin:
3239 case nir_op_umin:
3240 case nir_op_ishr:
3241 case nir_op_ushr:
3242 case nir_op_ishl:
3243 case nir_op_uadd_sat:
3244 return (bit_size == 8 || !(chip >= GFX8 && nir_dest_is_divergent(alu->dest.dest))) ? 32
3245 : 0;
3246 case nir_op_iadd_sat:
3247 return bit_size == 8 || !nir_dest_is_divergent(alu->dest.dest) ? 32 : 0;
3248
3249 default:
3250 return 0;
3251 }
3252 }
3253
3254 if (nir_src_bit_size(alu->src[0].src) & (8 | 16)) {
3255 unsigned bit_size = nir_src_bit_size(alu->src[0].src);
3256 switch (alu->op) {
3257 case nir_op_bit_count:
3258 case nir_op_find_lsb:
3259 case nir_op_ufind_msb:
3260 case nir_op_i2b1:
3261 return 32;
3262 case nir_op_ilt:
3263 case nir_op_ige:
3264 case nir_op_ieq:
3265 case nir_op_ine:
3266 case nir_op_ult:
3267 case nir_op_uge:
3268 return (bit_size == 8 || !(chip >= GFX8 && nir_dest_is_divergent(alu->dest.dest))) ? 32
3269 : 0;
3270 default:
3271 return 0;
3272 }
3273 }
3274
3275 return 0;
3276 }
3277
3278 static bool
opt_vectorize_callback(const nir_instr * instr,void * _)3279 opt_vectorize_callback(const nir_instr *instr, void *_)
3280 {
3281 assert(instr->type == nir_instr_type_alu);
3282 nir_alu_instr *alu = nir_instr_as_alu(instr);
3283 unsigned bit_size = alu->dest.dest.ssa.bit_size;
3284 if (bit_size != 16)
3285 return false;
3286
3287 switch (alu->op) {
3288 case nir_op_fadd:
3289 case nir_op_fsub:
3290 case nir_op_fmul:
3291 case nir_op_fneg:
3292 case nir_op_fsat:
3293 case nir_op_fmin:
3294 case nir_op_fmax:
3295 case nir_op_iadd:
3296 case nir_op_isub:
3297 case nir_op_imul:
3298 case nir_op_imin:
3299 case nir_op_imax:
3300 case nir_op_umin:
3301 case nir_op_umax:
3302 return true;
3303 case nir_op_ishl: /* TODO: in NIR, these have 32bit shift operands */
3304 case nir_op_ishr: /* while Radeon needs 16bit operands when vectorized */
3305 case nir_op_ushr:
3306 default:
3307 return false;
3308 }
3309 }
3310
3311 static nir_component_mask_t
non_uniform_access_callback(const nir_src * src,void * _)3312 non_uniform_access_callback(const nir_src *src, void *_)
3313 {
3314 if (src->ssa->num_components == 1)
3315 return 0x1;
3316 return nir_chase_binding(*src).success ? 0x2 : 0x3;
3317 }
3318
3319 VkResult
radv_create_shaders(struct radv_pipeline * pipeline,struct radv_pipeline_layout * pipeline_layout,struct radv_device * device,struct radv_pipeline_cache * cache,const struct radv_pipeline_key * pipeline_key,const VkPipelineShaderStageCreateInfo ** pStages,const VkPipelineCreateFlags flags,const uint8_t * custom_hash,VkPipelineCreationFeedbackEXT * pipeline_feedback,VkPipelineCreationFeedbackEXT ** stage_feedbacks)3320 radv_create_shaders(struct radv_pipeline *pipeline, struct radv_pipeline_layout *pipeline_layout,
3321 struct radv_device *device, struct radv_pipeline_cache *cache,
3322 const struct radv_pipeline_key *pipeline_key,
3323 const VkPipelineShaderStageCreateInfo **pStages,
3324 const VkPipelineCreateFlags flags, const uint8_t *custom_hash,
3325 VkPipelineCreationFeedbackEXT *pipeline_feedback,
3326 VkPipelineCreationFeedbackEXT **stage_feedbacks)
3327 {
3328 struct vk_shader_module fs_m = {0};
3329 struct vk_shader_module *modules[MESA_SHADER_STAGES] = {
3330 0,
3331 };
3332 nir_shader *nir[MESA_SHADER_STAGES] = {0};
3333 struct radv_shader_binary *binaries[MESA_SHADER_STAGES] = {NULL};
3334 struct radv_shader_info infos[MESA_SHADER_STAGES] = {0};
3335 unsigned char hash[20], gs_copy_hash[20];
3336 bool keep_executable_info =
3337 (flags & VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR) ||
3338 device->keep_shader_info;
3339 bool keep_statistic_info = (flags & VK_PIPELINE_CREATE_CAPTURE_STATISTICS_BIT_KHR) ||
3340 (device->instance->debug_flags & RADV_DEBUG_DUMP_SHADER_STATS) ||
3341 device->keep_shader_info;
3342 struct radv_pipeline_shader_stack_size **stack_sizes =
3343 pipeline->type == RADV_PIPELINE_COMPUTE ? &pipeline->compute.rt_stack_sizes : NULL;
3344 uint32_t *num_stack_sizes = stack_sizes ? &pipeline->compute.group_count : NULL;
3345
3346 radv_start_feedback(pipeline_feedback);
3347
3348 for (unsigned i = 0; i < MESA_SHADER_STAGES; ++i) {
3349 if (pStages[i]) {
3350 modules[i] = vk_shader_module_from_handle(pStages[i]->module);
3351 if (modules[i]->nir)
3352 _mesa_sha1_compute(modules[i]->nir->info.name, strlen(modules[i]->nir->info.name),
3353 modules[i]->sha1);
3354
3355 pipeline->active_stages |= mesa_to_vk_shader_stage(i);
3356 if (i < MESA_SHADER_FRAGMENT)
3357 pipeline->graphics.last_vgt_api_stage = i;
3358 }
3359 }
3360
3361 if (custom_hash)
3362 memcpy(hash, custom_hash, 20);
3363 else {
3364 radv_hash_shaders(hash, pStages, pipeline_layout, pipeline_key,
3365 radv_get_hash_flags(device, keep_statistic_info));
3366 }
3367 memcpy(gs_copy_hash, hash, 20);
3368 gs_copy_hash[0] ^= 1;
3369
3370 pipeline->pipeline_hash = *(uint64_t *)hash;
3371
3372 bool found_in_application_cache = true;
3373 if (modules[MESA_SHADER_GEOMETRY] && !keep_executable_info) {
3374 struct radv_shader_variant *variants[MESA_SHADER_STAGES] = {0};
3375 radv_create_shader_variants_from_pipeline_cache(device, cache, gs_copy_hash, variants, NULL,
3376 NULL, &found_in_application_cache);
3377 pipeline->gs_copy_shader = variants[MESA_SHADER_GEOMETRY];
3378 }
3379
3380 if (!keep_executable_info &&
3381 radv_create_shader_variants_from_pipeline_cache(device, cache, hash, pipeline->shaders,
3382 stack_sizes, num_stack_sizes,
3383 &found_in_application_cache) &&
3384 (!modules[MESA_SHADER_GEOMETRY] || pipeline->gs_copy_shader ||
3385 pipeline->shaders[MESA_SHADER_GEOMETRY]->info.is_ngg)) {
3386 radv_stop_feedback(pipeline_feedback, found_in_application_cache);
3387 return VK_SUCCESS;
3388 }
3389
3390 if (flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_EXT) {
3391 radv_stop_feedback(pipeline_feedback, found_in_application_cache);
3392 return VK_PIPELINE_COMPILE_REQUIRED_EXT;
3393 }
3394
3395 if (!modules[MESA_SHADER_FRAGMENT] && !modules[MESA_SHADER_COMPUTE]) {
3396 nir_builder fs_b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL, "noop_fs");
3397 fs_m = vk_shader_module_from_nir(fs_b.shader);
3398 modules[MESA_SHADER_FRAGMENT] = &fs_m;
3399 }
3400
3401 for (unsigned i = 0; i < MESA_SHADER_STAGES; ++i) {
3402 const VkPipelineShaderStageCreateInfo *stage = pStages[i];
3403
3404 if (!modules[i])
3405 continue;
3406
3407 radv_start_feedback(stage_feedbacks[i]);
3408
3409 nir[i] = radv_shader_compile_to_nir(device, modules[i], stage ? stage->pName : "main", i,
3410 stage ? stage->pSpecializationInfo : NULL,
3411 pipeline_layout, pipeline_key);
3412
3413 /* We don't want to alter meta shaders IR directly so clone it
3414 * first.
3415 */
3416 if (nir[i]->info.name) {
3417 nir[i] = nir_shader_clone(NULL, nir[i]);
3418 }
3419
3420 radv_stop_feedback(stage_feedbacks[i], false);
3421 }
3422
3423 bool optimize_conservatively = pipeline_key->optimisations_disabled;
3424
3425 radv_link_shaders(pipeline, pipeline_key, nir, optimize_conservatively);
3426 radv_set_driver_locations(pipeline, nir, infos);
3427
3428 for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
3429 if (nir[i]) {
3430 radv_start_feedback(stage_feedbacks[i]);
3431 radv_optimize_nir(device, nir[i], optimize_conservatively, false);
3432
3433 /* Gather info again, information such as outputs_read can be out-of-date. */
3434 nir_shader_gather_info(nir[i], nir_shader_get_entrypoint(nir[i]));
3435 radv_lower_io(device, nir[i]);
3436
3437 radv_stop_feedback(stage_feedbacks[i], false);
3438 }
3439 }
3440
3441 if (nir[MESA_SHADER_TESS_CTRL]) {
3442 nir_lower_patch_vertices(nir[MESA_SHADER_TESS_EVAL],
3443 nir[MESA_SHADER_TESS_CTRL]->info.tess.tcs_vertices_out, NULL);
3444 gather_tess_info(device, nir, infos, pipeline_key);
3445 }
3446
3447 radv_fill_shader_info(pipeline, pipeline_layout, pStages, pipeline_key, infos, nir);
3448
3449 bool pipeline_has_ngg = (nir[MESA_SHADER_VERTEX] && infos[MESA_SHADER_VERTEX].is_ngg) ||
3450 (nir[MESA_SHADER_TESS_EVAL] && infos[MESA_SHADER_TESS_EVAL].is_ngg);
3451
3452 if (pipeline_has_ngg) {
3453 struct gfx10_ngg_info *ngg_info;
3454
3455 if (nir[MESA_SHADER_GEOMETRY])
3456 ngg_info = &infos[MESA_SHADER_GEOMETRY].ngg_info;
3457 else if (nir[MESA_SHADER_TESS_CTRL])
3458 ngg_info = &infos[MESA_SHADER_TESS_EVAL].ngg_info;
3459 else
3460 ngg_info = &infos[MESA_SHADER_VERTEX].ngg_info;
3461
3462 gfx10_get_ngg_info(pipeline_key, pipeline, nir, infos, ngg_info);
3463 } else if (nir[MESA_SHADER_GEOMETRY]) {
3464 struct gfx9_gs_info *gs_info = &infos[MESA_SHADER_GEOMETRY].gs_ring_info;
3465
3466 gfx9_get_gs_info(pipeline_key, pipeline, nir, infos, gs_info);
3467 } else {
3468 gl_shader_stage hw_vs_api_stage =
3469 nir[MESA_SHADER_TESS_EVAL] ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
3470 infos[hw_vs_api_stage].workgroup_size = infos[hw_vs_api_stage].wave_size;
3471 }
3472
3473 radv_determine_ngg_settings(pipeline, pipeline_key, infos, nir);
3474
3475 for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
3476 if (nir[i]) {
3477 radv_start_feedback(stage_feedbacks[i]);
3478
3479 /* Wave and workgroup size should already be filled. */
3480 assert(infos[i].wave_size && infos[i].workgroup_size);
3481
3482 if (!radv_use_llvm_for_stage(device, i)) {
3483 nir_lower_non_uniform_access_options options = {
3484 .types = nir_lower_non_uniform_ubo_access | nir_lower_non_uniform_ssbo_access |
3485 nir_lower_non_uniform_texture_access | nir_lower_non_uniform_image_access,
3486 .callback = &non_uniform_access_callback,
3487 .callback_data = NULL,
3488 };
3489 NIR_PASS_V(nir[i], nir_lower_non_uniform_access, &options);
3490 }
3491 NIR_PASS_V(nir[i], nir_lower_memory_model);
3492
3493 bool lower_to_scalar = false;
3494
3495 nir_load_store_vectorize_options vectorize_opts = {
3496 .modes = nir_var_mem_ssbo | nir_var_mem_ubo | nir_var_mem_push_const |
3497 nir_var_mem_shared | nir_var_mem_global,
3498 .callback = mem_vectorize_callback,
3499 .robust_modes = 0,
3500 };
3501
3502 if (device->robust_buffer_access2) {
3503 vectorize_opts.robust_modes =
3504 nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_mem_global | nir_var_mem_push_const;
3505 }
3506
3507 if (nir_opt_load_store_vectorize(nir[i], &vectorize_opts)) {
3508 NIR_PASS_V(nir[i], nir_copy_prop);
3509 lower_to_scalar = true;
3510
3511 /* Gather info again, to update whether 8/16-bit are used. */
3512 nir_shader_gather_info(nir[i], nir_shader_get_entrypoint(nir[i]));
3513 }
3514
3515 lower_to_scalar |=
3516 nir_opt_shrink_vectors(nir[i], !device->instance->disable_shrink_image_store);
3517
3518 if (lower_to_scalar)
3519 nir_lower_alu_to_scalar(nir[i], NULL, NULL);
3520
3521 /* lower ALU operations */
3522 nir_lower_int64(nir[i]);
3523
3524 nir_opt_idiv_const(nir[i], 8);
3525
3526 nir_lower_idiv(nir[i],
3527 &(nir_lower_idiv_options){
3528 .imprecise_32bit_lowering = false,
3529 .allow_fp16 = device->physical_device->rad_info.chip_class >= GFX9,
3530 });
3531
3532 nir_opt_sink(nir[i], nir_move_load_input | nir_move_const_undef | nir_move_copies);
3533 nir_opt_move(nir[i], nir_move_load_input | nir_move_const_undef | nir_move_copies);
3534
3535 /* Lower I/O intrinsics to memory instructions. */
3536 bool io_to_mem = radv_lower_io_to_mem(device, nir[i], &infos[i], pipeline_key);
3537 bool lowered_ngg = pipeline_has_ngg && i == pipeline->graphics.last_vgt_api_stage &&
3538 !radv_use_llvm_for_stage(device, i);
3539 if (lowered_ngg)
3540 radv_lower_ngg(device, nir[i], &infos[i], pipeline_key);
3541
3542 radv_optimize_nir_algebraic(nir[i], io_to_mem || lowered_ngg || i == MESA_SHADER_COMPUTE);
3543
3544 if (nir[i]->info.bit_sizes_int & (8 | 16)) {
3545 if (device->physical_device->rad_info.chip_class >= GFX8) {
3546 nir_convert_to_lcssa(nir[i], true, true);
3547 nir_divergence_analysis(nir[i]);
3548 }
3549
3550 if (nir_lower_bit_size(nir[i], lower_bit_size_callback, device)) {
3551 NIR_PASS_V(nir[i], nir_opt_constant_folding);
3552 NIR_PASS_V(nir[i], nir_opt_dce);
3553 }
3554
3555 if (device->physical_device->rad_info.chip_class >= GFX8)
3556 nir_opt_remove_phis(nir[i]); /* cleanup LCSSA phis */
3557 }
3558 if (((nir[i]->info.bit_sizes_int | nir[i]->info.bit_sizes_float) & 16) &&
3559 device->physical_device->rad_info.chip_class >= GFX9)
3560 NIR_PASS_V(nir[i], nir_opt_vectorize, opt_vectorize_callback, NULL);
3561
3562 /* cleanup passes */
3563 nir_lower_load_const_to_scalar(nir[i]);
3564 nir_move_options move_opts = nir_move_const_undef | nir_move_load_ubo |
3565 nir_move_load_input | nir_move_comparisons | nir_move_copies;
3566 nir_opt_sink(nir[i], move_opts | nir_move_load_ssbo);
3567 nir_opt_move(nir[i], move_opts);
3568
3569 radv_stop_feedback(stage_feedbacks[i], false);
3570 }
3571 }
3572
3573 for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
3574 if (radv_can_dump_shader(device, modules[i], false))
3575 nir_print_shader(nir[i], stderr);
3576 }
3577
3578 if (modules[MESA_SHADER_GEOMETRY]) {
3579 struct radv_shader_binary *gs_copy_binary = NULL;
3580 if (!pipeline_has_ngg) {
3581 struct radv_shader_info info = {0};
3582
3583 if (infos[MESA_SHADER_GEOMETRY].vs.outinfo.export_clip_dists)
3584 info.vs.outinfo.export_clip_dists = true;
3585
3586 radv_nir_shader_info_pass(device, nir[MESA_SHADER_GEOMETRY], pipeline_layout, pipeline_key,
3587 &info);
3588 info.wave_size = 64; /* Wave32 not supported. */
3589 info.workgroup_size = 64; /* HW VS: separate waves, no workgroups */
3590 info.ballot_bit_size = 64;
3591
3592 pipeline->gs_copy_shader = radv_create_gs_copy_shader(
3593 device, nir[MESA_SHADER_GEOMETRY], &info, &gs_copy_binary, keep_executable_info,
3594 keep_statistic_info, pipeline_key->has_multiview_view_index,
3595 pipeline_key->optimisations_disabled);
3596 }
3597
3598 if (!keep_executable_info && pipeline->gs_copy_shader) {
3599 struct radv_shader_binary *gs_binaries[MESA_SHADER_STAGES] = {NULL};
3600 struct radv_shader_variant *gs_variants[MESA_SHADER_STAGES] = {0};
3601
3602 gs_binaries[MESA_SHADER_GEOMETRY] = gs_copy_binary;
3603 gs_variants[MESA_SHADER_GEOMETRY] = pipeline->gs_copy_shader;
3604
3605 radv_pipeline_cache_insert_shaders(device, cache, gs_copy_hash, gs_variants, gs_binaries,
3606 NULL, 0);
3607
3608 pipeline->gs_copy_shader = gs_variants[MESA_SHADER_GEOMETRY];
3609 }
3610 free(gs_copy_binary);
3611 }
3612
3613 if (nir[MESA_SHADER_FRAGMENT]) {
3614 if (!pipeline->shaders[MESA_SHADER_FRAGMENT]) {
3615 radv_start_feedback(stage_feedbacks[MESA_SHADER_FRAGMENT]);
3616
3617 pipeline->shaders[MESA_SHADER_FRAGMENT] = radv_shader_variant_compile(
3618 device, modules[MESA_SHADER_FRAGMENT], &nir[MESA_SHADER_FRAGMENT], 1, pipeline_layout,
3619 pipeline_key, infos + MESA_SHADER_FRAGMENT, keep_executable_info,
3620 keep_statistic_info, &binaries[MESA_SHADER_FRAGMENT]);
3621
3622 radv_stop_feedback(stage_feedbacks[MESA_SHADER_FRAGMENT], false);
3623 }
3624 }
3625
3626 if (device->physical_device->rad_info.chip_class >= GFX9 && modules[MESA_SHADER_TESS_CTRL]) {
3627 if (!pipeline->shaders[MESA_SHADER_TESS_CTRL]) {
3628 struct nir_shader *combined_nir[] = {nir[MESA_SHADER_VERTEX], nir[MESA_SHADER_TESS_CTRL]};
3629
3630 radv_start_feedback(stage_feedbacks[MESA_SHADER_TESS_CTRL]);
3631
3632 pipeline->shaders[MESA_SHADER_TESS_CTRL] = radv_shader_variant_compile(
3633 device, modules[MESA_SHADER_TESS_CTRL], combined_nir, 2, pipeline_layout, pipeline_key,
3634 &infos[MESA_SHADER_TESS_CTRL], keep_executable_info, keep_statistic_info,
3635 &binaries[MESA_SHADER_TESS_CTRL]);
3636
3637 radv_stop_feedback(stage_feedbacks[MESA_SHADER_TESS_CTRL], false);
3638 }
3639 modules[MESA_SHADER_VERTEX] = NULL;
3640 }
3641
3642 if (device->physical_device->rad_info.chip_class >= GFX9 && modules[MESA_SHADER_GEOMETRY]) {
3643 gl_shader_stage pre_stage =
3644 modules[MESA_SHADER_TESS_EVAL] ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
3645 if (!pipeline->shaders[MESA_SHADER_GEOMETRY]) {
3646 struct nir_shader *combined_nir[] = {nir[pre_stage], nir[MESA_SHADER_GEOMETRY]};
3647
3648 radv_start_feedback(stage_feedbacks[MESA_SHADER_GEOMETRY]);
3649
3650 pipeline->shaders[MESA_SHADER_GEOMETRY] = radv_shader_variant_compile(
3651 device, modules[MESA_SHADER_GEOMETRY], combined_nir, 2, pipeline_layout, pipeline_key,
3652 &infos[MESA_SHADER_GEOMETRY], keep_executable_info,
3653 keep_statistic_info, &binaries[MESA_SHADER_GEOMETRY]);
3654
3655 radv_stop_feedback(stage_feedbacks[MESA_SHADER_GEOMETRY], false);
3656 }
3657 modules[pre_stage] = NULL;
3658 }
3659
3660 for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
3661 if (modules[i] && !pipeline->shaders[i]) {
3662 radv_start_feedback(stage_feedbacks[i]);
3663
3664 pipeline->shaders[i] = radv_shader_variant_compile(
3665 device, modules[i], &nir[i], 1, pipeline_layout, pipeline_key, infos + i,
3666 keep_executable_info, keep_statistic_info, &binaries[i]);
3667
3668 radv_stop_feedback(stage_feedbacks[i], false);
3669 }
3670 }
3671
3672 if (!keep_executable_info) {
3673 radv_pipeline_cache_insert_shaders(device, cache, hash, pipeline->shaders, binaries,
3674 stack_sizes ? *stack_sizes : NULL,
3675 num_stack_sizes ? *num_stack_sizes : 0);
3676 }
3677
3678 for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
3679 free(binaries[i]);
3680 if (nir[i]) {
3681 ralloc_free(nir[i]);
3682
3683 if (radv_can_dump_shader_stats(device, modules[i])) {
3684 radv_dump_shader_stats(device, pipeline, i, stderr);
3685 }
3686 }
3687 }
3688
3689 if (fs_m.nir)
3690 ralloc_free(fs_m.nir);
3691
3692 radv_stop_feedback(pipeline_feedback, false);
3693 return VK_SUCCESS;
3694 }
3695
3696 static uint32_t
radv_pipeline_stage_to_user_data_0(struct radv_pipeline * pipeline,gl_shader_stage stage,enum chip_class chip_class)3697 radv_pipeline_stage_to_user_data_0(struct radv_pipeline *pipeline, gl_shader_stage stage,
3698 enum chip_class chip_class)
3699 {
3700 bool has_gs = radv_pipeline_has_gs(pipeline);
3701 bool has_tess = radv_pipeline_has_tess(pipeline);
3702 bool has_ngg = radv_pipeline_has_ngg(pipeline);
3703
3704 switch (stage) {
3705 case MESA_SHADER_FRAGMENT:
3706 return R_00B030_SPI_SHADER_USER_DATA_PS_0;
3707 case MESA_SHADER_VERTEX:
3708 if (has_tess) {
3709 if (chip_class >= GFX10) {
3710 return R_00B430_SPI_SHADER_USER_DATA_HS_0;
3711 } else if (chip_class == GFX9) {
3712 return R_00B430_SPI_SHADER_USER_DATA_LS_0;
3713 } else {
3714 return R_00B530_SPI_SHADER_USER_DATA_LS_0;
3715 }
3716 }
3717
3718 if (has_gs) {
3719 if (chip_class >= GFX10) {
3720 return R_00B230_SPI_SHADER_USER_DATA_GS_0;
3721 } else {
3722 return R_00B330_SPI_SHADER_USER_DATA_ES_0;
3723 }
3724 }
3725
3726 if (has_ngg)
3727 return R_00B230_SPI_SHADER_USER_DATA_GS_0;
3728
3729 return R_00B130_SPI_SHADER_USER_DATA_VS_0;
3730 case MESA_SHADER_GEOMETRY:
3731 return chip_class == GFX9 ? R_00B330_SPI_SHADER_USER_DATA_ES_0
3732 : R_00B230_SPI_SHADER_USER_DATA_GS_0;
3733 case MESA_SHADER_COMPUTE:
3734 return R_00B900_COMPUTE_USER_DATA_0;
3735 case MESA_SHADER_TESS_CTRL:
3736 return chip_class == GFX9 ? R_00B430_SPI_SHADER_USER_DATA_LS_0
3737 : R_00B430_SPI_SHADER_USER_DATA_HS_0;
3738 case MESA_SHADER_TESS_EVAL:
3739 if (has_gs) {
3740 return chip_class >= GFX10 ? R_00B230_SPI_SHADER_USER_DATA_GS_0
3741 : R_00B330_SPI_SHADER_USER_DATA_ES_0;
3742 } else if (has_ngg) {
3743 return R_00B230_SPI_SHADER_USER_DATA_GS_0;
3744 } else {
3745 return R_00B130_SPI_SHADER_USER_DATA_VS_0;
3746 }
3747 default:
3748 unreachable("unknown shader");
3749 }
3750 }
3751
3752 struct radv_bin_size_entry {
3753 unsigned bpp;
3754 VkExtent2D extent;
3755 };
3756
3757 static VkExtent2D
radv_gfx9_compute_bin_size(const struct radv_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)3758 radv_gfx9_compute_bin_size(const struct radv_pipeline *pipeline,
3759 const VkGraphicsPipelineCreateInfo *pCreateInfo)
3760 {
3761 static const struct radv_bin_size_entry color_size_table[][3][9] = {
3762 {
3763 /* One RB / SE */
3764 {
3765 /* One shader engine */
3766 {0, {128, 128}},
3767 {1, {64, 128}},
3768 {2, {32, 128}},
3769 {3, {16, 128}},
3770 {17, {0, 0}},
3771 {UINT_MAX, {0, 0}},
3772 },
3773 {
3774 /* Two shader engines */
3775 {0, {128, 128}},
3776 {2, {64, 128}},
3777 {3, {32, 128}},
3778 {5, {16, 128}},
3779 {17, {0, 0}},
3780 {UINT_MAX, {0, 0}},
3781 },
3782 {
3783 /* Four shader engines */
3784 {0, {128, 128}},
3785 {3, {64, 128}},
3786 {5, {16, 128}},
3787 {17, {0, 0}},
3788 {UINT_MAX, {0, 0}},
3789 },
3790 },
3791 {
3792 /* Two RB / SE */
3793 {
3794 /* One shader engine */
3795 {0, {128, 128}},
3796 {2, {64, 128}},
3797 {3, {32, 128}},
3798 {5, {16, 128}},
3799 {33, {0, 0}},
3800 {UINT_MAX, {0, 0}},
3801 },
3802 {
3803 /* Two shader engines */
3804 {0, {128, 128}},
3805 {3, {64, 128}},
3806 {5, {32, 128}},
3807 {9, {16, 128}},
3808 {33, {0, 0}},
3809 {UINT_MAX, {0, 0}},
3810 },
3811 {
3812 /* Four shader engines */
3813 {0, {256, 256}},
3814 {2, {128, 256}},
3815 {3, {128, 128}},
3816 {5, {64, 128}},
3817 {9, {16, 128}},
3818 {33, {0, 0}},
3819 {UINT_MAX, {0, 0}},
3820 },
3821 },
3822 {
3823 /* Four RB / SE */
3824 {
3825 /* One shader engine */
3826 {0, {128, 256}},
3827 {2, {128, 128}},
3828 {3, {64, 128}},
3829 {5, {32, 128}},
3830 {9, {16, 128}},
3831 {33, {0, 0}},
3832 {UINT_MAX, {0, 0}},
3833 },
3834 {
3835 /* Two shader engines */
3836 {0, {256, 256}},
3837 {2, {128, 256}},
3838 {3, {128, 128}},
3839 {5, {64, 128}},
3840 {9, {32, 128}},
3841 {17, {16, 128}},
3842 {33, {0, 0}},
3843 {UINT_MAX, {0, 0}},
3844 },
3845 {
3846 /* Four shader engines */
3847 {0, {256, 512}},
3848 {2, {256, 256}},
3849 {3, {128, 256}},
3850 {5, {128, 128}},
3851 {9, {64, 128}},
3852 {17, {16, 128}},
3853 {33, {0, 0}},
3854 {UINT_MAX, {0, 0}},
3855 },
3856 },
3857 };
3858 static const struct radv_bin_size_entry ds_size_table[][3][9] = {
3859 {
3860 // One RB / SE
3861 {
3862 // One shader engine
3863 {0, {128, 256}},
3864 {2, {128, 128}},
3865 {4, {64, 128}},
3866 {7, {32, 128}},
3867 {13, {16, 128}},
3868 {49, {0, 0}},
3869 {UINT_MAX, {0, 0}},
3870 },
3871 {
3872 // Two shader engines
3873 {0, {256, 256}},
3874 {2, {128, 256}},
3875 {4, {128, 128}},
3876 {7, {64, 128}},
3877 {13, {32, 128}},
3878 {25, {16, 128}},
3879 {49, {0, 0}},
3880 {UINT_MAX, {0, 0}},
3881 },
3882 {
3883 // Four shader engines
3884 {0, {256, 512}},
3885 {2, {256, 256}},
3886 {4, {128, 256}},
3887 {7, {128, 128}},
3888 {13, {64, 128}},
3889 {25, {16, 128}},
3890 {49, {0, 0}},
3891 {UINT_MAX, {0, 0}},
3892 },
3893 },
3894 {
3895 // Two RB / SE
3896 {
3897 // One shader engine
3898 {0, {256, 256}},
3899 {2, {128, 256}},
3900 {4, {128, 128}},
3901 {7, {64, 128}},
3902 {13, {32, 128}},
3903 {25, {16, 128}},
3904 {97, {0, 0}},
3905 {UINT_MAX, {0, 0}},
3906 },
3907 {
3908 // Two shader engines
3909 {0, {256, 512}},
3910 {2, {256, 256}},
3911 {4, {128, 256}},
3912 {7, {128, 128}},
3913 {13, {64, 128}},
3914 {25, {32, 128}},
3915 {49, {16, 128}},
3916 {97, {0, 0}},
3917 {UINT_MAX, {0, 0}},
3918 },
3919 {
3920 // Four shader engines
3921 {0, {512, 512}},
3922 {2, {256, 512}},
3923 {4, {256, 256}},
3924 {7, {128, 256}},
3925 {13, {128, 128}},
3926 {25, {64, 128}},
3927 {49, {16, 128}},
3928 {97, {0, 0}},
3929 {UINT_MAX, {0, 0}},
3930 },
3931 },
3932 {
3933 // Four RB / SE
3934 {
3935 // One shader engine
3936 {0, {256, 512}},
3937 {2, {256, 256}},
3938 {4, {128, 256}},
3939 {7, {128, 128}},
3940 {13, {64, 128}},
3941 {25, {32, 128}},
3942 {49, {16, 128}},
3943 {UINT_MAX, {0, 0}},
3944 },
3945 {
3946 // Two shader engines
3947 {0, {512, 512}},
3948 {2, {256, 512}},
3949 {4, {256, 256}},
3950 {7, {128, 256}},
3951 {13, {128, 128}},
3952 {25, {64, 128}},
3953 {49, {32, 128}},
3954 {97, {16, 128}},
3955 {UINT_MAX, {0, 0}},
3956 },
3957 {
3958 // Four shader engines
3959 {0, {512, 512}},
3960 {4, {256, 512}},
3961 {7, {256, 256}},
3962 {13, {128, 256}},
3963 {25, {128, 128}},
3964 {49, {64, 128}},
3965 {97, {16, 128}},
3966 {UINT_MAX, {0, 0}},
3967 },
3968 },
3969 };
3970
3971 RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
3972 struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
3973 VkExtent2D extent = {512, 512};
3974
3975 unsigned log_num_rb_per_se =
3976 util_logbase2_ceil(pipeline->device->physical_device->rad_info.max_render_backends /
3977 pipeline->device->physical_device->rad_info.max_se);
3978 unsigned log_num_se = util_logbase2_ceil(pipeline->device->physical_device->rad_info.max_se);
3979
3980 unsigned total_samples = 1u << G_028BE0_MSAA_NUM_SAMPLES(pipeline->graphics.ms.pa_sc_aa_config);
3981 unsigned ps_iter_samples = 1u << G_028804_PS_ITER_SAMPLES(pipeline->graphics.ms.db_eqaa);
3982 unsigned effective_samples = total_samples;
3983 unsigned color_bytes_per_pixel = 0;
3984
3985 const VkPipelineColorBlendStateCreateInfo *vkblend =
3986 radv_pipeline_get_color_blend_state(pCreateInfo);
3987 if (vkblend) {
3988 for (unsigned i = 0; i < subpass->color_count; i++) {
3989 if (!vkblend->pAttachments[i].colorWriteMask)
3990 continue;
3991
3992 if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED)
3993 continue;
3994
3995 VkFormat format = pass->attachments[subpass->color_attachments[i].attachment].format;
3996 color_bytes_per_pixel += vk_format_get_blocksize(format);
3997 }
3998
3999 /* MSAA images typically don't use all samples all the time. */
4000 if (effective_samples >= 2 && ps_iter_samples <= 1)
4001 effective_samples = 2;
4002 color_bytes_per_pixel *= effective_samples;
4003 }
4004
4005 const struct radv_bin_size_entry *color_entry = color_size_table[log_num_rb_per_se][log_num_se];
4006 while (color_entry[1].bpp <= color_bytes_per_pixel)
4007 ++color_entry;
4008
4009 extent = color_entry->extent;
4010
4011 if (subpass->depth_stencil_attachment) {
4012 struct radv_render_pass_attachment *attachment =
4013 pass->attachments + subpass->depth_stencil_attachment->attachment;
4014
4015 /* Coefficients taken from AMDVLK */
4016 unsigned depth_coeff = vk_format_has_depth(attachment->format) ? 5 : 0;
4017 unsigned stencil_coeff = vk_format_has_stencil(attachment->format) ? 1 : 0;
4018 unsigned ds_bytes_per_pixel = 4 * (depth_coeff + stencil_coeff) * total_samples;
4019
4020 const struct radv_bin_size_entry *ds_entry = ds_size_table[log_num_rb_per_se][log_num_se];
4021 while (ds_entry[1].bpp <= ds_bytes_per_pixel)
4022 ++ds_entry;
4023
4024 if (ds_entry->extent.width * ds_entry->extent.height < extent.width * extent.height)
4025 extent = ds_entry->extent;
4026 }
4027
4028 return extent;
4029 }
4030
4031 static VkExtent2D
radv_gfx10_compute_bin_size(const struct radv_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)4032 radv_gfx10_compute_bin_size(const struct radv_pipeline *pipeline,
4033 const VkGraphicsPipelineCreateInfo *pCreateInfo)
4034 {
4035 RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
4036 struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
4037 VkExtent2D extent = {512, 512};
4038
4039 const unsigned db_tag_size = 64;
4040 const unsigned db_tag_count = 312;
4041 const unsigned color_tag_size = 1024;
4042 const unsigned color_tag_count = 31;
4043 const unsigned fmask_tag_size = 256;
4044 const unsigned fmask_tag_count = 44;
4045
4046 const unsigned rb_count = pipeline->device->physical_device->rad_info.max_render_backends;
4047 const unsigned pipe_count =
4048 MAX2(rb_count, pipeline->device->physical_device->rad_info.num_tcc_blocks);
4049
4050 const unsigned db_tag_part = (db_tag_count * rb_count / pipe_count) * db_tag_size * pipe_count;
4051 const unsigned color_tag_part =
4052 (color_tag_count * rb_count / pipe_count) * color_tag_size * pipe_count;
4053 const unsigned fmask_tag_part =
4054 (fmask_tag_count * rb_count / pipe_count) * fmask_tag_size * pipe_count;
4055
4056 const unsigned total_samples =
4057 1u << G_028BE0_MSAA_NUM_SAMPLES(pipeline->graphics.ms.pa_sc_aa_config);
4058 const unsigned samples_log = util_logbase2_ceil(total_samples);
4059
4060 unsigned color_bytes_per_pixel = 0;
4061 unsigned fmask_bytes_per_pixel = 0;
4062
4063 const VkPipelineColorBlendStateCreateInfo *vkblend =
4064 radv_pipeline_get_color_blend_state(pCreateInfo);
4065 if (vkblend) {
4066 for (unsigned i = 0; i < subpass->color_count; i++) {
4067 if (!vkblend->pAttachments[i].colorWriteMask)
4068 continue;
4069
4070 if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED)
4071 continue;
4072
4073 VkFormat format = pass->attachments[subpass->color_attachments[i].attachment].format;
4074 color_bytes_per_pixel += vk_format_get_blocksize(format);
4075
4076 if (total_samples > 1) {
4077 assert(samples_log <= 3);
4078 const unsigned fmask_array[] = {0, 1, 1, 4};
4079 fmask_bytes_per_pixel += fmask_array[samples_log];
4080 }
4081 }
4082
4083 color_bytes_per_pixel *= total_samples;
4084 }
4085 color_bytes_per_pixel = MAX2(color_bytes_per_pixel, 1);
4086
4087 const unsigned color_pixel_count_log = util_logbase2(color_tag_part / color_bytes_per_pixel);
4088 extent.width = 1ull << ((color_pixel_count_log + 1) / 2);
4089 extent.height = 1ull << (color_pixel_count_log / 2);
4090
4091 if (fmask_bytes_per_pixel) {
4092 const unsigned fmask_pixel_count_log = util_logbase2(fmask_tag_part / fmask_bytes_per_pixel);
4093
4094 const VkExtent2D fmask_extent =
4095 (VkExtent2D){.width = 1ull << ((fmask_pixel_count_log + 1) / 2),
4096 .height = 1ull << (color_pixel_count_log / 2)};
4097
4098 if (fmask_extent.width * fmask_extent.height < extent.width * extent.height)
4099 extent = fmask_extent;
4100 }
4101
4102 if (subpass->depth_stencil_attachment) {
4103 struct radv_render_pass_attachment *attachment =
4104 pass->attachments + subpass->depth_stencil_attachment->attachment;
4105
4106 /* Coefficients taken from AMDVLK */
4107 unsigned depth_coeff = vk_format_has_depth(attachment->format) ? 5 : 0;
4108 unsigned stencil_coeff = vk_format_has_stencil(attachment->format) ? 1 : 0;
4109 unsigned db_bytes_per_pixel = (depth_coeff + stencil_coeff) * total_samples;
4110
4111 const unsigned db_pixel_count_log = util_logbase2(db_tag_part / db_bytes_per_pixel);
4112
4113 const VkExtent2D db_extent = (VkExtent2D){.width = 1ull << ((db_pixel_count_log + 1) / 2),
4114 .height = 1ull << (color_pixel_count_log / 2)};
4115
4116 if (db_extent.width * db_extent.height < extent.width * extent.height)
4117 extent = db_extent;
4118 }
4119
4120 extent.width = MAX2(extent.width, 128);
4121 extent.height = MAX2(extent.width, 64);
4122
4123 return extent;
4124 }
4125
4126 static void
radv_pipeline_init_disabled_binning_state(struct radv_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)4127 radv_pipeline_init_disabled_binning_state(struct radv_pipeline *pipeline,
4128 const VkGraphicsPipelineCreateInfo *pCreateInfo)
4129 {
4130 uint32_t pa_sc_binner_cntl_0 = S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) |
4131 S_028C44_DISABLE_START_OF_PRIM(1);
4132
4133 if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) {
4134 RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
4135 struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
4136 const VkPipelineColorBlendStateCreateInfo *vkblend =
4137 radv_pipeline_get_color_blend_state(pCreateInfo);
4138 unsigned min_bytes_per_pixel = 0;
4139
4140 if (vkblend) {
4141 for (unsigned i = 0; i < subpass->color_count; i++) {
4142 if (!vkblend->pAttachments[i].colorWriteMask)
4143 continue;
4144
4145 if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED)
4146 continue;
4147
4148 VkFormat format = pass->attachments[subpass->color_attachments[i].attachment].format;
4149 unsigned bytes = vk_format_get_blocksize(format);
4150 if (!min_bytes_per_pixel || bytes < min_bytes_per_pixel)
4151 min_bytes_per_pixel = bytes;
4152 }
4153 }
4154
4155 pa_sc_binner_cntl_0 =
4156 S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_NEW_SC) | S_028C44_BIN_SIZE_X(0) |
4157 S_028C44_BIN_SIZE_Y(0) | S_028C44_BIN_SIZE_X_EXTEND(2) | /* 128 */
4158 S_028C44_BIN_SIZE_Y_EXTEND(min_bytes_per_pixel <= 4 ? 2 : 1) | /* 128 or 64 */
4159 S_028C44_DISABLE_START_OF_PRIM(1);
4160 }
4161
4162 pipeline->graphics.binning.pa_sc_binner_cntl_0 = pa_sc_binner_cntl_0;
4163 }
4164
4165 struct radv_binning_settings
radv_get_binning_settings(const struct radv_physical_device * pdev)4166 radv_get_binning_settings(const struct radv_physical_device *pdev)
4167 {
4168 struct radv_binning_settings settings;
4169 if (pdev->rad_info.has_dedicated_vram) {
4170 if (pdev->rad_info.max_render_backends > 4) {
4171 settings.context_states_per_bin = 1;
4172 settings.persistent_states_per_bin = 1;
4173 } else {
4174 settings.context_states_per_bin = 3;
4175 settings.persistent_states_per_bin = 8;
4176 }
4177 settings.fpovs_per_batch = 63;
4178 } else {
4179 /* The context states are affected by the scissor bug. */
4180 settings.context_states_per_bin = 6;
4181 /* 32 causes hangs for RAVEN. */
4182 settings.persistent_states_per_bin = 16;
4183 settings.fpovs_per_batch = 63;
4184 }
4185
4186 if (pdev->rad_info.has_gfx9_scissor_bug)
4187 settings.context_states_per_bin = 1;
4188
4189 return settings;
4190 }
4191
4192 static void
radv_pipeline_init_binning_state(struct radv_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo,const struct radv_blend_state * blend)4193 radv_pipeline_init_binning_state(struct radv_pipeline *pipeline,
4194 const VkGraphicsPipelineCreateInfo *pCreateInfo,
4195 const struct radv_blend_state *blend)
4196 {
4197 if (pipeline->device->physical_device->rad_info.chip_class < GFX9)
4198 return;
4199
4200 VkExtent2D bin_size;
4201 if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) {
4202 bin_size = radv_gfx10_compute_bin_size(pipeline, pCreateInfo);
4203 } else if (pipeline->device->physical_device->rad_info.chip_class == GFX9) {
4204 bin_size = radv_gfx9_compute_bin_size(pipeline, pCreateInfo);
4205 } else
4206 unreachable("Unhandled generation for binning bin size calculation");
4207
4208 if (pipeline->device->pbb_allowed && bin_size.width && bin_size.height) {
4209 struct radv_binning_settings settings =
4210 radv_get_binning_settings(pipeline->device->physical_device);
4211
4212 const uint32_t pa_sc_binner_cntl_0 =
4213 S_028C44_BINNING_MODE(V_028C44_BINNING_ALLOWED) |
4214 S_028C44_BIN_SIZE_X(bin_size.width == 16) | S_028C44_BIN_SIZE_Y(bin_size.height == 16) |
4215 S_028C44_BIN_SIZE_X_EXTEND(util_logbase2(MAX2(bin_size.width, 32)) - 5) |
4216 S_028C44_BIN_SIZE_Y_EXTEND(util_logbase2(MAX2(bin_size.height, 32)) - 5) |
4217 S_028C44_CONTEXT_STATES_PER_BIN(settings.context_states_per_bin - 1) |
4218 S_028C44_PERSISTENT_STATES_PER_BIN(settings.persistent_states_per_bin - 1) |
4219 S_028C44_DISABLE_START_OF_PRIM(1) |
4220 S_028C44_FPOVS_PER_BATCH(settings.fpovs_per_batch) | S_028C44_OPTIMAL_BIN_SELECTION(1);
4221
4222 pipeline->graphics.binning.pa_sc_binner_cntl_0 = pa_sc_binner_cntl_0;
4223 } else
4224 radv_pipeline_init_disabled_binning_state(pipeline, pCreateInfo);
4225 }
4226
4227 static void
radv_pipeline_generate_depth_stencil_state(struct radeon_cmdbuf * ctx_cs,const struct radv_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo,const struct radv_graphics_pipeline_create_info * extra)4228 radv_pipeline_generate_depth_stencil_state(struct radeon_cmdbuf *ctx_cs,
4229 const struct radv_pipeline *pipeline,
4230 const VkGraphicsPipelineCreateInfo *pCreateInfo,
4231 const struct radv_graphics_pipeline_create_info *extra)
4232 {
4233 const VkPipelineDepthStencilStateCreateInfo *vkds =
4234 radv_pipeline_get_depth_stencil_state(pCreateInfo);
4235 RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
4236 struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
4237 struct radv_shader_variant *ps = pipeline->shaders[MESA_SHADER_FRAGMENT];
4238 struct radv_render_pass_attachment *attachment = NULL;
4239 uint32_t db_render_control = 0, db_render_override2 = 0;
4240 uint32_t db_render_override = 0;
4241
4242 if (subpass->depth_stencil_attachment)
4243 attachment = pass->attachments + subpass->depth_stencil_attachment->attachment;
4244
4245 bool has_depth_attachment = attachment && vk_format_has_depth(attachment->format);
4246
4247 if (vkds && has_depth_attachment) {
4248 /* from amdvlk: For 4xAA and 8xAA need to decompress on flush for better performance */
4249 db_render_override2 |= S_028010_DECOMPRESS_Z_ON_FLUSH(attachment->samples > 2);
4250
4251 if (pipeline->device->physical_device->rad_info.chip_class >= GFX10_3)
4252 db_render_override2 |= S_028010_CENTROID_COMPUTATION_MODE(1);
4253 }
4254
4255 if (attachment && extra) {
4256 db_render_control |= S_028000_DEPTH_CLEAR_ENABLE(extra->db_depth_clear);
4257 db_render_control |= S_028000_STENCIL_CLEAR_ENABLE(extra->db_stencil_clear);
4258
4259 db_render_control |= S_028000_RESUMMARIZE_ENABLE(extra->resummarize_enable);
4260 db_render_control |= S_028000_DEPTH_COMPRESS_DISABLE(extra->depth_compress_disable);
4261 db_render_control |= S_028000_STENCIL_COMPRESS_DISABLE(extra->stencil_compress_disable);
4262 }
4263
4264 db_render_override |= S_02800C_FORCE_HIS_ENABLE0(V_02800C_FORCE_DISABLE) |
4265 S_02800C_FORCE_HIS_ENABLE1(V_02800C_FORCE_DISABLE);
4266
4267 if (!pCreateInfo->pRasterizationState->depthClampEnable && ps->info.ps.writes_z) {
4268 /* From VK_EXT_depth_range_unrestricted spec:
4269 *
4270 * "The behavior described in Primitive Clipping still applies.
4271 * If depth clamping is disabled the depth values are still
4272 * clipped to 0 ≤ zc ≤ wc before the viewport transform. If
4273 * depth clamping is enabled the above equation is ignored and
4274 * the depth values are instead clamped to the VkViewport
4275 * minDepth and maxDepth values, which in the case of this
4276 * extension can be outside of the 0.0 to 1.0 range."
4277 */
4278 db_render_override |= S_02800C_DISABLE_VIEWPORT_CLAMP(1);
4279 }
4280
4281 radeon_set_context_reg(ctx_cs, R_028000_DB_RENDER_CONTROL, db_render_control);
4282
4283 radeon_set_context_reg_seq(ctx_cs, R_02800C_DB_RENDER_OVERRIDE, 2);
4284 radeon_emit(ctx_cs, db_render_override);
4285 radeon_emit(ctx_cs, db_render_override2);
4286 }
4287
4288 static void
radv_pipeline_generate_blend_state(struct radeon_cmdbuf * ctx_cs,const struct radv_pipeline * pipeline,const struct radv_blend_state * blend)4289 radv_pipeline_generate_blend_state(struct radeon_cmdbuf *ctx_cs,
4290 const struct radv_pipeline *pipeline,
4291 const struct radv_blend_state *blend)
4292 {
4293 radeon_set_context_reg_seq(ctx_cs, R_028780_CB_BLEND0_CONTROL, 8);
4294 radeon_emit_array(ctx_cs, blend->cb_blend_control, 8);
4295 radeon_set_context_reg(ctx_cs, R_028B70_DB_ALPHA_TO_MASK, blend->db_alpha_to_mask);
4296
4297 if (pipeline->device->physical_device->rad_info.has_rbplus) {
4298
4299 radeon_set_context_reg_seq(ctx_cs, R_028760_SX_MRT0_BLEND_OPT, 8);
4300 radeon_emit_array(ctx_cs, blend->sx_mrt_blend_opt, 8);
4301 }
4302
4303 radeon_set_context_reg(ctx_cs, R_028714_SPI_SHADER_COL_FORMAT, blend->spi_shader_col_format);
4304
4305 radeon_set_context_reg(ctx_cs, R_02823C_CB_SHADER_MASK, blend->cb_shader_mask);
4306 }
4307
4308 static void
radv_pipeline_generate_raster_state(struct radeon_cmdbuf * ctx_cs,const struct radv_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)4309 radv_pipeline_generate_raster_state(struct radeon_cmdbuf *ctx_cs,
4310 const struct radv_pipeline *pipeline,
4311 const VkGraphicsPipelineCreateInfo *pCreateInfo)
4312 {
4313 const VkPipelineRasterizationStateCreateInfo *vkraster = pCreateInfo->pRasterizationState;
4314 const VkConservativeRasterizationModeEXT mode = radv_get_conservative_raster_mode(vkraster);
4315 uint32_t pa_sc_conservative_rast = S_028C4C_NULL_SQUAD_AA_MASK_ENABLE(1);
4316
4317 if (pipeline->device->physical_device->rad_info.chip_class >= GFX9) {
4318 /* Conservative rasterization. */
4319 if (mode != VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT) {
4320 pa_sc_conservative_rast = S_028C4C_PREZ_AA_MASK_ENABLE(1) | S_028C4C_POSTZ_AA_MASK_ENABLE(1) |
4321 S_028C4C_CENTROID_SAMPLE_OVERRIDE(1);
4322
4323 if (mode == VK_CONSERVATIVE_RASTERIZATION_MODE_OVERESTIMATE_EXT) {
4324 pa_sc_conservative_rast |=
4325 S_028C4C_OVER_RAST_ENABLE(1) | S_028C4C_OVER_RAST_SAMPLE_SELECT(0) |
4326 S_028C4C_UNDER_RAST_ENABLE(0) | S_028C4C_UNDER_RAST_SAMPLE_SELECT(1) |
4327 S_028C4C_PBB_UNCERTAINTY_REGION_ENABLE(1);
4328 } else {
4329 assert(mode == VK_CONSERVATIVE_RASTERIZATION_MODE_UNDERESTIMATE_EXT);
4330 pa_sc_conservative_rast |=
4331 S_028C4C_OVER_RAST_ENABLE(0) | S_028C4C_OVER_RAST_SAMPLE_SELECT(1) |
4332 S_028C4C_UNDER_RAST_ENABLE(1) | S_028C4C_UNDER_RAST_SAMPLE_SELECT(0) |
4333 S_028C4C_PBB_UNCERTAINTY_REGION_ENABLE(0);
4334 }
4335 }
4336
4337 radeon_set_context_reg(ctx_cs, R_028C4C_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL,
4338 pa_sc_conservative_rast);
4339 }
4340 }
4341
4342 static void
radv_pipeline_generate_multisample_state(struct radeon_cmdbuf * ctx_cs,const struct radv_pipeline * pipeline)4343 radv_pipeline_generate_multisample_state(struct radeon_cmdbuf *ctx_cs,
4344 const struct radv_pipeline *pipeline)
4345 {
4346 const struct radv_multisample_state *ms = &pipeline->graphics.ms;
4347
4348 radeon_set_context_reg_seq(ctx_cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2);
4349 radeon_emit(ctx_cs, ms->pa_sc_aa_mask[0]);
4350 radeon_emit(ctx_cs, ms->pa_sc_aa_mask[1]);
4351
4352 radeon_set_context_reg(ctx_cs, R_028804_DB_EQAA, ms->db_eqaa);
4353 radeon_set_context_reg(ctx_cs, R_028BE0_PA_SC_AA_CONFIG, ms->pa_sc_aa_config);
4354
4355 radeon_set_context_reg_seq(ctx_cs, R_028A48_PA_SC_MODE_CNTL_0, 2);
4356 radeon_emit(ctx_cs, ms->pa_sc_mode_cntl_0);
4357 radeon_emit(ctx_cs, ms->pa_sc_mode_cntl_1);
4358
4359 /* The exclusion bits can be set to improve rasterization efficiency
4360 * if no sample lies on the pixel boundary (-8 sample offset). It's
4361 * currently always TRUE because the driver doesn't support 16 samples.
4362 */
4363 bool exclusion = pipeline->device->physical_device->rad_info.chip_class >= GFX7;
4364 radeon_set_context_reg(
4365 ctx_cs, R_02882C_PA_SU_PRIM_FILTER_CNTL,
4366 S_02882C_XMAX_RIGHT_EXCLUSION(exclusion) | S_02882C_YMAX_BOTTOM_EXCLUSION(exclusion));
4367 }
4368
4369 static void
radv_pipeline_generate_vgt_gs_mode(struct radeon_cmdbuf * ctx_cs,const struct radv_pipeline * pipeline)4370 radv_pipeline_generate_vgt_gs_mode(struct radeon_cmdbuf *ctx_cs,
4371 const struct radv_pipeline *pipeline)
4372 {
4373 const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline);
4374 const struct radv_shader_variant *vs = pipeline->shaders[MESA_SHADER_TESS_EVAL]
4375 ? pipeline->shaders[MESA_SHADER_TESS_EVAL]
4376 : pipeline->shaders[MESA_SHADER_VERTEX];
4377 unsigned vgt_primitiveid_en = 0;
4378 uint32_t vgt_gs_mode = 0;
4379
4380 if (radv_pipeline_has_ngg(pipeline))
4381 return;
4382
4383 if (radv_pipeline_has_gs(pipeline)) {
4384 const struct radv_shader_variant *gs = pipeline->shaders[MESA_SHADER_GEOMETRY];
4385
4386 vgt_gs_mode = ac_vgt_gs_mode(gs->info.gs.vertices_out,
4387 pipeline->device->physical_device->rad_info.chip_class);
4388 } else if (outinfo->export_prim_id || vs->info.uses_prim_id) {
4389 vgt_gs_mode = S_028A40_MODE(V_028A40_GS_SCENARIO_A);
4390 vgt_primitiveid_en |= S_028A84_PRIMITIVEID_EN(1);
4391 }
4392
4393 radeon_set_context_reg(ctx_cs, R_028A84_VGT_PRIMITIVEID_EN, vgt_primitiveid_en);
4394 radeon_set_context_reg(ctx_cs, R_028A40_VGT_GS_MODE, vgt_gs_mode);
4395 }
4396
4397 static void
radv_pipeline_generate_hw_vs(struct radeon_cmdbuf * ctx_cs,struct radeon_cmdbuf * cs,const struct radv_pipeline * pipeline,const struct radv_shader_variant * shader)4398 radv_pipeline_generate_hw_vs(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
4399 const struct radv_pipeline *pipeline,
4400 const struct radv_shader_variant *shader)
4401 {
4402 uint64_t va = radv_shader_variant_get_va(shader);
4403
4404 radeon_set_sh_reg_seq(cs, R_00B120_SPI_SHADER_PGM_LO_VS, 4);
4405 radeon_emit(cs, va >> 8);
4406 radeon_emit(cs, S_00B124_MEM_BASE(va >> 40));
4407 radeon_emit(cs, shader->config.rsrc1);
4408 radeon_emit(cs, shader->config.rsrc2);
4409
4410 const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline);
4411 unsigned clip_dist_mask, cull_dist_mask, total_mask;
4412 clip_dist_mask = outinfo->clip_dist_mask;
4413 cull_dist_mask = outinfo->cull_dist_mask;
4414 total_mask = clip_dist_mask | cull_dist_mask;
4415
4416 bool writes_primitive_shading_rate =
4417 outinfo->writes_primitive_shading_rate || pipeline->device->force_vrs != RADV_FORCE_VRS_NONE;
4418 bool misc_vec_ena = outinfo->writes_pointsize || outinfo->writes_layer ||
4419 outinfo->writes_viewport_index || writes_primitive_shading_rate;
4420 unsigned spi_vs_out_config, nparams;
4421
4422 /* VS is required to export at least one param. */
4423 nparams = MAX2(outinfo->param_exports, 1);
4424 spi_vs_out_config = S_0286C4_VS_EXPORT_COUNT(nparams - 1);
4425
4426 if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) {
4427 spi_vs_out_config |= S_0286C4_NO_PC_EXPORT(outinfo->param_exports == 0);
4428 }
4429
4430 radeon_set_context_reg(ctx_cs, R_0286C4_SPI_VS_OUT_CONFIG, spi_vs_out_config);
4431
4432 radeon_set_context_reg(
4433 ctx_cs, R_02870C_SPI_SHADER_POS_FORMAT,
4434 S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
4435 S_02870C_POS1_EXPORT_FORMAT(outinfo->pos_exports > 1 ? V_02870C_SPI_SHADER_4COMP
4436 : V_02870C_SPI_SHADER_NONE) |
4437 S_02870C_POS2_EXPORT_FORMAT(outinfo->pos_exports > 2 ? V_02870C_SPI_SHADER_4COMP
4438 : V_02870C_SPI_SHADER_NONE) |
4439 S_02870C_POS3_EXPORT_FORMAT(outinfo->pos_exports > 3 ? V_02870C_SPI_SHADER_4COMP
4440 : V_02870C_SPI_SHADER_NONE));
4441
4442 radeon_set_context_reg(ctx_cs, R_02881C_PA_CL_VS_OUT_CNTL,
4443 S_02881C_USE_VTX_POINT_SIZE(outinfo->writes_pointsize) |
4444 S_02881C_USE_VTX_RENDER_TARGET_INDX(outinfo->writes_layer) |
4445 S_02881C_USE_VTX_VIEWPORT_INDX(outinfo->writes_viewport_index) |
4446 S_02881C_USE_VTX_VRS_RATE(writes_primitive_shading_rate) |
4447 S_02881C_VS_OUT_MISC_VEC_ENA(misc_vec_ena) |
4448 S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(misc_vec_ena) |
4449 S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0f) != 0) |
4450 S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xf0) != 0) |
4451 cull_dist_mask << 8 | clip_dist_mask);
4452
4453 if (pipeline->device->physical_device->rad_info.chip_class <= GFX8)
4454 radeon_set_context_reg(ctx_cs, R_028AB4_VGT_REUSE_OFF, outinfo->writes_viewport_index);
4455
4456 unsigned late_alloc_wave64, cu_mask;
4457 ac_compute_late_alloc(&pipeline->device->physical_device->rad_info, false, false,
4458 shader->config.scratch_bytes_per_wave > 0, &late_alloc_wave64, &cu_mask);
4459
4460 if (pipeline->device->physical_device->rad_info.chip_class >= GFX7) {
4461 radeon_set_sh_reg_idx(pipeline->device->physical_device, cs, R_00B118_SPI_SHADER_PGM_RSRC3_VS, 3,
4462 S_00B118_CU_EN(cu_mask) | S_00B118_WAVE_LIMIT(0x3F));
4463 radeon_set_sh_reg(cs, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(late_alloc_wave64));
4464 }
4465 if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) {
4466 uint32_t oversub_pc_lines = late_alloc_wave64 ? pipeline->device->physical_device->rad_info.pc_lines / 4 : 0;
4467 gfx10_emit_ge_pc_alloc(cs, pipeline->device->physical_device->rad_info.chip_class, oversub_pc_lines);
4468 }
4469 }
4470
4471 static void
radv_pipeline_generate_hw_es(struct radeon_cmdbuf * cs,const struct radv_pipeline * pipeline,const struct radv_shader_variant * shader)4472 radv_pipeline_generate_hw_es(struct radeon_cmdbuf *cs, const struct radv_pipeline *pipeline,
4473 const struct radv_shader_variant *shader)
4474 {
4475 uint64_t va = radv_shader_variant_get_va(shader);
4476
4477 radeon_set_sh_reg_seq(cs, R_00B320_SPI_SHADER_PGM_LO_ES, 4);
4478 radeon_emit(cs, va >> 8);
4479 radeon_emit(cs, S_00B324_MEM_BASE(va >> 40));
4480 radeon_emit(cs, shader->config.rsrc1);
4481 radeon_emit(cs, shader->config.rsrc2);
4482 }
4483
4484 static void
radv_pipeline_generate_hw_ls(struct radeon_cmdbuf * cs,const struct radv_pipeline * pipeline,const struct radv_shader_variant * shader)4485 radv_pipeline_generate_hw_ls(struct radeon_cmdbuf *cs, const struct radv_pipeline *pipeline,
4486 const struct radv_shader_variant *shader)
4487 {
4488 unsigned num_lds_blocks = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.tcs.num_lds_blocks;
4489 uint64_t va = radv_shader_variant_get_va(shader);
4490 uint32_t rsrc2 = shader->config.rsrc2;
4491
4492 radeon_set_sh_reg(cs, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
4493
4494 rsrc2 |= S_00B52C_LDS_SIZE(num_lds_blocks);
4495 if (pipeline->device->physical_device->rad_info.chip_class == GFX7 &&
4496 pipeline->device->physical_device->rad_info.family != CHIP_HAWAII)
4497 radeon_set_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, rsrc2);
4498
4499 radeon_set_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2);
4500 radeon_emit(cs, shader->config.rsrc1);
4501 radeon_emit(cs, rsrc2);
4502 }
4503
4504 static void
radv_pipeline_generate_hw_ngg(struct radeon_cmdbuf * ctx_cs,struct radeon_cmdbuf * cs,const struct radv_pipeline * pipeline,const struct radv_shader_variant * shader)4505 radv_pipeline_generate_hw_ngg(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
4506 const struct radv_pipeline *pipeline,
4507 const struct radv_shader_variant *shader)
4508 {
4509 uint64_t va = radv_shader_variant_get_va(shader);
4510 gl_shader_stage es_type =
4511 radv_pipeline_has_tess(pipeline) ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
4512 struct radv_shader_variant *es = es_type == MESA_SHADER_TESS_EVAL
4513 ? pipeline->shaders[MESA_SHADER_TESS_EVAL]
4514 : pipeline->shaders[MESA_SHADER_VERTEX];
4515 const struct gfx10_ngg_info *ngg_state = &shader->info.ngg_info;
4516
4517 radeon_set_sh_reg(cs, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
4518
4519 radeon_set_sh_reg_seq(cs, R_00B228_SPI_SHADER_PGM_RSRC1_GS, 2);
4520 radeon_emit(cs, shader->config.rsrc1);
4521 radeon_emit(cs, shader->config.rsrc2);
4522
4523 const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline);
4524 unsigned clip_dist_mask, cull_dist_mask, total_mask;
4525 clip_dist_mask = outinfo->clip_dist_mask;
4526 cull_dist_mask = outinfo->cull_dist_mask;
4527 total_mask = clip_dist_mask | cull_dist_mask;
4528
4529 bool writes_primitive_shading_rate =
4530 outinfo->writes_primitive_shading_rate || pipeline->device->force_vrs != RADV_FORCE_VRS_NONE;
4531 bool misc_vec_ena = outinfo->writes_pointsize || outinfo->writes_layer ||
4532 outinfo->writes_viewport_index || writes_primitive_shading_rate;
4533 bool es_enable_prim_id = outinfo->export_prim_id || (es && es->info.uses_prim_id);
4534 bool break_wave_at_eoi = false;
4535 unsigned ge_cntl;
4536 unsigned nparams;
4537
4538 if (es_type == MESA_SHADER_TESS_EVAL) {
4539 struct radv_shader_variant *gs = pipeline->shaders[MESA_SHADER_GEOMETRY];
4540
4541 if (es_enable_prim_id || (gs && gs->info.uses_prim_id))
4542 break_wave_at_eoi = true;
4543 }
4544
4545 nparams = MAX2(outinfo->param_exports, 1);
4546 radeon_set_context_reg(
4547 ctx_cs, R_0286C4_SPI_VS_OUT_CONFIG,
4548 S_0286C4_VS_EXPORT_COUNT(nparams - 1) | S_0286C4_NO_PC_EXPORT(outinfo->param_exports == 0));
4549
4550 radeon_set_context_reg(ctx_cs, R_028708_SPI_SHADER_IDX_FORMAT,
4551 S_028708_IDX0_EXPORT_FORMAT(V_028708_SPI_SHADER_1COMP));
4552 radeon_set_context_reg(
4553 ctx_cs, R_02870C_SPI_SHADER_POS_FORMAT,
4554 S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
4555 S_02870C_POS1_EXPORT_FORMAT(outinfo->pos_exports > 1 ? V_02870C_SPI_SHADER_4COMP
4556 : V_02870C_SPI_SHADER_NONE) |
4557 S_02870C_POS2_EXPORT_FORMAT(outinfo->pos_exports > 2 ? V_02870C_SPI_SHADER_4COMP
4558 : V_02870C_SPI_SHADER_NONE) |
4559 S_02870C_POS3_EXPORT_FORMAT(outinfo->pos_exports > 3 ? V_02870C_SPI_SHADER_4COMP
4560 : V_02870C_SPI_SHADER_NONE));
4561
4562 radeon_set_context_reg(ctx_cs, R_02881C_PA_CL_VS_OUT_CNTL,
4563 S_02881C_USE_VTX_POINT_SIZE(outinfo->writes_pointsize) |
4564 S_02881C_USE_VTX_RENDER_TARGET_INDX(outinfo->writes_layer) |
4565 S_02881C_USE_VTX_VIEWPORT_INDX(outinfo->writes_viewport_index) |
4566 S_02881C_USE_VTX_VRS_RATE(writes_primitive_shading_rate) |
4567 S_02881C_VS_OUT_MISC_VEC_ENA(misc_vec_ena) |
4568 S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(misc_vec_ena) |
4569 S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0f) != 0) |
4570 S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xf0) != 0) |
4571 cull_dist_mask << 8 | clip_dist_mask);
4572
4573 radeon_set_context_reg(ctx_cs, R_028A84_VGT_PRIMITIVEID_EN,
4574 S_028A84_PRIMITIVEID_EN(es_enable_prim_id) |
4575 S_028A84_NGG_DISABLE_PROVOK_REUSE(outinfo->export_prim_id));
4576
4577 radeon_set_context_reg(ctx_cs, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
4578 ngg_state->vgt_esgs_ring_itemsize);
4579
4580 /* NGG specific registers. */
4581 struct radv_shader_variant *gs = pipeline->shaders[MESA_SHADER_GEOMETRY];
4582 uint32_t gs_num_invocations = gs ? gs->info.gs.invocations : 1;
4583
4584 radeon_set_context_reg(
4585 ctx_cs, R_028A44_VGT_GS_ONCHIP_CNTL,
4586 S_028A44_ES_VERTS_PER_SUBGRP(ngg_state->hw_max_esverts) |
4587 S_028A44_GS_PRIMS_PER_SUBGRP(ngg_state->max_gsprims) |
4588 S_028A44_GS_INST_PRIMS_IN_SUBGRP(ngg_state->max_gsprims * gs_num_invocations));
4589 radeon_set_context_reg(ctx_cs, R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP,
4590 S_0287FC_MAX_VERTS_PER_SUBGROUP(ngg_state->max_out_verts));
4591 radeon_set_context_reg(ctx_cs, R_028B4C_GE_NGG_SUBGRP_CNTL,
4592 S_028B4C_PRIM_AMP_FACTOR(ngg_state->prim_amp_factor) |
4593 S_028B4C_THDS_PER_SUBGRP(0)); /* for fast launch */
4594 radeon_set_context_reg(
4595 ctx_cs, R_028B90_VGT_GS_INSTANCE_CNT,
4596 S_028B90_CNT(gs_num_invocations) | S_028B90_ENABLE(gs_num_invocations > 1) |
4597 S_028B90_EN_MAX_VERT_OUT_PER_GS_INSTANCE(ngg_state->max_vert_out_per_gs_instance));
4598
4599 ge_cntl = S_03096C_PRIM_GRP_SIZE(ngg_state->max_gsprims) |
4600 S_03096C_VERT_GRP_SIZE(ngg_state->enable_vertex_grouping ? ngg_state->hw_max_esverts : 256) | /* 256 = disable vertex grouping */
4601 S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
4602
4603 /* Bug workaround for a possible hang with non-tessellation cases.
4604 * Tessellation always sets GE_CNTL.VERT_GRP_SIZE = 0
4605 *
4606 * Requirement: GE_CNTL.VERT_GRP_SIZE = VGT_GS_ONCHIP_CNTL.ES_VERTS_PER_SUBGRP - 5
4607 */
4608 if (pipeline->device->physical_device->rad_info.chip_class == GFX10 &&
4609 !radv_pipeline_has_tess(pipeline) && ngg_state->hw_max_esverts != 256) {
4610 ge_cntl &= C_03096C_VERT_GRP_SIZE;
4611
4612 if (ngg_state->hw_max_esverts > 5) {
4613 ge_cntl |= S_03096C_VERT_GRP_SIZE(ngg_state->hw_max_esverts - 5);
4614 }
4615 }
4616
4617 radeon_set_uconfig_reg(ctx_cs, R_03096C_GE_CNTL, ge_cntl);
4618
4619 unsigned late_alloc_wave64, cu_mask;
4620 ac_compute_late_alloc(&pipeline->device->physical_device->rad_info, true, shader->info.has_ngg_culling,
4621 shader->config.scratch_bytes_per_wave > 0, &late_alloc_wave64, &cu_mask);
4622
4623 radeon_set_sh_reg_idx(
4624 pipeline->device->physical_device, cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, 3,
4625 S_00B21C_CU_EN(cu_mask) | S_00B21C_WAVE_LIMIT(0x3F));
4626 radeon_set_sh_reg_idx(
4627 pipeline->device->physical_device, cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, 3,
4628 S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64));
4629
4630 uint32_t oversub_pc_lines = late_alloc_wave64 ? pipeline->device->physical_device->rad_info.pc_lines / 4 : 0;
4631 if (shader->info.has_ngg_culling) {
4632 unsigned oversub_factor = 2;
4633
4634 if (outinfo->param_exports > 4)
4635 oversub_factor = 4;
4636 else if (outinfo->param_exports > 2)
4637 oversub_factor = 3;
4638
4639 oversub_pc_lines *= oversub_factor;
4640 }
4641
4642 gfx10_emit_ge_pc_alloc(cs, pipeline->device->physical_device->rad_info.chip_class, oversub_pc_lines);
4643 }
4644
4645 static void
radv_pipeline_generate_hw_hs(struct radeon_cmdbuf * cs,const struct radv_pipeline * pipeline,const struct radv_shader_variant * shader)4646 radv_pipeline_generate_hw_hs(struct radeon_cmdbuf *cs, const struct radv_pipeline *pipeline,
4647 const struct radv_shader_variant *shader)
4648 {
4649 uint64_t va = radv_shader_variant_get_va(shader);
4650
4651 if (pipeline->device->physical_device->rad_info.chip_class >= GFX9) {
4652 if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) {
4653 radeon_set_sh_reg(cs, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
4654 } else {
4655 radeon_set_sh_reg(cs, R_00B410_SPI_SHADER_PGM_LO_LS, va >> 8);
4656 }
4657
4658 radeon_set_sh_reg_seq(cs, R_00B428_SPI_SHADER_PGM_RSRC1_HS, 2);
4659 radeon_emit(cs, shader->config.rsrc1);
4660 radeon_emit(cs, shader->config.rsrc2);
4661 } else {
4662 radeon_set_sh_reg_seq(cs, R_00B420_SPI_SHADER_PGM_LO_HS, 4);
4663 radeon_emit(cs, va >> 8);
4664 radeon_emit(cs, S_00B424_MEM_BASE(va >> 40));
4665 radeon_emit(cs, shader->config.rsrc1);
4666 radeon_emit(cs, shader->config.rsrc2);
4667 }
4668 }
4669
4670 static void
radv_pipeline_generate_vertex_shader(struct radeon_cmdbuf * ctx_cs,struct radeon_cmdbuf * cs,const struct radv_pipeline * pipeline)4671 radv_pipeline_generate_vertex_shader(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
4672 const struct radv_pipeline *pipeline)
4673 {
4674 struct radv_shader_variant *vs;
4675
4676 /* Skip shaders merged into HS/GS */
4677 vs = pipeline->shaders[MESA_SHADER_VERTEX];
4678 if (!vs)
4679 return;
4680
4681 if (vs->info.vs.as_ls)
4682 radv_pipeline_generate_hw_ls(cs, pipeline, vs);
4683 else if (vs->info.vs.as_es)
4684 radv_pipeline_generate_hw_es(cs, pipeline, vs);
4685 else if (vs->info.is_ngg)
4686 radv_pipeline_generate_hw_ngg(ctx_cs, cs, pipeline, vs);
4687 else
4688 radv_pipeline_generate_hw_vs(ctx_cs, cs, pipeline, vs);
4689 }
4690
4691 static void
radv_pipeline_generate_tess_shaders(struct radeon_cmdbuf * ctx_cs,struct radeon_cmdbuf * cs,const struct radv_pipeline * pipeline)4692 radv_pipeline_generate_tess_shaders(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
4693 const struct radv_pipeline *pipeline)
4694 {
4695 struct radv_shader_variant *tes, *tcs;
4696
4697 tcs = pipeline->shaders[MESA_SHADER_TESS_CTRL];
4698 tes = pipeline->shaders[MESA_SHADER_TESS_EVAL];
4699
4700 if (tes) {
4701 if (tes->info.is_ngg) {
4702 radv_pipeline_generate_hw_ngg(ctx_cs, cs, pipeline, tes);
4703 } else if (tes->info.tes.as_es)
4704 radv_pipeline_generate_hw_es(cs, pipeline, tes);
4705 else
4706 radv_pipeline_generate_hw_vs(ctx_cs, cs, pipeline, tes);
4707 }
4708
4709 radv_pipeline_generate_hw_hs(cs, pipeline, tcs);
4710
4711 if (pipeline->device->physical_device->rad_info.chip_class >= GFX10 &&
4712 !radv_pipeline_has_gs(pipeline) && !radv_pipeline_has_ngg(pipeline)) {
4713 radeon_set_context_reg(ctx_cs, R_028A44_VGT_GS_ONCHIP_CNTL,
4714 S_028A44_ES_VERTS_PER_SUBGRP(250) | S_028A44_GS_PRIMS_PER_SUBGRP(126) |
4715 S_028A44_GS_INST_PRIMS_IN_SUBGRP(126));
4716 }
4717 }
4718
4719 static void
radv_pipeline_generate_tess_state(struct radeon_cmdbuf * ctx_cs,const struct radv_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)4720 radv_pipeline_generate_tess_state(struct radeon_cmdbuf *ctx_cs,
4721 const struct radv_pipeline *pipeline,
4722 const VkGraphicsPipelineCreateInfo *pCreateInfo)
4723 {
4724 struct radv_shader_variant *tes = radv_get_shader(pipeline, MESA_SHADER_TESS_EVAL);
4725 unsigned type = 0, partitioning = 0, topology = 0, distribution_mode = 0;
4726 unsigned num_tcs_input_cp, num_tcs_output_cp, num_patches;
4727 unsigned ls_hs_config;
4728
4729 num_tcs_input_cp = pCreateInfo->pTessellationState->patchControlPoints;
4730 num_tcs_output_cp =
4731 pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.tcs.tcs_vertices_out; // TCS VERTICES OUT
4732 num_patches = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.num_tess_patches;
4733
4734 ls_hs_config = S_028B58_NUM_PATCHES(num_patches) | S_028B58_HS_NUM_INPUT_CP(num_tcs_input_cp) |
4735 S_028B58_HS_NUM_OUTPUT_CP(num_tcs_output_cp);
4736
4737 if (pipeline->device->physical_device->rad_info.chip_class >= GFX7) {
4738 radeon_set_context_reg_idx(ctx_cs, R_028B58_VGT_LS_HS_CONFIG, 2, ls_hs_config);
4739 } else {
4740 radeon_set_context_reg(ctx_cs, R_028B58_VGT_LS_HS_CONFIG, ls_hs_config);
4741 }
4742
4743 switch (tes->info.tes.primitive_mode) {
4744 case GL_TRIANGLES:
4745 type = V_028B6C_TESS_TRIANGLE;
4746 break;
4747 case GL_QUADS:
4748 type = V_028B6C_TESS_QUAD;
4749 break;
4750 case GL_ISOLINES:
4751 type = V_028B6C_TESS_ISOLINE;
4752 break;
4753 }
4754
4755 switch (tes->info.tes.spacing) {
4756 case TESS_SPACING_EQUAL:
4757 partitioning = V_028B6C_PART_INTEGER;
4758 break;
4759 case TESS_SPACING_FRACTIONAL_ODD:
4760 partitioning = V_028B6C_PART_FRAC_ODD;
4761 break;
4762 case TESS_SPACING_FRACTIONAL_EVEN:
4763 partitioning = V_028B6C_PART_FRAC_EVEN;
4764 break;
4765 default:
4766 break;
4767 }
4768
4769 bool ccw = tes->info.tes.ccw;
4770 const VkPipelineTessellationDomainOriginStateCreateInfo *domain_origin_state =
4771 vk_find_struct_const(pCreateInfo->pTessellationState,
4772 PIPELINE_TESSELLATION_DOMAIN_ORIGIN_STATE_CREATE_INFO);
4773
4774 if (domain_origin_state &&
4775 domain_origin_state->domainOrigin != VK_TESSELLATION_DOMAIN_ORIGIN_UPPER_LEFT)
4776 ccw = !ccw;
4777
4778 if (tes->info.tes.point_mode)
4779 topology = V_028B6C_OUTPUT_POINT;
4780 else if (tes->info.tes.primitive_mode == GL_ISOLINES)
4781 topology = V_028B6C_OUTPUT_LINE;
4782 else if (ccw)
4783 topology = V_028B6C_OUTPUT_TRIANGLE_CCW;
4784 else
4785 topology = V_028B6C_OUTPUT_TRIANGLE_CW;
4786
4787 if (pipeline->device->physical_device->rad_info.has_distributed_tess) {
4788 if (pipeline->device->physical_device->rad_info.family == CHIP_FIJI ||
4789 pipeline->device->physical_device->rad_info.family >= CHIP_POLARIS10)
4790 distribution_mode = V_028B6C_TRAPEZOIDS;
4791 else
4792 distribution_mode = V_028B6C_DONUTS;
4793 } else
4794 distribution_mode = V_028B6C_NO_DIST;
4795
4796 radeon_set_context_reg(ctx_cs, R_028B6C_VGT_TF_PARAM,
4797 S_028B6C_TYPE(type) | S_028B6C_PARTITIONING(partitioning) |
4798 S_028B6C_TOPOLOGY(topology) |
4799 S_028B6C_DISTRIBUTION_MODE(distribution_mode));
4800 }
4801
4802 static void
radv_pipeline_generate_hw_gs(struct radeon_cmdbuf * ctx_cs,struct radeon_cmdbuf * cs,const struct radv_pipeline * pipeline,const struct radv_shader_variant * gs)4803 radv_pipeline_generate_hw_gs(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
4804 const struct radv_pipeline *pipeline,
4805 const struct radv_shader_variant *gs)
4806 {
4807 const struct gfx9_gs_info *gs_state = &gs->info.gs_ring_info;
4808 unsigned gs_max_out_vertices;
4809 const uint8_t *num_components;
4810 uint8_t max_stream;
4811 unsigned offset;
4812 uint64_t va;
4813
4814 gs_max_out_vertices = gs->info.gs.vertices_out;
4815 max_stream = gs->info.gs.max_stream;
4816 num_components = gs->info.gs.num_stream_output_components;
4817
4818 offset = num_components[0] * gs_max_out_vertices;
4819
4820 radeon_set_context_reg_seq(ctx_cs, R_028A60_VGT_GSVS_RING_OFFSET_1, 3);
4821 radeon_emit(ctx_cs, offset);
4822 if (max_stream >= 1)
4823 offset += num_components[1] * gs_max_out_vertices;
4824 radeon_emit(ctx_cs, offset);
4825 if (max_stream >= 2)
4826 offset += num_components[2] * gs_max_out_vertices;
4827 radeon_emit(ctx_cs, offset);
4828 if (max_stream >= 3)
4829 offset += num_components[3] * gs_max_out_vertices;
4830 radeon_set_context_reg(ctx_cs, R_028AB0_VGT_GSVS_RING_ITEMSIZE, offset);
4831
4832 radeon_set_context_reg_seq(ctx_cs, R_028B5C_VGT_GS_VERT_ITEMSIZE, 4);
4833 radeon_emit(ctx_cs, num_components[0]);
4834 radeon_emit(ctx_cs, (max_stream >= 1) ? num_components[1] : 0);
4835 radeon_emit(ctx_cs, (max_stream >= 2) ? num_components[2] : 0);
4836 radeon_emit(ctx_cs, (max_stream >= 3) ? num_components[3] : 0);
4837
4838 uint32_t gs_num_invocations = gs->info.gs.invocations;
4839 radeon_set_context_reg(
4840 ctx_cs, R_028B90_VGT_GS_INSTANCE_CNT,
4841 S_028B90_CNT(MIN2(gs_num_invocations, 127)) | S_028B90_ENABLE(gs_num_invocations > 0));
4842
4843 radeon_set_context_reg(ctx_cs, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
4844 gs_state->vgt_esgs_ring_itemsize);
4845
4846 va = radv_shader_variant_get_va(gs);
4847
4848 if (pipeline->device->physical_device->rad_info.chip_class >= GFX9) {
4849 if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) {
4850 radeon_set_sh_reg(cs, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
4851 } else {
4852 radeon_set_sh_reg(cs, R_00B210_SPI_SHADER_PGM_LO_ES, va >> 8);
4853 }
4854
4855 radeon_set_sh_reg_seq(cs, R_00B228_SPI_SHADER_PGM_RSRC1_GS, 2);
4856 radeon_emit(cs, gs->config.rsrc1);
4857 radeon_emit(cs, gs->config.rsrc2 | S_00B22C_LDS_SIZE(gs_state->lds_size));
4858
4859 radeon_set_context_reg(ctx_cs, R_028A44_VGT_GS_ONCHIP_CNTL, gs_state->vgt_gs_onchip_cntl);
4860 radeon_set_context_reg(ctx_cs, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
4861 gs_state->vgt_gs_max_prims_per_subgroup);
4862 } else {
4863 radeon_set_sh_reg_seq(cs, R_00B220_SPI_SHADER_PGM_LO_GS, 4);
4864 radeon_emit(cs, va >> 8);
4865 radeon_emit(cs, S_00B224_MEM_BASE(va >> 40));
4866 radeon_emit(cs, gs->config.rsrc1);
4867 radeon_emit(cs, gs->config.rsrc2);
4868 }
4869
4870 if (pipeline->device->physical_device->rad_info.chip_class >= GFX7) {
4871 radeon_set_sh_reg_idx(
4872 pipeline->device->physical_device, cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, 3,
4873 S_00B21C_CU_EN(0xffff) | S_00B21C_WAVE_LIMIT(0x3F));
4874
4875 if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) {
4876 radeon_set_sh_reg_idx(
4877 pipeline->device->physical_device, cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, 3,
4878 S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0));
4879 }
4880 }
4881
4882 radv_pipeline_generate_hw_vs(ctx_cs, cs, pipeline, pipeline->gs_copy_shader);
4883 }
4884
4885 static void
radv_pipeline_generate_geometry_shader(struct radeon_cmdbuf * ctx_cs,struct radeon_cmdbuf * cs,const struct radv_pipeline * pipeline)4886 radv_pipeline_generate_geometry_shader(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
4887 const struct radv_pipeline *pipeline)
4888 {
4889 struct radv_shader_variant *gs;
4890
4891 gs = pipeline->shaders[MESA_SHADER_GEOMETRY];
4892 if (!gs)
4893 return;
4894
4895 if (gs->info.is_ngg)
4896 radv_pipeline_generate_hw_ngg(ctx_cs, cs, pipeline, gs);
4897 else
4898 radv_pipeline_generate_hw_gs(ctx_cs, cs, pipeline, gs);
4899
4900 radeon_set_context_reg(ctx_cs, R_028B38_VGT_GS_MAX_VERT_OUT, gs->info.gs.vertices_out);
4901 }
4902
4903 static uint32_t
offset_to_ps_input(uint32_t offset,bool flat_shade,bool explicit,bool float16)4904 offset_to_ps_input(uint32_t offset, bool flat_shade, bool explicit, bool float16)
4905 {
4906 uint32_t ps_input_cntl;
4907 if (offset <= AC_EXP_PARAM_OFFSET_31) {
4908 ps_input_cntl = S_028644_OFFSET(offset);
4909 if (flat_shade || explicit)
4910 ps_input_cntl |= S_028644_FLAT_SHADE(1);
4911 if (explicit) {
4912 /* Force parameter cache to be read in passthrough
4913 * mode.
4914 */
4915 ps_input_cntl |= S_028644_OFFSET(1 << 5);
4916 }
4917 if (float16) {
4918 ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) | S_028644_ATTR0_VALID(1);
4919 }
4920 } else {
4921 /* The input is a DEFAULT_VAL constant. */
4922 assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 && offset <= AC_EXP_PARAM_DEFAULT_VAL_1111);
4923 offset -= AC_EXP_PARAM_DEFAULT_VAL_0000;
4924 ps_input_cntl = S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(offset);
4925 }
4926 return ps_input_cntl;
4927 }
4928
4929 static void
radv_pipeline_generate_ps_inputs(struct radeon_cmdbuf * ctx_cs,const struct radv_pipeline * pipeline)4930 radv_pipeline_generate_ps_inputs(struct radeon_cmdbuf *ctx_cs, const struct radv_pipeline *pipeline)
4931 {
4932 struct radv_shader_variant *ps = pipeline->shaders[MESA_SHADER_FRAGMENT];
4933 const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline);
4934 uint32_t ps_input_cntl[32];
4935
4936 unsigned ps_offset = 0;
4937
4938 if (ps->info.ps.prim_id_input) {
4939 unsigned vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID];
4940 if (vs_offset != AC_EXP_PARAM_UNDEFINED) {
4941 ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true, false, false);
4942 ++ps_offset;
4943 }
4944 }
4945
4946 if (ps->info.ps.layer_input) {
4947 unsigned vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_LAYER];
4948 if (vs_offset != AC_EXP_PARAM_UNDEFINED)
4949 ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true, false, false);
4950 else
4951 ps_input_cntl[ps_offset] =
4952 offset_to_ps_input(AC_EXP_PARAM_DEFAULT_VAL_0000, true, false, false);
4953 ++ps_offset;
4954 }
4955
4956 if (ps->info.ps.viewport_index_input) {
4957 unsigned vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_VIEWPORT];
4958 if (vs_offset != AC_EXP_PARAM_UNDEFINED)
4959 ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true, false, false);
4960 else
4961 ps_input_cntl[ps_offset] =
4962 offset_to_ps_input(AC_EXP_PARAM_DEFAULT_VAL_0000, true, false, false);
4963 ++ps_offset;
4964 }
4965
4966 if (ps->info.ps.has_pcoord) {
4967 unsigned val;
4968 val = S_028644_PT_SPRITE_TEX(1) | S_028644_OFFSET(0x20);
4969 ps_input_cntl[ps_offset] = val;
4970 ps_offset++;
4971 }
4972
4973 if (ps->info.ps.num_input_clips_culls) {
4974 unsigned vs_offset;
4975
4976 vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_CLIP_DIST0];
4977 if (vs_offset != AC_EXP_PARAM_UNDEFINED) {
4978 ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, false, false, false);
4979 ++ps_offset;
4980 }
4981
4982 vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_CLIP_DIST1];
4983 if (vs_offset != AC_EXP_PARAM_UNDEFINED && ps->info.ps.num_input_clips_culls > 4) {
4984 ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, false, false, false);
4985 ++ps_offset;
4986 }
4987 }
4988
4989 for (unsigned i = 0; i < 32 && (1u << i) <= ps->info.ps.input_mask; ++i) {
4990 unsigned vs_offset;
4991 bool flat_shade;
4992 bool explicit;
4993 bool float16;
4994 if (!(ps->info.ps.input_mask & (1u << i)))
4995 continue;
4996
4997 vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_VAR0 + i];
4998 if (vs_offset == AC_EXP_PARAM_UNDEFINED) {
4999 ps_input_cntl[ps_offset] = S_028644_OFFSET(0x20);
5000 ++ps_offset;
5001 continue;
5002 }
5003
5004 flat_shade = !!(ps->info.ps.flat_shaded_mask & (1u << ps_offset));
5005 explicit = !!(ps->info.ps.explicit_shaded_mask & (1u << ps_offset));
5006 float16 = !!(ps->info.ps.float16_shaded_mask & (1u << ps_offset));
5007
5008 ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, flat_shade, explicit, float16);
5009 ++ps_offset;
5010 }
5011
5012 if (ps_offset) {
5013 radeon_set_context_reg_seq(ctx_cs, R_028644_SPI_PS_INPUT_CNTL_0, ps_offset);
5014 for (unsigned i = 0; i < ps_offset; i++) {
5015 radeon_emit(ctx_cs, ps_input_cntl[i]);
5016 }
5017 }
5018 }
5019
5020 static uint32_t
radv_compute_db_shader_control(const struct radv_device * device,const struct radv_pipeline * pipeline,const struct radv_shader_variant * ps)5021 radv_compute_db_shader_control(const struct radv_device *device,
5022 const struct radv_pipeline *pipeline,
5023 const struct radv_shader_variant *ps)
5024 {
5025 unsigned conservative_z_export = V_02880C_EXPORT_ANY_Z;
5026 unsigned z_order;
5027 if (ps->info.ps.early_fragment_test || !ps->info.ps.writes_memory)
5028 z_order = V_02880C_EARLY_Z_THEN_LATE_Z;
5029 else
5030 z_order = V_02880C_LATE_Z;
5031
5032 if (ps->info.ps.depth_layout == FRAG_DEPTH_LAYOUT_GREATER)
5033 conservative_z_export = V_02880C_EXPORT_GREATER_THAN_Z;
5034 else if (ps->info.ps.depth_layout == FRAG_DEPTH_LAYOUT_LESS)
5035 conservative_z_export = V_02880C_EXPORT_LESS_THAN_Z;
5036
5037 bool disable_rbplus = device->physical_device->rad_info.has_rbplus &&
5038 !device->physical_device->rad_info.rbplus_allowed;
5039
5040 /* It shouldn't be needed to export gl_SampleMask when MSAA is disabled
5041 * but this appears to break Project Cars (DXVK). See
5042 * https://bugs.freedesktop.org/show_bug.cgi?id=109401
5043 */
5044 bool mask_export_enable = ps->info.ps.writes_sample_mask;
5045
5046 return S_02880C_Z_EXPORT_ENABLE(ps->info.ps.writes_z) |
5047 S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(ps->info.ps.writes_stencil) |
5048 S_02880C_KILL_ENABLE(!!ps->info.ps.can_discard) |
5049 S_02880C_MASK_EXPORT_ENABLE(mask_export_enable) |
5050 S_02880C_CONSERVATIVE_Z_EXPORT(conservative_z_export) | S_02880C_Z_ORDER(z_order) |
5051 S_02880C_DEPTH_BEFORE_SHADER(ps->info.ps.early_fragment_test) |
5052 S_02880C_PRE_SHADER_DEPTH_COVERAGE_ENABLE(ps->info.ps.post_depth_coverage) |
5053 S_02880C_EXEC_ON_HIER_FAIL(ps->info.ps.writes_memory) |
5054 S_02880C_EXEC_ON_NOOP(ps->info.ps.writes_memory) |
5055 S_02880C_DUAL_QUAD_DISABLE(disable_rbplus);
5056 }
5057
5058 static void
radv_pipeline_generate_fragment_shader(struct radeon_cmdbuf * ctx_cs,struct radeon_cmdbuf * cs,struct radv_pipeline * pipeline)5059 radv_pipeline_generate_fragment_shader(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
5060 struct radv_pipeline *pipeline)
5061 {
5062 struct radv_shader_variant *ps;
5063 uint64_t va;
5064 assert(pipeline->shaders[MESA_SHADER_FRAGMENT]);
5065
5066 ps = pipeline->shaders[MESA_SHADER_FRAGMENT];
5067 va = radv_shader_variant_get_va(ps);
5068
5069 radeon_set_sh_reg_seq(cs, R_00B020_SPI_SHADER_PGM_LO_PS, 4);
5070 radeon_emit(cs, va >> 8);
5071 radeon_emit(cs, S_00B024_MEM_BASE(va >> 40));
5072 radeon_emit(cs, ps->config.rsrc1);
5073 radeon_emit(cs, ps->config.rsrc2);
5074
5075 radeon_set_context_reg(ctx_cs, R_02880C_DB_SHADER_CONTROL,
5076 radv_compute_db_shader_control(pipeline->device, pipeline, ps));
5077
5078 radeon_set_context_reg_seq(ctx_cs, R_0286CC_SPI_PS_INPUT_ENA, 2);
5079 radeon_emit(ctx_cs, ps->config.spi_ps_input_ena);
5080 radeon_emit(ctx_cs, ps->config.spi_ps_input_addr);
5081
5082 radeon_set_context_reg(
5083 ctx_cs, R_0286D8_SPI_PS_IN_CONTROL,
5084 S_0286D8_NUM_INTERP(ps->info.ps.num_interp) | S_0286D8_PS_W32_EN(ps->info.wave_size == 32));
5085
5086 radeon_set_context_reg(ctx_cs, R_0286E0_SPI_BARYC_CNTL, pipeline->graphics.spi_baryc_cntl);
5087
5088 radeon_set_context_reg(
5089 ctx_cs, R_028710_SPI_SHADER_Z_FORMAT,
5090 ac_get_spi_shader_z_format(ps->info.ps.writes_z, ps->info.ps.writes_stencil,
5091 ps->info.ps.writes_sample_mask));
5092 }
5093
5094 static void
radv_pipeline_generate_vgt_vertex_reuse(struct radeon_cmdbuf * ctx_cs,const struct radv_pipeline * pipeline)5095 radv_pipeline_generate_vgt_vertex_reuse(struct radeon_cmdbuf *ctx_cs,
5096 const struct radv_pipeline *pipeline)
5097 {
5098 if (pipeline->device->physical_device->rad_info.family < CHIP_POLARIS10 ||
5099 pipeline->device->physical_device->rad_info.chip_class >= GFX10)
5100 return;
5101
5102 unsigned vtx_reuse_depth = 30;
5103 if (radv_pipeline_has_tess(pipeline) &&
5104 radv_get_shader(pipeline, MESA_SHADER_TESS_EVAL)->info.tes.spacing ==
5105 TESS_SPACING_FRACTIONAL_ODD) {
5106 vtx_reuse_depth = 14;
5107 }
5108 radeon_set_context_reg(ctx_cs, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
5109 S_028C58_VTX_REUSE_DEPTH(vtx_reuse_depth));
5110 }
5111
5112 static void
radv_pipeline_generate_vgt_shader_config(struct radeon_cmdbuf * ctx_cs,const struct radv_pipeline * pipeline)5113 radv_pipeline_generate_vgt_shader_config(struct radeon_cmdbuf *ctx_cs,
5114 const struct radv_pipeline *pipeline)
5115 {
5116 uint32_t stages = 0;
5117 if (radv_pipeline_has_tess(pipeline)) {
5118 stages |= S_028B54_LS_EN(V_028B54_LS_STAGE_ON) | S_028B54_HS_EN(1) | S_028B54_DYNAMIC_HS(1);
5119
5120 if (radv_pipeline_has_gs(pipeline))
5121 stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS) | S_028B54_GS_EN(1);
5122 else if (radv_pipeline_has_ngg(pipeline))
5123 stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS);
5124 else
5125 stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_DS);
5126 } else if (radv_pipeline_has_gs(pipeline)) {
5127 stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) | S_028B54_GS_EN(1);
5128 } else if (radv_pipeline_has_ngg(pipeline)) {
5129 stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL);
5130 }
5131
5132 if (radv_pipeline_has_ngg(pipeline)) {
5133 stages |= S_028B54_PRIMGEN_EN(1);
5134 if (pipeline->streamout_shader)
5135 stages |= S_028B54_NGG_WAVE_ID_EN(1);
5136 if (radv_pipeline_has_ngg_passthrough(pipeline))
5137 stages |= S_028B54_PRIMGEN_PASSTHRU_EN(1);
5138 } else if (radv_pipeline_has_gs(pipeline)) {
5139 stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
5140 }
5141
5142 if (pipeline->device->physical_device->rad_info.chip_class >= GFX9)
5143 stages |= S_028B54_MAX_PRIMGRP_IN_WAVE(2);
5144
5145 if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) {
5146 uint8_t hs_size = 64, gs_size = 64, vs_size = 64;
5147
5148 if (radv_pipeline_has_tess(pipeline))
5149 hs_size = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.wave_size;
5150
5151 if (pipeline->shaders[MESA_SHADER_GEOMETRY]) {
5152 vs_size = gs_size = pipeline->shaders[MESA_SHADER_GEOMETRY]->info.wave_size;
5153 if (radv_pipeline_has_gs_copy_shader(pipeline))
5154 vs_size = pipeline->gs_copy_shader->info.wave_size;
5155 } else if (pipeline->shaders[MESA_SHADER_TESS_EVAL])
5156 vs_size = pipeline->shaders[MESA_SHADER_TESS_EVAL]->info.wave_size;
5157 else if (pipeline->shaders[MESA_SHADER_VERTEX])
5158 vs_size = pipeline->shaders[MESA_SHADER_VERTEX]->info.wave_size;
5159
5160 if (radv_pipeline_has_ngg(pipeline)) {
5161 assert(!radv_pipeline_has_gs_copy_shader(pipeline));
5162 gs_size = vs_size;
5163 }
5164
5165 /* legacy GS only supports Wave64 */
5166 stages |= S_028B54_HS_W32_EN(hs_size == 32 ? 1 : 0) |
5167 S_028B54_GS_W32_EN(gs_size == 32 ? 1 : 0) |
5168 S_028B54_VS_W32_EN(vs_size == 32 ? 1 : 0);
5169 }
5170
5171 radeon_set_context_reg(ctx_cs, R_028B54_VGT_SHADER_STAGES_EN, stages);
5172 }
5173
5174 static void
radv_pipeline_generate_cliprect_rule(struct radeon_cmdbuf * ctx_cs,const VkGraphicsPipelineCreateInfo * pCreateInfo)5175 radv_pipeline_generate_cliprect_rule(struct radeon_cmdbuf *ctx_cs,
5176 const VkGraphicsPipelineCreateInfo *pCreateInfo)
5177 {
5178 const VkPipelineDiscardRectangleStateCreateInfoEXT *discard_rectangle_info =
5179 vk_find_struct_const(pCreateInfo->pNext, PIPELINE_DISCARD_RECTANGLE_STATE_CREATE_INFO_EXT);
5180 uint32_t cliprect_rule = 0;
5181
5182 if (!discard_rectangle_info) {
5183 cliprect_rule = 0xffff;
5184 } else {
5185 for (unsigned i = 0; i < (1u << MAX_DISCARD_RECTANGLES); ++i) {
5186 /* Interpret i as a bitmask, and then set the bit in
5187 * the mask if that combination of rectangles in which
5188 * the pixel is contained should pass the cliprect
5189 * test.
5190 */
5191 unsigned relevant_subset = i & ((1u << discard_rectangle_info->discardRectangleCount) - 1);
5192
5193 if (discard_rectangle_info->discardRectangleMode ==
5194 VK_DISCARD_RECTANGLE_MODE_INCLUSIVE_EXT &&
5195 !relevant_subset)
5196 continue;
5197
5198 if (discard_rectangle_info->discardRectangleMode ==
5199 VK_DISCARD_RECTANGLE_MODE_EXCLUSIVE_EXT &&
5200 relevant_subset)
5201 continue;
5202
5203 cliprect_rule |= 1u << i;
5204 }
5205 }
5206
5207 radeon_set_context_reg(ctx_cs, R_02820C_PA_SC_CLIPRECT_RULE, cliprect_rule);
5208 }
5209
5210 static void
gfx10_pipeline_generate_ge_cntl(struct radeon_cmdbuf * ctx_cs,struct radv_pipeline * pipeline)5211 gfx10_pipeline_generate_ge_cntl(struct radeon_cmdbuf *ctx_cs, struct radv_pipeline *pipeline)
5212 {
5213 bool break_wave_at_eoi = false;
5214 unsigned primgroup_size;
5215 unsigned vertgroup_size = 256; /* 256 = disable vertex grouping */
5216
5217 if (radv_pipeline_has_tess(pipeline)) {
5218 primgroup_size = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.num_tess_patches;
5219 } else if (radv_pipeline_has_gs(pipeline)) {
5220 const struct gfx9_gs_info *gs_state =
5221 &pipeline->shaders[MESA_SHADER_GEOMETRY]->info.gs_ring_info;
5222 unsigned vgt_gs_onchip_cntl = gs_state->vgt_gs_onchip_cntl;
5223 primgroup_size = G_028A44_GS_PRIMS_PER_SUBGRP(vgt_gs_onchip_cntl);
5224 } else {
5225 primgroup_size = 128; /* recommended without a GS and tess */
5226 }
5227
5228 if (radv_pipeline_has_tess(pipeline)) {
5229 if (pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.uses_prim_id ||
5230 radv_get_shader(pipeline, MESA_SHADER_TESS_EVAL)->info.uses_prim_id)
5231 break_wave_at_eoi = true;
5232 }
5233
5234 radeon_set_uconfig_reg(ctx_cs, R_03096C_GE_CNTL,
5235 S_03096C_PRIM_GRP_SIZE(primgroup_size) |
5236 S_03096C_VERT_GRP_SIZE(vertgroup_size) |
5237 S_03096C_PACKET_TO_ONE_PA(0) /* line stipple */ |
5238 S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi));
5239 }
5240
5241 static void
radv_pipeline_generate_vgt_gs_out(struct radeon_cmdbuf * ctx_cs,const struct radv_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo,const struct radv_graphics_pipeline_create_info * extra)5242 radv_pipeline_generate_vgt_gs_out(struct radeon_cmdbuf *ctx_cs,
5243 const struct radv_pipeline *pipeline,
5244 const VkGraphicsPipelineCreateInfo *pCreateInfo,
5245 const struct radv_graphics_pipeline_create_info *extra)
5246 {
5247 uint32_t gs_out;
5248
5249 if (radv_pipeline_has_gs(pipeline)) {
5250 gs_out =
5251 si_conv_gl_prim_to_gs_out(pipeline->shaders[MESA_SHADER_GEOMETRY]->info.gs.output_prim);
5252 } else if (radv_pipeline_has_tess(pipeline)) {
5253 if (pipeline->shaders[MESA_SHADER_TESS_EVAL]->info.tes.point_mode) {
5254 gs_out = V_028A6C_POINTLIST;
5255 } else {
5256 gs_out = si_conv_gl_prim_to_gs_out(
5257 pipeline->shaders[MESA_SHADER_TESS_EVAL]->info.tes.primitive_mode);
5258 }
5259 } else {
5260 gs_out = si_conv_prim_to_gs_out(pCreateInfo->pInputAssemblyState->topology);
5261 }
5262
5263 if (extra && extra->use_rectlist) {
5264 gs_out = V_028A6C_TRISTRIP;
5265 if (radv_pipeline_has_ngg(pipeline))
5266 gs_out = V_028A6C_RECTLIST;
5267 }
5268
5269 radeon_set_context_reg(ctx_cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, gs_out);
5270 }
5271
5272 static bool
gfx103_pipeline_vrs_coarse_shading(const struct radv_pipeline * pipeline)5273 gfx103_pipeline_vrs_coarse_shading(const struct radv_pipeline *pipeline)
5274 {
5275 struct radv_shader_variant *ps = pipeline->shaders[MESA_SHADER_FRAGMENT];
5276 struct radv_device *device = pipeline->device;
5277
5278 if (device->instance->debug_flags & RADV_DEBUG_NO_VRS_FLAT_SHADING)
5279 return false;
5280
5281 if (!ps->info.ps.allow_flat_shading)
5282 return false;
5283
5284 return true;
5285 }
5286
5287 static void
gfx103_pipeline_generate_vrs_state(struct radeon_cmdbuf * ctx_cs,const struct radv_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)5288 gfx103_pipeline_generate_vrs_state(struct radeon_cmdbuf *ctx_cs,
5289 const struct radv_pipeline *pipeline,
5290 const VkGraphicsPipelineCreateInfo *pCreateInfo)
5291 {
5292 uint32_t mode = V_028064_VRS_COMB_MODE_PASSTHRU;
5293 uint8_t rate_x = 0, rate_y = 0;
5294 bool enable_vrs = false;
5295
5296 if (vk_find_struct_const(pCreateInfo->pNext,
5297 PIPELINE_FRAGMENT_SHADING_RATE_STATE_CREATE_INFO_KHR) ||
5298 radv_is_state_dynamic(pCreateInfo, VK_DYNAMIC_STATE_FRAGMENT_SHADING_RATE_KHR)) {
5299 /* Enable draw call VRS because it's explicitly requested. */
5300 enable_vrs = true;
5301 } else if (gfx103_pipeline_vrs_coarse_shading(pipeline)) {
5302 /* Enable VRS coarse shading 2x2 if the driver determined that
5303 * it's safe to enable.
5304 */
5305 mode = V_028064_VRS_COMB_MODE_OVERRIDE;
5306 rate_x = rate_y = 1;
5307 } else if (pipeline->device->force_vrs != RADV_FORCE_VRS_NONE) {
5308 /* Force enable vertex VRS if requested by the user. */
5309 radeon_set_context_reg(
5310 ctx_cs, R_028848_PA_CL_VRS_CNTL,
5311 S_028848_SAMPLE_ITER_COMBINER_MODE(V_028848_VRS_COMB_MODE_OVERRIDE) |
5312 S_028848_VERTEX_RATE_COMBINER_MODE(V_028848_VRS_COMB_MODE_OVERRIDE));
5313
5314 /* If the shader is using discard, turn off coarse shading
5315 * because discard at 2x2 pixel granularity degrades quality
5316 * too much. MIN allows sample shading but not coarse shading.
5317 */
5318 struct radv_shader_variant *ps = pipeline->shaders[MESA_SHADER_FRAGMENT];
5319
5320 mode = ps->info.ps.can_discard ? V_028064_VRS_COMB_MODE_MIN : V_028064_VRS_COMB_MODE_PASSTHRU;
5321 }
5322
5323 radeon_set_context_reg(ctx_cs, R_028A98_VGT_DRAW_PAYLOAD_CNTL, S_028A98_EN_VRS_RATE(enable_vrs));
5324
5325 radeon_set_context_reg(ctx_cs, R_028064_DB_VRS_OVERRIDE_CNTL,
5326 S_028064_VRS_OVERRIDE_RATE_COMBINER_MODE(mode) |
5327 S_028064_VRS_OVERRIDE_RATE_X(rate_x) |
5328 S_028064_VRS_OVERRIDE_RATE_Y(rate_y));
5329 }
5330
5331 static void
radv_pipeline_generate_pm4(struct radv_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo,const struct radv_graphics_pipeline_create_info * extra,const struct radv_blend_state * blend)5332 radv_pipeline_generate_pm4(struct radv_pipeline *pipeline,
5333 const VkGraphicsPipelineCreateInfo *pCreateInfo,
5334 const struct radv_graphics_pipeline_create_info *extra,
5335 const struct radv_blend_state *blend)
5336 {
5337 struct radeon_cmdbuf *ctx_cs = &pipeline->ctx_cs;
5338 struct radeon_cmdbuf *cs = &pipeline->cs;
5339
5340 cs->max_dw = 64;
5341 ctx_cs->max_dw = 256;
5342 cs->buf = malloc(4 * (cs->max_dw + ctx_cs->max_dw));
5343 ctx_cs->buf = cs->buf + cs->max_dw;
5344
5345 radv_pipeline_generate_depth_stencil_state(ctx_cs, pipeline, pCreateInfo, extra);
5346 radv_pipeline_generate_blend_state(ctx_cs, pipeline, blend);
5347 radv_pipeline_generate_raster_state(ctx_cs, pipeline, pCreateInfo);
5348 radv_pipeline_generate_multisample_state(ctx_cs, pipeline);
5349 radv_pipeline_generate_vgt_gs_mode(ctx_cs, pipeline);
5350 radv_pipeline_generate_vertex_shader(ctx_cs, cs, pipeline);
5351
5352 if (radv_pipeline_has_tess(pipeline)) {
5353 radv_pipeline_generate_tess_shaders(ctx_cs, cs, pipeline);
5354 radv_pipeline_generate_tess_state(ctx_cs, pipeline, pCreateInfo);
5355 }
5356
5357 radv_pipeline_generate_geometry_shader(ctx_cs, cs, pipeline);
5358 radv_pipeline_generate_fragment_shader(ctx_cs, cs, pipeline);
5359 radv_pipeline_generate_ps_inputs(ctx_cs, pipeline);
5360 radv_pipeline_generate_vgt_vertex_reuse(ctx_cs, pipeline);
5361 radv_pipeline_generate_vgt_shader_config(ctx_cs, pipeline);
5362 radv_pipeline_generate_cliprect_rule(ctx_cs, pCreateInfo);
5363 radv_pipeline_generate_vgt_gs_out(ctx_cs, pipeline, pCreateInfo, extra);
5364
5365 if (pipeline->device->physical_device->rad_info.chip_class >= GFX10 &&
5366 !radv_pipeline_has_ngg(pipeline))
5367 gfx10_pipeline_generate_ge_cntl(ctx_cs, pipeline);
5368
5369 if (pipeline->device->physical_device->rad_info.chip_class >= GFX10_3)
5370 gfx103_pipeline_generate_vrs_state(ctx_cs, pipeline, pCreateInfo);
5371
5372 pipeline->ctx_cs_hash = _mesa_hash_data(ctx_cs->buf, ctx_cs->cdw * 4);
5373
5374 assert(ctx_cs->cdw <= ctx_cs->max_dw);
5375 assert(cs->cdw <= cs->max_dw);
5376 }
5377
5378 static void
radv_pipeline_init_vertex_input_state(struct radv_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo,const struct radv_pipeline_key * key)5379 radv_pipeline_init_vertex_input_state(struct radv_pipeline *pipeline,
5380 const VkGraphicsPipelineCreateInfo *pCreateInfo,
5381 const struct radv_pipeline_key *key)
5382 {
5383 const struct radv_shader_info *info = &radv_get_shader(pipeline, MESA_SHADER_VERTEX)->info;
5384 if (!key->vs.dynamic_input_state) {
5385 const VkPipelineVertexInputStateCreateInfo *vi_info = pCreateInfo->pVertexInputState;
5386
5387 for (uint32_t i = 0; i < vi_info->vertexBindingDescriptionCount; i++) {
5388 const VkVertexInputBindingDescription *desc = &vi_info->pVertexBindingDescriptions[i];
5389
5390 pipeline->binding_stride[desc->binding] = desc->stride;
5391 }
5392
5393 for (uint32_t i = 0; i < vi_info->vertexAttributeDescriptionCount; i++) {
5394 const VkVertexInputAttributeDescription *desc = &vi_info->pVertexAttributeDescriptions[i];
5395
5396 uint32_t end = desc->offset + vk_format_get_blocksize(desc->format);
5397 pipeline->attrib_ends[desc->location] = end;
5398 if (pipeline->binding_stride[desc->binding])
5399 pipeline->attrib_index_offset[desc->location] =
5400 desc->offset / pipeline->binding_stride[desc->binding];
5401 pipeline->attrib_bindings[desc->location] = desc->binding;
5402 }
5403 }
5404
5405 pipeline->use_per_attribute_vb_descs = info->vs.use_per_attribute_vb_descs;
5406 pipeline->last_vertex_attrib_bit = util_last_bit(info->vs.vb_desc_usage_mask);
5407 if (pipeline->shaders[MESA_SHADER_VERTEX])
5408 pipeline->next_vertex_stage = MESA_SHADER_VERTEX;
5409 else if (pipeline->shaders[MESA_SHADER_TESS_CTRL])
5410 pipeline->next_vertex_stage = MESA_SHADER_TESS_CTRL;
5411 else
5412 pipeline->next_vertex_stage = MESA_SHADER_GEOMETRY;
5413 if (pipeline->next_vertex_stage == MESA_SHADER_VERTEX) {
5414 const struct radv_shader_variant *vs_shader = pipeline->shaders[MESA_SHADER_VERTEX];
5415 pipeline->can_use_simple_input = vs_shader->info.is_ngg == pipeline->device->physical_device->use_ngg &&
5416 vs_shader->info.wave_size == pipeline->device->physical_device->ge_wave_size;
5417 } else {
5418 pipeline->can_use_simple_input = false;
5419 }
5420 if (info->vs.dynamic_inputs)
5421 pipeline->vb_desc_usage_mask = BITFIELD_MASK(pipeline->last_vertex_attrib_bit);
5422 else
5423 pipeline->vb_desc_usage_mask = info->vs.vb_desc_usage_mask;
5424 pipeline->vb_desc_alloc_size = util_bitcount(pipeline->vb_desc_usage_mask) * 16;
5425 }
5426
5427 static struct radv_shader_variant *
radv_pipeline_get_streamout_shader(struct radv_pipeline * pipeline)5428 radv_pipeline_get_streamout_shader(struct radv_pipeline *pipeline)
5429 {
5430 int i;
5431
5432 for (i = MESA_SHADER_GEOMETRY; i >= MESA_SHADER_VERTEX; i--) {
5433 struct radv_shader_variant *shader = radv_get_shader(pipeline, i);
5434
5435 if (shader && shader->info.so.num_outputs > 0)
5436 return shader;
5437 }
5438
5439 return NULL;
5440 }
5441
5442 static bool
radv_shader_need_indirect_descriptor_sets(struct radv_pipeline * pipeline,gl_shader_stage stage)5443 radv_shader_need_indirect_descriptor_sets(struct radv_pipeline *pipeline, gl_shader_stage stage)
5444 {
5445 struct radv_userdata_info *loc =
5446 radv_lookup_user_sgpr(pipeline, stage, AC_UD_INDIRECT_DESCRIPTOR_SETS);
5447 return loc->sgpr_idx != -1;
5448 }
5449
5450 static void
radv_pipeline_init_shader_stages_state(struct radv_pipeline * pipeline)5451 radv_pipeline_init_shader_stages_state(struct radv_pipeline *pipeline)
5452 {
5453 struct radv_device *device = pipeline->device;
5454
5455 for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
5456 pipeline->user_data_0[i] = radv_pipeline_stage_to_user_data_0(
5457 pipeline, i, device->physical_device->rad_info.chip_class);
5458
5459 if (pipeline->shaders[i]) {
5460 pipeline->need_indirect_descriptor_sets |=
5461 radv_shader_need_indirect_descriptor_sets(pipeline, i);
5462 }
5463 }
5464
5465 struct radv_userdata_info *loc =
5466 radv_lookup_user_sgpr(pipeline, MESA_SHADER_VERTEX, AC_UD_VS_BASE_VERTEX_START_INSTANCE);
5467 if (loc->sgpr_idx != -1) {
5468 pipeline->graphics.vtx_base_sgpr = pipeline->user_data_0[MESA_SHADER_VERTEX];
5469 pipeline->graphics.vtx_base_sgpr += loc->sgpr_idx * 4;
5470 pipeline->graphics.vtx_emit_num = loc->num_sgprs;
5471 pipeline->graphics.uses_drawid =
5472 radv_get_shader(pipeline, MESA_SHADER_VERTEX)->info.vs.needs_draw_id;
5473 pipeline->graphics.uses_baseinstance =
5474 radv_get_shader(pipeline, MESA_SHADER_VERTEX)->info.vs.needs_base_instance;
5475 }
5476 }
5477
5478 static VkResult
radv_pipeline_init(struct radv_pipeline * pipeline,struct radv_device * device,struct radv_pipeline_cache * cache,const VkGraphicsPipelineCreateInfo * pCreateInfo,const struct radv_graphics_pipeline_create_info * extra)5479 radv_pipeline_init(struct radv_pipeline *pipeline, struct radv_device *device,
5480 struct radv_pipeline_cache *cache,
5481 const VkGraphicsPipelineCreateInfo *pCreateInfo,
5482 const struct radv_graphics_pipeline_create_info *extra)
5483 {
5484 RADV_FROM_HANDLE(radv_pipeline_layout, pipeline_layout, pCreateInfo->layout);
5485 VkResult result;
5486
5487 pipeline->device = device;
5488 pipeline->graphics.last_vgt_api_stage = MESA_SHADER_NONE;
5489
5490 struct radv_blend_state blend = radv_pipeline_init_blend_state(pipeline, pCreateInfo, extra);
5491
5492 const VkPipelineCreationFeedbackCreateInfoEXT *creation_feedback =
5493 vk_find_struct_const(pCreateInfo->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO_EXT);
5494 radv_init_feedback(creation_feedback);
5495
5496 VkPipelineCreationFeedbackEXT *pipeline_feedback =
5497 creation_feedback ? creation_feedback->pPipelineCreationFeedback : NULL;
5498
5499 const VkPipelineShaderStageCreateInfo *pStages[MESA_SHADER_STAGES] = {
5500 0,
5501 };
5502 VkPipelineCreationFeedbackEXT *stage_feedbacks[MESA_SHADER_STAGES] = {0};
5503 for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) {
5504 gl_shader_stage stage = ffs(pCreateInfo->pStages[i].stage) - 1;
5505 pStages[stage] = &pCreateInfo->pStages[i];
5506 if (creation_feedback)
5507 stage_feedbacks[stage] = &creation_feedback->pPipelineStageCreationFeedbacks[i];
5508 }
5509
5510 struct radv_pipeline_key key =
5511 radv_generate_graphics_pipeline_key(pipeline, pCreateInfo, &blend);
5512
5513 result = radv_create_shaders(pipeline, pipeline_layout, device, cache, &key, pStages,
5514 pCreateInfo->flags, NULL, pipeline_feedback, stage_feedbacks);
5515 if (result != VK_SUCCESS)
5516 return result;
5517
5518 pipeline->graphics.spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1);
5519 radv_pipeline_init_multisample_state(pipeline, &blend, pCreateInfo);
5520 radv_pipeline_init_input_assembly_state(pipeline, pCreateInfo, extra);
5521 radv_pipeline_init_dynamic_state(pipeline, pCreateInfo, extra);
5522 radv_pipeline_init_raster_state(pipeline, pCreateInfo);
5523 radv_pipeline_init_depth_stencil_state(pipeline, pCreateInfo);
5524
5525 if (pipeline->device->physical_device->rad_info.chip_class >= GFX10_3)
5526 gfx103_pipeline_init_vrs_state(pipeline, pCreateInfo);
5527
5528 /* Ensure that some export memory is always allocated, for two reasons:
5529 *
5530 * 1) Correctness: The hardware ignores the EXEC mask if no export
5531 * memory is allocated, so KILL and alpha test do not work correctly
5532 * without this.
5533 * 2) Performance: Every shader needs at least a NULL export, even when
5534 * it writes no color/depth output. The NULL export instruction
5535 * stalls without this setting.
5536 *
5537 * Don't add this to CB_SHADER_MASK.
5538 *
5539 * GFX10 supports pixel shaders without exports by setting both the
5540 * color and Z formats to SPI_SHADER_ZERO. The hw will skip export
5541 * instructions if any are present.
5542 */
5543 struct radv_shader_variant *ps = pipeline->shaders[MESA_SHADER_FRAGMENT];
5544 if ((pipeline->device->physical_device->rad_info.chip_class <= GFX9 ||
5545 ps->info.ps.can_discard) &&
5546 !blend.spi_shader_col_format) {
5547 if (!ps->info.ps.writes_z && !ps->info.ps.writes_stencil && !ps->info.ps.writes_sample_mask)
5548 blend.spi_shader_col_format = V_028714_SPI_SHADER_32_R;
5549 }
5550
5551 if (extra && (extra->custom_blend_mode == V_028808_CB_ELIMINATE_FAST_CLEAR ||
5552 extra->custom_blend_mode == V_028808_CB_FMASK_DECOMPRESS ||
5553 extra->custom_blend_mode == V_028808_CB_DCC_DECOMPRESS ||
5554 extra->custom_blend_mode == V_028808_CB_RESOLVE)) {
5555 /* According to the CB spec states, CB_SHADER_MASK should be
5556 * set to enable writes to all four channels of MRT0.
5557 */
5558 blend.cb_shader_mask = 0xf;
5559 }
5560
5561 pipeline->graphics.col_format = blend.spi_shader_col_format;
5562 pipeline->graphics.cb_target_mask = blend.cb_target_mask;
5563
5564 if (radv_pipeline_has_gs(pipeline) && !radv_pipeline_has_ngg(pipeline)) {
5565 struct radv_shader_variant *gs = pipeline->shaders[MESA_SHADER_GEOMETRY];
5566
5567 radv_pipeline_init_gs_ring_state(pipeline, &gs->info.gs_ring_info);
5568 }
5569
5570 if (radv_pipeline_has_tess(pipeline)) {
5571 pipeline->graphics.tess_patch_control_points =
5572 pCreateInfo->pTessellationState->patchControlPoints;
5573 }
5574
5575 radv_pipeline_init_vertex_input_state(pipeline, pCreateInfo, &key);
5576 radv_pipeline_init_binning_state(pipeline, pCreateInfo, &blend);
5577 radv_pipeline_init_shader_stages_state(pipeline);
5578 radv_pipeline_init_scratch(device, pipeline);
5579
5580 /* Find the last vertex shader stage that eventually uses streamout. */
5581 pipeline->streamout_shader = radv_pipeline_get_streamout_shader(pipeline);
5582
5583 pipeline->graphics.is_ngg = radv_pipeline_has_ngg(pipeline);
5584 pipeline->graphics.has_ngg_culling =
5585 pipeline->graphics.is_ngg &&
5586 pipeline->shaders[pipeline->graphics.last_vgt_api_stage]->info.has_ngg_culling;
5587
5588 pipeline->push_constant_size = pipeline_layout->push_constant_size;
5589 pipeline->dynamic_offset_count = pipeline_layout->dynamic_offset_count;
5590
5591 radv_pipeline_generate_pm4(pipeline, pCreateInfo, extra, &blend);
5592
5593 return result;
5594 }
5595
5596 VkResult
radv_graphics_pipeline_create(VkDevice _device,VkPipelineCache _cache,const VkGraphicsPipelineCreateInfo * pCreateInfo,const struct radv_graphics_pipeline_create_info * extra,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipeline)5597 radv_graphics_pipeline_create(VkDevice _device, VkPipelineCache _cache,
5598 const VkGraphicsPipelineCreateInfo *pCreateInfo,
5599 const struct radv_graphics_pipeline_create_info *extra,
5600 const VkAllocationCallbacks *pAllocator, VkPipeline *pPipeline)
5601 {
5602 RADV_FROM_HANDLE(radv_device, device, _device);
5603 RADV_FROM_HANDLE(radv_pipeline_cache, cache, _cache);
5604 struct radv_pipeline *pipeline;
5605 VkResult result;
5606
5607 pipeline = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,
5608 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
5609 if (pipeline == NULL)
5610 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
5611
5612 vk_object_base_init(&device->vk, &pipeline->base, VK_OBJECT_TYPE_PIPELINE);
5613 pipeline->type = RADV_PIPELINE_GRAPHICS;
5614
5615 result = radv_pipeline_init(pipeline, device, cache, pCreateInfo, extra);
5616 if (result != VK_SUCCESS) {
5617 radv_pipeline_destroy(device, pipeline, pAllocator);
5618 return result;
5619 }
5620
5621 *pPipeline = radv_pipeline_to_handle(pipeline);
5622
5623 return VK_SUCCESS;
5624 }
5625
5626 VkResult
radv_CreateGraphicsPipelines(VkDevice _device,VkPipelineCache pipelineCache,uint32_t count,const VkGraphicsPipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)5627 radv_CreateGraphicsPipelines(VkDevice _device, VkPipelineCache pipelineCache, uint32_t count,
5628 const VkGraphicsPipelineCreateInfo *pCreateInfos,
5629 const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines)
5630 {
5631 VkResult result = VK_SUCCESS;
5632 unsigned i = 0;
5633
5634 for (; i < count; i++) {
5635 VkResult r;
5636 r = radv_graphics_pipeline_create(_device, pipelineCache, &pCreateInfos[i], NULL, pAllocator,
5637 &pPipelines[i]);
5638 if (r != VK_SUCCESS) {
5639 result = r;
5640 pPipelines[i] = VK_NULL_HANDLE;
5641
5642 if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT)
5643 break;
5644 }
5645 }
5646
5647 for (; i < count; ++i)
5648 pPipelines[i] = VK_NULL_HANDLE;
5649
5650 return result;
5651 }
5652
5653 static void
radv_pipeline_generate_hw_cs(struct radeon_cmdbuf * cs,const struct radv_pipeline * pipeline)5654 radv_pipeline_generate_hw_cs(struct radeon_cmdbuf *cs, const struct radv_pipeline *pipeline)
5655 {
5656 struct radv_shader_variant *shader = pipeline->shaders[MESA_SHADER_COMPUTE];
5657 uint64_t va = radv_shader_variant_get_va(shader);
5658 struct radv_device *device = pipeline->device;
5659
5660 radeon_set_sh_reg(cs, R_00B830_COMPUTE_PGM_LO, va >> 8);
5661
5662 radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
5663 radeon_emit(cs, shader->config.rsrc1);
5664 radeon_emit(cs, shader->config.rsrc2);
5665 if (device->physical_device->rad_info.chip_class >= GFX10) {
5666 radeon_set_sh_reg(cs, R_00B8A0_COMPUTE_PGM_RSRC3, shader->config.rsrc3);
5667 }
5668 }
5669
5670 static void
radv_pipeline_generate_compute_state(struct radeon_cmdbuf * cs,const struct radv_pipeline * pipeline)5671 radv_pipeline_generate_compute_state(struct radeon_cmdbuf *cs, const struct radv_pipeline *pipeline)
5672 {
5673 struct radv_shader_variant *shader = pipeline->shaders[MESA_SHADER_COMPUTE];
5674 struct radv_device *device = pipeline->device;
5675 unsigned threads_per_threadgroup;
5676 unsigned threadgroups_per_cu = 1;
5677 unsigned waves_per_threadgroup;
5678 unsigned max_waves_per_sh = 0;
5679
5680 /* Calculate best compute resource limits. */
5681 threads_per_threadgroup =
5682 shader->info.cs.block_size[0] * shader->info.cs.block_size[1] * shader->info.cs.block_size[2];
5683 waves_per_threadgroup = DIV_ROUND_UP(threads_per_threadgroup, shader->info.wave_size);
5684
5685 if (device->physical_device->rad_info.chip_class >= GFX10 && waves_per_threadgroup == 1)
5686 threadgroups_per_cu = 2;
5687
5688 radeon_set_sh_reg(
5689 cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
5690 ac_get_compute_resource_limits(&device->physical_device->rad_info, waves_per_threadgroup,
5691 max_waves_per_sh, threadgroups_per_cu));
5692
5693 radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
5694 radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(shader->info.cs.block_size[0]));
5695 radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(shader->info.cs.block_size[1]));
5696 radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(shader->info.cs.block_size[2]));
5697 }
5698
5699 static void
radv_compute_generate_pm4(struct radv_pipeline * pipeline)5700 radv_compute_generate_pm4(struct radv_pipeline *pipeline)
5701 {
5702 struct radv_device *device = pipeline->device;
5703 struct radeon_cmdbuf *cs = &pipeline->cs;
5704
5705 cs->max_dw = device->physical_device->rad_info.chip_class >= GFX10 ? 19 : 16;
5706 cs->buf = malloc(cs->max_dw * 4);
5707
5708 radv_pipeline_generate_hw_cs(cs, pipeline);
5709 radv_pipeline_generate_compute_state(cs, pipeline);
5710
5711 assert(pipeline->cs.cdw <= pipeline->cs.max_dw);
5712 }
5713
5714 static struct radv_pipeline_key
radv_generate_compute_pipeline_key(struct radv_pipeline * pipeline,const VkComputePipelineCreateInfo * pCreateInfo)5715 radv_generate_compute_pipeline_key(struct radv_pipeline *pipeline,
5716 const VkComputePipelineCreateInfo *pCreateInfo)
5717 {
5718 const VkPipelineShaderStageCreateInfo *stage = &pCreateInfo->stage;
5719 struct radv_pipeline_key key;
5720 memset(&key, 0, sizeof(key));
5721
5722 if (pCreateInfo->flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT)
5723 key.optimisations_disabled = 1;
5724
5725 const VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT *subgroup_size =
5726 vk_find_struct_const(stage->pNext,
5727 PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT);
5728
5729 if (subgroup_size) {
5730 assert(subgroup_size->requiredSubgroupSize == 32 ||
5731 subgroup_size->requiredSubgroupSize == 64);
5732 key.cs.compute_subgroup_size = subgroup_size->requiredSubgroupSize;
5733 } else if (stage->flags & VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT) {
5734 key.cs.require_full_subgroups = true;
5735 }
5736
5737 return key;
5738 }
5739
5740 VkResult
radv_compute_pipeline_create(VkDevice _device,VkPipelineCache _cache,const VkComputePipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,const uint8_t * custom_hash,struct radv_pipeline_shader_stack_size * rt_stack_sizes,uint32_t rt_group_count,VkPipeline * pPipeline)5741 radv_compute_pipeline_create(VkDevice _device, VkPipelineCache _cache,
5742 const VkComputePipelineCreateInfo *pCreateInfo,
5743 const VkAllocationCallbacks *pAllocator, const uint8_t *custom_hash,
5744 struct radv_pipeline_shader_stack_size *rt_stack_sizes,
5745 uint32_t rt_group_count, VkPipeline *pPipeline)
5746 {
5747 RADV_FROM_HANDLE(radv_device, device, _device);
5748 RADV_FROM_HANDLE(radv_pipeline_cache, cache, _cache);
5749 RADV_FROM_HANDLE(radv_pipeline_layout, pipeline_layout, pCreateInfo->layout);
5750 const VkPipelineShaderStageCreateInfo *pStages[MESA_SHADER_STAGES] = {
5751 0,
5752 };
5753 VkPipelineCreationFeedbackEXT *stage_feedbacks[MESA_SHADER_STAGES] = {0};
5754 struct radv_pipeline *pipeline;
5755 VkResult result;
5756
5757 pipeline = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,
5758 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
5759 if (pipeline == NULL) {
5760 free(rt_stack_sizes);
5761 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
5762 }
5763
5764 vk_object_base_init(&device->vk, &pipeline->base, VK_OBJECT_TYPE_PIPELINE);
5765 pipeline->type = RADV_PIPELINE_COMPUTE;
5766
5767 pipeline->device = device;
5768 pipeline->graphics.last_vgt_api_stage = MESA_SHADER_NONE;
5769 pipeline->compute.rt_stack_sizes = rt_stack_sizes;
5770 pipeline->compute.group_count = rt_group_count;
5771
5772 const VkPipelineCreationFeedbackCreateInfoEXT *creation_feedback =
5773 vk_find_struct_const(pCreateInfo->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO_EXT);
5774 radv_init_feedback(creation_feedback);
5775
5776 VkPipelineCreationFeedbackEXT *pipeline_feedback =
5777 creation_feedback ? creation_feedback->pPipelineCreationFeedback : NULL;
5778 if (creation_feedback)
5779 stage_feedbacks[MESA_SHADER_COMPUTE] = &creation_feedback->pPipelineStageCreationFeedbacks[0];
5780
5781 pStages[MESA_SHADER_COMPUTE] = &pCreateInfo->stage;
5782
5783 struct radv_pipeline_key key = radv_generate_compute_pipeline_key(pipeline, pCreateInfo);
5784
5785 result = radv_create_shaders(pipeline, pipeline_layout, device, cache, &key, pStages,
5786 pCreateInfo->flags, custom_hash, pipeline_feedback, stage_feedbacks);
5787 if (result != VK_SUCCESS) {
5788 radv_pipeline_destroy(device, pipeline, pAllocator);
5789 return result;
5790 }
5791
5792 pipeline->user_data_0[MESA_SHADER_COMPUTE] = radv_pipeline_stage_to_user_data_0(
5793 pipeline, MESA_SHADER_COMPUTE, device->physical_device->rad_info.chip_class);
5794 pipeline->need_indirect_descriptor_sets |=
5795 radv_shader_need_indirect_descriptor_sets(pipeline, MESA_SHADER_COMPUTE);
5796 radv_pipeline_init_scratch(device, pipeline);
5797
5798 pipeline->push_constant_size = pipeline_layout->push_constant_size;
5799 pipeline->dynamic_offset_count = pipeline_layout->dynamic_offset_count;
5800
5801 radv_compute_generate_pm4(pipeline);
5802
5803 *pPipeline = radv_pipeline_to_handle(pipeline);
5804
5805 return VK_SUCCESS;
5806 }
5807
5808 VkResult
radv_CreateComputePipelines(VkDevice _device,VkPipelineCache pipelineCache,uint32_t count,const VkComputePipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)5809 radv_CreateComputePipelines(VkDevice _device, VkPipelineCache pipelineCache, uint32_t count,
5810 const VkComputePipelineCreateInfo *pCreateInfos,
5811 const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines)
5812 {
5813 VkResult result = VK_SUCCESS;
5814
5815 unsigned i = 0;
5816 for (; i < count; i++) {
5817 VkResult r;
5818 r = radv_compute_pipeline_create(_device, pipelineCache, &pCreateInfos[i], pAllocator, NULL,
5819 NULL, 0, &pPipelines[i]);
5820 if (r != VK_SUCCESS) {
5821 result = r;
5822 pPipelines[i] = VK_NULL_HANDLE;
5823
5824 if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT)
5825 break;
5826 }
5827 }
5828
5829 for (; i < count; ++i)
5830 pPipelines[i] = VK_NULL_HANDLE;
5831
5832 return result;
5833 }
5834
5835 static uint32_t
radv_get_executable_count(const struct radv_pipeline * pipeline)5836 radv_get_executable_count(const struct radv_pipeline *pipeline)
5837 {
5838 uint32_t ret = 0;
5839 for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
5840 if (!pipeline->shaders[i])
5841 continue;
5842
5843 if (i == MESA_SHADER_GEOMETRY && !radv_pipeline_has_ngg(pipeline)) {
5844 ret += 2u;
5845 } else {
5846 ret += 1u;
5847 }
5848 }
5849 return ret;
5850 }
5851
5852 static struct radv_shader_variant *
radv_get_shader_from_executable_index(const struct radv_pipeline * pipeline,int index,gl_shader_stage * stage)5853 radv_get_shader_from_executable_index(const struct radv_pipeline *pipeline, int index,
5854 gl_shader_stage *stage)
5855 {
5856 for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
5857 if (!pipeline->shaders[i])
5858 continue;
5859 if (!index) {
5860 *stage = i;
5861 return pipeline->shaders[i];
5862 }
5863
5864 --index;
5865
5866 if (i == MESA_SHADER_GEOMETRY && !radv_pipeline_has_ngg(pipeline)) {
5867 if (!index) {
5868 *stage = i;
5869 return pipeline->gs_copy_shader;
5870 }
5871 --index;
5872 }
5873 }
5874
5875 *stage = -1;
5876 return NULL;
5877 }
5878
5879 /* Basically strlcpy (which does not exist on linux) specialized for
5880 * descriptions. */
5881 static void
desc_copy(char * desc,const char * src)5882 desc_copy(char *desc, const char *src)
5883 {
5884 int len = strlen(src);
5885 assert(len < VK_MAX_DESCRIPTION_SIZE);
5886 memcpy(desc, src, len);
5887 memset(desc + len, 0, VK_MAX_DESCRIPTION_SIZE - len);
5888 }
5889
5890 VkResult
radv_GetPipelineExecutablePropertiesKHR(VkDevice _device,const VkPipelineInfoKHR * pPipelineInfo,uint32_t * pExecutableCount,VkPipelineExecutablePropertiesKHR * pProperties)5891 radv_GetPipelineExecutablePropertiesKHR(VkDevice _device, const VkPipelineInfoKHR *pPipelineInfo,
5892 uint32_t *pExecutableCount,
5893 VkPipelineExecutablePropertiesKHR *pProperties)
5894 {
5895 RADV_FROM_HANDLE(radv_pipeline, pipeline, pPipelineInfo->pipeline);
5896 const uint32_t total_count = radv_get_executable_count(pipeline);
5897
5898 if (!pProperties) {
5899 *pExecutableCount = total_count;
5900 return VK_SUCCESS;
5901 }
5902
5903 const uint32_t count = MIN2(total_count, *pExecutableCount);
5904 for (unsigned i = 0, executable_idx = 0; i < MESA_SHADER_STAGES && executable_idx < count; ++i) {
5905 if (!pipeline->shaders[i])
5906 continue;
5907 pProperties[executable_idx].stages = mesa_to_vk_shader_stage(i);
5908 const char *name = NULL;
5909 const char *description = NULL;
5910 switch (i) {
5911 case MESA_SHADER_VERTEX:
5912 name = "Vertex Shader";
5913 description = "Vulkan Vertex Shader";
5914 break;
5915 case MESA_SHADER_TESS_CTRL:
5916 if (!pipeline->shaders[MESA_SHADER_VERTEX]) {
5917 pProperties[executable_idx].stages |= VK_SHADER_STAGE_VERTEX_BIT;
5918 name = "Vertex + Tessellation Control Shaders";
5919 description = "Combined Vulkan Vertex and Tessellation Control Shaders";
5920 } else {
5921 name = "Tessellation Control Shader";
5922 description = "Vulkan Tessellation Control Shader";
5923 }
5924 break;
5925 case MESA_SHADER_TESS_EVAL:
5926 name = "Tessellation Evaluation Shader";
5927 description = "Vulkan Tessellation Evaluation Shader";
5928 break;
5929 case MESA_SHADER_GEOMETRY:
5930 if (radv_pipeline_has_tess(pipeline) && !pipeline->shaders[MESA_SHADER_TESS_EVAL]) {
5931 pProperties[executable_idx].stages |= VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT;
5932 name = "Tessellation Evaluation + Geometry Shaders";
5933 description = "Combined Vulkan Tessellation Evaluation and Geometry Shaders";
5934 } else if (!radv_pipeline_has_tess(pipeline) && !pipeline->shaders[MESA_SHADER_VERTEX]) {
5935 pProperties[executable_idx].stages |= VK_SHADER_STAGE_VERTEX_BIT;
5936 name = "Vertex + Geometry Shader";
5937 description = "Combined Vulkan Vertex and Geometry Shaders";
5938 } else {
5939 name = "Geometry Shader";
5940 description = "Vulkan Geometry Shader";
5941 }
5942 break;
5943 case MESA_SHADER_FRAGMENT:
5944 name = "Fragment Shader";
5945 description = "Vulkan Fragment Shader";
5946 break;
5947 case MESA_SHADER_COMPUTE:
5948 name = "Compute Shader";
5949 description = "Vulkan Compute Shader";
5950 break;
5951 }
5952
5953 pProperties[executable_idx].subgroupSize = pipeline->shaders[i]->info.wave_size;
5954 desc_copy(pProperties[executable_idx].name, name);
5955 desc_copy(pProperties[executable_idx].description, description);
5956
5957 ++executable_idx;
5958 if (i == MESA_SHADER_GEOMETRY && !radv_pipeline_has_ngg(pipeline)) {
5959 assert(pipeline->gs_copy_shader);
5960 if (executable_idx >= count)
5961 break;
5962
5963 pProperties[executable_idx].stages = VK_SHADER_STAGE_GEOMETRY_BIT;
5964 pProperties[executable_idx].subgroupSize = 64;
5965 desc_copy(pProperties[executable_idx].name, "GS Copy Shader");
5966 desc_copy(pProperties[executable_idx].description,
5967 "Extra shader stage that loads the GS output ringbuffer into the rasterizer");
5968
5969 ++executable_idx;
5970 }
5971 }
5972
5973 VkResult result = *pExecutableCount < total_count ? VK_INCOMPLETE : VK_SUCCESS;
5974 *pExecutableCount = count;
5975 return result;
5976 }
5977
5978 VkResult
radv_GetPipelineExecutableStatisticsKHR(VkDevice _device,const VkPipelineExecutableInfoKHR * pExecutableInfo,uint32_t * pStatisticCount,VkPipelineExecutableStatisticKHR * pStatistics)5979 radv_GetPipelineExecutableStatisticsKHR(VkDevice _device,
5980 const VkPipelineExecutableInfoKHR *pExecutableInfo,
5981 uint32_t *pStatisticCount,
5982 VkPipelineExecutableStatisticKHR *pStatistics)
5983 {
5984 RADV_FROM_HANDLE(radv_device, device, _device);
5985 RADV_FROM_HANDLE(radv_pipeline, pipeline, pExecutableInfo->pipeline);
5986 gl_shader_stage stage;
5987 struct radv_shader_variant *shader =
5988 radv_get_shader_from_executable_index(pipeline, pExecutableInfo->executableIndex, &stage);
5989
5990 enum chip_class chip_class = device->physical_device->rad_info.chip_class;
5991 unsigned lds_increment = chip_class >= GFX7 ? 512 : 256;
5992 unsigned max_waves = radv_get_max_waves(device, shader, stage);
5993
5994 VkPipelineExecutableStatisticKHR *s = pStatistics;
5995 VkPipelineExecutableStatisticKHR *end = s + (pStatistics ? *pStatisticCount : 0);
5996 VkResult result = VK_SUCCESS;
5997
5998 if (s < end) {
5999 desc_copy(s->name, "SGPRs");
6000 desc_copy(s->description, "Number of SGPR registers allocated per subgroup");
6001 s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
6002 s->value.u64 = shader->config.num_sgprs;
6003 }
6004 ++s;
6005
6006 if (s < end) {
6007 desc_copy(s->name, "VGPRs");
6008 desc_copy(s->description, "Number of VGPR registers allocated per subgroup");
6009 s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
6010 s->value.u64 = shader->config.num_vgprs;
6011 }
6012 ++s;
6013
6014 if (s < end) {
6015 desc_copy(s->name, "Spilled SGPRs");
6016 desc_copy(s->description, "Number of SGPR registers spilled per subgroup");
6017 s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
6018 s->value.u64 = shader->config.spilled_sgprs;
6019 }
6020 ++s;
6021
6022 if (s < end) {
6023 desc_copy(s->name, "Spilled VGPRs");
6024 desc_copy(s->description, "Number of VGPR registers spilled per subgroup");
6025 s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
6026 s->value.u64 = shader->config.spilled_vgprs;
6027 }
6028 ++s;
6029
6030 if (s < end) {
6031 desc_copy(s->name, "Code size");
6032 desc_copy(s->description, "Code size in bytes");
6033 s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
6034 s->value.u64 = shader->exec_size;
6035 }
6036 ++s;
6037
6038 if (s < end) {
6039 desc_copy(s->name, "LDS size");
6040 desc_copy(s->description, "LDS size in bytes per workgroup");
6041 s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
6042 s->value.u64 = shader->config.lds_size * lds_increment;
6043 }
6044 ++s;
6045
6046 if (s < end) {
6047 desc_copy(s->name, "Scratch size");
6048 desc_copy(s->description, "Private memory in bytes per subgroup");
6049 s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
6050 s->value.u64 = shader->config.scratch_bytes_per_wave;
6051 }
6052 ++s;
6053
6054 if (s < end) {
6055 desc_copy(s->name, "Subgroups per SIMD");
6056 desc_copy(s->description, "The maximum number of subgroups in flight on a SIMD unit");
6057 s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
6058 s->value.u64 = max_waves;
6059 }
6060 ++s;
6061
6062 if (shader->statistics) {
6063 for (unsigned i = 0; i < aco_num_statistics; i++) {
6064 const struct aco_compiler_statistic_info *info = &aco_statistic_infos[i];
6065 if (s < end) {
6066 desc_copy(s->name, info->name);
6067 desc_copy(s->description, info->desc);
6068 s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
6069 s->value.u64 = shader->statistics[i];
6070 }
6071 ++s;
6072 }
6073 }
6074
6075 if (!pStatistics)
6076 *pStatisticCount = s - pStatistics;
6077 else if (s > end) {
6078 *pStatisticCount = end - pStatistics;
6079 result = VK_INCOMPLETE;
6080 } else {
6081 *pStatisticCount = s - pStatistics;
6082 }
6083
6084 return result;
6085 }
6086
6087 static VkResult
radv_copy_representation(void * data,size_t * data_size,const char * src)6088 radv_copy_representation(void *data, size_t *data_size, const char *src)
6089 {
6090 size_t total_size = strlen(src) + 1;
6091
6092 if (!data) {
6093 *data_size = total_size;
6094 return VK_SUCCESS;
6095 }
6096
6097 size_t size = MIN2(total_size, *data_size);
6098
6099 memcpy(data, src, size);
6100 if (size)
6101 *((char *)data + size - 1) = 0;
6102 return size < total_size ? VK_INCOMPLETE : VK_SUCCESS;
6103 }
6104
6105 VkResult
radv_GetPipelineExecutableInternalRepresentationsKHR(VkDevice device,const VkPipelineExecutableInfoKHR * pExecutableInfo,uint32_t * pInternalRepresentationCount,VkPipelineExecutableInternalRepresentationKHR * pInternalRepresentations)6106 radv_GetPipelineExecutableInternalRepresentationsKHR(
6107 VkDevice device, const VkPipelineExecutableInfoKHR *pExecutableInfo,
6108 uint32_t *pInternalRepresentationCount,
6109 VkPipelineExecutableInternalRepresentationKHR *pInternalRepresentations)
6110 {
6111 RADV_FROM_HANDLE(radv_pipeline, pipeline, pExecutableInfo->pipeline);
6112 gl_shader_stage stage;
6113 struct radv_shader_variant *shader =
6114 radv_get_shader_from_executable_index(pipeline, pExecutableInfo->executableIndex, &stage);
6115
6116 VkPipelineExecutableInternalRepresentationKHR *p = pInternalRepresentations;
6117 VkPipelineExecutableInternalRepresentationKHR *end =
6118 p + (pInternalRepresentations ? *pInternalRepresentationCount : 0);
6119 VkResult result = VK_SUCCESS;
6120 /* optimized NIR */
6121 if (p < end) {
6122 p->isText = true;
6123 desc_copy(p->name, "NIR Shader(s)");
6124 desc_copy(p->description, "The optimized NIR shader(s)");
6125 if (radv_copy_representation(p->pData, &p->dataSize, shader->nir_string) != VK_SUCCESS)
6126 result = VK_INCOMPLETE;
6127 }
6128 ++p;
6129
6130 /* backend IR */
6131 if (p < end) {
6132 p->isText = true;
6133 if (radv_use_llvm_for_stage(pipeline->device, stage)) {
6134 desc_copy(p->name, "LLVM IR");
6135 desc_copy(p->description, "The LLVM IR after some optimizations");
6136 } else {
6137 desc_copy(p->name, "ACO IR");
6138 desc_copy(p->description, "The ACO IR after some optimizations");
6139 }
6140 if (radv_copy_representation(p->pData, &p->dataSize, shader->ir_string) != VK_SUCCESS)
6141 result = VK_INCOMPLETE;
6142 }
6143 ++p;
6144
6145 /* Disassembler */
6146 if (p < end && shader->disasm_string) {
6147 p->isText = true;
6148 desc_copy(p->name, "Assembly");
6149 desc_copy(p->description, "Final Assembly");
6150 if (radv_copy_representation(p->pData, &p->dataSize, shader->disasm_string) != VK_SUCCESS)
6151 result = VK_INCOMPLETE;
6152 }
6153 ++p;
6154
6155 if (!pInternalRepresentations)
6156 *pInternalRepresentationCount = p - pInternalRepresentations;
6157 else if (p > end) {
6158 result = VK_INCOMPLETE;
6159 *pInternalRepresentationCount = end - pInternalRepresentations;
6160 } else {
6161 *pInternalRepresentationCount = p - pInternalRepresentations;
6162 }
6163
6164 return result;
6165 }
6166