1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 *
5 * based in part on anv driver which is:
6 * Copyright © 2015 Intel Corporation
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the "Software"),
10 * to deal in the Software without restriction, including without limitation
11 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 * and/or sell copies of the Software, and to permit persons to whom the
13 * Software is furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the next
16 * paragraph) shall be included in all copies or substantial portions of the
17 * Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * IN THE SOFTWARE.
26 */
27
28 #include "nir/nir.h"
29 #include "nir/nir_builder.h"
30 #include "spirv/nir_spirv.h"
31 #include "util/disk_cache.h"
32 #include "util/mesa-sha1.h"
33 #include "util/os_time.h"
34 #include "util/u_atomic.h"
35 #include "radv_cs.h"
36 #include "radv_debug.h"
37 #include "radv_meta.h"
38 #include "radv_private.h"
39 #include "radv_shader.h"
40 #include "radv_shader_args.h"
41 #include "vk_pipeline.h"
42 #include "vk_util.h"
43
44 #include "util/debug.h"
45 #include "ac_binary.h"
46 #include "ac_nir.h"
47 #include "ac_shader_util.h"
48 #include "aco_interface.h"
49 #include "sid.h"
50 #include "vk_format.h"
51
52 struct radv_blend_state {
53 uint32_t blend_enable_4bit;
54 uint32_t need_src_alpha;
55
56 uint32_t cb_target_mask;
57 uint32_t cb_target_enabled_4bit;
58 uint32_t sx_mrt_blend_opt[8];
59 uint32_t cb_blend_control[8];
60
61 uint32_t spi_shader_col_format;
62 uint32_t col_format_is_int8;
63 uint32_t col_format_is_int10;
64 uint32_t col_format_is_float32;
65 uint32_t cb_shader_mask;
66 uint32_t db_alpha_to_mask;
67
68 uint32_t commutative_4bit;
69
70 bool mrt0_is_dual_src;
71 };
72
73 struct radv_depth_stencil_state {
74 uint32_t db_render_control;
75 uint32_t db_render_override;
76 uint32_t db_render_override2;
77 };
78
79 struct radv_dsa_order_invariance {
80 /* Whether the final result in Z/S buffers is guaranteed to be
81 * invariant under changes to the order in which fragments arrive.
82 */
83 bool zs;
84
85 /* Whether the set of fragments that pass the combined Z/S test is
86 * guaranteed to be invariant under changes to the order in which
87 * fragments arrive.
88 */
89 bool pass_set;
90 };
91
92 static bool
radv_is_raster_enabled(const struct radv_graphics_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)93 radv_is_raster_enabled(const struct radv_graphics_pipeline *pipeline,
94 const VkGraphicsPipelineCreateInfo *pCreateInfo)
95 {
96 return !pCreateInfo->pRasterizationState->rasterizerDiscardEnable ||
97 (pipeline->dynamic_states & RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE);
98 }
99
100 static bool
radv_is_static_vrs_enabled(const struct radv_graphics_pipeline * pipeline,const struct radv_graphics_pipeline_info * info)101 radv_is_static_vrs_enabled(const struct radv_graphics_pipeline *pipeline,
102 const struct radv_graphics_pipeline_info *info)
103 {
104 return info->fsr.size.width != 1 || info->fsr.size.height != 1 ||
105 info->fsr.combiner_ops[0] != VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR ||
106 info->fsr.combiner_ops[1] != VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR;
107 }
108
109 static bool
radv_is_vrs_enabled(const struct radv_graphics_pipeline * pipeline,const struct radv_graphics_pipeline_info * info)110 radv_is_vrs_enabled(const struct radv_graphics_pipeline *pipeline,
111 const struct radv_graphics_pipeline_info *info)
112 {
113 return radv_is_static_vrs_enabled(pipeline, info) ||
114 (pipeline->dynamic_states & RADV_DYNAMIC_FRAGMENT_SHADING_RATE);
115 }
116
117 static bool
radv_pipeline_has_ds_attachments(const struct radv_rendering_info * ri_info)118 radv_pipeline_has_ds_attachments(const struct radv_rendering_info *ri_info)
119 {
120 return ri_info->depth_att_format != VK_FORMAT_UNDEFINED ||
121 ri_info->stencil_att_format != VK_FORMAT_UNDEFINED;
122 }
123
124 static bool
radv_pipeline_has_color_attachments(const struct radv_rendering_info * ri_info)125 radv_pipeline_has_color_attachments(const struct radv_rendering_info *ri_info)
126 {
127 for (uint32_t i = 0; i < ri_info->color_att_count; ++i) {
128 if (ri_info->color_att_formats[i] != VK_FORMAT_UNDEFINED)
129 return true;
130 }
131
132 return false;
133 }
134
135 static bool
radv_pipeline_has_ngg(const struct radv_graphics_pipeline * pipeline)136 radv_pipeline_has_ngg(const struct radv_graphics_pipeline *pipeline)
137 {
138 struct radv_shader *shader = pipeline->base.shaders[pipeline->last_vgt_api_stage];
139
140 return shader->info.is_ngg;
141 }
142
143 bool
radv_pipeline_has_ngg_passthrough(const struct radv_graphics_pipeline * pipeline)144 radv_pipeline_has_ngg_passthrough(const struct radv_graphics_pipeline *pipeline)
145 {
146 assert(radv_pipeline_has_ngg(pipeline));
147
148 struct radv_shader *shader = pipeline->base.shaders[pipeline->last_vgt_api_stage];
149
150 return shader->info.is_ngg_passthrough;
151 }
152
153 bool
radv_pipeline_has_gs_copy_shader(const struct radv_pipeline * pipeline)154 radv_pipeline_has_gs_copy_shader(const struct radv_pipeline *pipeline)
155 {
156 return !!pipeline->gs_copy_shader;
157 }
158
159 static struct radv_pipeline_slab *
radv_pipeline_slab_create(struct radv_device * device,struct radv_pipeline * pipeline,uint32_t code_size)160 radv_pipeline_slab_create(struct radv_device *device, struct radv_pipeline *pipeline,
161 uint32_t code_size)
162 {
163 struct radv_pipeline_slab *slab;
164
165 slab = calloc(1, sizeof(*slab));
166 if (!slab)
167 return NULL;
168
169 slab->ref_count = 1;
170
171 slab->alloc = radv_alloc_shader_memory(device, code_size, pipeline);
172 if (!slab->alloc) {
173 free(slab);
174 return NULL;
175 }
176
177 return slab;
178 }
179
180 void
radv_pipeline_slab_destroy(struct radv_device * device,struct radv_pipeline_slab * slab)181 radv_pipeline_slab_destroy(struct radv_device *device, struct radv_pipeline_slab *slab)
182 {
183 if (!p_atomic_dec_zero(&slab->ref_count))
184 return;
185
186 radv_free_shader_memory(device, slab->alloc);
187 free(slab);
188 }
189
190 void
radv_pipeline_destroy(struct radv_device * device,struct radv_pipeline * pipeline,const VkAllocationCallbacks * allocator)191 radv_pipeline_destroy(struct radv_device *device, struct radv_pipeline *pipeline,
192 const VkAllocationCallbacks *allocator)
193 {
194 if (pipeline->type == RADV_PIPELINE_COMPUTE) {
195 struct radv_compute_pipeline *compute_pipeline = radv_pipeline_to_compute(pipeline);
196
197 free(compute_pipeline->rt_group_handles);
198 free(compute_pipeline->rt_stack_sizes);
199 } else if (pipeline->type == RADV_PIPELINE_LIBRARY) {
200 struct radv_library_pipeline *library_pipeline = radv_pipeline_to_library(pipeline);
201
202 free(library_pipeline->groups);
203 for (uint32_t i = 0; i < library_pipeline->stage_count; i++) {
204 RADV_FROM_HANDLE(vk_shader_module, module, library_pipeline->stages[i].module);
205 if (module) {
206 vk_object_base_finish(&module->base);
207 ralloc_free(module);
208 }
209 }
210 free(library_pipeline->stages);
211 free(library_pipeline->identifiers);
212 free(library_pipeline->hashes);
213 }
214
215 if (pipeline->slab)
216 radv_pipeline_slab_destroy(device, pipeline->slab);
217
218 for (unsigned i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i)
219 if (pipeline->shaders[i])
220 radv_shader_destroy(device, pipeline->shaders[i]);
221
222 if (pipeline->gs_copy_shader)
223 radv_shader_destroy(device, pipeline->gs_copy_shader);
224
225 if (pipeline->cs.buf)
226 free(pipeline->cs.buf);
227
228 vk_object_base_finish(&pipeline->base);
229 vk_free2(&device->vk.alloc, allocator, pipeline);
230 }
231
232 VKAPI_ATTR void VKAPI_CALL
radv_DestroyPipeline(VkDevice _device,VkPipeline _pipeline,const VkAllocationCallbacks * pAllocator)233 radv_DestroyPipeline(VkDevice _device, VkPipeline _pipeline,
234 const VkAllocationCallbacks *pAllocator)
235 {
236 RADV_FROM_HANDLE(radv_device, device, _device);
237 RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);
238
239 if (!_pipeline)
240 return;
241
242 radv_pipeline_destroy(device, pipeline, pAllocator);
243 }
244
245 uint32_t
radv_get_hash_flags(const struct radv_device * device,bool stats)246 radv_get_hash_flags(const struct radv_device *device, bool stats)
247 {
248 uint32_t hash_flags = 0;
249
250 if (device->physical_device->use_ngg_culling)
251 hash_flags |= RADV_HASH_SHADER_USE_NGG_CULLING;
252 if (device->instance->perftest_flags & RADV_PERFTEST_EMULATE_RT)
253 hash_flags |= RADV_HASH_SHADER_EMULATE_RT;
254 if (device->physical_device->rt_wave_size == 64)
255 hash_flags |= RADV_HASH_SHADER_RT_WAVE64;
256 if (device->physical_device->cs_wave_size == 32)
257 hash_flags |= RADV_HASH_SHADER_CS_WAVE32;
258 if (device->physical_device->ps_wave_size == 32)
259 hash_flags |= RADV_HASH_SHADER_PS_WAVE32;
260 if (device->physical_device->ge_wave_size == 32)
261 hash_flags |= RADV_HASH_SHADER_GE_WAVE32;
262 if (device->physical_device->use_llvm)
263 hash_flags |= RADV_HASH_SHADER_LLVM;
264 if (stats)
265 hash_flags |= RADV_HASH_SHADER_KEEP_STATISTICS;
266 if (device->robust_buffer_access) /* forces per-attribute vertex descriptors */
267 hash_flags |= RADV_HASH_SHADER_ROBUST_BUFFER_ACCESS;
268 if (device->robust_buffer_access2) /* affects load/store vectorizer */
269 hash_flags |= RADV_HASH_SHADER_ROBUST_BUFFER_ACCESS2;
270 if (device->instance->debug_flags & RADV_DEBUG_SPLIT_FMA)
271 hash_flags |= RADV_HASH_SHADER_SPLIT_FMA;
272 return hash_flags;
273 }
274
275 static void
radv_pipeline_init_scratch(const struct radv_device * device,struct radv_pipeline * pipeline)276 radv_pipeline_init_scratch(const struct radv_device *device, struct radv_pipeline *pipeline)
277 {
278 unsigned scratch_bytes_per_wave = 0;
279 unsigned max_waves = 0;
280
281 for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i) {
282 if (pipeline->shaders[i] && pipeline->shaders[i]->config.scratch_bytes_per_wave) {
283 unsigned max_stage_waves = device->scratch_waves;
284
285 scratch_bytes_per_wave =
286 MAX2(scratch_bytes_per_wave, pipeline->shaders[i]->config.scratch_bytes_per_wave);
287
288 max_stage_waves =
289 MIN2(max_stage_waves, 4 * device->physical_device->rad_info.num_cu *
290 radv_get_max_waves(device, pipeline->shaders[i], i));
291 max_waves = MAX2(max_waves, max_stage_waves);
292 }
293 }
294
295 pipeline->scratch_bytes_per_wave = scratch_bytes_per_wave;
296 pipeline->max_waves = max_waves;
297 }
298
299 static uint32_t
si_translate_blend_function(VkBlendOp op)300 si_translate_blend_function(VkBlendOp op)
301 {
302 switch (op) {
303 case VK_BLEND_OP_ADD:
304 return V_028780_COMB_DST_PLUS_SRC;
305 case VK_BLEND_OP_SUBTRACT:
306 return V_028780_COMB_SRC_MINUS_DST;
307 case VK_BLEND_OP_REVERSE_SUBTRACT:
308 return V_028780_COMB_DST_MINUS_SRC;
309 case VK_BLEND_OP_MIN:
310 return V_028780_COMB_MIN_DST_SRC;
311 case VK_BLEND_OP_MAX:
312 return V_028780_COMB_MAX_DST_SRC;
313 default:
314 return 0;
315 }
316 }
317
318 static uint32_t
si_translate_blend_factor(enum amd_gfx_level gfx_level,VkBlendFactor factor)319 si_translate_blend_factor(enum amd_gfx_level gfx_level, VkBlendFactor factor)
320 {
321 switch (factor) {
322 case VK_BLEND_FACTOR_ZERO:
323 return V_028780_BLEND_ZERO;
324 case VK_BLEND_FACTOR_ONE:
325 return V_028780_BLEND_ONE;
326 case VK_BLEND_FACTOR_SRC_COLOR:
327 return V_028780_BLEND_SRC_COLOR;
328 case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
329 return V_028780_BLEND_ONE_MINUS_SRC_COLOR;
330 case VK_BLEND_FACTOR_DST_COLOR:
331 return V_028780_BLEND_DST_COLOR;
332 case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
333 return V_028780_BLEND_ONE_MINUS_DST_COLOR;
334 case VK_BLEND_FACTOR_SRC_ALPHA:
335 return V_028780_BLEND_SRC_ALPHA;
336 case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
337 return V_028780_BLEND_ONE_MINUS_SRC_ALPHA;
338 case VK_BLEND_FACTOR_DST_ALPHA:
339 return V_028780_BLEND_DST_ALPHA;
340 case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
341 return V_028780_BLEND_ONE_MINUS_DST_ALPHA;
342 case VK_BLEND_FACTOR_CONSTANT_COLOR:
343 return gfx_level >= GFX11 ? V_028780_BLEND_CONSTANT_COLOR_GFX11
344 : V_028780_BLEND_CONSTANT_COLOR_GFX6;
345 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
346 return gfx_level >= GFX11 ? V_028780_BLEND_ONE_MINUS_CONSTANT_COLOR_GFX11
347 : V_028780_BLEND_ONE_MINUS_CONSTANT_COLOR_GFX6;
348 case VK_BLEND_FACTOR_CONSTANT_ALPHA:
349 return gfx_level >= GFX11 ? V_028780_BLEND_CONSTANT_ALPHA_GFX11
350 : V_028780_BLEND_CONSTANT_ALPHA_GFX6;
351 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
352 return gfx_level >= GFX11 ? V_028780_BLEND_ONE_MINUS_CONSTANT_ALPHA_GFX11
353 : V_028780_BLEND_ONE_MINUS_CONSTANT_ALPHA_GFX6;
354 case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
355 return V_028780_BLEND_SRC_ALPHA_SATURATE;
356 case VK_BLEND_FACTOR_SRC1_COLOR:
357 return gfx_level >= GFX11 ? V_028780_BLEND_SRC1_COLOR_GFX11 : V_028780_BLEND_SRC1_COLOR_GFX6;
358 case VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR:
359 return gfx_level >= GFX11 ? V_028780_BLEND_INV_SRC1_COLOR_GFX11
360 : V_028780_BLEND_INV_SRC1_COLOR_GFX6;
361 case VK_BLEND_FACTOR_SRC1_ALPHA:
362 return gfx_level >= GFX11 ? V_028780_BLEND_SRC1_ALPHA_GFX11 : V_028780_BLEND_SRC1_ALPHA_GFX6;
363 case VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA:
364 return gfx_level >= GFX11 ? V_028780_BLEND_INV_SRC1_ALPHA_GFX11
365 : V_028780_BLEND_INV_SRC1_ALPHA_GFX6;
366 default:
367 return 0;
368 }
369 }
370
371 static uint32_t
si_translate_blend_opt_function(unsigned op)372 si_translate_blend_opt_function(unsigned op)
373 {
374 switch (op) {
375 case V_028780_COMB_DST_PLUS_SRC:
376 return V_028760_OPT_COMB_ADD;
377 case V_028780_COMB_SRC_MINUS_DST:
378 return V_028760_OPT_COMB_SUBTRACT;
379 case V_028780_COMB_DST_MINUS_SRC:
380 return V_028760_OPT_COMB_REVSUBTRACT;
381 case V_028780_COMB_MIN_DST_SRC:
382 return V_028760_OPT_COMB_MIN;
383 case V_028780_COMB_MAX_DST_SRC:
384 return V_028760_OPT_COMB_MAX;
385 default:
386 return V_028760_OPT_COMB_BLEND_DISABLED;
387 }
388 }
389
390 static uint32_t
si_translate_blend_opt_factor(unsigned factor,bool is_alpha)391 si_translate_blend_opt_factor(unsigned factor, bool is_alpha)
392 {
393 switch (factor) {
394 case V_028780_BLEND_ZERO:
395 return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_ALL;
396 case V_028780_BLEND_ONE:
397 return V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE;
398 case V_028780_BLEND_SRC_COLOR:
399 return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0
400 : V_028760_BLEND_OPT_PRESERVE_C1_IGNORE_C0;
401 case V_028780_BLEND_ONE_MINUS_SRC_COLOR:
402 return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1
403 : V_028760_BLEND_OPT_PRESERVE_C0_IGNORE_C1;
404 case V_028780_BLEND_SRC_ALPHA:
405 return V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0;
406 case V_028780_BLEND_ONE_MINUS_SRC_ALPHA:
407 return V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1;
408 case V_028780_BLEND_SRC_ALPHA_SATURATE:
409 return is_alpha ? V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE
410 : V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0;
411 default:
412 return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
413 }
414 }
415
416 /**
417 * Get rid of DST in the blend factors by commuting the operands:
418 * func(src * DST, dst * 0) ---> func(src * 0, dst * SRC)
419 */
420 static void
si_blend_remove_dst(unsigned * func,unsigned * src_factor,unsigned * dst_factor,unsigned expected_dst,unsigned replacement_src)421 si_blend_remove_dst(unsigned *func, unsigned *src_factor, unsigned *dst_factor,
422 unsigned expected_dst, unsigned replacement_src)
423 {
424 if (*src_factor == expected_dst && *dst_factor == V_028780_BLEND_ZERO) {
425 *src_factor = V_028780_BLEND_ZERO;
426 *dst_factor = replacement_src;
427
428 /* Commuting the operands requires reversing subtractions. */
429 if (*func == V_028780_COMB_SRC_MINUS_DST)
430 *func = V_028780_COMB_DST_MINUS_SRC;
431 else if (*func == V_028780_COMB_DST_MINUS_SRC)
432 *func = V_028780_COMB_SRC_MINUS_DST;
433 }
434 }
435
436 static bool
si_blend_factor_uses_dst(unsigned factor)437 si_blend_factor_uses_dst(unsigned factor)
438 {
439 return factor == V_028780_BLEND_DST_COLOR ||
440 factor == V_028780_BLEND_DST_ALPHA ||
441 factor == V_028780_BLEND_SRC_ALPHA_SATURATE ||
442 factor == V_028780_BLEND_ONE_MINUS_DST_ALPHA ||
443 factor == V_028780_BLEND_ONE_MINUS_DST_COLOR;
444 }
445
446 static bool
is_dual_src(enum amd_gfx_level gfx_level,unsigned factor)447 is_dual_src(enum amd_gfx_level gfx_level, unsigned factor)
448 {
449 if (gfx_level >= GFX11) {
450 switch (factor) {
451 case V_028780_BLEND_SRC1_COLOR_GFX11:
452 case V_028780_BLEND_INV_SRC1_COLOR_GFX11:
453 case V_028780_BLEND_SRC1_ALPHA_GFX11:
454 case V_028780_BLEND_INV_SRC1_ALPHA_GFX11:
455 return true;
456 default:
457 return false;
458 }
459 } else {
460 switch (factor) {
461 case V_028780_BLEND_SRC1_COLOR_GFX6:
462 case V_028780_BLEND_INV_SRC1_COLOR_GFX6:
463 case V_028780_BLEND_SRC1_ALPHA_GFX6:
464 case V_028780_BLEND_INV_SRC1_ALPHA_GFX6:
465 return true;
466 default:
467 return false;
468 }
469 }
470 }
471
472 static unsigned
radv_choose_spi_color_format(const struct radv_device * device,VkFormat vk_format,bool blend_enable,bool blend_need_alpha)473 radv_choose_spi_color_format(const struct radv_device *device, VkFormat vk_format,
474 bool blend_enable, bool blend_need_alpha)
475 {
476 const struct util_format_description *desc = vk_format_description(vk_format);
477 bool use_rbplus = device->physical_device->rad_info.rbplus_allowed;
478 struct ac_spi_color_formats formats = {0};
479 unsigned format, ntype, swap;
480
481 format = radv_translate_colorformat(vk_format);
482 ntype = radv_translate_color_numformat(vk_format, desc,
483 vk_format_get_first_non_void_channel(vk_format));
484 swap = radv_translate_colorswap(vk_format, false);
485
486 ac_choose_spi_color_formats(format, swap, ntype, false, use_rbplus, &formats);
487
488 if (blend_enable && blend_need_alpha)
489 return formats.blend_alpha;
490 else if (blend_need_alpha)
491 return formats.alpha;
492 else if (blend_enable)
493 return formats.blend;
494 else
495 return formats.normal;
496 }
497
498 static bool
format_is_int8(VkFormat format)499 format_is_int8(VkFormat format)
500 {
501 const struct util_format_description *desc = vk_format_description(format);
502 int channel = vk_format_get_first_non_void_channel(format);
503
504 return channel >= 0 && desc->channel[channel].pure_integer && desc->channel[channel].size == 8;
505 }
506
507 static bool
format_is_int10(VkFormat format)508 format_is_int10(VkFormat format)
509 {
510 const struct util_format_description *desc = vk_format_description(format);
511
512 if (desc->nr_channels != 4)
513 return false;
514 for (unsigned i = 0; i < 4; i++) {
515 if (desc->channel[i].pure_integer && desc->channel[i].size == 10)
516 return true;
517 }
518 return false;
519 }
520
521 static bool
format_is_float32(VkFormat format)522 format_is_float32(VkFormat format)
523 {
524 const struct util_format_description *desc = vk_format_description(format);
525 int channel = vk_format_get_first_non_void_channel(format);
526
527 return channel >= 0 &&
528 desc->channel[channel].type == UTIL_FORMAT_TYPE_FLOAT && desc->channel[channel].size == 32;
529 }
530
531 static void
radv_pipeline_compute_spi_color_formats(const struct radv_graphics_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo,struct radv_blend_state * blend,const struct radv_graphics_pipeline_info * info)532 radv_pipeline_compute_spi_color_formats(const struct radv_graphics_pipeline *pipeline,
533 const VkGraphicsPipelineCreateInfo *pCreateInfo,
534 struct radv_blend_state *blend,
535 const struct radv_graphics_pipeline_info *info)
536 {
537 unsigned col_format = 0, is_int8 = 0, is_int10 = 0, is_float32 = 0;
538 unsigned num_targets;
539
540 for (unsigned i = 0; i < info->ri.color_att_count; ++i) {
541 unsigned cf;
542 VkFormat fmt = info->ri.color_att_formats[i];
543
544 if (fmt == VK_FORMAT_UNDEFINED || !(blend->cb_target_mask & (0xfu << (i * 4)))) {
545 cf = V_028714_SPI_SHADER_ZERO;
546 } else {
547 bool blend_enable = blend->blend_enable_4bit & (0xfu << (i * 4));
548
549 cf = radv_choose_spi_color_format(pipeline->base.device, fmt, blend_enable,
550 blend->need_src_alpha & (1 << i));
551
552 if (format_is_int8(fmt))
553 is_int8 |= 1 << i;
554 if (format_is_int10(fmt))
555 is_int10 |= 1 << i;
556 if (format_is_float32(fmt))
557 is_float32 |= 1 << i;
558 }
559
560 col_format |= cf << (4 * i);
561 }
562
563 if (!(col_format & 0xf) && blend->need_src_alpha & (1 << 0)) {
564 /* When a subpass doesn't have any color attachments, write the
565 * alpha channel of MRT0 when alpha coverage is enabled because
566 * the depth attachment needs it.
567 */
568 col_format |= V_028714_SPI_SHADER_32_AR;
569 }
570
571 /* If the i-th target format is set, all previous target formats must
572 * be non-zero to avoid hangs.
573 */
574 num_targets = (util_last_bit(col_format) + 3) / 4;
575 for (unsigned i = 0; i < num_targets; i++) {
576 if (!(col_format & (0xfu << (i * 4)))) {
577 col_format |= V_028714_SPI_SHADER_32_R << (i * 4);
578 }
579 }
580
581 /* The output for dual source blending should have the same format as
582 * the first output.
583 */
584 if (blend->mrt0_is_dual_src) {
585 assert(!(col_format >> 4));
586 col_format |= (col_format & 0xf) << 4;
587 }
588
589 blend->cb_shader_mask = ac_get_cb_shader_mask(col_format);
590 blend->spi_shader_col_format = col_format;
591 blend->col_format_is_int8 = is_int8;
592 blend->col_format_is_int10 = is_int10;
593 blend->col_format_is_float32 = is_float32;
594 }
595
596 /*
597 * Ordered so that for each i,
598 * radv_format_meta_fs_key(radv_fs_key_format_exemplars[i]) == i.
599 */
600 const VkFormat radv_fs_key_format_exemplars[NUM_META_FS_KEYS] = {
601 VK_FORMAT_R32_SFLOAT,
602 VK_FORMAT_R32G32_SFLOAT,
603 VK_FORMAT_R8G8B8A8_UNORM,
604 VK_FORMAT_R16G16B16A16_UNORM,
605 VK_FORMAT_R16G16B16A16_SNORM,
606 VK_FORMAT_R16G16B16A16_UINT,
607 VK_FORMAT_R16G16B16A16_SINT,
608 VK_FORMAT_R32G32B32A32_SFLOAT,
609 VK_FORMAT_R8G8B8A8_UINT,
610 VK_FORMAT_R8G8B8A8_SINT,
611 VK_FORMAT_A2R10G10B10_UINT_PACK32,
612 VK_FORMAT_A2R10G10B10_SINT_PACK32,
613 };
614
615 unsigned
radv_format_meta_fs_key(struct radv_device * device,VkFormat format)616 radv_format_meta_fs_key(struct radv_device *device, VkFormat format)
617 {
618 unsigned col_format = radv_choose_spi_color_format(device, format, false, false);
619 assert(col_format != V_028714_SPI_SHADER_32_AR);
620
621 bool is_int8 = format_is_int8(format);
622 bool is_int10 = format_is_int10(format);
623
624 if (col_format == V_028714_SPI_SHADER_UINT16_ABGR && is_int8)
625 return 8;
626 else if (col_format == V_028714_SPI_SHADER_SINT16_ABGR && is_int8)
627 return 9;
628 else if (col_format == V_028714_SPI_SHADER_UINT16_ABGR && is_int10)
629 return 10;
630 else if (col_format == V_028714_SPI_SHADER_SINT16_ABGR && is_int10)
631 return 11;
632 else {
633 if (col_format >= V_028714_SPI_SHADER_32_AR)
634 --col_format; /* Skip V_028714_SPI_SHADER_32_AR since there is no such VkFormat */
635
636 --col_format; /* Skip V_028714_SPI_SHADER_ZERO */
637 return col_format;
638 }
639 }
640
641 static void
radv_blend_check_commutativity(enum amd_gfx_level gfx_level,struct radv_blend_state * blend,unsigned op,unsigned src,unsigned dst,unsigned chanmask)642 radv_blend_check_commutativity(enum amd_gfx_level gfx_level, struct radv_blend_state *blend,
643 unsigned op, unsigned src, unsigned dst, unsigned chanmask)
644 {
645 bool is_src_allowed = false;
646
647 /* Src factor is allowed when it does not depend on Dst. */
648 if (src == V_028780_BLEND_ZERO ||
649 src == V_028780_BLEND_ONE ||
650 src == V_028780_BLEND_SRC_COLOR ||
651 src == V_028780_BLEND_SRC_ALPHA ||
652 src == V_028780_BLEND_SRC_ALPHA_SATURATE ||
653 src == V_028780_BLEND_ONE_MINUS_SRC_COLOR ||
654 src == V_028780_BLEND_ONE_MINUS_SRC_ALPHA) {
655 is_src_allowed = true;
656 }
657
658 if (gfx_level >= GFX11) {
659 if (src == V_028780_BLEND_CONSTANT_COLOR_GFX11 ||
660 src == V_028780_BLEND_CONSTANT_ALPHA_GFX11 ||
661 src == V_028780_BLEND_SRC1_COLOR_GFX11 ||
662 src == V_028780_BLEND_SRC1_ALPHA_GFX11 ||
663 src == V_028780_BLEND_ONE_MINUS_CONSTANT_COLOR_GFX11 ||
664 src == V_028780_BLEND_ONE_MINUS_CONSTANT_ALPHA_GFX11 ||
665 src == V_028780_BLEND_INV_SRC1_COLOR_GFX11 ||
666 src == V_028780_BLEND_INV_SRC1_ALPHA_GFX11) {
667 is_src_allowed = true;
668 }
669 } else {
670 if (src == V_028780_BLEND_CONSTANT_COLOR_GFX6 ||
671 src == V_028780_BLEND_CONSTANT_ALPHA_GFX6 ||
672 src == V_028780_BLEND_SRC1_COLOR_GFX6 ||
673 src == V_028780_BLEND_SRC1_ALPHA_GFX6 ||
674 src == V_028780_BLEND_ONE_MINUS_CONSTANT_COLOR_GFX6 ||
675 src == V_028780_BLEND_ONE_MINUS_CONSTANT_ALPHA_GFX6 ||
676 src == V_028780_BLEND_INV_SRC1_COLOR_GFX6 ||
677 src == V_028780_BLEND_INV_SRC1_ALPHA_GFX6) {
678 is_src_allowed = true;
679 }
680 }
681
682 if (dst == V_028780_BLEND_ONE && is_src_allowed) {
683 /* Addition is commutative, but floating point addition isn't
684 * associative: subtle changes can be introduced via different
685 * rounding. Be conservative, only enable for min and max.
686 */
687 if (op == V_028780_COMB_MAX_DST_SRC || op == V_028780_COMB_MIN_DST_SRC)
688 blend->commutative_4bit |= chanmask;
689 }
690 }
691
692 static struct radv_blend_state
radv_pipeline_init_blend_state(struct radv_graphics_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo,const struct radv_graphics_pipeline_info * info)693 radv_pipeline_init_blend_state(struct radv_graphics_pipeline *pipeline,
694 const VkGraphicsPipelineCreateInfo *pCreateInfo,
695 const struct radv_graphics_pipeline_info *info)
696 {
697 const struct radv_device *device = pipeline->base.device;
698 struct radv_blend_state blend = {0};
699 unsigned cb_color_control = 0;
700 const enum amd_gfx_level gfx_level = device->physical_device->rad_info.gfx_level;
701 int i;
702
703 if (info->cb.logic_op_enable)
704 cb_color_control |= S_028808_ROP3(info->cb.logic_op);
705 else
706 cb_color_control |= S_028808_ROP3(V_028808_ROP3_COPY);
707
708 if (device->instance->debug_flags & RADV_DEBUG_NO_ATOC_DITHERING)
709 {
710 blend.db_alpha_to_mask = S_028B70_ALPHA_TO_MASK_OFFSET0(2) | S_028B70_ALPHA_TO_MASK_OFFSET1(2) |
711 S_028B70_ALPHA_TO_MASK_OFFSET2(2) | S_028B70_ALPHA_TO_MASK_OFFSET3(2) |
712 S_028B70_OFFSET_ROUND(0);
713 }
714 else
715 {
716 blend.db_alpha_to_mask = S_028B70_ALPHA_TO_MASK_OFFSET0(3) | S_028B70_ALPHA_TO_MASK_OFFSET1(1) |
717 S_028B70_ALPHA_TO_MASK_OFFSET2(0) | S_028B70_ALPHA_TO_MASK_OFFSET3(2) |
718 S_028B70_OFFSET_ROUND(1);
719 }
720
721 if (info->ms.alpha_to_coverage_enable) {
722 blend.db_alpha_to_mask |= S_028B70_ALPHA_TO_MASK_ENABLE(1);
723 blend.need_src_alpha |= 0x1;
724 }
725
726 blend.cb_target_mask = 0;
727 for (i = 0; i < info->cb.att_count; i++) {
728 unsigned blend_cntl = 0;
729 unsigned srcRGB_opt, dstRGB_opt, srcA_opt, dstA_opt;
730 unsigned eqRGB = info->cb.att[i].color_blend_op;
731 unsigned srcRGB = info->cb.att[i].src_color_blend_factor;
732 unsigned dstRGB = info->cb.att[i].dst_color_blend_factor;
733 unsigned eqA = info->cb.att[i].alpha_blend_op;
734 unsigned srcA = info->cb.att[i].src_alpha_blend_factor;
735 unsigned dstA = info->cb.att[i].dst_alpha_blend_factor;
736
737 blend.sx_mrt_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) |
738 S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED);
739
740 if (!info->cb.att[i].color_write_mask)
741 continue;
742
743 /* Ignore other blend targets if dual-source blending
744 * is enabled to prevent wrong behaviour.
745 */
746 if (blend.mrt0_is_dual_src)
747 continue;
748
749 blend.cb_target_mask |= (unsigned)info->cb.att[i].color_write_mask << (4 * i);
750 blend.cb_target_enabled_4bit |= 0xfu << (4 * i);
751 if (!info->cb.att[i].blend_enable) {
752 blend.cb_blend_control[i] = blend_cntl;
753 continue;
754 }
755
756 if (is_dual_src(gfx_level, srcRGB) || is_dual_src(gfx_level, dstRGB) ||
757 is_dual_src(gfx_level, srcA) || is_dual_src(gfx_level, dstA))
758 if (i == 0)
759 blend.mrt0_is_dual_src = true;
760
761
762 if (eqRGB == V_028780_COMB_MIN_DST_SRC || eqRGB == V_028780_COMB_MAX_DST_SRC) {
763 srcRGB = V_028780_BLEND_ONE;
764 dstRGB = V_028780_BLEND_ONE;
765 }
766 if (eqA == V_028780_COMB_MIN_DST_SRC || eqA == V_028780_COMB_MAX_DST_SRC) {
767 srcA = V_028780_BLEND_ONE;
768 dstA = V_028780_BLEND_ONE;
769 }
770
771 radv_blend_check_commutativity(gfx_level, &blend, eqRGB, srcRGB, dstRGB, 0x7u << (4 * i));
772 radv_blend_check_commutativity(gfx_level, &blend, eqA, srcA, dstA, 0x8u << (4 * i));
773
774 /* Blending optimizations for RB+.
775 * These transformations don't change the behavior.
776 *
777 * First, get rid of DST in the blend factors:
778 * func(src * DST, dst * 0) ---> func(src * 0, dst * SRC)
779 */
780 si_blend_remove_dst(&eqRGB, &srcRGB, &dstRGB, V_028780_BLEND_DST_COLOR,
781 V_028780_BLEND_SRC_COLOR);
782
783 si_blend_remove_dst(&eqA, &srcA, &dstA, V_028780_BLEND_DST_COLOR,
784 V_028780_BLEND_SRC_COLOR);
785
786 si_blend_remove_dst(&eqA, &srcA, &dstA, V_028780_BLEND_DST_ALPHA,
787 V_028780_BLEND_SRC_ALPHA);
788
789 /* Look up the ideal settings from tables. */
790 srcRGB_opt = si_translate_blend_opt_factor(srcRGB, false);
791 dstRGB_opt = si_translate_blend_opt_factor(dstRGB, false);
792 srcA_opt = si_translate_blend_opt_factor(srcA, true);
793 dstA_opt = si_translate_blend_opt_factor(dstA, true);
794
795 /* Handle interdependencies. */
796 if (si_blend_factor_uses_dst(srcRGB))
797 dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
798 if (si_blend_factor_uses_dst(srcA))
799 dstA_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
800
801 if (srcRGB == V_028780_BLEND_SRC_ALPHA_SATURATE &&
802 (dstRGB == V_028780_BLEND_ZERO || dstRGB == V_028780_BLEND_SRC_ALPHA ||
803 dstRGB == V_028780_BLEND_SRC_ALPHA_SATURATE))
804 dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0;
805
806 /* Set the final value. */
807 blend.sx_mrt_blend_opt[i] =
808 S_028760_COLOR_SRC_OPT(srcRGB_opt) | S_028760_COLOR_DST_OPT(dstRGB_opt) |
809 S_028760_COLOR_COMB_FCN(si_translate_blend_opt_function(eqRGB)) |
810 S_028760_ALPHA_SRC_OPT(srcA_opt) | S_028760_ALPHA_DST_OPT(dstA_opt) |
811 S_028760_ALPHA_COMB_FCN(si_translate_blend_opt_function(eqA));
812 blend_cntl |= S_028780_ENABLE(1);
813
814 blend_cntl |= S_028780_COLOR_COMB_FCN(eqRGB);
815 blend_cntl |= S_028780_COLOR_SRCBLEND(srcRGB);
816 blend_cntl |= S_028780_COLOR_DESTBLEND(dstRGB);
817 if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB) {
818 blend_cntl |= S_028780_SEPARATE_ALPHA_BLEND(1);
819 blend_cntl |= S_028780_ALPHA_COMB_FCN(eqA);
820 blend_cntl |= S_028780_ALPHA_SRCBLEND(srcA);
821 blend_cntl |= S_028780_ALPHA_DESTBLEND(dstA);
822 }
823 blend.cb_blend_control[i] = blend_cntl;
824
825 blend.blend_enable_4bit |= 0xfu << (i * 4);
826
827 if (srcRGB == V_028780_BLEND_SRC_ALPHA || dstRGB == V_028780_BLEND_SRC_ALPHA ||
828 srcRGB == V_028780_BLEND_SRC_ALPHA_SATURATE ||
829 dstRGB == V_028780_BLEND_SRC_ALPHA_SATURATE ||
830 srcRGB == V_028780_BLEND_ONE_MINUS_SRC_ALPHA ||
831 dstRGB == V_028780_BLEND_ONE_MINUS_SRC_ALPHA)
832 blend.need_src_alpha |= 1 << i;
833 }
834 for (i = info->cb.att_count; i < 8; i++) {
835 blend.cb_blend_control[i] = 0;
836 blend.sx_mrt_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) |
837 S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED);
838 }
839
840 if (device->physical_device->rad_info.has_rbplus) {
841 /* Disable RB+ blend optimizations for dual source blending. */
842 if (blend.mrt0_is_dual_src) {
843 for (i = 0; i < 8; i++) {
844 blend.sx_mrt_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_NONE) |
845 S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_NONE);
846 }
847 }
848
849 /* RB+ doesn't work with dual source blending, logic op and
850 * RESOLVE.
851 */
852 if (blend.mrt0_is_dual_src || info->cb.logic_op_enable ||
853 (device->physical_device->rad_info.gfx_level >= GFX11 && blend.blend_enable_4bit))
854 cb_color_control |= S_028808_DISABLE_DUAL_QUAD(1);
855 }
856
857 if (blend.cb_target_mask)
858 cb_color_control |= S_028808_MODE(V_028808_CB_NORMAL);
859 else
860 cb_color_control |= S_028808_MODE(V_028808_CB_DISABLE);
861
862 radv_pipeline_compute_spi_color_formats(pipeline, pCreateInfo, &blend, info);
863
864 pipeline->cb_color_control = cb_color_control;
865
866 return blend;
867 }
868
869 static uint32_t
si_translate_fill(VkPolygonMode func)870 si_translate_fill(VkPolygonMode func)
871 {
872 switch (func) {
873 case VK_POLYGON_MODE_FILL:
874 return V_028814_X_DRAW_TRIANGLES;
875 case VK_POLYGON_MODE_LINE:
876 return V_028814_X_DRAW_LINES;
877 case VK_POLYGON_MODE_POINT:
878 return V_028814_X_DRAW_POINTS;
879 default:
880 assert(0);
881 return V_028814_X_DRAW_POINTS;
882 }
883 }
884
885 static unsigned
radv_pipeline_color_samples(const struct radv_graphics_pipeline_info * info)886 radv_pipeline_color_samples( const struct radv_graphics_pipeline_info *info)
887 {
888 if (info->color_att_samples && radv_pipeline_has_color_attachments(&info->ri)) {
889 return info->color_att_samples;
890 }
891
892 return info->ms.raster_samples;
893 }
894
895 static unsigned
radv_pipeline_depth_samples(const struct radv_graphics_pipeline_info * info)896 radv_pipeline_depth_samples(const struct radv_graphics_pipeline_info *info)
897 {
898 if (info->ds_att_samples && radv_pipeline_has_ds_attachments(&info->ri)) {
899 return info->ds_att_samples;
900 }
901
902 return info->ms.raster_samples;
903 }
904
905 static uint8_t
radv_pipeline_get_ps_iter_samples(const struct radv_graphics_pipeline_info * info)906 radv_pipeline_get_ps_iter_samples(const struct radv_graphics_pipeline_info *info)
907 {
908 uint32_t ps_iter_samples = 1;
909 uint32_t num_samples = radv_pipeline_color_samples(info);
910
911 if (info->ms.sample_shading_enable) {
912 ps_iter_samples = ceilf(info->ms.min_sample_shading * num_samples);
913 ps_iter_samples = util_next_power_of_two(ps_iter_samples);
914 }
915 return ps_iter_samples;
916 }
917
918 static bool
radv_is_depth_write_enabled(const struct radv_depth_stencil_info * ds_info)919 radv_is_depth_write_enabled(const struct radv_depth_stencil_info *ds_info)
920 {
921 return ds_info->depth_test_enable && ds_info->depth_write_enable &&
922 ds_info->depth_compare_op != VK_COMPARE_OP_NEVER;
923 }
924
925 static bool
radv_writes_stencil(const struct radv_stencil_op_info * info)926 radv_writes_stencil(const struct radv_stencil_op_info *info)
927 {
928 return info->write_mask &&
929 (info->fail_op != VK_STENCIL_OP_KEEP || info->pass_op != VK_STENCIL_OP_KEEP ||
930 info->depth_fail_op != VK_STENCIL_OP_KEEP);
931 }
932
933 static bool
radv_is_stencil_write_enabled(const struct radv_depth_stencil_info * ds_info)934 radv_is_stencil_write_enabled(const struct radv_depth_stencil_info *ds_info)
935 {
936 return ds_info->stencil_test_enable &&
937 (radv_writes_stencil(&ds_info->front) || radv_writes_stencil(&ds_info->back));
938 }
939
940 static bool
radv_order_invariant_stencil_op(VkStencilOp op)941 radv_order_invariant_stencil_op(VkStencilOp op)
942 {
943 /* REPLACE is normally order invariant, except when the stencil
944 * reference value is written by the fragment shader. Tracking this
945 * interaction does not seem worth the effort, so be conservative.
946 */
947 return op != VK_STENCIL_OP_INCREMENT_AND_CLAMP && op != VK_STENCIL_OP_DECREMENT_AND_CLAMP &&
948 op != VK_STENCIL_OP_REPLACE;
949 }
950
951 static bool
radv_order_invariant_stencil_state(const struct radv_stencil_op_info * info)952 radv_order_invariant_stencil_state(const struct radv_stencil_op_info *info)
953 {
954 /* Compute whether, assuming Z writes are disabled, this stencil state
955 * is order invariant in the sense that the set of passing fragments as
956 * well as the final stencil buffer result does not depend on the order
957 * of fragments.
958 */
959 return !info->write_mask ||
960 /* The following assumes that Z writes are disabled. */
961 (info->compare_op == VK_COMPARE_OP_ALWAYS &&
962 radv_order_invariant_stencil_op(info->pass_op) &&
963 radv_order_invariant_stencil_op(info->depth_fail_op)) ||
964 (info->compare_op == VK_COMPARE_OP_NEVER &&
965 radv_order_invariant_stencil_op(info->fail_op));
966 }
967
968 static bool
radv_pipeline_has_dynamic_ds_states(const struct radv_graphics_pipeline * pipeline)969 radv_pipeline_has_dynamic_ds_states(const struct radv_graphics_pipeline *pipeline)
970 {
971 return !!(pipeline->dynamic_states & (RADV_DYNAMIC_DEPTH_TEST_ENABLE |
972 RADV_DYNAMIC_DEPTH_WRITE_ENABLE |
973 RADV_DYNAMIC_DEPTH_COMPARE_OP |
974 RADV_DYNAMIC_STENCIL_TEST_ENABLE |
975 RADV_DYNAMIC_STENCIL_WRITE_MASK |
976 RADV_DYNAMIC_STENCIL_OP));
977 }
978
979 static bool
radv_pipeline_out_of_order_rast(struct radv_graphics_pipeline * pipeline,const struct radv_blend_state * blend,const struct radv_graphics_pipeline_info * info)980 radv_pipeline_out_of_order_rast(struct radv_graphics_pipeline *pipeline,
981 const struct radv_blend_state *blend,
982 const struct radv_graphics_pipeline_info *info)
983 {
984 unsigned colormask = blend->cb_target_enabled_4bit;
985
986 if (!pipeline->base.device->physical_device->out_of_order_rast_allowed)
987 return false;
988
989 /* Be conservative if a logic operation is enabled with color buffers. */
990 if (colormask && info->cb.logic_op_enable)
991 return false;
992
993 /* Be conservative if an extended dynamic depth/stencil state is
994 * enabled because the driver can't update out-of-order rasterization
995 * dynamically.
996 */
997 if (radv_pipeline_has_dynamic_ds_states(pipeline))
998 return false;
999
1000 /* Default depth/stencil invariance when no attachment is bound. */
1001 struct radv_dsa_order_invariance dsa_order_invariant = {.zs = true, .pass_set = true};
1002
1003 bool has_stencil = info->ri.stencil_att_format != VK_FORMAT_UNDEFINED;
1004 struct radv_dsa_order_invariance order_invariance[2];
1005 struct radv_shader *ps = pipeline->base.shaders[MESA_SHADER_FRAGMENT];
1006
1007 /* Compute depth/stencil order invariance in order to know if
1008 * it's safe to enable out-of-order.
1009 */
1010 bool zfunc_is_ordered = info->ds.depth_compare_op == VK_COMPARE_OP_NEVER ||
1011 info->ds.depth_compare_op == VK_COMPARE_OP_LESS ||
1012 info->ds.depth_compare_op == VK_COMPARE_OP_LESS_OR_EQUAL ||
1013 info->ds.depth_compare_op == VK_COMPARE_OP_GREATER ||
1014 info->ds.depth_compare_op == VK_COMPARE_OP_GREATER_OR_EQUAL;
1015 bool depth_write_enabled = radv_is_depth_write_enabled(&info->ds);
1016 bool stencil_write_enabled = radv_is_stencil_write_enabled(&info->ds);
1017 bool ds_write_enabled = depth_write_enabled || stencil_write_enabled;
1018
1019 bool nozwrite_and_order_invariant_stencil =
1020 !ds_write_enabled ||
1021 (!depth_write_enabled && radv_order_invariant_stencil_state(&info->ds.front) &&
1022 radv_order_invariant_stencil_state(&info->ds.back));
1023
1024 order_invariance[1].zs = nozwrite_and_order_invariant_stencil ||
1025 (!stencil_write_enabled && zfunc_is_ordered);
1026 order_invariance[0].zs = !depth_write_enabled || zfunc_is_ordered;
1027
1028 order_invariance[1].pass_set =
1029 nozwrite_and_order_invariant_stencil ||
1030 (!stencil_write_enabled &&
1031 (info->ds.depth_compare_op == VK_COMPARE_OP_ALWAYS ||
1032 info->ds.depth_compare_op == VK_COMPARE_OP_NEVER));
1033 order_invariance[0].pass_set =
1034 !depth_write_enabled ||
1035 (info->ds.depth_compare_op == VK_COMPARE_OP_ALWAYS ||
1036 info->ds.depth_compare_op == VK_COMPARE_OP_NEVER);
1037
1038 dsa_order_invariant = order_invariance[has_stencil];
1039 if (!dsa_order_invariant.zs)
1040 return false;
1041
1042 /* The set of PS invocations is always order invariant,
1043 * except when early Z/S tests are requested.
1044 */
1045 if (ps && ps->info.ps.writes_memory && ps->info.ps.early_fragment_test &&
1046 !dsa_order_invariant.pass_set)
1047 return false;
1048
1049 /* Determine if out-of-order rasterization should be disabled when occlusion queries are used. */
1050 pipeline->disable_out_of_order_rast_for_occlusion = !dsa_order_invariant.pass_set;
1051
1052 /* No color buffers are enabled for writing. */
1053 if (!colormask)
1054 return true;
1055
1056 unsigned blendmask = colormask & blend->blend_enable_4bit;
1057
1058 if (blendmask) {
1059 /* Only commutative blending. */
1060 if (blendmask & ~blend->commutative_4bit)
1061 return false;
1062
1063 if (!dsa_order_invariant.pass_set)
1064 return false;
1065 }
1066
1067 if (colormask & ~blendmask)
1068 return false;
1069
1070 return true;
1071 }
1072
1073 static void
radv_pipeline_init_multisample_state(struct radv_graphics_pipeline * pipeline,const struct radv_blend_state * blend,const struct radv_graphics_pipeline_info * info,unsigned rast_prim)1074 radv_pipeline_init_multisample_state(struct radv_graphics_pipeline *pipeline,
1075 const struct radv_blend_state *blend,
1076 const struct radv_graphics_pipeline_info *info,
1077 unsigned rast_prim)
1078 {
1079 const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
1080 struct radv_multisample_state *ms = &pipeline->ms;
1081 unsigned num_tile_pipes = pdevice->rad_info.num_tile_pipes;
1082 const VkConservativeRasterizationModeEXT mode = info->rs.conservative_mode;
1083 bool out_of_order_rast = false;
1084 int ps_iter_samples = 1;
1085
1086 ms->num_samples = info->ms.raster_samples;
1087
1088 /* From the Vulkan 1.1.129 spec, 26.7. Sample Shading:
1089 *
1090 * "Sample shading is enabled for a graphics pipeline:
1091 *
1092 * - If the interface of the fragment shader entry point of the
1093 * graphics pipeline includes an input variable decorated
1094 * with SampleId or SamplePosition. In this case
1095 * minSampleShadingFactor takes the value 1.0.
1096 * - Else if the sampleShadingEnable member of the
1097 * VkPipelineMultisampleStateCreateInfo structure specified
1098 * when creating the graphics pipeline is set to VK_TRUE. In
1099 * this case minSampleShadingFactor takes the value of
1100 * VkPipelineMultisampleStateCreateInfo::minSampleShading.
1101 *
1102 * Otherwise, sample shading is considered disabled."
1103 */
1104 if (pipeline->base.shaders[MESA_SHADER_FRAGMENT]->info.ps.uses_sample_shading) {
1105 ps_iter_samples = ms->num_samples;
1106 } else {
1107 ps_iter_samples = radv_pipeline_get_ps_iter_samples(info);
1108 }
1109
1110 if (info->rs.order == VK_RASTERIZATION_ORDER_RELAXED_AMD) {
1111 /* Out-of-order rasterization is explicitly enabled by the
1112 * application.
1113 */
1114 out_of_order_rast = true;
1115 } else {
1116 /* Determine if the driver can enable out-of-order
1117 * rasterization internally.
1118 */
1119 out_of_order_rast = radv_pipeline_out_of_order_rast(pipeline, blend, info);
1120 }
1121
1122 ms->pa_sc_aa_config = 0;
1123 ms->db_eqaa = S_028804_HIGH_QUALITY_INTERSECTIONS(1) | S_028804_INCOHERENT_EQAA_READS(1) |
1124 S_028804_INTERPOLATE_COMP_Z(1) | S_028804_STATIC_ANCHOR_ASSOCIATIONS(1);
1125
1126 /* Adjust MSAA state if conservative rasterization is enabled. */
1127 if (mode != VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT) {
1128 ms->pa_sc_aa_config |= S_028BE0_AA_MASK_CENTROID_DTMN(1);
1129
1130 ms->db_eqaa |=
1131 S_028804_ENABLE_POSTZ_OVERRASTERIZATION(1) | S_028804_OVERRASTERIZATION_AMOUNT(4);
1132 }
1133
1134 ms->pa_sc_mode_cntl_1 =
1135 S_028A4C_WALK_FENCE_ENABLE(1) | // TODO linear dst fixes
1136 S_028A4C_WALK_FENCE_SIZE(num_tile_pipes == 2 ? 2 : 3) |
1137 S_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(out_of_order_rast) |
1138 S_028A4C_OUT_OF_ORDER_WATER_MARK(0x7) |
1139 /* always 1: */
1140 S_028A4C_WALK_ALIGN8_PRIM_FITS_ST(1) | S_028A4C_SUPERTILE_WALK_ORDER_ENABLE(1) |
1141 S_028A4C_TILE_WALK_ORDER_ENABLE(1) | S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE(1) |
1142 S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) | S_028A4C_FORCE_EOV_REZ_ENABLE(1);
1143 ms->pa_sc_mode_cntl_0 = S_028A48_ALTERNATE_RBS_PER_TILE(pdevice->rad_info.gfx_level >= GFX9) |
1144 S_028A48_VPORT_SCISSOR_ENABLE(1) |
1145 S_028A48_LINE_STIPPLE_ENABLE(info->rs.stippled_line_enable);
1146
1147 if (info->rs.line_raster_mode == VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT &&
1148 radv_rast_prim_is_line(rast_prim)) {
1149 /* From the Vulkan spec 1.3.221:
1150 *
1151 * "When Bresenham lines are being rasterized, sample locations may all be treated as being at
1152 * the pixel center (this may affect attribute and depth interpolation)."
1153 *
1154 * "One consequence of this is that Bresenham lines cover the same pixels regardless of the
1155 * number of rasterization samples, and cover all samples in those pixels (unless masked out
1156 * or killed)."
1157 */
1158 ms->num_samples = 1;
1159 }
1160
1161 if (ms->num_samples > 1) {
1162 uint32_t z_samples = radv_pipeline_depth_samples(info);
1163 unsigned log_samples = util_logbase2(ms->num_samples);
1164 unsigned log_z_samples = util_logbase2(z_samples);
1165 unsigned log_ps_iter_samples = util_logbase2(ps_iter_samples);
1166 ms->pa_sc_mode_cntl_0 |= S_028A48_MSAA_ENABLE(1);
1167 ms->db_eqaa |= S_028804_MAX_ANCHOR_SAMPLES(log_z_samples) |
1168 S_028804_PS_ITER_SAMPLES(log_ps_iter_samples) |
1169 S_028804_MASK_EXPORT_NUM_SAMPLES(log_samples) |
1170 S_028804_ALPHA_TO_MASK_NUM_SAMPLES(log_samples);
1171 ms->pa_sc_aa_config |=
1172 S_028BE0_MSAA_NUM_SAMPLES(log_samples) |
1173 S_028BE0_MAX_SAMPLE_DIST(radv_get_default_max_sample_dist(log_samples)) |
1174 S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples) | /* CM_R_028BE0_PA_SC_AA_CONFIG */
1175 S_028BE0_COVERED_CENTROID_IS_CENTER(pdevice->rad_info.gfx_level >= GFX10_3);
1176 ms->pa_sc_mode_cntl_1 |= S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1);
1177 if (ps_iter_samples > 1)
1178 pipeline->spi_baryc_cntl |= S_0286E0_POS_FLOAT_LOCATION(2);
1179 }
1180
1181 ms->pa_sc_aa_mask[0] = info->ms.sample_mask | ((uint32_t)info->ms.sample_mask << 16);
1182 ms->pa_sc_aa_mask[1] = info->ms.sample_mask | ((uint32_t)info->ms.sample_mask << 16);
1183 }
1184
1185 static void
gfx103_pipeline_init_vrs_state(struct radv_graphics_pipeline * pipeline,const struct radv_graphics_pipeline_info * info)1186 gfx103_pipeline_init_vrs_state(struct radv_graphics_pipeline *pipeline,
1187 const struct radv_graphics_pipeline_info *info)
1188 {
1189 struct radv_shader *ps = pipeline->base.shaders[MESA_SHADER_FRAGMENT];
1190 struct radv_multisample_state *ms = &pipeline->ms;
1191 struct radv_vrs_state *vrs = &pipeline->vrs;
1192
1193 if (info->ms.sample_shading_enable ||
1194 ps->info.ps.uses_sample_shading || ps->info.ps.reads_sample_mask_in) {
1195 /* Disable VRS and use the rates from PS_ITER_SAMPLES if:
1196 *
1197 * 1) sample shading is enabled or per-sample interpolation is
1198 * used by the fragment shader
1199 * 2) the fragment shader reads gl_SampleMaskIn because the
1200 * 16-bit sample coverage mask isn't enough for MSAA8x and
1201 * 2x2 coarse shading isn't enough.
1202 */
1203 vrs->pa_cl_vrs_cntl = S_028848_SAMPLE_ITER_COMBINER_MODE(V_028848_VRS_COMB_MODE_OVERRIDE);
1204
1205 /* Make sure sample shading is enabled even if only MSAA1x is
1206 * used because the SAMPLE_ITER combiner is in passthrough
1207 * mode if PS_ITER_SAMPLE is 0, and it uses the per-draw rate.
1208 * The default VRS rate when sample shading is enabled is 1x1.
1209 */
1210 if (!G_028A4C_PS_ITER_SAMPLE(ms->pa_sc_mode_cntl_1))
1211 ms->pa_sc_mode_cntl_1 |= S_028A4C_PS_ITER_SAMPLE(1);
1212 } else {
1213 vrs->pa_cl_vrs_cntl = S_028848_SAMPLE_ITER_COMBINER_MODE(V_028848_VRS_COMB_MODE_PASSTHRU);
1214 }
1215 }
1216
1217 static uint32_t
si_conv_tess_prim_to_gs_out(enum tess_primitive_mode prim)1218 si_conv_tess_prim_to_gs_out(enum tess_primitive_mode prim)
1219 {
1220 switch (prim) {
1221 case TESS_PRIMITIVE_TRIANGLES:
1222 case TESS_PRIMITIVE_QUADS:
1223 return V_028A6C_TRISTRIP;
1224 case TESS_PRIMITIVE_ISOLINES:
1225 return V_028A6C_LINESTRIP;
1226 default:
1227 assert(0);
1228 return 0;
1229 }
1230 }
1231
1232 static uint32_t
si_conv_gl_prim_to_gs_out(unsigned gl_prim)1233 si_conv_gl_prim_to_gs_out(unsigned gl_prim)
1234 {
1235 switch (gl_prim) {
1236 case SHADER_PRIM_POINTS:
1237 return V_028A6C_POINTLIST;
1238 case SHADER_PRIM_LINES:
1239 case SHADER_PRIM_LINE_STRIP:
1240 case SHADER_PRIM_LINES_ADJACENCY:
1241 return V_028A6C_LINESTRIP;
1242
1243 case SHADER_PRIM_TRIANGLES:
1244 case SHADER_PRIM_TRIANGLE_STRIP_ADJACENCY:
1245 case SHADER_PRIM_TRIANGLE_STRIP:
1246 case SHADER_PRIM_QUADS:
1247 return V_028A6C_TRISTRIP;
1248 default:
1249 assert(0);
1250 return 0;
1251 }
1252 }
1253
1254 static uint64_t
radv_dynamic_state_mask(VkDynamicState state)1255 radv_dynamic_state_mask(VkDynamicState state)
1256 {
1257 switch (state) {
1258 case VK_DYNAMIC_STATE_VIEWPORT:
1259 case VK_DYNAMIC_STATE_VIEWPORT_WITH_COUNT:
1260 return RADV_DYNAMIC_VIEWPORT;
1261 case VK_DYNAMIC_STATE_SCISSOR:
1262 case VK_DYNAMIC_STATE_SCISSOR_WITH_COUNT:
1263 return RADV_DYNAMIC_SCISSOR;
1264 case VK_DYNAMIC_STATE_LINE_WIDTH:
1265 return RADV_DYNAMIC_LINE_WIDTH;
1266 case VK_DYNAMIC_STATE_DEPTH_BIAS:
1267 return RADV_DYNAMIC_DEPTH_BIAS;
1268 case VK_DYNAMIC_STATE_BLEND_CONSTANTS:
1269 return RADV_DYNAMIC_BLEND_CONSTANTS;
1270 case VK_DYNAMIC_STATE_DEPTH_BOUNDS:
1271 return RADV_DYNAMIC_DEPTH_BOUNDS;
1272 case VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK:
1273 return RADV_DYNAMIC_STENCIL_COMPARE_MASK;
1274 case VK_DYNAMIC_STATE_STENCIL_WRITE_MASK:
1275 return RADV_DYNAMIC_STENCIL_WRITE_MASK;
1276 case VK_DYNAMIC_STATE_STENCIL_REFERENCE:
1277 return RADV_DYNAMIC_STENCIL_REFERENCE;
1278 case VK_DYNAMIC_STATE_DISCARD_RECTANGLE_EXT:
1279 return RADV_DYNAMIC_DISCARD_RECTANGLE;
1280 case VK_DYNAMIC_STATE_SAMPLE_LOCATIONS_EXT:
1281 return RADV_DYNAMIC_SAMPLE_LOCATIONS;
1282 case VK_DYNAMIC_STATE_LINE_STIPPLE_EXT:
1283 return RADV_DYNAMIC_LINE_STIPPLE;
1284 case VK_DYNAMIC_STATE_CULL_MODE:
1285 return RADV_DYNAMIC_CULL_MODE;
1286 case VK_DYNAMIC_STATE_FRONT_FACE:
1287 return RADV_DYNAMIC_FRONT_FACE;
1288 case VK_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY:
1289 return RADV_DYNAMIC_PRIMITIVE_TOPOLOGY;
1290 case VK_DYNAMIC_STATE_DEPTH_TEST_ENABLE:
1291 return RADV_DYNAMIC_DEPTH_TEST_ENABLE;
1292 case VK_DYNAMIC_STATE_DEPTH_WRITE_ENABLE:
1293 return RADV_DYNAMIC_DEPTH_WRITE_ENABLE;
1294 case VK_DYNAMIC_STATE_DEPTH_COMPARE_OP:
1295 return RADV_DYNAMIC_DEPTH_COMPARE_OP;
1296 case VK_DYNAMIC_STATE_DEPTH_BOUNDS_TEST_ENABLE:
1297 return RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE;
1298 case VK_DYNAMIC_STATE_STENCIL_TEST_ENABLE:
1299 return RADV_DYNAMIC_STENCIL_TEST_ENABLE;
1300 case VK_DYNAMIC_STATE_STENCIL_OP:
1301 return RADV_DYNAMIC_STENCIL_OP;
1302 case VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE:
1303 return RADV_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE;
1304 case VK_DYNAMIC_STATE_FRAGMENT_SHADING_RATE_KHR:
1305 return RADV_DYNAMIC_FRAGMENT_SHADING_RATE;
1306 case VK_DYNAMIC_STATE_PATCH_CONTROL_POINTS_EXT:
1307 return RADV_DYNAMIC_PATCH_CONTROL_POINTS;
1308 case VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE:
1309 return RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
1310 case VK_DYNAMIC_STATE_DEPTH_BIAS_ENABLE:
1311 return RADV_DYNAMIC_DEPTH_BIAS_ENABLE;
1312 case VK_DYNAMIC_STATE_LOGIC_OP_EXT:
1313 return RADV_DYNAMIC_LOGIC_OP;
1314 case VK_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE:
1315 return RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE;
1316 case VK_DYNAMIC_STATE_COLOR_WRITE_ENABLE_EXT:
1317 return RADV_DYNAMIC_COLOR_WRITE_ENABLE;
1318 case VK_DYNAMIC_STATE_VERTEX_INPUT_EXT:
1319 return RADV_DYNAMIC_VERTEX_INPUT;
1320 default:
1321 unreachable("Unhandled dynamic state");
1322 }
1323 }
1324
1325 static bool
radv_pipeline_is_blend_enabled(const struct radv_graphics_pipeline * pipeline,const struct radv_color_blend_info * cb_info)1326 radv_pipeline_is_blend_enabled(const struct radv_graphics_pipeline *pipeline,
1327 const struct radv_color_blend_info *cb_info)
1328 {
1329 for (uint32_t i = 0; i < cb_info->att_count; i++) {
1330 if (cb_info->att[i].color_write_mask && cb_info->att[i].blend_enable)
1331 return true;
1332 }
1333
1334 return false;
1335 }
1336
1337 static uint64_t
radv_pipeline_needed_dynamic_state(const struct radv_graphics_pipeline * pipeline,const struct radv_graphics_pipeline_info * info)1338 radv_pipeline_needed_dynamic_state(const struct radv_graphics_pipeline *pipeline,
1339 const struct radv_graphics_pipeline_info *info)
1340 {
1341 bool has_color_att = radv_pipeline_has_color_attachments(&info->ri);
1342 bool raster_enabled = !info->rs.discard_enable ||
1343 (pipeline->dynamic_states & RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE);
1344 uint64_t states = RADV_DYNAMIC_ALL;
1345
1346 /* Disable dynamic states that are useless to mesh shading. */
1347 if (radv_pipeline_has_stage(pipeline, MESA_SHADER_MESH)) {
1348 if (!raster_enabled)
1349 return RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
1350
1351 states &= ~(RADV_DYNAMIC_VERTEX_INPUT | RADV_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE |
1352 RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE | RADV_DYNAMIC_PRIMITIVE_TOPOLOGY);
1353 }
1354
1355 /* If rasterization is disabled we do not care about any of the
1356 * dynamic states, since they are all rasterization related only,
1357 * except primitive topology, primitive restart enable, vertex
1358 * binding stride and rasterization discard itself.
1359 */
1360 if (!raster_enabled) {
1361 return RADV_DYNAMIC_PRIMITIVE_TOPOLOGY | RADV_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE |
1362 RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE | RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE |
1363 RADV_DYNAMIC_VERTEX_INPUT;
1364 }
1365
1366 if (!info->rs.depth_bias_enable &&
1367 !(pipeline->dynamic_states & RADV_DYNAMIC_DEPTH_BIAS_ENABLE))
1368 states &= ~RADV_DYNAMIC_DEPTH_BIAS;
1369
1370 if (!info->ds.depth_bounds_test_enable &&
1371 !(pipeline->dynamic_states & RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE))
1372 states &= ~RADV_DYNAMIC_DEPTH_BOUNDS;
1373
1374 if (!info->ds.stencil_test_enable &&
1375 !(pipeline->dynamic_states & RADV_DYNAMIC_STENCIL_TEST_ENABLE))
1376 states &= ~(RADV_DYNAMIC_STENCIL_COMPARE_MASK | RADV_DYNAMIC_STENCIL_WRITE_MASK |
1377 RADV_DYNAMIC_STENCIL_REFERENCE | RADV_DYNAMIC_STENCIL_OP);
1378
1379 if (!info->dr.count)
1380 states &= ~RADV_DYNAMIC_DISCARD_RECTANGLE;
1381
1382 if (!info->ms.sample_locs_enable)
1383 states &= ~RADV_DYNAMIC_SAMPLE_LOCATIONS;
1384
1385 if (!info->rs.stippled_line_enable)
1386 states &= ~RADV_DYNAMIC_LINE_STIPPLE;
1387
1388 if (!radv_is_vrs_enabled(pipeline, info))
1389 states &= ~RADV_DYNAMIC_FRAGMENT_SHADING_RATE;
1390
1391 if (!has_color_att || !radv_pipeline_is_blend_enabled(pipeline, &info->cb))
1392 states &= ~RADV_DYNAMIC_BLEND_CONSTANTS;
1393
1394 if (!has_color_att)
1395 states &= ~RADV_DYNAMIC_COLOR_WRITE_ENABLE;
1396
1397 return states;
1398 }
1399
1400 static struct radv_ia_multi_vgt_param_helpers
radv_compute_ia_multi_vgt_param_helpers(struct radv_graphics_pipeline * pipeline)1401 radv_compute_ia_multi_vgt_param_helpers(struct radv_graphics_pipeline *pipeline)
1402 {
1403 const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
1404 struct radv_ia_multi_vgt_param_helpers ia_multi_vgt_param = {0};
1405
1406 if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL))
1407 ia_multi_vgt_param.primgroup_size =
1408 pipeline->base.shaders[MESA_SHADER_TESS_CTRL]->info.num_tess_patches;
1409 else if (radv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY))
1410 ia_multi_vgt_param.primgroup_size = 64;
1411 else
1412 ia_multi_vgt_param.primgroup_size = 128; /* recommended without a GS */
1413
1414 /* GS requirement. */
1415 ia_multi_vgt_param.partial_es_wave = false;
1416 if (radv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY) && pdevice->rad_info.gfx_level <= GFX8)
1417 if (SI_GS_PER_ES / ia_multi_vgt_param.primgroup_size >= pdevice->gs_table_depth - 3)
1418 ia_multi_vgt_param.partial_es_wave = true;
1419
1420 ia_multi_vgt_param.ia_switch_on_eoi = false;
1421 if (pipeline->base.shaders[MESA_SHADER_FRAGMENT]->info.ps.prim_id_input)
1422 ia_multi_vgt_param.ia_switch_on_eoi = true;
1423 if (radv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY) && pipeline->base.shaders[MESA_SHADER_GEOMETRY]->info.uses_prim_id)
1424 ia_multi_vgt_param.ia_switch_on_eoi = true;
1425 if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL)) {
1426 /* SWITCH_ON_EOI must be set if PrimID is used. */
1427 if (pipeline->base.shaders[MESA_SHADER_TESS_CTRL]->info.uses_prim_id ||
1428 radv_get_shader(&pipeline->base, MESA_SHADER_TESS_EVAL)->info.uses_prim_id)
1429 ia_multi_vgt_param.ia_switch_on_eoi = true;
1430 }
1431
1432 ia_multi_vgt_param.partial_vs_wave = false;
1433 if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL)) {
1434 /* Bug with tessellation and GS on Bonaire and older 2 SE chips. */
1435 if ((pdevice->rad_info.family == CHIP_TAHITI ||
1436 pdevice->rad_info.family == CHIP_PITCAIRN ||
1437 pdevice->rad_info.family == CHIP_BONAIRE) &&
1438 radv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY))
1439 ia_multi_vgt_param.partial_vs_wave = true;
1440 /* Needed for 028B6C_DISTRIBUTION_MODE != 0 */
1441 if (pdevice->rad_info.has_distributed_tess) {
1442 if (radv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
1443 if (pdevice->rad_info.gfx_level <= GFX8)
1444 ia_multi_vgt_param.partial_es_wave = true;
1445 } else {
1446 ia_multi_vgt_param.partial_vs_wave = true;
1447 }
1448 }
1449 }
1450
1451 if (radv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
1452 /* On these chips there is the possibility of a hang if the
1453 * pipeline uses a GS and partial_vs_wave is not set.
1454 *
1455 * This mostly does not hit 4-SE chips, as those typically set
1456 * ia_switch_on_eoi and then partial_vs_wave is set for pipelines
1457 * with GS due to another workaround.
1458 *
1459 * Reproducer: https://bugs.freedesktop.org/show_bug.cgi?id=109242
1460 */
1461 if (pdevice->rad_info.family == CHIP_TONGA ||
1462 pdevice->rad_info.family == CHIP_FIJI ||
1463 pdevice->rad_info.family == CHIP_POLARIS10 ||
1464 pdevice->rad_info.family == CHIP_POLARIS11 ||
1465 pdevice->rad_info.family == CHIP_POLARIS12 ||
1466 pdevice->rad_info.family == CHIP_VEGAM) {
1467 ia_multi_vgt_param.partial_vs_wave = true;
1468 }
1469 }
1470
1471 ia_multi_vgt_param.base =
1472 S_028AA8_PRIMGROUP_SIZE(ia_multi_vgt_param.primgroup_size - 1) |
1473 /* The following field was moved to VGT_SHADER_STAGES_EN in GFX9. */
1474 S_028AA8_MAX_PRIMGRP_IN_WAVE(pdevice->rad_info.gfx_level == GFX8 ? 2 : 0) |
1475 S_030960_EN_INST_OPT_BASIC(pdevice->rad_info.gfx_level >= GFX9) |
1476 S_030960_EN_INST_OPT_ADV(pdevice->rad_info.gfx_level >= GFX9);
1477
1478 return ia_multi_vgt_param;
1479 }
1480
1481 static uint32_t
radv_get_attrib_stride(const VkPipelineVertexInputStateCreateInfo * vi,uint32_t attrib_binding)1482 radv_get_attrib_stride(const VkPipelineVertexInputStateCreateInfo *vi, uint32_t attrib_binding)
1483 {
1484 for (uint32_t i = 0; i < vi->vertexBindingDescriptionCount; i++) {
1485 const VkVertexInputBindingDescription *input_binding = &vi->pVertexBindingDescriptions[i];
1486
1487 if (input_binding->binding == attrib_binding)
1488 return input_binding->stride;
1489 }
1490
1491 return 0;
1492 }
1493
1494 static struct radv_vertex_input_info
radv_pipeline_init_vertex_input_info(struct radv_graphics_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)1495 radv_pipeline_init_vertex_input_info(struct radv_graphics_pipeline *pipeline,
1496 const VkGraphicsPipelineCreateInfo *pCreateInfo)
1497 {
1498 const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
1499 const VkPipelineVertexInputStateCreateInfo *vi = pCreateInfo->pVertexInputState;
1500 struct radv_vertex_input_info info = {0};
1501
1502 if (!(pipeline->dynamic_states & RADV_DYNAMIC_VERTEX_INPUT)) {
1503 /* Vertex input */
1504 const VkPipelineVertexInputDivisorStateCreateInfoEXT *divisor_state =
1505 vk_find_struct_const(vi->pNext, PIPELINE_VERTEX_INPUT_DIVISOR_STATE_CREATE_INFO_EXT);
1506
1507 uint32_t binding_input_rate = 0;
1508 uint32_t instance_rate_divisors[MAX_VERTEX_ATTRIBS];
1509 for (unsigned i = 0; i < vi->vertexBindingDescriptionCount; ++i) {
1510 const VkVertexInputBindingDescription *desc = &vi->pVertexBindingDescriptions[i];
1511
1512 if (desc->inputRate) {
1513 unsigned binding = vi->pVertexBindingDescriptions[i].binding;
1514 binding_input_rate |= 1u << binding;
1515 instance_rate_divisors[binding] = 1;
1516 }
1517
1518 info.binding_stride[desc->binding] = desc->stride;
1519 }
1520
1521 if (divisor_state) {
1522 for (unsigned i = 0; i < divisor_state->vertexBindingDivisorCount; ++i) {
1523 instance_rate_divisors[divisor_state->pVertexBindingDivisors[i].binding] =
1524 divisor_state->pVertexBindingDivisors[i].divisor;
1525 }
1526 }
1527
1528 for (unsigned i = 0; i < vi->vertexAttributeDescriptionCount; ++i) {
1529 const VkVertexInputAttributeDescription *desc = &vi->pVertexAttributeDescriptions[i];
1530 const struct util_format_description *format_desc;
1531 unsigned location = desc->location;
1532 unsigned binding = desc->binding;
1533 unsigned num_format, data_format;
1534 bool post_shuffle;
1535
1536 if (binding_input_rate & (1u << binding)) {
1537 info.instance_rate_inputs |= 1u << location;
1538 info.instance_rate_divisors[location] = instance_rate_divisors[binding];
1539 }
1540
1541 format_desc = vk_format_description(desc->format);
1542 radv_translate_vertex_format(pdevice, desc->format, format_desc, &data_format, &num_format,
1543 &post_shuffle, &info.vertex_alpha_adjust[location]);
1544
1545 info.vertex_attribute_formats[location] = data_format | (num_format << 4);
1546 info.vertex_attribute_bindings[location] = desc->binding;
1547 info.vertex_attribute_offsets[location] = desc->offset;
1548
1549 const struct ac_data_format_info *dfmt_info = ac_get_data_format_info(data_format);
1550 unsigned attrib_align =
1551 dfmt_info->chan_byte_size ? dfmt_info->chan_byte_size : dfmt_info->element_size;
1552
1553 /* If desc->offset is misaligned, then the buffer offset must be too. Just
1554 * skip updating vertex_binding_align in this case.
1555 */
1556 if (desc->offset % attrib_align == 0)
1557 info.vertex_binding_align[desc->binding] =
1558 MAX2(info.vertex_binding_align[desc->binding], attrib_align);
1559
1560 if (!(pipeline->dynamic_states & RADV_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE)) {
1561 /* From the Vulkan spec 1.2.157:
1562 *
1563 * "If the bound pipeline state object was created
1564 * with the
1565 * VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE
1566 * dynamic state enabled then pStrides[i] specifies
1567 * the distance in bytes between two consecutive
1568 * elements within the corresponding buffer. In this
1569 * case the VkVertexInputBindingDescription::stride
1570 * state from the pipeline state object is ignored."
1571 *
1572 * Make sure the vertex attribute stride is zero to
1573 * avoid computing a wrong offset if it's initialized
1574 * to something else than zero.
1575 */
1576 info.vertex_attribute_strides[location] = radv_get_attrib_stride(vi, desc->binding);
1577 }
1578
1579 if (post_shuffle)
1580 info.vertex_post_shuffle |= 1 << location;
1581
1582 uint32_t end = desc->offset + vk_format_get_blocksize(desc->format);
1583 info.attrib_ends[desc->location] = end;
1584 if (info.binding_stride[desc->binding])
1585 info.attrib_index_offset[desc->location] =
1586 desc->offset / info.binding_stride[desc->binding];
1587 info.attrib_bindings[desc->location] = desc->binding;
1588 }
1589 }
1590
1591 return info;
1592 }
1593
1594 static struct radv_input_assembly_info
radv_pipeline_init_input_assembly_info(struct radv_graphics_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)1595 radv_pipeline_init_input_assembly_info(struct radv_graphics_pipeline *pipeline,
1596 const VkGraphicsPipelineCreateInfo *pCreateInfo)
1597 {
1598 const VkPipelineInputAssemblyStateCreateInfo *ia = pCreateInfo->pInputAssemblyState;
1599 struct radv_input_assembly_info info = {0};
1600
1601 info.primitive_topology = si_translate_prim(ia->topology);
1602 info.primitive_restart_enable = !!ia->primitiveRestartEnable;
1603
1604 return info;
1605 }
1606
1607 static struct radv_tessellation_info
radv_pipeline_init_tessellation_info(struct radv_graphics_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)1608 radv_pipeline_init_tessellation_info(struct radv_graphics_pipeline *pipeline,
1609 const VkGraphicsPipelineCreateInfo *pCreateInfo)
1610 {
1611 const VkPipelineTessellationStateCreateInfo *ts = pCreateInfo->pTessellationState;
1612 const VkShaderStageFlagBits tess_stages = VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT |
1613 VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT;
1614 struct radv_tessellation_info info = {0};
1615
1616 if ((pipeline->active_stages & tess_stages) == tess_stages) {
1617 info.patch_control_points = ts->patchControlPoints;
1618
1619 const VkPipelineTessellationDomainOriginStateCreateInfo *domain_origin_state =
1620 vk_find_struct_const(ts->pNext, PIPELINE_TESSELLATION_DOMAIN_ORIGIN_STATE_CREATE_INFO);
1621 if (domain_origin_state) {
1622 info.domain_origin = domain_origin_state->domainOrigin;
1623 }
1624 }
1625
1626 return info;
1627 }
1628
1629 static struct radv_viewport_info
radv_pipeline_init_viewport_info(struct radv_graphics_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)1630 radv_pipeline_init_viewport_info(struct radv_graphics_pipeline *pipeline,
1631 const VkGraphicsPipelineCreateInfo *pCreateInfo)
1632 {
1633 const VkPipelineViewportStateCreateInfo *vp = pCreateInfo->pViewportState;
1634 struct radv_viewport_info info = {0};
1635
1636 if (radv_is_raster_enabled(pipeline, pCreateInfo)) {
1637 if (!(pipeline->dynamic_states & RADV_DYNAMIC_VIEWPORT)) {
1638 typed_memcpy(info.viewports, vp->pViewports, vp->viewportCount);
1639 }
1640 info.viewport_count = vp->viewportCount;
1641
1642 if (!(pipeline->dynamic_states & RADV_DYNAMIC_SCISSOR)) {
1643 typed_memcpy(info.scissors, vp->pScissors, vp->scissorCount);
1644 }
1645 info.scissor_count = vp->scissorCount;
1646
1647 const VkPipelineViewportDepthClipControlCreateInfoEXT *depth_clip_control =
1648 vk_find_struct_const(vp->pNext, PIPELINE_VIEWPORT_DEPTH_CLIP_CONTROL_CREATE_INFO_EXT);
1649 if (depth_clip_control) {
1650 info.negative_one_to_one = !!depth_clip_control->negativeOneToOne;
1651 }
1652 }
1653
1654 return info;
1655 }
1656
1657 static struct radv_rasterization_info
radv_pipeline_init_rasterization_info(struct radv_graphics_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)1658 radv_pipeline_init_rasterization_info(struct radv_graphics_pipeline *pipeline,
1659 const VkGraphicsPipelineCreateInfo *pCreateInfo)
1660 {
1661 const VkPipelineRasterizationStateCreateInfo *rs = pCreateInfo->pRasterizationState;
1662 struct radv_rasterization_info info = {0};
1663
1664 info.discard_enable = rs->rasterizerDiscardEnable;
1665 info.front_face = rs->frontFace;
1666 info.cull_mode = rs->cullMode;
1667 info.polygon_mode = si_translate_fill(rs->polygonMode);
1668 info.depth_bias_enable = rs->depthBiasEnable;
1669 info.depth_clamp_enable = rs->depthClampEnable;
1670 info.line_width = rs->lineWidth;
1671 info.depth_bias_constant_factor = rs->depthBiasConstantFactor;
1672 info.depth_bias_clamp = rs->depthBiasClamp;
1673 info.depth_bias_slope_factor = rs->depthBiasSlopeFactor;
1674 info.depth_clip_disable = rs->depthClampEnable;
1675
1676 const VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *provoking_vtx_info =
1677 vk_find_struct_const(rs->pNext, PIPELINE_RASTERIZATION_PROVOKING_VERTEX_STATE_CREATE_INFO_EXT);
1678 if (provoking_vtx_info &&
1679 provoking_vtx_info->provokingVertexMode == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT) {
1680 info.provoking_vtx_last = true;
1681 }
1682
1683 const VkPipelineRasterizationConservativeStateCreateInfoEXT *conservative_raster =
1684 vk_find_struct_const(rs->pNext, PIPELINE_RASTERIZATION_CONSERVATIVE_STATE_CREATE_INFO_EXT);
1685 if (conservative_raster) {
1686 info.conservative_mode = conservative_raster->conservativeRasterizationMode;
1687 }
1688
1689 const VkPipelineRasterizationLineStateCreateInfoEXT *rast_line_info =
1690 vk_find_struct_const(rs->pNext, PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT);
1691 if (rast_line_info) {
1692 info.stippled_line_enable = rast_line_info->stippledLineEnable;
1693 info.line_raster_mode = rast_line_info->lineRasterizationMode;
1694 info.line_stipple_factor = rast_line_info->lineStippleFactor;
1695 info.line_stipple_pattern = rast_line_info->lineStipplePattern;
1696 }
1697
1698 const VkPipelineRasterizationDepthClipStateCreateInfoEXT *depth_clip_state =
1699 vk_find_struct_const(rs->pNext, PIPELINE_RASTERIZATION_DEPTH_CLIP_STATE_CREATE_INFO_EXT);
1700 if (depth_clip_state) {
1701 info.depth_clip_disable = !depth_clip_state->depthClipEnable;
1702 }
1703
1704 const VkPipelineRasterizationStateRasterizationOrderAMD *raster_order =
1705 vk_find_struct_const(rs->pNext, PIPELINE_RASTERIZATION_STATE_RASTERIZATION_ORDER_AMD);
1706 if (raster_order) {
1707 info.order = raster_order->rasterizationOrder;
1708 }
1709
1710 return info;
1711 }
1712
1713 static struct radv_discard_rectangle_info
radv_pipeline_init_discard_rectangle_info(struct radv_graphics_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)1714 radv_pipeline_init_discard_rectangle_info(struct radv_graphics_pipeline *pipeline,
1715 const VkGraphicsPipelineCreateInfo *pCreateInfo)
1716 {
1717 const VkPipelineDiscardRectangleStateCreateInfoEXT *discard_rectangle_info =
1718 vk_find_struct_const(pCreateInfo->pNext, PIPELINE_DISCARD_RECTANGLE_STATE_CREATE_INFO_EXT);
1719 struct radv_discard_rectangle_info info = {0};
1720
1721 if (discard_rectangle_info) {
1722 info.mode = discard_rectangle_info->discardRectangleMode;
1723 if (!(pipeline->dynamic_states & RADV_DYNAMIC_DISCARD_RECTANGLE)) {
1724 typed_memcpy(info.rects, discard_rectangle_info->pDiscardRectangles,
1725 discard_rectangle_info->discardRectangleCount);
1726 }
1727 info.count = discard_rectangle_info->discardRectangleCount;
1728 }
1729
1730 return info;
1731 }
1732
1733 static struct radv_multisample_info
radv_pipeline_init_multisample_info(struct radv_graphics_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)1734 radv_pipeline_init_multisample_info(struct radv_graphics_pipeline *pipeline,
1735 const VkGraphicsPipelineCreateInfo *pCreateInfo)
1736 {
1737 const VkPipelineMultisampleStateCreateInfo *ms = pCreateInfo->pMultisampleState;
1738 struct radv_multisample_info info = {0};
1739
1740 if (radv_is_raster_enabled(pipeline, pCreateInfo)) {
1741 info.raster_samples = ms->rasterizationSamples;
1742 info.sample_shading_enable = ms->sampleShadingEnable;
1743 info.min_sample_shading = ms->minSampleShading;
1744 info.alpha_to_coverage_enable = ms->alphaToCoverageEnable;
1745 if (ms->pSampleMask) {
1746 info.sample_mask = ms->pSampleMask[0] & 0xffff;
1747 } else {
1748 info.sample_mask = 0xffff;
1749 }
1750
1751 const VkPipelineSampleLocationsStateCreateInfoEXT *sample_location_info =
1752 vk_find_struct_const(ms->pNext, PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT);
1753 if (sample_location_info) {
1754 /* If sampleLocationsEnable is VK_FALSE, the default sample locations are used and the
1755 * values specified in sampleLocationsInfo are ignored.
1756 */
1757 info.sample_locs_enable = sample_location_info->sampleLocationsEnable;
1758 if (sample_location_info->sampleLocationsEnable) {
1759 const VkSampleLocationsInfoEXT *pSampleLocationsInfo =
1760 &sample_location_info->sampleLocationsInfo;
1761 assert(pSampleLocationsInfo->sampleLocationsCount <= MAX_SAMPLE_LOCATIONS);
1762
1763 info.sample_locs_per_pixel = pSampleLocationsInfo->sampleLocationsPerPixel;
1764 info.sample_locs_grid_size = pSampleLocationsInfo->sampleLocationGridSize;
1765 for (uint32_t i = 0; i < pSampleLocationsInfo->sampleLocationsCount; i++) {
1766 info.sample_locs[i] = pSampleLocationsInfo->pSampleLocations[i];
1767 }
1768 info.sample_locs_count = pSampleLocationsInfo->sampleLocationsCount;
1769 }
1770 }
1771 } else {
1772 info.raster_samples = VK_SAMPLE_COUNT_1_BIT;
1773 }
1774
1775 return info;
1776 }
1777
1778 static struct radv_depth_stencil_info
radv_pipeline_init_depth_stencil_info(struct radv_graphics_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)1779 radv_pipeline_init_depth_stencil_info(struct radv_graphics_pipeline *pipeline,
1780 const VkGraphicsPipelineCreateInfo *pCreateInfo)
1781 {
1782 const VkPipelineDepthStencilStateCreateInfo *ds = pCreateInfo->pDepthStencilState;
1783 const VkPipelineRenderingCreateInfo *ri =
1784 vk_find_struct_const(pCreateInfo->pNext, PIPELINE_RENDERING_CREATE_INFO);
1785 struct radv_depth_stencil_info info = {0};
1786
1787 if (radv_is_raster_enabled(pipeline, pCreateInfo) &&
1788 (ri->depthAttachmentFormat != VK_FORMAT_UNDEFINED ||
1789 ri->stencilAttachmentFormat != VK_FORMAT_UNDEFINED)) {
1790 info.depth_bounds_test_enable = ds->depthBoundsTestEnable;
1791 info.depth_bounds.min = ds->minDepthBounds;
1792 info.depth_bounds.max = ds->maxDepthBounds;
1793 info.stencil_test_enable = ds->stencilTestEnable;
1794 info.front.fail_op = ds->front.failOp;
1795 info.front.pass_op = ds->front.passOp;
1796 info.front.depth_fail_op = ds->front.depthFailOp;
1797 info.front.compare_op = ds->front.compareOp;
1798 info.front.compare_mask = ds->front.compareMask;
1799 info.front.write_mask = ds->front.writeMask;
1800 info.front.reference = ds->front.reference;
1801 info.back.fail_op = ds->back.failOp;
1802 info.back.pass_op = ds->back.passOp;
1803 info.back.depth_fail_op = ds->back.depthFailOp;
1804 info.back.compare_op = ds->back.compareOp;
1805 info.back.compare_mask = ds->back.compareMask;
1806 info.back.write_mask = ds->back.writeMask;
1807 info.back.reference = ds->back.reference;
1808 info.depth_test_enable = ds->depthTestEnable;
1809 info.depth_write_enable = ds->depthWriteEnable;
1810 info.depth_compare_op = ds->depthCompareOp;
1811 }
1812
1813 return info;
1814 }
1815
1816 static struct radv_rendering_info
radv_pipeline_init_rendering_info(struct radv_graphics_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)1817 radv_pipeline_init_rendering_info(struct radv_graphics_pipeline *pipeline,
1818 const VkGraphicsPipelineCreateInfo *pCreateInfo)
1819 {
1820 const VkPipelineRenderingCreateInfo *ri =
1821 vk_find_struct_const(pCreateInfo->pNext, PIPELINE_RENDERING_CREATE_INFO);
1822 struct radv_rendering_info info = {0};
1823
1824 info.view_mask = ri->viewMask;
1825 for (uint32_t i = 0; i < ri->colorAttachmentCount; i++) {
1826 info.color_att_formats[i] = ri->pColorAttachmentFormats[i];
1827 }
1828 info.color_att_count = ri->colorAttachmentCount;
1829 info.depth_att_format = ri->depthAttachmentFormat;
1830 info.stencil_att_format = ri->stencilAttachmentFormat;
1831
1832 return info;
1833 }
1834
1835 static struct radv_color_blend_info
radv_pipeline_init_color_blend_info(struct radv_graphics_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)1836 radv_pipeline_init_color_blend_info(struct radv_graphics_pipeline *pipeline,
1837 const VkGraphicsPipelineCreateInfo *pCreateInfo)
1838 {
1839 const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
1840 const VkPipelineColorBlendStateCreateInfo *cb = pCreateInfo->pColorBlendState;
1841 const VkPipelineRenderingCreateInfo *ri =
1842 vk_find_struct_const(pCreateInfo->pNext, PIPELINE_RENDERING_CREATE_INFO);
1843 struct radv_color_blend_info info = {0};
1844 bool has_color_att = false;
1845
1846 for (uint32_t i = 0; i < ri->colorAttachmentCount; ++i) {
1847 if (ri->pColorAttachmentFormats[i] != VK_FORMAT_UNDEFINED) {
1848 has_color_att = true;
1849 break;
1850 }
1851 }
1852
1853 if (radv_is_raster_enabled(pipeline, pCreateInfo) && has_color_att) {
1854 for (uint32_t i = 0; i < cb->attachmentCount; i++) {
1855 const VkPipelineColorBlendAttachmentState *att = &cb->pAttachments[i];
1856
1857 info.att[i].color_write_mask = att->colorWriteMask;
1858 info.att[i].blend_enable = att->blendEnable;
1859 info.att[i].color_blend_op = si_translate_blend_function(att->colorBlendOp);
1860 info.att[i].alpha_blend_op = si_translate_blend_function(att->alphaBlendOp);
1861 info.att[i].src_color_blend_factor =
1862 si_translate_blend_factor(pdevice->rad_info.gfx_level, att->srcColorBlendFactor);
1863 info.att[i].dst_color_blend_factor =
1864 si_translate_blend_factor(pdevice->rad_info.gfx_level, att->dstColorBlendFactor);
1865 info.att[i].src_alpha_blend_factor =
1866 si_translate_blend_factor(pdevice->rad_info.gfx_level, att->srcAlphaBlendFactor);
1867 info.att[i].dst_alpha_blend_factor =
1868 si_translate_blend_factor(pdevice->rad_info.gfx_level, att->dstAlphaBlendFactor);
1869 }
1870 info.att_count = cb->attachmentCount;
1871
1872 for (uint32_t i = 0; i < 4; i++) {
1873 info.blend_constants[i] = cb->blendConstants[i];
1874 }
1875
1876 info.logic_op_enable = cb->logicOpEnable;
1877 if (info.logic_op_enable)
1878 info.logic_op = si_translate_blend_logic_op(cb->logicOp);
1879
1880 const VkPipelineColorWriteCreateInfoEXT *color_write_info =
1881 vk_find_struct_const(cb->pNext, PIPELINE_COLOR_WRITE_CREATE_INFO_EXT);
1882 if (color_write_info) {
1883 for (uint32_t i = 0; i < color_write_info->attachmentCount; i++) {
1884 info.color_write_enable |=
1885 color_write_info->pColorWriteEnables[i] ? (0xfu << (i * 4)) : 0;
1886 }
1887 } else {
1888 info.color_write_enable = 0xffffffffu;
1889 }
1890 }
1891
1892 return info;
1893 }
1894
1895 static struct radv_fragment_shading_rate_info
radv_pipeline_init_fragment_shading_rate_info(struct radv_graphics_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)1896 radv_pipeline_init_fragment_shading_rate_info(struct radv_graphics_pipeline *pipeline,
1897 const VkGraphicsPipelineCreateInfo *pCreateInfo)
1898 {
1899 const VkPipelineFragmentShadingRateStateCreateInfoKHR *shading_rate =
1900 vk_find_struct_const(pCreateInfo->pNext, PIPELINE_FRAGMENT_SHADING_RATE_STATE_CREATE_INFO_KHR);
1901 struct radv_fragment_shading_rate_info info = {0};
1902
1903 if (shading_rate && !(pipeline->dynamic_states & RADV_DYNAMIC_FRAGMENT_SHADING_RATE)) {
1904 info.size = shading_rate->fragmentSize;
1905 for (int i = 0; i < 2; i++)
1906 info.combiner_ops[i] = shading_rate->combinerOps[i];
1907 } else {
1908 info.size = (VkExtent2D){ 1, 1 };
1909 info.combiner_ops[0] = VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR;
1910 info.combiner_ops[1] = VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR;
1911 }
1912
1913 return info;
1914 }
1915
1916 static struct radv_graphics_pipeline_info
radv_pipeline_init_graphics_info(struct radv_graphics_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)1917 radv_pipeline_init_graphics_info(struct radv_graphics_pipeline *pipeline,
1918 const VkGraphicsPipelineCreateInfo *pCreateInfo)
1919 {
1920 struct radv_graphics_pipeline_info info = {0};
1921
1922 /* Vertex input interface structs have to be ignored if the pipeline includes a mesh shader. */
1923 if (!(pipeline->active_stages & VK_SHADER_STAGE_MESH_BIT_NV)) {
1924 info.vi = radv_pipeline_init_vertex_input_info(pipeline, pCreateInfo);
1925 info.ia = radv_pipeline_init_input_assembly_info(pipeline, pCreateInfo);
1926 }
1927
1928 info.ts = radv_pipeline_init_tessellation_info(pipeline, pCreateInfo);
1929 info.vp = radv_pipeline_init_viewport_info(pipeline, pCreateInfo);
1930 info.rs = radv_pipeline_init_rasterization_info(pipeline, pCreateInfo);
1931 info.dr = radv_pipeline_init_discard_rectangle_info(pipeline, pCreateInfo);
1932
1933 info.ms = radv_pipeline_init_multisample_info(pipeline, pCreateInfo);
1934 info.ds = radv_pipeline_init_depth_stencil_info(pipeline, pCreateInfo);
1935 info.ri = radv_pipeline_init_rendering_info(pipeline, pCreateInfo);
1936 info.cb = radv_pipeline_init_color_blend_info(pipeline, pCreateInfo);
1937
1938 info.fsr = radv_pipeline_init_fragment_shading_rate_info(pipeline, pCreateInfo);
1939
1940 /* VK_AMD_mixed_attachment_samples */
1941 const VkAttachmentSampleCountInfoAMD *sample_info =
1942 vk_find_struct_const(pCreateInfo->pNext, ATTACHMENT_SAMPLE_COUNT_INFO_AMD);
1943 if (sample_info) {
1944 for (uint32_t i = 0; i < sample_info->colorAttachmentCount; ++i) {
1945 if (info.ri.color_att_formats[i] != VK_FORMAT_UNDEFINED) {
1946 info.color_att_samples = MAX2(info.color_att_samples, sample_info->pColorAttachmentSamples[i]);
1947 }
1948 }
1949 info.ds_att_samples = sample_info->depthStencilAttachmentSamples;
1950 }
1951
1952 return info;
1953 }
1954
1955 static void
radv_pipeline_init_input_assembly_state(struct radv_graphics_pipeline * pipeline,const struct radv_graphics_pipeline_info * info)1956 radv_pipeline_init_input_assembly_state(struct radv_graphics_pipeline *pipeline,
1957 const struct radv_graphics_pipeline_info *info)
1958 {
1959 pipeline->ia_multi_vgt_param = radv_compute_ia_multi_vgt_param_helpers(pipeline);
1960 }
1961
1962 static void
radv_pipeline_init_dynamic_state(struct radv_graphics_pipeline * pipeline,const struct radv_graphics_pipeline_info * info)1963 radv_pipeline_init_dynamic_state(struct radv_graphics_pipeline *pipeline,
1964 const struct radv_graphics_pipeline_info *info)
1965 {
1966 uint64_t needed_states = radv_pipeline_needed_dynamic_state(pipeline, info);
1967 uint64_t states = needed_states;
1968
1969 pipeline->dynamic_state = default_dynamic_state;
1970 pipeline->needed_dynamic_state = needed_states;
1971
1972 states &= ~pipeline->dynamic_states;
1973
1974 struct radv_dynamic_state *dynamic = &pipeline->dynamic_state;
1975
1976 if (needed_states & RADV_DYNAMIC_VIEWPORT) {
1977 dynamic->viewport.count = info->vp.viewport_count;
1978 if (states & RADV_DYNAMIC_VIEWPORT) {
1979 typed_memcpy(dynamic->viewport.viewports, info->vp.viewports, info->vp.viewport_count);
1980 for (unsigned i = 0; i < dynamic->viewport.count; i++)
1981 radv_get_viewport_xform(&dynamic->viewport.viewports[i],
1982 dynamic->viewport.xform[i].scale, dynamic->viewport.xform[i].translate);
1983 }
1984 }
1985
1986 if (needed_states & RADV_DYNAMIC_SCISSOR) {
1987 dynamic->scissor.count = info->vp.scissor_count;
1988 if (states & RADV_DYNAMIC_SCISSOR) {
1989 typed_memcpy(dynamic->scissor.scissors, info->vp.scissors, info->vp.scissor_count);
1990 }
1991 }
1992
1993 if (states & RADV_DYNAMIC_LINE_WIDTH) {
1994 dynamic->line_width = info->rs.line_width;
1995 }
1996
1997 if (states & RADV_DYNAMIC_DEPTH_BIAS) {
1998 dynamic->depth_bias.bias = info->rs.depth_bias_constant_factor;
1999 dynamic->depth_bias.clamp = info->rs.depth_bias_clamp;
2000 dynamic->depth_bias.slope = info->rs.depth_bias_slope_factor;
2001 }
2002
2003 /* Section 9.2 of the Vulkan 1.0.15 spec says:
2004 *
2005 * pColorBlendState is [...] NULL if the pipeline has rasterization
2006 * disabled or if the subpass of the render pass the pipeline is
2007 * created against does not use any color attachments.
2008 */
2009 if (states & RADV_DYNAMIC_BLEND_CONSTANTS) {
2010 typed_memcpy(dynamic->blend_constants, info->cb.blend_constants, 4);
2011 }
2012
2013 if (states & RADV_DYNAMIC_CULL_MODE) {
2014 dynamic->cull_mode = info->rs.cull_mode;
2015 }
2016
2017 if (states & RADV_DYNAMIC_FRONT_FACE) {
2018 dynamic->front_face = info->rs.front_face;
2019 }
2020
2021 if (states & RADV_DYNAMIC_PRIMITIVE_TOPOLOGY) {
2022 dynamic->primitive_topology = info->ia.primitive_topology;
2023 }
2024
2025 /* If there is no depthstencil attachment, then don't read
2026 * pDepthStencilState. The Vulkan spec states that pDepthStencilState may
2027 * be NULL in this case. Even if pDepthStencilState is non-NULL, there is
2028 * no need to override the depthstencil defaults in
2029 * radv_pipeline::dynamic_state when there is no depthstencil attachment.
2030 *
2031 * Section 9.2 of the Vulkan 1.0.15 spec says:
2032 *
2033 * pDepthStencilState is [...] NULL if the pipeline has rasterization
2034 * disabled or if the subpass of the render pass the pipeline is created
2035 * against does not use a depth/stencil attachment.
2036 */
2037 if (needed_states && radv_pipeline_has_ds_attachments(&info->ri)) {
2038 if (states & RADV_DYNAMIC_DEPTH_BOUNDS) {
2039 dynamic->depth_bounds.min = info->ds.depth_bounds.min;
2040 dynamic->depth_bounds.max = info->ds.depth_bounds.max;
2041 }
2042
2043 if (states & RADV_DYNAMIC_STENCIL_COMPARE_MASK) {
2044 dynamic->stencil_compare_mask.front = info->ds.front.compare_mask;
2045 dynamic->stencil_compare_mask.back = info->ds.back.compare_mask;
2046 }
2047
2048 if (states & RADV_DYNAMIC_STENCIL_WRITE_MASK) {
2049 dynamic->stencil_write_mask.front = info->ds.front.write_mask;
2050 dynamic->stencil_write_mask.back = info->ds.back.write_mask;
2051 }
2052
2053 if (states & RADV_DYNAMIC_STENCIL_REFERENCE) {
2054 dynamic->stencil_reference.front = info->ds.front.reference;
2055 dynamic->stencil_reference.back = info->ds.back.reference;
2056 }
2057
2058 if (states & RADV_DYNAMIC_DEPTH_TEST_ENABLE) {
2059 dynamic->depth_test_enable = info->ds.depth_test_enable;
2060 }
2061
2062 if (states & RADV_DYNAMIC_DEPTH_WRITE_ENABLE) {
2063 dynamic->depth_write_enable = info->ds.depth_write_enable;
2064 }
2065
2066 if (states & RADV_DYNAMIC_DEPTH_COMPARE_OP) {
2067 dynamic->depth_compare_op = info->ds.depth_compare_op;
2068 }
2069
2070 if (states & RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE) {
2071 dynamic->depth_bounds_test_enable = info->ds.depth_bounds_test_enable;
2072 }
2073
2074 if (states & RADV_DYNAMIC_STENCIL_TEST_ENABLE) {
2075 dynamic->stencil_test_enable = info->ds.stencil_test_enable;
2076 }
2077
2078 if (states & RADV_DYNAMIC_STENCIL_OP) {
2079 dynamic->stencil_op.front.compare_op = info->ds.front.compare_op;
2080 dynamic->stencil_op.front.fail_op = info->ds.front.fail_op;
2081 dynamic->stencil_op.front.pass_op = info->ds.front.pass_op;
2082 dynamic->stencil_op.front.depth_fail_op = info->ds.front.depth_fail_op;
2083
2084 dynamic->stencil_op.back.compare_op = info->ds.back.compare_op;
2085 dynamic->stencil_op.back.fail_op = info->ds.back.fail_op;
2086 dynamic->stencil_op.back.pass_op = info->ds.back.pass_op;
2087 dynamic->stencil_op.back.depth_fail_op = info->ds.back.depth_fail_op;
2088 }
2089 }
2090
2091 if (needed_states & RADV_DYNAMIC_DISCARD_RECTANGLE) {
2092 dynamic->discard_rectangle.count = info->dr.count;
2093 if (states & RADV_DYNAMIC_DISCARD_RECTANGLE) {
2094 typed_memcpy(dynamic->discard_rectangle.rectangles, info->dr.rects, info->dr.count);
2095 }
2096 }
2097
2098 if (needed_states & RADV_DYNAMIC_SAMPLE_LOCATIONS) {
2099 if (info->ms.sample_locs_enable) {
2100 dynamic->sample_location.per_pixel = info->ms.sample_locs_per_pixel;
2101 dynamic->sample_location.grid_size = info->ms.sample_locs_grid_size;
2102 dynamic->sample_location.count = info->ms.sample_locs_count;
2103 typed_memcpy(&dynamic->sample_location.locations[0], info->ms.sample_locs,
2104 info->ms.sample_locs_count);
2105 }
2106 }
2107
2108 if (needed_states & RADV_DYNAMIC_LINE_STIPPLE) {
2109 dynamic->line_stipple.factor = info->rs.line_stipple_factor;
2110 dynamic->line_stipple.pattern = info->rs.line_stipple_pattern;
2111 }
2112
2113 if (!(states & RADV_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE) ||
2114 !(states & RADV_DYNAMIC_VERTEX_INPUT))
2115 pipeline->uses_dynamic_stride = true;
2116
2117 if (states & RADV_DYNAMIC_FRAGMENT_SHADING_RATE) {
2118 dynamic->fragment_shading_rate.size = info->fsr.size;
2119 for (int i = 0; i < 2; i++)
2120 dynamic->fragment_shading_rate.combiner_ops[i] = info->fsr.combiner_ops[i];
2121 }
2122
2123 if (states & RADV_DYNAMIC_DEPTH_BIAS_ENABLE) {
2124 dynamic->depth_bias_enable = info->rs.depth_bias_enable;
2125 }
2126
2127 if (states & RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE) {
2128 dynamic->primitive_restart_enable = info->ia.primitive_restart_enable;
2129 }
2130
2131 if (states & RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE) {
2132 dynamic->rasterizer_discard_enable = info->rs.discard_enable;
2133 }
2134
2135 if (radv_pipeline_has_color_attachments(&info->ri) && states & RADV_DYNAMIC_LOGIC_OP) {
2136 if (info->cb.logic_op_enable) {
2137 dynamic->logic_op = info->cb.logic_op;
2138 } else {
2139 dynamic->logic_op = V_028808_ROP3_COPY;
2140 }
2141 }
2142
2143 if (states & RADV_DYNAMIC_COLOR_WRITE_ENABLE) {
2144 dynamic->color_write_enable = info->cb.color_write_enable;
2145 }
2146
2147 pipeline->dynamic_state.mask = states;
2148 }
2149
2150 static void
radv_pipeline_init_raster_state(struct radv_graphics_pipeline * pipeline,const struct radv_graphics_pipeline_info * info)2151 radv_pipeline_init_raster_state(struct radv_graphics_pipeline *pipeline,
2152 const struct radv_graphics_pipeline_info *info)
2153 {
2154 const struct radv_device *device = pipeline->base.device;
2155
2156 pipeline->pa_su_sc_mode_cntl =
2157 S_028814_FACE(info->rs.front_face) |
2158 S_028814_CULL_FRONT(!!(info->rs.cull_mode & VK_CULL_MODE_FRONT_BIT)) |
2159 S_028814_CULL_BACK(!!(info->rs.cull_mode & VK_CULL_MODE_BACK_BIT)) |
2160 S_028814_POLY_MODE(info->rs.polygon_mode != V_028814_X_DRAW_TRIANGLES) |
2161 S_028814_POLYMODE_FRONT_PTYPE(info->rs.polygon_mode) |
2162 S_028814_POLYMODE_BACK_PTYPE(info->rs.polygon_mode) |
2163 S_028814_POLY_OFFSET_FRONT_ENABLE(info->rs.depth_bias_enable) |
2164 S_028814_POLY_OFFSET_BACK_ENABLE(info->rs.depth_bias_enable) |
2165 S_028814_POLY_OFFSET_PARA_ENABLE(info->rs.depth_bias_enable) |
2166 S_028814_PROVOKING_VTX_LAST(info->rs.provoking_vtx_last);
2167
2168 if (device->physical_device->rad_info.gfx_level >= GFX10) {
2169 /* It should also be set if PERPENDICULAR_ENDCAP_ENA is set. */
2170 pipeline->pa_su_sc_mode_cntl |=
2171 S_028814_KEEP_TOGETHER_ENABLE(info->rs.polygon_mode != V_028814_X_DRAW_TRIANGLES);
2172 }
2173
2174 pipeline->pa_cl_clip_cntl =
2175 S_028810_DX_CLIP_SPACE_DEF(!pipeline->negative_one_to_one) |
2176 S_028810_ZCLIP_NEAR_DISABLE(info->rs.depth_clip_disable) |
2177 S_028810_ZCLIP_FAR_DISABLE(info->rs.depth_clip_disable) |
2178 S_028810_DX_RASTERIZATION_KILL(info->rs.discard_enable) |
2179 S_028810_DX_LINEAR_ATTR_CLIP_ENA(1);
2180
2181 pipeline->uses_conservative_overestimate =
2182 info->rs.conservative_mode == VK_CONSERVATIVE_RASTERIZATION_MODE_OVERESTIMATE_EXT;
2183
2184 pipeline->depth_clamp_mode = RADV_DEPTH_CLAMP_MODE_VIEWPORT;
2185 if (!info->rs.depth_clamp_enable) {
2186 /* For optimal performance, depth clamping should always be enabled except if the
2187 * application disables clamping explicitly or uses depth values outside of the [0.0, 1.0]
2188 * range.
2189 */
2190 if (info->rs.depth_clip_disable ||
2191 device->vk.enabled_extensions.EXT_depth_range_unrestricted) {
2192 pipeline->depth_clamp_mode = RADV_DEPTH_CLAMP_MODE_DISABLED;
2193 } else {
2194 pipeline->depth_clamp_mode = RADV_DEPTH_CLAMP_MODE_ZERO_TO_ONE;
2195 }
2196 }
2197 }
2198
2199 static struct radv_depth_stencil_state
radv_pipeline_init_depth_stencil_state(struct radv_graphics_pipeline * pipeline,const struct radv_graphics_pipeline_info * info)2200 radv_pipeline_init_depth_stencil_state(struct radv_graphics_pipeline *pipeline,
2201 const struct radv_graphics_pipeline_info *info)
2202 {
2203 const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
2204 struct radv_depth_stencil_state ds_state = {0};
2205 uint32_t db_depth_control = 0;
2206
2207 bool has_depth_attachment = info->ri.depth_att_format != VK_FORMAT_UNDEFINED;
2208 bool has_stencil_attachment = info->ri.stencil_att_format != VK_FORMAT_UNDEFINED;
2209
2210 if (has_depth_attachment) {
2211 /* from amdvlk: For 4xAA and 8xAA need to decompress on flush for better performance */
2212 ds_state.db_render_override2 |= S_028010_DECOMPRESS_Z_ON_FLUSH(info->ms.raster_samples > 2);
2213
2214 if (pdevice->rad_info.gfx_level >= GFX10_3)
2215 ds_state.db_render_override2 |= S_028010_CENTROID_COMPUTATION_MODE(1);
2216
2217 db_depth_control = S_028800_Z_ENABLE(info->ds.depth_test_enable) |
2218 S_028800_Z_WRITE_ENABLE(info->ds.depth_write_enable) |
2219 S_028800_ZFUNC(info->ds.depth_compare_op) |
2220 S_028800_DEPTH_BOUNDS_ENABLE(info->ds.depth_bounds_test_enable);
2221 }
2222
2223 if (has_stencil_attachment && info->ds.stencil_test_enable) {
2224 db_depth_control |= S_028800_STENCIL_ENABLE(1) | S_028800_BACKFACE_ENABLE(1);
2225 db_depth_control |= S_028800_STENCILFUNC(info->ds.front.compare_op);
2226 db_depth_control |= S_028800_STENCILFUNC_BF(info->ds.back.compare_op);
2227 }
2228
2229 ds_state.db_render_override |= S_02800C_FORCE_HIS_ENABLE0(V_02800C_FORCE_DISABLE) |
2230 S_02800C_FORCE_HIS_ENABLE1(V_02800C_FORCE_DISABLE);
2231
2232 if (pipeline->depth_clamp_mode == RADV_DEPTH_CLAMP_MODE_DISABLED)
2233 ds_state.db_render_override |= S_02800C_DISABLE_VIEWPORT_CLAMP(1);
2234
2235 if (pdevice->rad_info.gfx_level >= GFX11) {
2236 unsigned max_allowed_tiles_in_wave = 0;
2237 unsigned num_samples = MAX2(radv_pipeline_color_samples(info),
2238 radv_pipeline_depth_samples(info));
2239
2240 if (pdevice->rad_info.has_dedicated_vram) {
2241 if (num_samples == 8)
2242 max_allowed_tiles_in_wave = 7;
2243 else if (num_samples == 4)
2244 max_allowed_tiles_in_wave = 14;
2245 } else {
2246 if (num_samples == 8)
2247 max_allowed_tiles_in_wave = 8;
2248 }
2249
2250 /* TODO: We may want to disable this workaround for future chips. */
2251 if (num_samples >= 4) {
2252 if (max_allowed_tiles_in_wave)
2253 max_allowed_tiles_in_wave--;
2254 else
2255 max_allowed_tiles_in_wave = 15;
2256 }
2257
2258 ds_state.db_render_control |= S_028000_OREO_MODE(V_028000_OMODE_O_THEN_B) |
2259 S_028000_MAX_ALLOWED_TILES_IN_WAVE(max_allowed_tiles_in_wave);
2260 }
2261
2262 pipeline->db_depth_control = db_depth_control;
2263
2264 return ds_state;
2265 }
2266
2267 static void
gfx9_get_gs_info(const struct radv_pipeline_key * key,const struct radv_pipeline * pipeline,struct radv_pipeline_stage * stages,struct gfx9_gs_info * out)2268 gfx9_get_gs_info(const struct radv_pipeline_key *key, const struct radv_pipeline *pipeline,
2269 struct radv_pipeline_stage *stages, struct gfx9_gs_info *out)
2270 {
2271 const struct radv_physical_device *pdevice = pipeline->device->physical_device;
2272 struct radv_shader_info *gs_info = &stages[MESA_SHADER_GEOMETRY].info;
2273 struct radv_es_output_info *es_info;
2274 bool has_tess = !!stages[MESA_SHADER_TESS_CTRL].nir;
2275
2276 if (pdevice->rad_info.gfx_level >= GFX9)
2277 es_info = has_tess ? &gs_info->tes.es_info : &gs_info->vs.es_info;
2278 else
2279 es_info = has_tess ? &stages[MESA_SHADER_TESS_EVAL].info.tes.es_info
2280 : &stages[MESA_SHADER_VERTEX].info.vs.es_info;
2281
2282 unsigned gs_num_invocations = MAX2(gs_info->gs.invocations, 1);
2283 bool uses_adjacency;
2284 switch (key->vs.topology) {
2285 case V_008958_DI_PT_LINELIST_ADJ:
2286 case V_008958_DI_PT_LINESTRIP_ADJ:
2287 case V_008958_DI_PT_TRILIST_ADJ:
2288 case V_008958_DI_PT_TRISTRIP_ADJ:
2289 uses_adjacency = true;
2290 break;
2291 default:
2292 uses_adjacency = false;
2293 break;
2294 }
2295
2296 /* All these are in dwords: */
2297 /* We can't allow using the whole LDS, because GS waves compete with
2298 * other shader stages for LDS space. */
2299 const unsigned max_lds_size = 8 * 1024;
2300 const unsigned esgs_itemsize = es_info->esgs_itemsize / 4;
2301 unsigned esgs_lds_size;
2302
2303 /* All these are per subgroup: */
2304 const unsigned max_out_prims = 32 * 1024;
2305 const unsigned max_es_verts = 255;
2306 const unsigned ideal_gs_prims = 64;
2307 unsigned max_gs_prims, gs_prims;
2308 unsigned min_es_verts, es_verts, worst_case_es_verts;
2309
2310 if (uses_adjacency || gs_num_invocations > 1)
2311 max_gs_prims = 127 / gs_num_invocations;
2312 else
2313 max_gs_prims = 255;
2314
2315 /* MAX_PRIMS_PER_SUBGROUP = gs_prims * max_vert_out * gs_invocations.
2316 * Make sure we don't go over the maximum value.
2317 */
2318 if (gs_info->gs.vertices_out > 0) {
2319 max_gs_prims =
2320 MIN2(max_gs_prims, max_out_prims / (gs_info->gs.vertices_out * gs_num_invocations));
2321 }
2322 assert(max_gs_prims > 0);
2323
2324 /* If the primitive has adjacency, halve the number of vertices
2325 * that will be reused in multiple primitives.
2326 */
2327 min_es_verts = gs_info->gs.vertices_in / (uses_adjacency ? 2 : 1);
2328
2329 gs_prims = MIN2(ideal_gs_prims, max_gs_prims);
2330 worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);
2331
2332 /* Compute ESGS LDS size based on the worst case number of ES vertices
2333 * needed to create the target number of GS prims per subgroup.
2334 */
2335 esgs_lds_size = esgs_itemsize * worst_case_es_verts;
2336
2337 /* If total LDS usage is too big, refactor partitions based on ratio
2338 * of ESGS item sizes.
2339 */
2340 if (esgs_lds_size > max_lds_size) {
2341 /* Our target GS Prims Per Subgroup was too large. Calculate
2342 * the maximum number of GS Prims Per Subgroup that will fit
2343 * into LDS, capped by the maximum that the hardware can support.
2344 */
2345 gs_prims = MIN2((max_lds_size / (esgs_itemsize * min_es_verts)), max_gs_prims);
2346 assert(gs_prims > 0);
2347 worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);
2348
2349 esgs_lds_size = esgs_itemsize * worst_case_es_verts;
2350 assert(esgs_lds_size <= max_lds_size);
2351 }
2352
2353 /* Now calculate remaining ESGS information. */
2354 if (esgs_lds_size)
2355 es_verts = MIN2(esgs_lds_size / esgs_itemsize, max_es_verts);
2356 else
2357 es_verts = max_es_verts;
2358
2359 /* Vertices for adjacency primitives are not always reused, so restore
2360 * it for ES_VERTS_PER_SUBGRP.
2361 */
2362 min_es_verts = gs_info->gs.vertices_in;
2363
2364 /* For normal primitives, the VGT only checks if they are past the ES
2365 * verts per subgroup after allocating a full GS primitive and if they
2366 * are, kick off a new subgroup. But if those additional ES verts are
2367 * unique (e.g. not reused) we need to make sure there is enough LDS
2368 * space to account for those ES verts beyond ES_VERTS_PER_SUBGRP.
2369 */
2370 es_verts -= min_es_verts - 1;
2371
2372 uint32_t es_verts_per_subgroup = es_verts;
2373 uint32_t gs_prims_per_subgroup = gs_prims;
2374 uint32_t gs_inst_prims_in_subgroup = gs_prims * gs_num_invocations;
2375 uint32_t max_prims_per_subgroup = gs_inst_prims_in_subgroup * gs_info->gs.vertices_out;
2376 out->lds_size = align(esgs_lds_size, 128) / 128;
2377 out->vgt_gs_onchip_cntl = S_028A44_ES_VERTS_PER_SUBGRP(es_verts_per_subgroup) |
2378 S_028A44_GS_PRIMS_PER_SUBGRP(gs_prims_per_subgroup) |
2379 S_028A44_GS_INST_PRIMS_IN_SUBGRP(gs_inst_prims_in_subgroup);
2380 out->vgt_gs_max_prims_per_subgroup = S_028A94_MAX_PRIMS_PER_SUBGROUP(max_prims_per_subgroup);
2381 out->vgt_esgs_ring_itemsize = esgs_itemsize;
2382 assert(max_prims_per_subgroup <= max_out_prims);
2383
2384 gl_shader_stage es_stage = has_tess ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
2385 unsigned workgroup_size = ac_compute_esgs_workgroup_size(
2386 pdevice->rad_info.gfx_level, stages[es_stage].info.wave_size,
2387 es_verts_per_subgroup, gs_inst_prims_in_subgroup);
2388 stages[es_stage].info.workgroup_size = workgroup_size;
2389 stages[MESA_SHADER_GEOMETRY].info.workgroup_size = workgroup_size;
2390 }
2391
2392 static void
clamp_gsprims_to_esverts(unsigned * max_gsprims,unsigned max_esverts,unsigned min_verts_per_prim,bool use_adjacency)2393 clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts, unsigned min_verts_per_prim,
2394 bool use_adjacency)
2395 {
2396 unsigned max_reuse = max_esverts - min_verts_per_prim;
2397 if (use_adjacency)
2398 max_reuse /= 2;
2399 *max_gsprims = MIN2(*max_gsprims, 1 + max_reuse);
2400 }
2401
2402 static unsigned
radv_get_num_input_vertices(const struct radv_pipeline_stage * stages)2403 radv_get_num_input_vertices(const struct radv_pipeline_stage *stages)
2404 {
2405 if (stages[MESA_SHADER_GEOMETRY].nir) {
2406 nir_shader *gs = stages[MESA_SHADER_GEOMETRY].nir;
2407
2408 return gs->info.gs.vertices_in;
2409 }
2410
2411 if (stages[MESA_SHADER_TESS_CTRL].nir) {
2412 nir_shader *tes = stages[MESA_SHADER_TESS_EVAL].nir;
2413
2414 if (tes->info.tess.point_mode)
2415 return 1;
2416 if (tes->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES)
2417 return 2;
2418 return 3;
2419 }
2420
2421 return 3;
2422 }
2423
2424 static void
gfx10_emit_ge_pc_alloc(struct radeon_cmdbuf * cs,enum amd_gfx_level gfx_level,uint32_t oversub_pc_lines)2425 gfx10_emit_ge_pc_alloc(struct radeon_cmdbuf *cs, enum amd_gfx_level gfx_level,
2426 uint32_t oversub_pc_lines)
2427 {
2428 radeon_set_uconfig_reg(
2429 cs, R_030980_GE_PC_ALLOC,
2430 S_030980_OVERSUB_EN(oversub_pc_lines > 0) | S_030980_NUM_PC_LINES(oversub_pc_lines - 1));
2431 }
2432
2433 static void
gfx10_get_ngg_ms_info(struct radv_pipeline_stage * stage,struct gfx10_ngg_info * ngg)2434 gfx10_get_ngg_ms_info(struct radv_pipeline_stage *stage, struct gfx10_ngg_info *ngg)
2435 {
2436 /* Special case for mesh shader workgroups.
2437 *
2438 * Mesh shaders don't have any real vertex input, but they can produce
2439 * an arbitrary number of vertices and primitives (up to 256).
2440 * We need to precisely control the number of mesh shader workgroups
2441 * that are launched from draw calls.
2442 *
2443 * To achieve that, we set:
2444 * - input primitive topology to point list
2445 * - input vertex and primitive count to 1
2446 * - max output vertex count and primitive amplification factor
2447 * to the boundaries of the shader
2448 *
2449 * With that, in the draw call:
2450 * - drawing 1 input vertex ~ launching 1 mesh shader workgroup
2451 *
2452 * In the shader:
2453 * - base vertex ~ first workgroup index (firstTask in NV_mesh_shader)
2454 * - input vertex id ~ workgroup id (in 1D - shader needs to calculate in 3D)
2455 *
2456 * Notes:
2457 * - without GS_EN=1 PRIM_AMP_FACTOR and MAX_VERTS_PER_SUBGROUP don't seem to work
2458 * - with GS_EN=1 we must also set VGT_GS_MAX_VERT_OUT (otherwise the GPU hangs)
2459 * - with GS_FAST_LAUNCH=1 every lane's VGPRs are initialized to the same input vertex index
2460 *
2461 */
2462 nir_shader *ms = stage->nir;
2463
2464 ngg->enable_vertex_grouping = true;
2465 ngg->esgs_ring_size = 1;
2466 ngg->hw_max_esverts = 1;
2467 ngg->max_gsprims = 1;
2468 ngg->max_out_verts = ms->info.mesh.max_vertices_out;
2469 ngg->max_vert_out_per_gs_instance = false;
2470 ngg->ngg_emit_size = 0;
2471 ngg->prim_amp_factor = ms->info.mesh.max_primitives_out;
2472 ngg->vgt_esgs_ring_itemsize = 1;
2473
2474 unsigned min_ngg_workgroup_size =
2475 ac_compute_ngg_workgroup_size(ngg->hw_max_esverts, ngg->max_gsprims,
2476 ngg->max_out_verts, ngg->prim_amp_factor);
2477
2478 unsigned api_workgroup_size =
2479 ac_compute_cs_workgroup_size(ms->info.workgroup_size, false, UINT32_MAX);
2480
2481 stage->info.workgroup_size = MAX2(min_ngg_workgroup_size, api_workgroup_size);
2482 }
2483
2484 static void
gfx10_get_ngg_info(const struct radv_pipeline_key * key,struct radv_pipeline * pipeline,struct radv_pipeline_stage * stages,struct gfx10_ngg_info * ngg)2485 gfx10_get_ngg_info(const struct radv_pipeline_key *key, struct radv_pipeline *pipeline,
2486 struct radv_pipeline_stage *stages, struct gfx10_ngg_info *ngg)
2487 {
2488 const struct radv_physical_device *pdevice = pipeline->device->physical_device;
2489 struct radv_shader_info *gs_info = &stages[MESA_SHADER_GEOMETRY].info;
2490 struct radv_es_output_info *es_info =
2491 stages[MESA_SHADER_TESS_CTRL].nir ? &gs_info->tes.es_info : &gs_info->vs.es_info;
2492 unsigned gs_type = stages[MESA_SHADER_GEOMETRY].nir ? MESA_SHADER_GEOMETRY : MESA_SHADER_VERTEX;
2493 unsigned max_verts_per_prim = radv_get_num_input_vertices(stages);
2494 unsigned min_verts_per_prim = gs_type == MESA_SHADER_GEOMETRY ? max_verts_per_prim : 1;
2495 unsigned gs_num_invocations = stages[MESA_SHADER_GEOMETRY].nir ? MAX2(gs_info->gs.invocations, 1) : 1;
2496 bool uses_adjacency;
2497 switch (key->vs.topology) {
2498 case V_008958_DI_PT_LINELIST_ADJ:
2499 case V_008958_DI_PT_LINESTRIP_ADJ:
2500 case V_008958_DI_PT_TRILIST_ADJ:
2501 case V_008958_DI_PT_TRISTRIP_ADJ:
2502 uses_adjacency = true;
2503 break;
2504 default:
2505 uses_adjacency = false;
2506 break;
2507 }
2508
2509 /* All these are in dwords: */
2510 /* We can't allow using the whole LDS, because GS waves compete with
2511 * other shader stages for LDS space.
2512 *
2513 * TODO: We should really take the shader's internal LDS use into
2514 * account. The linker will fail if the size is greater than
2515 * 8K dwords.
2516 */
2517 const unsigned max_lds_size = 8 * 1024 - 768;
2518 const unsigned target_lds_size = max_lds_size;
2519 unsigned esvert_lds_size = 0;
2520 unsigned gsprim_lds_size = 0;
2521
2522 /* All these are per subgroup: */
2523 const unsigned min_esverts = pdevice->rad_info.gfx_level >= GFX10_3 ? 29 : 24;
2524 bool max_vert_out_per_gs_instance = false;
2525 unsigned max_esverts_base = 128;
2526 unsigned max_gsprims_base = 128; /* default prim group size clamp */
2527
2528 /* Hardware has the following non-natural restrictions on the value
2529 * of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of
2530 * the draw:
2531 * - at most 252 for any line input primitive type
2532 * - at most 251 for any quad input primitive type
2533 * - at most 251 for triangle strips with adjacency (this happens to
2534 * be the natural limit for triangle *lists* with adjacency)
2535 */
2536 max_esverts_base = MIN2(max_esverts_base, 251 + max_verts_per_prim - 1);
2537
2538 if (gs_type == MESA_SHADER_GEOMETRY) {
2539 unsigned max_out_verts_per_gsprim = gs_info->gs.vertices_out * gs_num_invocations;
2540
2541 if (max_out_verts_per_gsprim <= 256) {
2542 if (max_out_verts_per_gsprim) {
2543 max_gsprims_base = MIN2(max_gsprims_base, 256 / max_out_verts_per_gsprim);
2544 }
2545 } else {
2546 /* Use special multi-cycling mode in which each GS
2547 * instance gets its own subgroup. Does not work with
2548 * tessellation. */
2549 max_vert_out_per_gs_instance = true;
2550 max_gsprims_base = 1;
2551 max_out_verts_per_gsprim = gs_info->gs.vertices_out;
2552 }
2553
2554 esvert_lds_size = es_info->esgs_itemsize / 4;
2555 gsprim_lds_size = (gs_info->gs.gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim;
2556 } else {
2557 /* VS and TES. */
2558 /* LDS size for passing data from GS to ES. */
2559 struct radv_streamout_info *so_info = stages[MESA_SHADER_TESS_CTRL].nir
2560 ? &stages[MESA_SHADER_TESS_EVAL].info.so
2561 : &stages[MESA_SHADER_VERTEX].info.so;
2562
2563 if (so_info->num_outputs)
2564 esvert_lds_size = 4 * so_info->num_outputs + 1;
2565
2566 /* GS stores Primitive IDs (one DWORD) into LDS at the address
2567 * corresponding to the ES thread of the provoking vertex. All
2568 * ES threads load and export PrimitiveID for their thread.
2569 */
2570 if (!stages[MESA_SHADER_TESS_CTRL].nir && stages[MESA_SHADER_VERTEX].info.vs.outinfo.export_prim_id)
2571 esvert_lds_size = MAX2(esvert_lds_size, 1);
2572 }
2573
2574 unsigned max_gsprims = max_gsprims_base;
2575 unsigned max_esverts = max_esverts_base;
2576
2577 if (esvert_lds_size)
2578 max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size);
2579 if (gsprim_lds_size)
2580 max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size);
2581
2582 max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2583 clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency);
2584 assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
2585
2586 if (esvert_lds_size || gsprim_lds_size) {
2587 /* Now that we have a rough proportionality between esverts
2588 * and gsprims based on the primitive type, scale both of them
2589 * down simultaneously based on required LDS space.
2590 *
2591 * We could be smarter about this if we knew how much vertex
2592 * reuse to expect.
2593 */
2594 unsigned lds_total = max_esverts * esvert_lds_size + max_gsprims * gsprim_lds_size;
2595 if (lds_total > target_lds_size) {
2596 max_esverts = max_esverts * target_lds_size / lds_total;
2597 max_gsprims = max_gsprims * target_lds_size / lds_total;
2598
2599 max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2600 clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency);
2601 assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
2602 }
2603 }
2604
2605 /* Round up towards full wave sizes for better ALU utilization. */
2606 if (!max_vert_out_per_gs_instance) {
2607 unsigned orig_max_esverts;
2608 unsigned orig_max_gsprims;
2609 unsigned wavesize;
2610
2611 if (gs_type == MESA_SHADER_GEOMETRY) {
2612 wavesize = gs_info->wave_size;
2613 } else {
2614 wavesize = stages[MESA_SHADER_TESS_CTRL].nir ? stages[MESA_SHADER_TESS_EVAL].info.wave_size
2615 : stages[MESA_SHADER_VERTEX].info.wave_size;
2616 }
2617
2618 do {
2619 orig_max_esverts = max_esverts;
2620 orig_max_gsprims = max_gsprims;
2621
2622 max_esverts = align(max_esverts, wavesize);
2623 max_esverts = MIN2(max_esverts, max_esverts_base);
2624 if (esvert_lds_size)
2625 max_esverts =
2626 MIN2(max_esverts, (max_lds_size - max_gsprims * gsprim_lds_size) / esvert_lds_size);
2627 max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2628
2629 /* Hardware restriction: minimum value of max_esverts */
2630 if (pdevice->rad_info.gfx_level == GFX10)
2631 max_esverts = MAX2(max_esverts, min_esverts - 1 + max_verts_per_prim);
2632 else
2633 max_esverts = MAX2(max_esverts, min_esverts);
2634
2635 max_gsprims = align(max_gsprims, wavesize);
2636 max_gsprims = MIN2(max_gsprims, max_gsprims_base);
2637 if (gsprim_lds_size) {
2638 /* Don't count unusable vertices to the LDS
2639 * size. Those are vertices above the maximum
2640 * number of vertices that can occur in the
2641 * workgroup, which is e.g. max_gsprims * 3
2642 * for triangles.
2643 */
2644 unsigned usable_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2645 max_gsprims = MIN2(max_gsprims,
2646 (max_lds_size - usable_esverts * esvert_lds_size) / gsprim_lds_size);
2647 }
2648 clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency);
2649 assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
2650 } while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims);
2651
2652 /* Verify the restriction. */
2653 if (pdevice->rad_info.gfx_level == GFX10)
2654 assert(max_esverts >= min_esverts - 1 + max_verts_per_prim);
2655 else
2656 assert(max_esverts >= min_esverts);
2657 } else {
2658 /* Hardware restriction: minimum value of max_esverts */
2659 if (pdevice->rad_info.gfx_level == GFX10)
2660 max_esverts = MAX2(max_esverts, min_esverts - 1 + max_verts_per_prim);
2661 else
2662 max_esverts = MAX2(max_esverts, min_esverts);
2663 }
2664
2665 unsigned max_out_vertices = max_vert_out_per_gs_instance ? gs_info->gs.vertices_out
2666 : gs_type == MESA_SHADER_GEOMETRY
2667 ? max_gsprims * gs_num_invocations * gs_info->gs.vertices_out
2668 : max_esverts;
2669 assert(max_out_vertices <= 256);
2670
2671 unsigned prim_amp_factor = 1;
2672 if (gs_type == MESA_SHADER_GEOMETRY) {
2673 /* Number of output primitives per GS input primitive after
2674 * GS instancing. */
2675 prim_amp_factor = gs_info->gs.vertices_out;
2676 }
2677
2678 /* On Gfx10, the GE only checks against the maximum number of ES verts
2679 * after allocating a full GS primitive. So we need to ensure that
2680 * whenever this check passes, there is enough space for a full
2681 * primitive without vertex reuse.
2682 */
2683 if (pdevice->rad_info.gfx_level == GFX10)
2684 ngg->hw_max_esverts = max_esverts - max_verts_per_prim + 1;
2685 else
2686 ngg->hw_max_esverts = max_esverts;
2687
2688 ngg->max_gsprims = max_gsprims;
2689 ngg->max_out_verts = max_out_vertices;
2690 ngg->prim_amp_factor = prim_amp_factor;
2691 ngg->max_vert_out_per_gs_instance = max_vert_out_per_gs_instance;
2692 ngg->ngg_emit_size = max_gsprims * gsprim_lds_size;
2693 ngg->enable_vertex_grouping = true;
2694
2695 /* Don't count unusable vertices. */
2696 ngg->esgs_ring_size = MIN2(max_esverts, max_gsprims * max_verts_per_prim) * esvert_lds_size * 4;
2697
2698 if (gs_type == MESA_SHADER_GEOMETRY) {
2699 ngg->vgt_esgs_ring_itemsize = es_info->esgs_itemsize / 4;
2700 } else {
2701 ngg->vgt_esgs_ring_itemsize = 1;
2702 }
2703
2704 assert(ngg->hw_max_esverts >= min_esverts); /* HW limitation */
2705
2706 gl_shader_stage es_stage = stages[MESA_SHADER_TESS_CTRL].nir ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
2707 unsigned workgroup_size =
2708 ac_compute_ngg_workgroup_size(
2709 max_esverts, max_gsprims * gs_num_invocations, max_out_vertices, prim_amp_factor);
2710 stages[MESA_SHADER_GEOMETRY].info.workgroup_size = workgroup_size;
2711 stages[es_stage].info.workgroup_size = workgroup_size;
2712 }
2713
2714 static void
radv_pipeline_init_gs_ring_state(struct radv_graphics_pipeline * pipeline,const struct gfx9_gs_info * gs)2715 radv_pipeline_init_gs_ring_state(struct radv_graphics_pipeline *pipeline, const struct gfx9_gs_info *gs)
2716 {
2717 const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
2718 unsigned num_se = pdevice->rad_info.max_se;
2719 unsigned wave_size = 64;
2720 unsigned max_gs_waves = 32 * num_se; /* max 32 per SE on GCN */
2721 /* On GFX6-GFX7, the value comes from VGT_GS_VERTEX_REUSE = 16.
2722 * On GFX8+, the value comes from VGT_VERTEX_REUSE_BLOCK_CNTL = 30 (+2).
2723 */
2724 unsigned gs_vertex_reuse = (pdevice->rad_info.gfx_level >= GFX8 ? 32 : 16) * num_se;
2725 unsigned alignment = 256 * num_se;
2726 /* The maximum size is 63.999 MB per SE. */
2727 unsigned max_size = ((unsigned)(63.999 * 1024 * 1024) & ~255) * num_se;
2728 struct radv_shader_info *gs_info = &pipeline->base.shaders[MESA_SHADER_GEOMETRY]->info;
2729
2730 /* Calculate the minimum size. */
2731 unsigned min_esgs_ring_size =
2732 align(gs->vgt_esgs_ring_itemsize * 4 * gs_vertex_reuse * wave_size, alignment);
2733 /* These are recommended sizes, not minimum sizes. */
2734 unsigned esgs_ring_size =
2735 max_gs_waves * 2 * wave_size * gs->vgt_esgs_ring_itemsize * 4 * gs_info->gs.vertices_in;
2736 unsigned gsvs_ring_size = max_gs_waves * 2 * wave_size * gs_info->gs.max_gsvs_emit_size;
2737
2738 min_esgs_ring_size = align(min_esgs_ring_size, alignment);
2739 esgs_ring_size = align(esgs_ring_size, alignment);
2740 gsvs_ring_size = align(gsvs_ring_size, alignment);
2741
2742 if (pdevice->rad_info.gfx_level <= GFX8)
2743 pipeline->esgs_ring_size = CLAMP(esgs_ring_size, min_esgs_ring_size, max_size);
2744
2745 pipeline->gsvs_ring_size = MIN2(gsvs_ring_size, max_size);
2746 }
2747
2748 struct radv_shader *
radv_get_shader(const struct radv_pipeline * pipeline,gl_shader_stage stage)2749 radv_get_shader(const struct radv_pipeline *pipeline, gl_shader_stage stage)
2750 {
2751 if (stage == MESA_SHADER_VERTEX) {
2752 if (pipeline->shaders[MESA_SHADER_VERTEX])
2753 return pipeline->shaders[MESA_SHADER_VERTEX];
2754 if (pipeline->shaders[MESA_SHADER_TESS_CTRL])
2755 return pipeline->shaders[MESA_SHADER_TESS_CTRL];
2756 if (pipeline->shaders[MESA_SHADER_GEOMETRY])
2757 return pipeline->shaders[MESA_SHADER_GEOMETRY];
2758 } else if (stage == MESA_SHADER_TESS_EVAL) {
2759 if (!pipeline->shaders[MESA_SHADER_TESS_CTRL])
2760 return NULL;
2761 if (pipeline->shaders[MESA_SHADER_TESS_EVAL])
2762 return pipeline->shaders[MESA_SHADER_TESS_EVAL];
2763 if (pipeline->shaders[MESA_SHADER_GEOMETRY])
2764 return pipeline->shaders[MESA_SHADER_GEOMETRY];
2765 }
2766 return pipeline->shaders[stage];
2767 }
2768
2769 static const struct radv_vs_output_info *
get_vs_output_info(const struct radv_graphics_pipeline * pipeline)2770 get_vs_output_info(const struct radv_graphics_pipeline *pipeline)
2771 {
2772 if (radv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY))
2773 if (radv_pipeline_has_ngg(pipeline))
2774 return &pipeline->base.shaders[MESA_SHADER_GEOMETRY]->info.vs.outinfo;
2775 else
2776 return &pipeline->base.gs_copy_shader->info.vs.outinfo;
2777 else if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL))
2778 return &pipeline->base.shaders[MESA_SHADER_TESS_EVAL]->info.tes.outinfo;
2779 else if (radv_pipeline_has_stage(pipeline, MESA_SHADER_MESH))
2780 return &pipeline->base.shaders[MESA_SHADER_MESH]->info.ms.outinfo;
2781 else
2782 return &pipeline->base.shaders[MESA_SHADER_VERTEX]->info.vs.outinfo;
2783 }
2784
2785 static bool
radv_lower_viewport_to_zero(nir_shader * nir)2786 radv_lower_viewport_to_zero(nir_shader *nir)
2787 {
2788 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
2789 bool progress = false;
2790
2791 nir_builder b;
2792 nir_builder_init(&b, impl);
2793
2794 /* There should be only one deref load for VIEWPORT after lower_io_to_temporaries. */
2795 nir_foreach_block(block, impl) {
2796 nir_foreach_instr(instr, block) {
2797 if (instr->type != nir_instr_type_intrinsic)
2798 continue;
2799
2800 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
2801 if (intr->intrinsic != nir_intrinsic_load_deref)
2802 continue;
2803
2804 nir_variable *var = nir_intrinsic_get_var(intr, 0);
2805 if (var->data.mode != nir_var_shader_in ||
2806 var->data.location != VARYING_SLOT_VIEWPORT)
2807 continue;
2808
2809 b.cursor = nir_before_instr(instr);
2810
2811 nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_imm_zero(&b, 1, 32));
2812 progress = true;
2813 break;
2814 }
2815 if (progress)
2816 break;
2817 }
2818
2819 if (progress)
2820 nir_metadata_preserve(impl, nir_metadata_block_index | nir_metadata_dominance);
2821 else
2822 nir_metadata_preserve(impl, nir_metadata_all);
2823
2824 return progress;
2825 }
2826
2827 static nir_variable *
find_layer_out_var(nir_shader * nir)2828 find_layer_out_var(nir_shader *nir)
2829 {
2830 nir_variable *var = nir_find_variable_with_location(nir, nir_var_shader_out, VARYING_SLOT_LAYER);
2831 if (var != NULL)
2832 return var;
2833
2834 var = nir_variable_create(nir, nir_var_shader_out, glsl_int_type(), "layer id");
2835 var->data.location = VARYING_SLOT_LAYER;
2836 var->data.interpolation = INTERP_MODE_NONE;
2837
2838 return var;
2839 }
2840
2841 static bool
radv_lower_multiview(nir_shader * nir)2842 radv_lower_multiview(nir_shader *nir)
2843 {
2844 /* This pass is not suitable for mesh shaders, because it can't know
2845 * the mapping between API mesh shader invocations and output primitives.
2846 * Needs to be handled in ac_nir_lower_ngg.
2847 */
2848 if (nir->info.stage == MESA_SHADER_MESH)
2849 return false;
2850
2851 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
2852 bool progress = false;
2853
2854 nir_builder b;
2855 nir_builder_init(&b, impl);
2856
2857 /* Iterate in reverse order since there should be only one deref store to POS after
2858 * lower_io_to_temporaries for vertex shaders and inject the layer there. For geometry shaders,
2859 * the layer is injected right before every emit_vertex_with_counter.
2860 */
2861 nir_variable *layer = NULL;
2862 nir_foreach_block_reverse(block, impl) {
2863 nir_foreach_instr_reverse(instr, block) {
2864 if (instr->type != nir_instr_type_intrinsic)
2865 continue;
2866
2867 if (nir->info.stage == MESA_SHADER_GEOMETRY) {
2868 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
2869 if (intr->intrinsic != nir_intrinsic_emit_vertex_with_counter)
2870 continue;
2871
2872 b.cursor = nir_before_instr(instr);
2873 } else {
2874 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
2875 if (intr->intrinsic != nir_intrinsic_store_deref)
2876 continue;
2877
2878 nir_variable *var = nir_intrinsic_get_var(intr, 0);
2879 if (var->data.mode != nir_var_shader_out || var->data.location != VARYING_SLOT_POS)
2880 continue;
2881
2882 b.cursor = nir_after_instr(instr);
2883 }
2884
2885 if (!layer)
2886 layer = find_layer_out_var(nir);
2887
2888 nir_store_var(&b, layer, nir_load_view_index(&b), 1);
2889
2890 /* Update outputs_written to reflect that the pass added a new output. */
2891 nir->info.outputs_written |= BITFIELD64_BIT(VARYING_SLOT_LAYER);
2892
2893 progress = true;
2894 if (nir->info.stage == MESA_SHADER_VERTEX)
2895 break;
2896 }
2897 if (nir->info.stage == MESA_SHADER_VERTEX && progress)
2898 break;
2899 }
2900
2901 if (progress)
2902 nir_metadata_preserve(impl, nir_metadata_block_index | nir_metadata_dominance);
2903 else
2904 nir_metadata_preserve(impl, nir_metadata_all);
2905
2906 return progress;
2907 }
2908
2909 static bool
radv_export_implicit_primitive_id(nir_shader * nir)2910 radv_export_implicit_primitive_id(nir_shader *nir)
2911 {
2912 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
2913 nir_builder b;
2914 nir_builder_init(&b, impl);
2915
2916 b.cursor = nir_after_cf_list(&impl->body);
2917
2918 nir_variable *var = nir_variable_create(nir, nir_var_shader_out, glsl_int_type(), NULL);
2919 var->data.location = VARYING_SLOT_PRIMITIVE_ID;
2920 var->data.interpolation = INTERP_MODE_NONE;
2921
2922 nir_store_var(&b, var, nir_load_primitive_id(&b), 1);
2923
2924 /* Update outputs_written to reflect that the pass added a new output. */
2925 nir->info.outputs_written |= BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_ID);
2926
2927 nir_metadata_preserve(impl, nir_metadata_block_index | nir_metadata_dominance);
2928
2929 return true;
2930 }
2931
2932 static void
radv_link_shaders(struct radv_pipeline * pipeline,const struct radv_pipeline_key * pipeline_key,const struct radv_pipeline_stage * stages,bool optimize_conservatively,gl_shader_stage last_vgt_api_stage)2933 radv_link_shaders(struct radv_pipeline *pipeline,
2934 const struct radv_pipeline_key *pipeline_key,
2935 const struct radv_pipeline_stage *stages,
2936 bool optimize_conservatively,
2937 gl_shader_stage last_vgt_api_stage)
2938 {
2939 const struct radv_physical_device *pdevice = pipeline->device->physical_device;
2940 nir_shader *ordered_shaders[MESA_VULKAN_SHADER_STAGES];
2941 int shader_count = 0;
2942
2943 if (stages[MESA_SHADER_FRAGMENT].nir) {
2944 ordered_shaders[shader_count++] = stages[MESA_SHADER_FRAGMENT].nir;
2945 }
2946 if (stages[MESA_SHADER_GEOMETRY].nir) {
2947 ordered_shaders[shader_count++] = stages[MESA_SHADER_GEOMETRY].nir;
2948 }
2949 if (stages[MESA_SHADER_TESS_EVAL].nir) {
2950 ordered_shaders[shader_count++] = stages[MESA_SHADER_TESS_EVAL].nir;
2951 }
2952 if (stages[MESA_SHADER_TESS_CTRL].nir) {
2953 ordered_shaders[shader_count++] = stages[MESA_SHADER_TESS_CTRL].nir;
2954 }
2955 if (stages[MESA_SHADER_VERTEX].nir) {
2956 ordered_shaders[shader_count++] = stages[MESA_SHADER_VERTEX].nir;
2957 }
2958 if (stages[MESA_SHADER_MESH].nir) {
2959 ordered_shaders[shader_count++] = stages[MESA_SHADER_MESH].nir;
2960 }
2961 if (stages[MESA_SHADER_TASK].nir) {
2962 ordered_shaders[shader_count++] = stages[MESA_SHADER_TASK].nir;
2963 }
2964 if (stages[MESA_SHADER_COMPUTE].nir) {
2965 ordered_shaders[shader_count++] = stages[MESA_SHADER_COMPUTE].nir;
2966 }
2967
2968 if (stages[MESA_SHADER_MESH].nir && stages[MESA_SHADER_FRAGMENT].nir) {
2969 nir_shader *ps = stages[MESA_SHADER_FRAGMENT].nir;
2970
2971 nir_foreach_shader_in_variable(var, ps) {
2972 /* These variables are per-primitive when used with a mesh shader. */
2973 if (var->data.location == VARYING_SLOT_PRIMITIVE_ID ||
2974 var->data.location == VARYING_SLOT_VIEWPORT ||
2975 var->data.location == VARYING_SLOT_LAYER)
2976 var->data.per_primitive = true;
2977 }
2978 }
2979
2980 bool has_geom_tess = stages[MESA_SHADER_GEOMETRY].nir || stages[MESA_SHADER_TESS_CTRL].nir;
2981 bool merged_gs = stages[MESA_SHADER_GEOMETRY].nir && pdevice->rad_info.gfx_level >= GFX9;
2982
2983 if (!optimize_conservatively && shader_count > 1) {
2984 unsigned first = ordered_shaders[shader_count - 1]->info.stage;
2985 unsigned last = ordered_shaders[0]->info.stage;
2986
2987 if (ordered_shaders[0]->info.stage == MESA_SHADER_FRAGMENT &&
2988 ordered_shaders[1]->info.has_transform_feedback_varyings)
2989 nir_link_xfb_varyings(ordered_shaders[1], ordered_shaders[0]);
2990
2991 for (int i = 1; i < shader_count; ++i) {
2992 nir_lower_io_arrays_to_elements(ordered_shaders[i], ordered_shaders[i - 1]);
2993 nir_validate_shader(ordered_shaders[i], "after nir_lower_io_arrays_to_elements");
2994 nir_validate_shader(ordered_shaders[i - 1], "after nir_lower_io_arrays_to_elements");
2995 }
2996
2997 for (int i = 0; i < shader_count; ++i) {
2998 nir_variable_mode mask = 0;
2999
3000 if (ordered_shaders[i]->info.stage != first)
3001 mask = mask | nir_var_shader_in;
3002
3003 if (ordered_shaders[i]->info.stage != last)
3004 mask = mask | nir_var_shader_out;
3005
3006 bool progress = false;
3007 NIR_PASS(progress, ordered_shaders[i], nir_lower_io_to_scalar_early, mask);
3008 if (progress) {
3009 /* Optimize the new vector code and then remove dead vars */
3010 NIR_PASS(_, ordered_shaders[i], nir_copy_prop);
3011 NIR_PASS(_, ordered_shaders[i], nir_opt_shrink_vectors);
3012
3013 if (ordered_shaders[i]->info.stage != last) {
3014 /* Optimize swizzled movs of load_const for
3015 * nir_link_opt_varyings's constant propagation
3016 */
3017 NIR_PASS(_, ordered_shaders[i], nir_opt_constant_folding);
3018 /* For nir_link_opt_varyings's duplicate input opt */
3019 NIR_PASS(_, ordered_shaders[i], nir_opt_cse);
3020 }
3021
3022 /* Run copy-propagation to help remove dead
3023 * output variables (some shaders have useless
3024 * copies to/from an output), so compaction
3025 * later will be more effective.
3026 *
3027 * This will have been done earlier but it might
3028 * not have worked because the outputs were vector.
3029 */
3030 if (ordered_shaders[i]->info.stage == MESA_SHADER_TESS_CTRL)
3031 NIR_PASS(_, ordered_shaders[i], nir_opt_copy_prop_vars);
3032
3033 NIR_PASS(_, ordered_shaders[i], nir_opt_dce);
3034 NIR_PASS(_, ordered_shaders[i], nir_remove_dead_variables,
3035 nir_var_function_temp | nir_var_shader_in | nir_var_shader_out, NULL);
3036 }
3037 }
3038 }
3039
3040 /* Export the primitive ID when VS or TES don't export it because it's implicit, while it's
3041 * required for GS or MS. The primitive ID is added during lowering for NGG.
3042 */
3043 if (stages[MESA_SHADER_FRAGMENT].nir &&
3044 (stages[MESA_SHADER_FRAGMENT].nir->info.inputs_read & VARYING_BIT_PRIMITIVE_ID) &&
3045 !(stages[last_vgt_api_stage].nir->info.outputs_written & VARYING_BIT_PRIMITIVE_ID) &&
3046 ((last_vgt_api_stage == MESA_SHADER_VERTEX && !stages[MESA_SHADER_VERTEX].info.is_ngg) ||
3047 (last_vgt_api_stage == MESA_SHADER_TESS_EVAL && !stages[MESA_SHADER_TESS_EVAL].info.is_ngg))) {
3048 radv_export_implicit_primitive_id(stages[last_vgt_api_stage].nir);
3049 }
3050
3051 if (!optimize_conservatively) {
3052 bool uses_xfb = last_vgt_api_stage != -1 &&
3053 stages[last_vgt_api_stage].nir->xfb_info;
3054
3055 for (unsigned i = 0; i < shader_count; ++i) {
3056 shader_info *info = &ordered_shaders[i]->info;
3057
3058 /* Remove exports without color attachment or writemask. */
3059 if (info->stage == MESA_SHADER_FRAGMENT) {
3060 bool fixup_derefs = false;
3061 nir_foreach_variable_with_modes(var, ordered_shaders[i], nir_var_shader_out) {
3062 int idx = var->data.location;
3063 idx -= FRAG_RESULT_DATA0;
3064 if (idx < 0)
3065 continue;
3066
3067 unsigned col_format = (pipeline_key->ps.col_format >> (4 * idx)) & 0xf;
3068 unsigned cb_target_mask = (pipeline_key->ps.cb_target_mask >> (4 * idx)) & 0xf;
3069
3070 if (col_format == V_028714_SPI_SHADER_ZERO ||
3071 (col_format == V_028714_SPI_SHADER_32_R && !cb_target_mask &&
3072 !pipeline_key->ps.mrt0_is_dual_src)) {
3073 /* Remove the color export if it's unused or in presence of holes. */
3074 info->outputs_written &= ~BITFIELD64_BIT(var->data.location);
3075 var->data.location = 0;
3076 var->data.mode = nir_var_shader_temp;
3077 fixup_derefs = true;
3078 }
3079 }
3080 if (fixup_derefs) {
3081 NIR_PASS_V(ordered_shaders[i], nir_fixup_deref_modes);
3082 NIR_PASS(_, ordered_shaders[i], nir_remove_dead_variables, nir_var_shader_temp,
3083 NULL);
3084 NIR_PASS(_, ordered_shaders[i], nir_opt_dce);
3085 }
3086 continue;
3087 }
3088
3089 /* Remove PSIZ from shaders when it's not needed.
3090 * This is typically produced by translation layers like Zink or D9VK.
3091 */
3092 if (uses_xfb || !(info->outputs_written & VARYING_BIT_PSIZ))
3093 continue;
3094
3095 bool next_stage_needs_psiz =
3096 i != 0 && /* ordered_shaders is backwards, so next stage is: i - 1 */
3097 ordered_shaders[i - 1]->info.inputs_read & VARYING_BIT_PSIZ;
3098 bool topology_uses_psiz =
3099 info->stage == last_vgt_api_stage &&
3100 ((info->stage == MESA_SHADER_VERTEX && pipeline_key->vs.topology == V_008958_DI_PT_POINTLIST) ||
3101 (info->stage == MESA_SHADER_TESS_EVAL && info->tess.point_mode) ||
3102 (info->stage == MESA_SHADER_GEOMETRY && info->gs.output_primitive == SHADER_PRIM_POINTS) ||
3103 (info->stage == MESA_SHADER_MESH && info->mesh.primitive_type == SHADER_PRIM_POINTS));
3104
3105 nir_variable *psiz_var =
3106 nir_find_variable_with_location(ordered_shaders[i], nir_var_shader_out, VARYING_SLOT_PSIZ);
3107
3108 if (!next_stage_needs_psiz && !topology_uses_psiz && psiz_var) {
3109 /* Change PSIZ to a global variable which allows it to be DCE'd. */
3110 psiz_var->data.location = 0;
3111 psiz_var->data.mode = nir_var_shader_temp;
3112
3113 info->outputs_written &= ~VARYING_BIT_PSIZ;
3114 NIR_PASS_V(ordered_shaders[i], nir_fixup_deref_modes);
3115 NIR_PASS(_, ordered_shaders[i], nir_remove_dead_variables, nir_var_shader_temp, NULL);
3116 NIR_PASS(_, ordered_shaders[i], nir_opt_dce);
3117 }
3118 }
3119 }
3120
3121 /* Lower the viewport index to zero when the last vertex stage doesn't export it. */
3122 if (stages[MESA_SHADER_FRAGMENT].nir &&
3123 (stages[MESA_SHADER_FRAGMENT].nir->info.inputs_read & VARYING_BIT_VIEWPORT) &&
3124 !(stages[last_vgt_api_stage].nir->info.outputs_written & VARYING_BIT_VIEWPORT)) {
3125 NIR_PASS(_, stages[MESA_SHADER_FRAGMENT].nir, radv_lower_viewport_to_zero);
3126 }
3127
3128 /* Export the layer in the last VGT stage if multiview is used. */
3129 if (pipeline_key->has_multiview_view_index && last_vgt_api_stage != -1 &&
3130 !(stages[last_vgt_api_stage].nir->info.outputs_written &
3131 VARYING_BIT_LAYER)) {
3132 nir_shader *last_vgt_shader = stages[last_vgt_api_stage].nir;
3133 NIR_PASS(_, last_vgt_shader, radv_lower_multiview);
3134 }
3135
3136 for (int i = 1; !optimize_conservatively && (i < shader_count); ++i) {
3137 if (nir_link_opt_varyings(ordered_shaders[i], ordered_shaders[i - 1])) {
3138 nir_validate_shader(ordered_shaders[i], "after nir_link_opt_varyings");
3139 nir_validate_shader(ordered_shaders[i - 1], "after nir_link_opt_varyings");
3140
3141 NIR_PASS(_, ordered_shaders[i - 1], nir_opt_constant_folding);
3142 NIR_PASS(_, ordered_shaders[i - 1], nir_opt_algebraic);
3143 NIR_PASS(_, ordered_shaders[i - 1], nir_opt_dce);
3144 }
3145
3146 NIR_PASS(_, ordered_shaders[i], nir_remove_dead_variables, nir_var_shader_out, NULL);
3147 NIR_PASS(_, ordered_shaders[i - 1], nir_remove_dead_variables, nir_var_shader_in, NULL);
3148
3149 bool progress = nir_remove_unused_varyings(ordered_shaders[i], ordered_shaders[i - 1]);
3150
3151 nir_compact_varyings(ordered_shaders[i], ordered_shaders[i - 1], true);
3152 nir_validate_shader(ordered_shaders[i], "after nir_compact_varyings");
3153 nir_validate_shader(ordered_shaders[i - 1], "after nir_compact_varyings");
3154 if (ordered_shaders[i]->info.stage == MESA_SHADER_MESH) {
3155 /* nir_compact_varyings can change the location of per-vertex and per-primitive outputs */
3156 nir_shader_gather_info(ordered_shaders[i], nir_shader_get_entrypoint(ordered_shaders[i]));
3157 }
3158
3159 if (ordered_shaders[i]->info.stage == MESA_SHADER_TESS_CTRL ||
3160 ordered_shaders[i]->info.stage == MESA_SHADER_MESH ||
3161 (ordered_shaders[i]->info.stage == MESA_SHADER_VERTEX && has_geom_tess) ||
3162 (ordered_shaders[i]->info.stage == MESA_SHADER_TESS_EVAL && merged_gs)) {
3163 NIR_PASS(_, ordered_shaders[i], nir_lower_io_to_vector, nir_var_shader_out);
3164 if (ordered_shaders[i]->info.stage == MESA_SHADER_TESS_CTRL)
3165 NIR_PASS(_, ordered_shaders[i], nir_vectorize_tess_levels);
3166 NIR_PASS(_, ordered_shaders[i], nir_opt_combine_stores, nir_var_shader_out);
3167 }
3168 if (ordered_shaders[i - 1]->info.stage == MESA_SHADER_GEOMETRY ||
3169 ordered_shaders[i - 1]->info.stage == MESA_SHADER_TESS_CTRL ||
3170 ordered_shaders[i - 1]->info.stage == MESA_SHADER_TESS_EVAL) {
3171 NIR_PASS(_, ordered_shaders[i - 1], nir_lower_io_to_vector, nir_var_shader_in);
3172 }
3173
3174 if (progress) {
3175 progress = false;
3176 NIR_PASS(progress, ordered_shaders[i], nir_lower_global_vars_to_local);
3177 if (progress) {
3178 ac_nir_lower_indirect_derefs(ordered_shaders[i], pdevice->rad_info.gfx_level);
3179 /* remove dead writes, which can remove input loads */
3180 NIR_PASS(_, ordered_shaders[i], nir_lower_vars_to_ssa);
3181 NIR_PASS(_, ordered_shaders[i], nir_opt_dce);
3182 }
3183
3184 progress = false;
3185 NIR_PASS(progress, ordered_shaders[i - 1], nir_lower_global_vars_to_local);
3186 if (progress) {
3187 ac_nir_lower_indirect_derefs(ordered_shaders[i - 1], pdevice->rad_info.gfx_level);
3188 }
3189 }
3190 }
3191 }
3192
3193 static void
radv_set_driver_locations(struct radv_pipeline * pipeline,struct radv_pipeline_stage * stages,gl_shader_stage last_vgt_api_stage)3194 radv_set_driver_locations(struct radv_pipeline *pipeline, struct radv_pipeline_stage *stages,
3195 gl_shader_stage last_vgt_api_stage)
3196 {
3197 const struct radv_physical_device *pdevice = pipeline->device->physical_device;
3198
3199 if (stages[MESA_SHADER_FRAGMENT].nir) {
3200 nir_foreach_shader_out_variable(var, stages[MESA_SHADER_FRAGMENT].nir)
3201 {
3202 var->data.driver_location = var->data.location + var->data.index;
3203 }
3204 }
3205
3206 if (stages[MESA_SHADER_MESH].nir) {
3207 /* ac_nir_lower_ngg ignores driver locations for mesh shaders,
3208 * but set them to all zero just to be on the safe side.
3209 */
3210 nir_foreach_shader_out_variable(var, stages[MESA_SHADER_MESH].nir) {
3211 var->data.driver_location = 0;
3212 }
3213 return;
3214 }
3215
3216 if (!stages[MESA_SHADER_VERTEX].nir)
3217 return;
3218
3219 bool has_tess = stages[MESA_SHADER_TESS_CTRL].nir;
3220 bool has_gs = stages[MESA_SHADER_GEOMETRY].nir;
3221
3222 /* Merged stage for VS and TES */
3223 unsigned vs_info_idx = MESA_SHADER_VERTEX;
3224 unsigned tes_info_idx = MESA_SHADER_TESS_EVAL;
3225
3226 if (pdevice->rad_info.gfx_level >= GFX9) {
3227 /* These are merged into the next stage */
3228 vs_info_idx = has_tess ? MESA_SHADER_TESS_CTRL : MESA_SHADER_GEOMETRY;
3229 tes_info_idx = has_gs ? MESA_SHADER_GEOMETRY : MESA_SHADER_TESS_EVAL;
3230 }
3231
3232 nir_foreach_shader_in_variable (var, stages[MESA_SHADER_VERTEX].nir) {
3233 var->data.driver_location = var->data.location;
3234 }
3235
3236 if (has_tess) {
3237 nir_linked_io_var_info vs2tcs = nir_assign_linked_io_var_locations(
3238 stages[MESA_SHADER_VERTEX].nir, stages[MESA_SHADER_TESS_CTRL].nir);
3239 nir_linked_io_var_info tcs2tes = nir_assign_linked_io_var_locations(
3240 stages[MESA_SHADER_TESS_CTRL].nir, stages[MESA_SHADER_TESS_EVAL].nir);
3241
3242 stages[MESA_SHADER_VERTEX].info.vs.num_linked_outputs = vs2tcs.num_linked_io_vars;
3243 stages[MESA_SHADER_TESS_CTRL].info.tcs.num_linked_inputs = vs2tcs.num_linked_io_vars;
3244 stages[MESA_SHADER_TESS_CTRL].info.tcs.num_linked_outputs = tcs2tes.num_linked_io_vars;
3245 stages[MESA_SHADER_TESS_CTRL].info.tcs.num_linked_patch_outputs = tcs2tes.num_linked_patch_io_vars;
3246 stages[MESA_SHADER_TESS_EVAL].info.tes.num_linked_inputs = tcs2tes.num_linked_io_vars;
3247 stages[MESA_SHADER_TESS_EVAL].info.tes.num_linked_patch_inputs = tcs2tes.num_linked_patch_io_vars;
3248
3249 /* Copy data to merged stage */
3250 stages[vs_info_idx].info.vs.num_linked_outputs = vs2tcs.num_linked_io_vars;
3251 stages[tes_info_idx].info.tes.num_linked_inputs = tcs2tes.num_linked_io_vars;
3252 stages[tes_info_idx].info.tes.num_linked_patch_inputs = tcs2tes.num_linked_patch_io_vars;
3253
3254 if (has_gs) {
3255 nir_linked_io_var_info tes2gs = nir_assign_linked_io_var_locations(
3256 stages[MESA_SHADER_TESS_EVAL].nir, stages[MESA_SHADER_GEOMETRY].nir);
3257
3258 stages[MESA_SHADER_TESS_EVAL].info.tes.num_linked_outputs = tes2gs.num_linked_io_vars;
3259 stages[MESA_SHADER_GEOMETRY].info.gs.num_linked_inputs = tes2gs.num_linked_io_vars;
3260
3261 /* Copy data to merged stage */
3262 stages[tes_info_idx].info.tes.num_linked_outputs = tes2gs.num_linked_io_vars;
3263 }
3264 } else if (has_gs) {
3265 nir_linked_io_var_info vs2gs = nir_assign_linked_io_var_locations(
3266 stages[MESA_SHADER_VERTEX].nir, stages[MESA_SHADER_GEOMETRY].nir);
3267
3268 stages[MESA_SHADER_VERTEX].info.vs.num_linked_outputs = vs2gs.num_linked_io_vars;
3269 stages[MESA_SHADER_GEOMETRY].info.gs.num_linked_inputs = vs2gs.num_linked_io_vars;
3270
3271 /* Copy data to merged stage */
3272 stages[vs_info_idx].info.vs.num_linked_outputs = vs2gs.num_linked_io_vars;
3273 }
3274
3275 assert(last_vgt_api_stage != MESA_SHADER_NONE);
3276 nir_foreach_shader_out_variable(var, stages[last_vgt_api_stage].nir)
3277 {
3278 var->data.driver_location = var->data.location;
3279 }
3280 }
3281
3282 static struct radv_pipeline_key
radv_generate_pipeline_key(const struct radv_pipeline * pipeline,VkPipelineCreateFlags flags)3283 radv_generate_pipeline_key(const struct radv_pipeline *pipeline, VkPipelineCreateFlags flags)
3284 {
3285 struct radv_device *device = pipeline->device;
3286 struct radv_pipeline_key key;
3287
3288 memset(&key, 0, sizeof(key));
3289
3290 if (flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT)
3291 key.optimisations_disabled = 1;
3292
3293 key.disable_aniso_single_level = device->instance->disable_aniso_single_level &&
3294 device->physical_device->rad_info.gfx_level < GFX8;
3295
3296 key.image_2d_view_of_3d = device->image_2d_view_of_3d &&
3297 device->physical_device->rad_info.gfx_level == GFX9;
3298
3299 return key;
3300 }
3301
3302 static struct radv_pipeline_key
radv_generate_graphics_pipeline_key(const struct radv_graphics_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo,const struct radv_graphics_pipeline_info * info,const struct radv_blend_state * blend)3303 radv_generate_graphics_pipeline_key(const struct radv_graphics_pipeline *pipeline,
3304 const VkGraphicsPipelineCreateInfo *pCreateInfo,
3305 const struct radv_graphics_pipeline_info *info,
3306 const struct radv_blend_state *blend)
3307 {
3308 struct radv_device *device = pipeline->base.device;
3309 struct radv_pipeline_key key = radv_generate_pipeline_key(&pipeline->base, pCreateInfo->flags);
3310
3311 key.has_multiview_view_index = !!info->ri.view_mask;
3312
3313 if (pipeline->dynamic_states & RADV_DYNAMIC_VERTEX_INPUT) {
3314 key.vs.dynamic_input_state = true;
3315 }
3316
3317 /* Vertex input state */
3318 key.vs.instance_rate_inputs = info->vi.instance_rate_inputs;
3319 key.vs.vertex_post_shuffle = info->vi.vertex_post_shuffle;
3320
3321 for (uint32_t i = 0; i < MAX_VERTEX_ATTRIBS; i++) {
3322 key.vs.instance_rate_divisors[i] = info->vi.instance_rate_divisors[i];
3323 key.vs.vertex_attribute_formats[i] = info->vi.vertex_attribute_formats[i];
3324 key.vs.vertex_attribute_bindings[i] = info->vi.vertex_attribute_bindings[i];
3325 key.vs.vertex_attribute_offsets[i] = info->vi.vertex_attribute_offsets[i];
3326 key.vs.vertex_attribute_strides[i] = info->vi.vertex_attribute_strides[i];
3327 key.vs.vertex_alpha_adjust[i] = info->vi.vertex_alpha_adjust[i];
3328 }
3329
3330 for (uint32_t i = 0; i < MAX_VBS; i++) {
3331 key.vs.vertex_binding_align[i] = info->vi.vertex_binding_align[i];
3332 }
3333
3334 key.tcs.tess_input_vertices = info->ts.patch_control_points;
3335
3336 if (info->ms.raster_samples > 1) {
3337 uint32_t ps_iter_samples = radv_pipeline_get_ps_iter_samples(info);
3338 key.ps.num_samples = info->ms.raster_samples;
3339 key.ps.log2_ps_iter_samples = util_logbase2(ps_iter_samples);
3340 }
3341
3342 key.ps.col_format = blend->spi_shader_col_format;
3343 key.ps.cb_target_mask = blend->cb_target_mask;
3344 key.ps.mrt0_is_dual_src = blend->mrt0_is_dual_src;
3345 if (device->physical_device->rad_info.gfx_level < GFX8) {
3346 key.ps.is_int8 = blend->col_format_is_int8;
3347 key.ps.is_int10 = blend->col_format_is_int10;
3348 }
3349 if (device->physical_device->rad_info.gfx_level >= GFX11) {
3350 key.ps.alpha_to_coverage_via_mrtz = info->ms.alpha_to_coverage_enable;
3351 }
3352
3353 key.vs.topology = info->ia.primitive_topology;
3354
3355 if (device->physical_device->rad_info.gfx_level >= GFX10) {
3356 key.vs.provoking_vtx_last = info->rs.provoking_vtx_last;
3357 }
3358
3359 if (device->instance->debug_flags & RADV_DEBUG_DISCARD_TO_DEMOTE)
3360 key.ps.lower_discard_to_demote = true;
3361
3362 if (device->instance->enable_mrt_output_nan_fixup)
3363 key.ps.enable_mrt_output_nan_fixup = blend->col_format_is_float32;
3364
3365
3366 key.ps.force_vrs_enabled = device->force_vrs_enabled;
3367
3368 if (device->instance->debug_flags & RADV_DEBUG_INVARIANT_GEOM)
3369 key.invariant_geom = true;
3370
3371 key.use_ngg = device->physical_device->use_ngg;
3372
3373 if ((radv_is_vrs_enabled(pipeline, info) || device->force_vrs_enabled) &&
3374 (device->physical_device->rad_info.family == CHIP_NAVI21 ||
3375 device->physical_device->rad_info.family == CHIP_NAVI22 ||
3376 device->physical_device->rad_info.family == CHIP_VANGOGH))
3377 key.adjust_frag_coord_z = true;
3378
3379 if (device->instance->disable_sinking_load_input_fs)
3380 key.disable_sinking_load_input_fs = true;
3381
3382 if (device->primitives_generated_query)
3383 key.primitives_generated_query = true;
3384
3385 key.ps.has_epilog = false; /* TODO: hook up PS epilogs */
3386
3387 return key;
3388 }
3389
3390 static uint8_t
radv_get_wave_size(struct radv_device * device,gl_shader_stage stage,const struct radv_shader_info * info)3391 radv_get_wave_size(struct radv_device *device, gl_shader_stage stage,
3392 const struct radv_shader_info *info)
3393 {
3394 if (stage == MESA_SHADER_GEOMETRY && !info->is_ngg)
3395 return 64;
3396 else if (stage == MESA_SHADER_COMPUTE) {
3397 return info->cs.subgroup_size;
3398 } else if (stage == MESA_SHADER_FRAGMENT)
3399 return device->physical_device->ps_wave_size;
3400 else if (stage == MESA_SHADER_TASK)
3401 return device->physical_device->cs_wave_size;
3402 else
3403 return device->physical_device->ge_wave_size;
3404 }
3405
3406 static uint8_t
radv_get_ballot_bit_size(struct radv_device * device,gl_shader_stage stage,const struct radv_shader_info * info)3407 radv_get_ballot_bit_size(struct radv_device *device, gl_shader_stage stage,
3408 const struct radv_shader_info *info)
3409 {
3410 if (stage == MESA_SHADER_COMPUTE && info->cs.subgroup_size)
3411 return info->cs.subgroup_size;
3412 return 64;
3413 }
3414
3415 static void
radv_determine_ngg_settings(struct radv_pipeline * pipeline,const struct radv_pipeline_key * pipeline_key,struct radv_pipeline_stage * stages,gl_shader_stage last_vgt_api_stage)3416 radv_determine_ngg_settings(struct radv_pipeline *pipeline,
3417 const struct radv_pipeline_key *pipeline_key,
3418 struct radv_pipeline_stage *stages,
3419 gl_shader_stage last_vgt_api_stage)
3420 {
3421 const struct radv_physical_device *pdevice = pipeline->device->physical_device;
3422
3423 /* Shader settings for VS or TES without GS. */
3424 if (last_vgt_api_stage == MESA_SHADER_VERTEX ||
3425 last_vgt_api_stage == MESA_SHADER_TESS_EVAL) {
3426 uint64_t ps_inputs_read =
3427 stages[MESA_SHADER_FRAGMENT].nir ? stages[MESA_SHADER_FRAGMENT].nir->info.inputs_read : 0;
3428 gl_shader_stage es_stage = last_vgt_api_stage;
3429
3430 unsigned num_vertices_per_prim = si_conv_prim_to_gs_out(pipeline_key->vs.topology) + 1;
3431 if (es_stage == MESA_SHADER_TESS_EVAL)
3432 num_vertices_per_prim = stages[es_stage].nir->info.tess.point_mode ? 1
3433 : stages[es_stage].nir->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES ? 2
3434 : 3;
3435 /* TODO: Enable culling for LLVM. */
3436 stages[es_stage].info.has_ngg_culling = radv_consider_culling(
3437 pdevice, stages[es_stage].nir, ps_inputs_read, num_vertices_per_prim, &stages[es_stage].info) &&
3438 !radv_use_llvm_for_stage(pipeline->device, es_stage);
3439
3440 nir_function_impl *impl = nir_shader_get_entrypoint(stages[es_stage].nir);
3441 stages[es_stage].info.has_ngg_early_prim_export = exec_list_is_singular(&impl->body);
3442
3443 /* Invocations that process an input vertex */
3444 const struct gfx10_ngg_info *ngg_info = &stages[es_stage].info.ngg_info;
3445 unsigned max_vtx_in = MIN2(256, ngg_info->enable_vertex_grouping ? ngg_info->hw_max_esverts : num_vertices_per_prim * ngg_info->max_gsprims);
3446
3447 unsigned lds_bytes_if_culling_off = 0;
3448 /* We need LDS space when VS needs to export the primitive ID. */
3449 if (es_stage == MESA_SHADER_VERTEX && stages[es_stage].info.vs.outinfo.export_prim_id)
3450 lds_bytes_if_culling_off = max_vtx_in * 4u;
3451 stages[es_stage].info.num_lds_blocks_when_not_culling =
3452 DIV_ROUND_UP(lds_bytes_if_culling_off, pdevice->rad_info.lds_encode_granularity);
3453
3454 /* NGG passthrough mode should be disabled when culling and when the vertex shader exports the
3455 * primitive ID.
3456 */
3457 stages[es_stage].info.is_ngg_passthrough = stages[es_stage].info.is_ngg_passthrough &&
3458 !stages[es_stage].info.has_ngg_culling &&
3459 !(es_stage == MESA_SHADER_VERTEX &&
3460 stages[es_stage].info.vs.outinfo.export_prim_id);
3461 }
3462 }
3463
3464 static void
radv_fill_shader_info_ngg(struct radv_pipeline * pipeline,const struct radv_pipeline_key * pipeline_key,struct radv_pipeline_stage * stages)3465 radv_fill_shader_info_ngg(struct radv_pipeline *pipeline,
3466 const struct radv_pipeline_key *pipeline_key,
3467 struct radv_pipeline_stage *stages)
3468 {
3469 struct radv_device *device = pipeline->device;
3470
3471 if (pipeline_key->use_ngg) {
3472 if (stages[MESA_SHADER_TESS_CTRL].nir) {
3473 stages[MESA_SHADER_TESS_EVAL].info.is_ngg = true;
3474 } else if (stages[MESA_SHADER_VERTEX].nir) {
3475 stages[MESA_SHADER_VERTEX].info.is_ngg = true;
3476 } else if (stages[MESA_SHADER_MESH].nir) {
3477 stages[MESA_SHADER_MESH].info.is_ngg = true;
3478 }
3479
3480 if (stages[MESA_SHADER_TESS_CTRL].nir && stages[MESA_SHADER_GEOMETRY].nir &&
3481 stages[MESA_SHADER_GEOMETRY].nir->info.gs.invocations *
3482 stages[MESA_SHADER_GEOMETRY].nir->info.gs.vertices_out >
3483 256) {
3484 /* Fallback to the legacy path if tessellation is
3485 * enabled with extreme geometry because
3486 * EN_MAX_VERT_OUT_PER_GS_INSTANCE doesn't work and it
3487 * might hang.
3488 */
3489 stages[MESA_SHADER_TESS_EVAL].info.is_ngg = false;
3490
3491 /* GFX11+ requires NGG. */
3492 assert(device->physical_device->rad_info.gfx_level < GFX11);
3493 }
3494
3495 gl_shader_stage last_xfb_stage = MESA_SHADER_VERTEX;
3496
3497 for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
3498 if (stages[i].nir)
3499 last_xfb_stage = i;
3500 }
3501
3502 bool uses_xfb = stages[last_xfb_stage].nir &&
3503 stages[last_xfb_stage].nir->xfb_info;
3504
3505 if (!device->physical_device->use_ngg_streamout && uses_xfb) {
3506 /* GFX11+ requires NGG. */
3507 assert(device->physical_device->rad_info.gfx_level < GFX11);
3508
3509 if (stages[MESA_SHADER_TESS_CTRL].nir)
3510 stages[MESA_SHADER_TESS_EVAL].info.is_ngg = false;
3511 else
3512 stages[MESA_SHADER_VERTEX].info.is_ngg = false;
3513 }
3514
3515 /* Determine if the pipeline is eligible for the NGG passthrough
3516 * mode. It can't be enabled for geometry shaders, for NGG
3517 * streamout or for vertex shaders that export the primitive ID
3518 * (this is checked later because we don't have the info here.)
3519 */
3520 if (!stages[MESA_SHADER_GEOMETRY].nir && !uses_xfb) {
3521 if (stages[MESA_SHADER_TESS_CTRL].nir && stages[MESA_SHADER_TESS_EVAL].info.is_ngg) {
3522 stages[MESA_SHADER_TESS_EVAL].info.is_ngg_passthrough = true;
3523 } else if (stages[MESA_SHADER_VERTEX].nir && stages[MESA_SHADER_VERTEX].info.is_ngg) {
3524 stages[MESA_SHADER_VERTEX].info.is_ngg_passthrough = true;
3525 }
3526 }
3527 }
3528 }
3529
3530 static void
radv_fill_shader_info(struct radv_pipeline * pipeline,struct radv_pipeline_layout * pipeline_layout,const struct radv_pipeline_key * pipeline_key,struct radv_pipeline_stage * stages,gl_shader_stage last_vgt_api_stage)3531 radv_fill_shader_info(struct radv_pipeline *pipeline,
3532 struct radv_pipeline_layout *pipeline_layout,
3533 const struct radv_pipeline_key *pipeline_key,
3534 struct radv_pipeline_stage *stages,
3535 gl_shader_stage last_vgt_api_stage)
3536 {
3537 struct radv_device *device = pipeline->device;
3538 unsigned active_stages = 0;
3539 unsigned filled_stages = 0;
3540
3541 for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; i++) {
3542 if (stages[i].nir)
3543 active_stages |= (1 << i);
3544 }
3545
3546 if (stages[MESA_SHADER_TESS_CTRL].nir) {
3547 stages[MESA_SHADER_VERTEX].info.vs.as_ls = true;
3548 }
3549
3550 if (stages[MESA_SHADER_GEOMETRY].nir) {
3551 if (stages[MESA_SHADER_TESS_CTRL].nir)
3552 stages[MESA_SHADER_TESS_EVAL].info.tes.as_es = true;
3553 else
3554 stages[MESA_SHADER_VERTEX].info.vs.as_es = true;
3555 }
3556
3557 if (stages[MESA_SHADER_FRAGMENT].nir) {
3558 radv_nir_shader_info_init(&stages[MESA_SHADER_FRAGMENT].info);
3559 radv_nir_shader_info_pass(device, stages[MESA_SHADER_FRAGMENT].nir, pipeline_layout,
3560 pipeline_key, &stages[MESA_SHADER_FRAGMENT].info);
3561
3562 assert(last_vgt_api_stage != MESA_SHADER_NONE);
3563 struct radv_shader_info *pre_ps_info = &stages[last_vgt_api_stage].info;
3564 struct radv_vs_output_info *outinfo = NULL;
3565 if (last_vgt_api_stage == MESA_SHADER_VERTEX ||
3566 last_vgt_api_stage == MESA_SHADER_GEOMETRY) {
3567 outinfo = &pre_ps_info->vs.outinfo;
3568 } else if (last_vgt_api_stage == MESA_SHADER_TESS_EVAL) {
3569 outinfo = &pre_ps_info->tes.outinfo;
3570 } else if (last_vgt_api_stage == MESA_SHADER_MESH) {
3571 outinfo = &pre_ps_info->ms.outinfo;
3572 }
3573
3574 /* Add PS input requirements to the output of the pre-PS stage. */
3575 bool ps_prim_id_in = stages[MESA_SHADER_FRAGMENT].info.ps.prim_id_input;
3576 bool ps_clip_dists_in = !!stages[MESA_SHADER_FRAGMENT].info.ps.num_input_clips_culls;
3577
3578 assert(outinfo);
3579 outinfo->export_clip_dists |= ps_clip_dists_in;
3580 if (last_vgt_api_stage == MESA_SHADER_VERTEX ||
3581 last_vgt_api_stage == MESA_SHADER_TESS_EVAL) {
3582 outinfo->export_prim_id |= ps_prim_id_in;
3583 }
3584
3585 filled_stages |= (1 << MESA_SHADER_FRAGMENT);
3586 }
3587
3588 if (device->physical_device->rad_info.gfx_level >= GFX9 &&
3589 stages[MESA_SHADER_TESS_CTRL].nir) {
3590 struct nir_shader *combined_nir[] = {stages[MESA_SHADER_VERTEX].nir, stages[MESA_SHADER_TESS_CTRL].nir};
3591
3592 radv_nir_shader_info_init(&stages[MESA_SHADER_TESS_CTRL].info);
3593
3594 /* Copy data to merged stage. */
3595 stages[MESA_SHADER_TESS_CTRL].info.vs.as_ls = true;
3596
3597 for (int i = 0; i < 2; i++) {
3598 radv_nir_shader_info_pass(device, combined_nir[i], pipeline_layout, pipeline_key,
3599 &stages[MESA_SHADER_TESS_CTRL].info);
3600 }
3601
3602 filled_stages |= (1 << MESA_SHADER_VERTEX);
3603 filled_stages |= (1 << MESA_SHADER_TESS_CTRL);
3604 }
3605
3606 if (device->physical_device->rad_info.gfx_level >= GFX9 &&
3607 stages[MESA_SHADER_GEOMETRY].nir) {
3608 gl_shader_stage pre_stage =
3609 stages[MESA_SHADER_TESS_EVAL].nir ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
3610 struct nir_shader *combined_nir[] = {stages[pre_stage].nir, stages[MESA_SHADER_GEOMETRY].nir};
3611
3612 radv_nir_shader_info_init(&stages[MESA_SHADER_GEOMETRY].info);
3613
3614 /* Copy data to merged stage. */
3615 if (pre_stage == MESA_SHADER_VERTEX) {
3616 stages[MESA_SHADER_GEOMETRY].info.vs.as_es = stages[MESA_SHADER_VERTEX].info.vs.as_es;
3617 } else {
3618 stages[MESA_SHADER_GEOMETRY].info.tes.as_es = stages[MESA_SHADER_TESS_EVAL].info.tes.as_es;
3619 }
3620 stages[MESA_SHADER_GEOMETRY].info.is_ngg = stages[pre_stage].info.is_ngg;
3621 stages[MESA_SHADER_GEOMETRY].info.gs.es_type = pre_stage;
3622
3623 for (int i = 0; i < 2; i++) {
3624 radv_nir_shader_info_pass(device, combined_nir[i], pipeline_layout, pipeline_key,
3625 &stages[MESA_SHADER_GEOMETRY].info);
3626 }
3627
3628 filled_stages |= (1 << pre_stage);
3629 filled_stages |= (1 << MESA_SHADER_GEOMETRY);
3630 }
3631
3632 active_stages ^= filled_stages;
3633 while (active_stages) {
3634 int i = u_bit_scan(&active_stages);
3635 radv_nir_shader_info_init(&stages[i].info);
3636 radv_nir_shader_info_pass(device, stages[i].nir, pipeline_layout, pipeline_key,
3637 &stages[i].info);
3638 }
3639
3640 if (stages[MESA_SHADER_COMPUTE].nir) {
3641 unsigned subgroup_size = pipeline_key->cs.compute_subgroup_size;
3642 unsigned req_subgroup_size = subgroup_size;
3643 bool require_full_subgroups = pipeline_key->cs.require_full_subgroups;
3644
3645 if (!subgroup_size)
3646 subgroup_size = device->physical_device->cs_wave_size;
3647
3648 unsigned local_size = stages[MESA_SHADER_COMPUTE].nir->info.workgroup_size[0] *
3649 stages[MESA_SHADER_COMPUTE].nir->info.workgroup_size[1] *
3650 stages[MESA_SHADER_COMPUTE].nir->info.workgroup_size[2];
3651
3652 /* Games don't always request full subgroups when they should,
3653 * which can cause bugs if cswave32 is enabled.
3654 */
3655 if (device->physical_device->cs_wave_size == 32 &&
3656 stages[MESA_SHADER_COMPUTE].nir->info.cs.uses_wide_subgroup_intrinsics && !req_subgroup_size &&
3657 local_size % RADV_SUBGROUP_SIZE == 0)
3658 require_full_subgroups = true;
3659
3660 if (require_full_subgroups && !req_subgroup_size) {
3661 /* don't use wave32 pretending to be wave64 */
3662 subgroup_size = RADV_SUBGROUP_SIZE;
3663 }
3664
3665 stages[MESA_SHADER_COMPUTE].info.cs.subgroup_size = subgroup_size;
3666 }
3667
3668 for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; i++) {
3669 if (stages[i].nir) {
3670 stages[i].info.wave_size = radv_get_wave_size(device, i, &stages[i].info);
3671 stages[i].info.ballot_bit_size = radv_get_ballot_bit_size(device, i, &stages[i].info);
3672 }
3673 }
3674
3675 /* PS always operates without workgroups. */
3676 if (stages[MESA_SHADER_FRAGMENT].nir)
3677 stages[MESA_SHADER_FRAGMENT].info.workgroup_size = stages[MESA_SHADER_FRAGMENT].info.wave_size;
3678
3679 if (stages[MESA_SHADER_COMPUTE].nir) {
3680 /* Variable workgroup size is not supported by Vulkan. */
3681 assert(!stages[MESA_SHADER_COMPUTE].nir->info.workgroup_size_variable);
3682
3683 stages[MESA_SHADER_COMPUTE].info.workgroup_size =
3684 ac_compute_cs_workgroup_size(
3685 stages[MESA_SHADER_COMPUTE].nir->info.workgroup_size, false, UINT32_MAX);
3686 }
3687
3688 if (stages[MESA_SHADER_TASK].nir) {
3689 /* Task/mesh I/O uses the task ring buffers. */
3690 stages[MESA_SHADER_TASK].info.cs.uses_task_rings = true;
3691 stages[MESA_SHADER_MESH].info.cs.uses_task_rings = true;
3692
3693 stages[MESA_SHADER_TASK].info.workgroup_size =
3694 ac_compute_cs_workgroup_size(
3695 stages[MESA_SHADER_TASK].nir->info.workgroup_size, false, UINT32_MAX);
3696 }
3697 }
3698
3699 static void
radv_declare_pipeline_args(struct radv_device * device,struct radv_pipeline_stage * stages,const struct radv_pipeline_key * pipeline_key)3700 radv_declare_pipeline_args(struct radv_device *device, struct radv_pipeline_stage *stages,
3701 const struct radv_pipeline_key *pipeline_key)
3702 {
3703 enum amd_gfx_level gfx_level = device->physical_device->rad_info.gfx_level;
3704 unsigned active_stages = 0;
3705
3706 for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; i++) {
3707 if (stages[i].nir)
3708 active_stages |= (1 << i);
3709 }
3710
3711 for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i) {
3712 stages[i].args.is_gs_copy_shader = false;
3713 stages[i].args.explicit_scratch_args = !radv_use_llvm_for_stage(device, i);
3714 stages[i].args.remap_spi_ps_input = !radv_use_llvm_for_stage(device, i);
3715 stages[i].args.load_grid_size_from_user_sgpr = device->load_grid_size_from_user_sgpr;
3716 }
3717
3718 if (gfx_level >= GFX9 && stages[MESA_SHADER_TESS_CTRL].nir) {
3719 radv_declare_shader_args(gfx_level, pipeline_key, &stages[MESA_SHADER_TESS_CTRL].info,
3720 MESA_SHADER_TESS_CTRL, true, MESA_SHADER_VERTEX,
3721 &stages[MESA_SHADER_TESS_CTRL].args);
3722 stages[MESA_SHADER_TESS_CTRL].info.user_sgprs_locs = stages[MESA_SHADER_TESS_CTRL].args.user_sgprs_locs;
3723 stages[MESA_SHADER_TESS_CTRL].info.inline_push_constant_mask =
3724 stages[MESA_SHADER_TESS_CTRL].args.ac.inline_push_const_mask;
3725
3726 stages[MESA_SHADER_VERTEX].args = stages[MESA_SHADER_TESS_CTRL].args;
3727 active_stages &= ~(1 << MESA_SHADER_VERTEX);
3728 active_stages &= ~(1 << MESA_SHADER_TESS_CTRL);
3729 }
3730
3731 if (gfx_level >= GFX9 && stages[MESA_SHADER_GEOMETRY].nir) {
3732 gl_shader_stage pre_stage =
3733 stages[MESA_SHADER_TESS_EVAL].nir ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
3734 radv_declare_shader_args(gfx_level, pipeline_key, &stages[MESA_SHADER_GEOMETRY].info,
3735 MESA_SHADER_GEOMETRY, true, pre_stage,
3736 &stages[MESA_SHADER_GEOMETRY].args);
3737 stages[MESA_SHADER_GEOMETRY].info.user_sgprs_locs = stages[MESA_SHADER_GEOMETRY].args.user_sgprs_locs;
3738 stages[MESA_SHADER_GEOMETRY].info.inline_push_constant_mask =
3739 stages[MESA_SHADER_GEOMETRY].args.ac.inline_push_const_mask;
3740
3741 stages[pre_stage].args = stages[MESA_SHADER_GEOMETRY].args;
3742 active_stages &= ~(1 << pre_stage);
3743 active_stages &= ~(1 << MESA_SHADER_GEOMETRY);
3744 }
3745
3746 u_foreach_bit(i, active_stages) {
3747 radv_declare_shader_args(gfx_level, pipeline_key, &stages[i].info, i, false,
3748 MESA_SHADER_VERTEX, &stages[i].args);
3749 stages[i].info.user_sgprs_locs = stages[i].args.user_sgprs_locs;
3750 stages[i].info.inline_push_constant_mask = stages[i].args.ac.inline_push_const_mask;
3751 }
3752 }
3753
3754 static void
merge_tess_info(struct shader_info * tes_info,struct shader_info * tcs_info)3755 merge_tess_info(struct shader_info *tes_info, struct shader_info *tcs_info)
3756 {
3757 /* The Vulkan 1.0.38 spec, section 21.1 Tessellator says:
3758 *
3759 * "PointMode. Controls generation of points rather than triangles
3760 * or lines. This functionality defaults to disabled, and is
3761 * enabled if either shader stage includes the execution mode.
3762 *
3763 * and about Triangles, Quads, IsoLines, VertexOrderCw, VertexOrderCcw,
3764 * PointMode, SpacingEqual, SpacingFractionalEven, SpacingFractionalOdd,
3765 * and OutputVertices, it says:
3766 *
3767 * "One mode must be set in at least one of the tessellation
3768 * shader stages."
3769 *
3770 * So, the fields can be set in either the TCS or TES, but they must
3771 * agree if set in both. Our backend looks at TES, so bitwise-or in
3772 * the values from the TCS.
3773 */
3774 assert(tcs_info->tess.tcs_vertices_out == 0 || tes_info->tess.tcs_vertices_out == 0 ||
3775 tcs_info->tess.tcs_vertices_out == tes_info->tess.tcs_vertices_out);
3776 tes_info->tess.tcs_vertices_out |= tcs_info->tess.tcs_vertices_out;
3777
3778 assert(tcs_info->tess.spacing == TESS_SPACING_UNSPECIFIED ||
3779 tes_info->tess.spacing == TESS_SPACING_UNSPECIFIED ||
3780 tcs_info->tess.spacing == tes_info->tess.spacing);
3781 tes_info->tess.spacing |= tcs_info->tess.spacing;
3782
3783 assert(tcs_info->tess._primitive_mode == TESS_PRIMITIVE_UNSPECIFIED ||
3784 tes_info->tess._primitive_mode == TESS_PRIMITIVE_UNSPECIFIED ||
3785 tcs_info->tess._primitive_mode == tes_info->tess._primitive_mode);
3786 tes_info->tess._primitive_mode |= tcs_info->tess._primitive_mode;
3787 tes_info->tess.ccw |= tcs_info->tess.ccw;
3788 tes_info->tess.point_mode |= tcs_info->tess.point_mode;
3789
3790 /* Copy the merged info back to the TCS */
3791 tcs_info->tess.tcs_vertices_out = tes_info->tess.tcs_vertices_out;
3792 tcs_info->tess.spacing = tes_info->tess.spacing;
3793 tcs_info->tess._primitive_mode = tes_info->tess._primitive_mode;
3794 tcs_info->tess.ccw = tes_info->tess.ccw;
3795 tcs_info->tess.point_mode = tes_info->tess.point_mode;
3796 }
3797
3798 static void
gather_tess_info(struct radv_device * device,struct radv_pipeline_stage * stages,const struct radv_pipeline_key * pipeline_key)3799 gather_tess_info(struct radv_device *device, struct radv_pipeline_stage *stages,
3800 const struct radv_pipeline_key *pipeline_key)
3801 {
3802 merge_tess_info(&stages[MESA_SHADER_TESS_EVAL].nir->info,
3803 &stages[MESA_SHADER_TESS_CTRL].nir->info);
3804
3805 unsigned tess_in_patch_size = pipeline_key->tcs.tess_input_vertices;
3806 unsigned tess_out_patch_size = stages[MESA_SHADER_TESS_CTRL].nir->info.tess.tcs_vertices_out;
3807
3808 /* Number of tessellation patches per workgroup processed by the current pipeline. */
3809 unsigned num_patches = get_tcs_num_patches(
3810 tess_in_patch_size, tess_out_patch_size,
3811 stages[MESA_SHADER_TESS_CTRL].info.tcs.num_linked_inputs,
3812 stages[MESA_SHADER_TESS_CTRL].info.tcs.num_linked_outputs,
3813 stages[MESA_SHADER_TESS_CTRL].info.tcs.num_linked_patch_outputs,
3814 device->physical_device->hs.tess_offchip_block_dw_size, device->physical_device->rad_info.gfx_level,
3815 device->physical_device->rad_info.family);
3816
3817 /* LDS size used by VS+TCS for storing TCS inputs and outputs. */
3818 unsigned tcs_lds_size = calculate_tess_lds_size(
3819 device->physical_device->rad_info.gfx_level, tess_in_patch_size, tess_out_patch_size,
3820 stages[MESA_SHADER_TESS_CTRL].info.tcs.num_linked_inputs, num_patches,
3821 stages[MESA_SHADER_TESS_CTRL].info.tcs.num_linked_outputs,
3822 stages[MESA_SHADER_TESS_CTRL].info.tcs.num_linked_patch_outputs);
3823
3824 stages[MESA_SHADER_TESS_CTRL].info.num_tess_patches = num_patches;
3825 stages[MESA_SHADER_TESS_CTRL].info.tcs.num_lds_blocks = tcs_lds_size;
3826 stages[MESA_SHADER_TESS_CTRL].info.tcs.tes_reads_tess_factors =
3827 !!(stages[MESA_SHADER_TESS_EVAL].nir->info.inputs_read &
3828 (VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER));
3829 stages[MESA_SHADER_TESS_CTRL].info.tcs.tes_inputs_read = stages[MESA_SHADER_TESS_EVAL].nir->info.inputs_read;
3830 stages[MESA_SHADER_TESS_CTRL].info.tcs.tes_patch_inputs_read =
3831 stages[MESA_SHADER_TESS_EVAL].nir->info.patch_inputs_read;
3832
3833 stages[MESA_SHADER_TESS_EVAL].info.num_tess_patches = num_patches;
3834 stages[MESA_SHADER_GEOMETRY].info.num_tess_patches = num_patches;
3835 stages[MESA_SHADER_VERTEX].info.num_tess_patches = num_patches;
3836 stages[MESA_SHADER_TESS_CTRL].info.tcs.tcs_vertices_out = tess_out_patch_size;
3837 stages[MESA_SHADER_VERTEX].info.tcs.tcs_vertices_out = tess_out_patch_size;
3838
3839 if (!radv_use_llvm_for_stage(device, MESA_SHADER_VERTEX)) {
3840 /* When the number of TCS input and output vertices are the same (typically 3):
3841 * - There is an equal amount of LS and HS invocations
3842 * - In case of merged LSHS shaders, the LS and HS halves of the shader
3843 * always process the exact same vertex. We can use this knowledge to optimize them.
3844 *
3845 * We don't set tcs_in_out_eq if the float controls differ because that might
3846 * involve different float modes for the same block and our optimizer
3847 * doesn't handle a instruction dominating another with a different mode.
3848 */
3849 stages[MESA_SHADER_VERTEX].info.vs.tcs_in_out_eq =
3850 device->physical_device->rad_info.gfx_level >= GFX9 &&
3851 tess_in_patch_size == tess_out_patch_size &&
3852 stages[MESA_SHADER_VERTEX].nir->info.float_controls_execution_mode ==
3853 stages[MESA_SHADER_TESS_CTRL].nir->info.float_controls_execution_mode;
3854
3855 if (stages[MESA_SHADER_VERTEX].info.vs.tcs_in_out_eq)
3856 stages[MESA_SHADER_VERTEX].info.vs.tcs_temp_only_input_mask =
3857 stages[MESA_SHADER_TESS_CTRL].nir->info.inputs_read &
3858 stages[MESA_SHADER_VERTEX].nir->info.outputs_written &
3859 ~stages[MESA_SHADER_TESS_CTRL].nir->info.tess.tcs_cross_invocation_inputs_read &
3860 ~stages[MESA_SHADER_TESS_CTRL].nir->info.inputs_read_indirectly &
3861 ~stages[MESA_SHADER_VERTEX].nir->info.outputs_accessed_indirectly;
3862
3863 /* Copy data to TCS so it can be accessed by the backend if they are merged. */
3864 stages[MESA_SHADER_TESS_CTRL].info.vs.tcs_in_out_eq = stages[MESA_SHADER_VERTEX].info.vs.tcs_in_out_eq;
3865 stages[MESA_SHADER_TESS_CTRL].info.vs.tcs_temp_only_input_mask =
3866 stages[MESA_SHADER_VERTEX].info.vs.tcs_temp_only_input_mask;
3867 }
3868
3869 for (gl_shader_stage s = MESA_SHADER_VERTEX; s <= MESA_SHADER_TESS_CTRL; ++s)
3870 stages[s].info.workgroup_size =
3871 ac_compute_lshs_workgroup_size(device->physical_device->rad_info.gfx_level, s, num_patches,
3872 tess_in_patch_size, tess_out_patch_size);
3873 }
3874
3875 static bool
mem_vectorize_callback(unsigned align_mul,unsigned align_offset,unsigned bit_size,unsigned num_components,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * data)3876 mem_vectorize_callback(unsigned align_mul, unsigned align_offset, unsigned bit_size,
3877 unsigned num_components, nir_intrinsic_instr *low, nir_intrinsic_instr *high,
3878 void *data)
3879 {
3880 if (num_components > 4)
3881 return false;
3882
3883 /* >128 bit loads are split except with SMEM */
3884 if (bit_size * num_components > 128)
3885 return false;
3886
3887 uint32_t align;
3888 if (align_offset)
3889 align = 1 << (ffs(align_offset) - 1);
3890 else
3891 align = align_mul;
3892
3893 switch (low->intrinsic) {
3894 case nir_intrinsic_load_global:
3895 case nir_intrinsic_store_global:
3896 case nir_intrinsic_store_ssbo:
3897 case nir_intrinsic_load_ssbo:
3898 case nir_intrinsic_load_ubo:
3899 case nir_intrinsic_load_push_constant: {
3900 unsigned max_components;
3901 if (align % 4 == 0)
3902 max_components = NIR_MAX_VEC_COMPONENTS;
3903 else if (align % 2 == 0)
3904 max_components = 16u / bit_size;
3905 else
3906 max_components = 8u / bit_size;
3907 return (align % (bit_size / 8u)) == 0 && num_components <= max_components;
3908 }
3909 case nir_intrinsic_load_deref:
3910 case nir_intrinsic_store_deref:
3911 assert(nir_deref_mode_is(nir_src_as_deref(low->src[0]), nir_var_mem_shared));
3912 FALLTHROUGH;
3913 case nir_intrinsic_load_shared:
3914 case nir_intrinsic_store_shared:
3915 if (bit_size * num_components ==
3916 96) { /* 96 bit loads require 128 bit alignment and are split otherwise */
3917 return align % 16 == 0;
3918 } else if (bit_size == 16 && (align % 4)) {
3919 /* AMD hardware can't do 2-byte aligned f16vec2 loads, but they are useful for ALU
3920 * vectorization, because our vectorizer requires the scalar IR to already contain vectors.
3921 */
3922 return (align % 2 == 0) && num_components <= 2;
3923 } else {
3924 if (num_components == 3) {
3925 /* AMD hardware can't do 3-component loads except for 96-bit loads, handled above. */
3926 return false;
3927 }
3928 unsigned req = bit_size * num_components;
3929 if (req == 64 || req == 128) /* 64-bit and 128-bit loads can use ds_read2_b{32,64} */
3930 req /= 2u;
3931 return align % (req / 8u) == 0;
3932 }
3933 default:
3934 return false;
3935 }
3936 return false;
3937 }
3938
3939 static unsigned
lower_bit_size_callback(const nir_instr * instr,void * _)3940 lower_bit_size_callback(const nir_instr *instr, void *_)
3941 {
3942 struct radv_device *device = _;
3943 enum amd_gfx_level chip = device->physical_device->rad_info.gfx_level;
3944
3945 if (instr->type != nir_instr_type_alu)
3946 return 0;
3947 nir_alu_instr *alu = nir_instr_as_alu(instr);
3948
3949 /* If an instruction is not scalarized by this point,
3950 * it can be emitted as packed instruction */
3951 if (alu->dest.dest.ssa.num_components > 1)
3952 return 0;
3953
3954 if (alu->dest.dest.ssa.bit_size & (8 | 16)) {
3955 unsigned bit_size = alu->dest.dest.ssa.bit_size;
3956 switch (alu->op) {
3957 case nir_op_bitfield_select:
3958 case nir_op_imul_high:
3959 case nir_op_umul_high:
3960 return 32;
3961 case nir_op_iabs:
3962 case nir_op_imax:
3963 case nir_op_umax:
3964 case nir_op_imin:
3965 case nir_op_umin:
3966 case nir_op_ishr:
3967 case nir_op_ushr:
3968 case nir_op_ishl:
3969 case nir_op_isign:
3970 case nir_op_uadd_sat:
3971 case nir_op_usub_sat:
3972 return (bit_size == 8 || !(chip >= GFX8 && nir_dest_is_divergent(alu->dest.dest))) ? 32
3973 : 0;
3974 case nir_op_iadd_sat:
3975 case nir_op_isub_sat:
3976 return bit_size == 8 || !nir_dest_is_divergent(alu->dest.dest) ? 32 : 0;
3977
3978 default:
3979 return 0;
3980 }
3981 }
3982
3983 if (nir_src_bit_size(alu->src[0].src) & (8 | 16)) {
3984 unsigned bit_size = nir_src_bit_size(alu->src[0].src);
3985 switch (alu->op) {
3986 case nir_op_bit_count:
3987 case nir_op_find_lsb:
3988 case nir_op_ufind_msb:
3989 case nir_op_i2b1:
3990 return 32;
3991 case nir_op_ilt:
3992 case nir_op_ige:
3993 case nir_op_ieq:
3994 case nir_op_ine:
3995 case nir_op_ult:
3996 case nir_op_uge:
3997 return (bit_size == 8 || !(chip >= GFX8 && nir_dest_is_divergent(alu->dest.dest))) ? 32
3998 : 0;
3999 default:
4000 return 0;
4001 }
4002 }
4003
4004 return 0;
4005 }
4006
4007 static uint8_t
opt_vectorize_callback(const nir_instr * instr,const void * _)4008 opt_vectorize_callback(const nir_instr *instr, const void *_)
4009 {
4010 if (instr->type != nir_instr_type_alu)
4011 return 0;
4012
4013 const struct radv_device *device = _;
4014 enum amd_gfx_level chip = device->physical_device->rad_info.gfx_level;
4015 if (chip < GFX9)
4016 return 1;
4017
4018 const nir_alu_instr *alu = nir_instr_as_alu(instr);
4019 const unsigned bit_size = alu->dest.dest.ssa.bit_size;
4020 if (bit_size != 16)
4021 return 1;
4022
4023 switch (alu->op) {
4024 case nir_op_fadd:
4025 case nir_op_fsub:
4026 case nir_op_fmul:
4027 case nir_op_ffma:
4028 case nir_op_fdiv:
4029 case nir_op_flrp:
4030 case nir_op_fabs:
4031 case nir_op_fneg:
4032 case nir_op_fsat:
4033 case nir_op_fmin:
4034 case nir_op_fmax:
4035 case nir_op_iabs:
4036 case nir_op_iadd:
4037 case nir_op_iadd_sat:
4038 case nir_op_uadd_sat:
4039 case nir_op_isub:
4040 case nir_op_isub_sat:
4041 case nir_op_usub_sat:
4042 case nir_op_ineg:
4043 case nir_op_imul:
4044 case nir_op_imin:
4045 case nir_op_imax:
4046 case nir_op_umin:
4047 case nir_op_umax:
4048 return 2;
4049 case nir_op_ishl: /* TODO: in NIR, these have 32bit shift operands */
4050 case nir_op_ishr: /* while Radeon needs 16bit operands when vectorized */
4051 case nir_op_ushr:
4052 default:
4053 return 1;
4054 }
4055 }
4056
4057 static nir_component_mask_t
non_uniform_access_callback(const nir_src * src,void * _)4058 non_uniform_access_callback(const nir_src *src, void *_)
4059 {
4060 if (src->ssa->num_components == 1)
4061 return 0x1;
4062 return nir_chase_binding(*src).success ? 0x2 : 0x3;
4063 }
4064
4065
4066 VkResult
radv_upload_shaders(struct radv_device * device,struct radv_pipeline * pipeline,struct radv_shader_binary ** binaries,struct radv_shader_binary * gs_copy_binary)4067 radv_upload_shaders(struct radv_device *device, struct radv_pipeline *pipeline,
4068 struct radv_shader_binary **binaries, struct radv_shader_binary *gs_copy_binary)
4069 {
4070 uint32_t code_size = 0;
4071
4072 /* Compute the total code size. */
4073 for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; i++) {
4074 struct radv_shader *shader = pipeline->shaders[i];
4075 if (!shader)
4076 continue;
4077
4078 code_size += align(shader->code_size, RADV_SHADER_ALLOC_ALIGNMENT);
4079 }
4080
4081 if (pipeline->gs_copy_shader) {
4082 code_size += align(pipeline->gs_copy_shader->code_size, RADV_SHADER_ALLOC_ALIGNMENT);
4083 }
4084
4085 /* Allocate memory for all shader binaries. */
4086 pipeline->slab = radv_pipeline_slab_create(device, pipeline, code_size);
4087 if (!pipeline->slab)
4088 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
4089
4090 pipeline->slab_bo = pipeline->slab->alloc->arena->bo;
4091
4092 /* Upload shader binaries. */
4093 uint64_t slab_va = radv_buffer_get_va(pipeline->slab_bo);
4094 uint32_t slab_offset = pipeline->slab->alloc->offset;
4095 char *slab_ptr = pipeline->slab->alloc->arena->ptr;
4096
4097 for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i) {
4098 struct radv_shader *shader = pipeline->shaders[i];
4099 if (!shader)
4100 continue;
4101
4102 shader->va = slab_va + slab_offset;
4103
4104 void *dest_ptr = slab_ptr + slab_offset;
4105 if (!radv_shader_binary_upload(device, binaries[i], shader, dest_ptr))
4106 return VK_ERROR_OUT_OF_HOST_MEMORY;
4107
4108 slab_offset += align(shader->code_size, RADV_SHADER_ALLOC_ALIGNMENT);
4109 }
4110
4111 if (pipeline->gs_copy_shader) {
4112 pipeline->gs_copy_shader->va = slab_va + slab_offset;
4113
4114 void *dest_ptr = slab_ptr + slab_offset;
4115 if (!radv_shader_binary_upload(device, gs_copy_binary, pipeline->gs_copy_shader, dest_ptr))
4116 return VK_ERROR_OUT_OF_HOST_MEMORY;
4117 }
4118
4119 return VK_SUCCESS;
4120 }
4121
4122 static bool
radv_consider_force_vrs(const struct radv_pipeline * pipeline,bool noop_fs,const struct radv_pipeline_stage * stages,gl_shader_stage last_vgt_api_stage)4123 radv_consider_force_vrs(const struct radv_pipeline *pipeline, bool noop_fs,
4124 const struct radv_pipeline_stage *stages,
4125 gl_shader_stage last_vgt_api_stage)
4126 {
4127 struct radv_device *device = pipeline->device;
4128
4129 if (!device->force_vrs_enabled)
4130 return false;
4131
4132 if (last_vgt_api_stage != MESA_SHADER_VERTEX &&
4133 last_vgt_api_stage != MESA_SHADER_TESS_EVAL &&
4134 last_vgt_api_stage != MESA_SHADER_GEOMETRY)
4135 return false;
4136
4137 nir_shader *last_vgt_shader = stages[last_vgt_api_stage].nir;
4138 if (last_vgt_shader->info.outputs_written & BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE))
4139 return false;
4140
4141 /* VRS has no effect if there is no pixel shader. */
4142 if (noop_fs)
4143 return false;
4144
4145 /* Do not enable if the PS uses gl_FragCoord because it breaks postprocessing in some games. */
4146 nir_shader *fs_shader = stages[MESA_SHADER_FRAGMENT].nir;
4147 if (fs_shader &&
4148 BITSET_TEST(fs_shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD)) {
4149 return false;
4150 }
4151
4152 return true;
4153 }
4154
4155 static nir_ssa_def *
radv_adjust_vertex_fetch_alpha(nir_builder * b,enum radv_vs_input_alpha_adjust alpha_adjust,nir_ssa_def * alpha)4156 radv_adjust_vertex_fetch_alpha(nir_builder *b,
4157 enum radv_vs_input_alpha_adjust alpha_adjust,
4158 nir_ssa_def *alpha)
4159 {
4160 if (alpha_adjust == ALPHA_ADJUST_SSCALED)
4161 alpha = nir_f2u32(b, alpha);
4162
4163 /* For the integer-like cases, do a natural sign extension.
4164 *
4165 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0 and happen to contain 0, 1, 2, 3 as
4166 * the two LSBs of the exponent.
4167 */
4168 unsigned offset = alpha_adjust == ALPHA_ADJUST_SNORM ? 23u : 0u;
4169
4170 alpha = nir_ibfe_imm(b, alpha, offset, 2u);
4171
4172 /* Convert back to the right type. */
4173 if (alpha_adjust == ALPHA_ADJUST_SNORM) {
4174 alpha = nir_i2f32(b, alpha);
4175 alpha = nir_fmax(b, alpha, nir_imm_float(b, -1.0f));
4176 } else if (alpha_adjust == ALPHA_ADJUST_SSCALED) {
4177 alpha = nir_i2f32(b, alpha);
4178 }
4179
4180 return alpha;
4181 }
4182
4183 static bool
radv_lower_vs_input(nir_shader * nir,const struct radv_pipeline_key * pipeline_key)4184 radv_lower_vs_input(nir_shader *nir, const struct radv_pipeline_key *pipeline_key)
4185 {
4186 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
4187 bool progress = false;
4188
4189 if (pipeline_key->vs.dynamic_input_state)
4190 return false;
4191
4192 nir_builder b;
4193 nir_builder_init(&b, impl);
4194
4195 nir_foreach_block(block, impl) {
4196 nir_foreach_instr(instr, block) {
4197 if (instr->type != nir_instr_type_intrinsic)
4198 continue;
4199
4200 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
4201 if (intrin->intrinsic != nir_intrinsic_load_input)
4202 continue;
4203
4204 unsigned location = nir_intrinsic_base(intrin) - VERT_ATTRIB_GENERIC0;
4205 enum radv_vs_input_alpha_adjust alpha_adjust = pipeline_key->vs.vertex_alpha_adjust[location];
4206 bool post_shuffle = pipeline_key->vs.vertex_post_shuffle & (1 << location);
4207
4208 unsigned component = nir_intrinsic_component(intrin);
4209 unsigned num_components = intrin->dest.ssa.num_components;
4210
4211 unsigned attrib_format = pipeline_key->vs.vertex_attribute_formats[location];
4212 unsigned dfmt = attrib_format & 0xf;
4213 unsigned nfmt = (attrib_format >> 4) & 0x7;
4214 const struct ac_data_format_info *vtx_info = ac_get_data_format_info(dfmt);
4215 bool is_float =
4216 nfmt != V_008F0C_BUF_NUM_FORMAT_UINT && nfmt != V_008F0C_BUF_NUM_FORMAT_SINT;
4217
4218 unsigned mask = nir_ssa_def_components_read(&intrin->dest.ssa) << component;
4219 unsigned num_channels = MIN2(util_last_bit(mask), vtx_info->num_channels);
4220
4221 static const unsigned swizzle_normal[4] = {0, 1, 2, 3};
4222 static const unsigned swizzle_post_shuffle[4] = {2, 1, 0, 3};
4223 const unsigned *swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal;
4224
4225 b.cursor = nir_after_instr(instr);
4226 nir_ssa_def *channels[4];
4227
4228 if (post_shuffle) {
4229 /* Expand to load 3 components because it's shuffled like X<->Z. */
4230 intrin->num_components = MAX2(component + num_components, 3);
4231 intrin->dest.ssa.num_components = intrin->num_components;
4232
4233 nir_intrinsic_set_component(intrin, 0);
4234
4235 num_channels = MAX2(num_channels, 3);
4236 }
4237
4238 for (uint32_t i = 0; i < num_components; i++) {
4239 unsigned idx = i + (post_shuffle ? component : 0);
4240
4241 if (swizzle[i + component] < num_channels) {
4242 channels[i] = nir_channel(&b, &intrin->dest.ssa, swizzle[idx]);
4243 } else if (i + component == 3) {
4244 channels[i] = is_float ? nir_imm_floatN_t(&b, 1.0f, intrin->dest.ssa.bit_size)
4245 : nir_imm_intN_t(&b, 1u, intrin->dest.ssa.bit_size);
4246 } else {
4247 channels[i] = nir_imm_zero(&b, 1, intrin->dest.ssa.bit_size);
4248 }
4249 }
4250
4251 if (alpha_adjust != ALPHA_ADJUST_NONE && component + num_components == 4) {
4252 unsigned idx = num_components - 1;
4253 channels[idx] = radv_adjust_vertex_fetch_alpha(&b, alpha_adjust, channels[idx]);
4254 }
4255
4256 nir_ssa_def *new_dest = nir_vec(&b, channels, num_components);
4257
4258 nir_ssa_def_rewrite_uses_after(&intrin->dest.ssa, new_dest,
4259 new_dest->parent_instr);
4260
4261 progress = true;
4262 }
4263 }
4264
4265 if (progress)
4266 nir_metadata_preserve(impl, nir_metadata_block_index | nir_metadata_dominance);
4267 else
4268 nir_metadata_preserve(impl, nir_metadata_all);
4269
4270 return progress;
4271 }
4272
4273 static bool
radv_lower_fs_output(nir_shader * nir,const struct radv_pipeline_key * pipeline_key)4274 radv_lower_fs_output(nir_shader *nir, const struct radv_pipeline_key *pipeline_key)
4275 {
4276 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
4277 bool progress = false;
4278
4279 nir_builder b;
4280 nir_builder_init(&b, impl);
4281
4282 nir_foreach_block(block, impl) {
4283 nir_foreach_instr(instr, block) {
4284 if (instr->type != nir_instr_type_intrinsic)
4285 continue;
4286
4287 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
4288 if (intrin->intrinsic != nir_intrinsic_store_output)
4289 continue;
4290
4291 int slot = nir_intrinsic_base(intrin) - FRAG_RESULT_DATA0;
4292 if (slot < 0)
4293 continue;
4294
4295 unsigned write_mask = nir_intrinsic_write_mask(intrin);
4296 unsigned col_format = (pipeline_key->ps.col_format >> (4 * slot)) & 0xf;
4297 bool is_int8 = (pipeline_key->ps.is_int8 >> slot) & 1;
4298 bool is_int10 = (pipeline_key->ps.is_int10 >> slot) & 1;
4299 bool enable_mrt_output_nan_fixup = (pipeline_key->ps.enable_mrt_output_nan_fixup >> slot) & 1;
4300 bool is_16bit = intrin->src[0].ssa->bit_size == 16;
4301
4302 if (col_format == V_028714_SPI_SHADER_ZERO)
4303 continue;
4304
4305 b.cursor = nir_before_instr(instr);
4306 nir_ssa_def *values[4];
4307
4308 /* Extract the export values. */
4309 for (unsigned i = 0; i < 4; i++) {
4310 if (write_mask & (1 << i)) {
4311 values[i] = nir_channel(&b, intrin->src[0].ssa, i);
4312 } else {
4313 values[i] = nir_ssa_undef(&b, 1, 32);
4314 }
4315 }
4316
4317 /* Replace NaN by zero (for 32-bit float formats) to fix game bugs if requested. */
4318 if (enable_mrt_output_nan_fixup && !nir->info.internal && !is_16bit) {
4319 u_foreach_bit(i, write_mask) {
4320 const bool save_exact = b.exact;
4321
4322 b.exact = true;
4323 nir_ssa_def *isnan = nir_fneu(&b, values[i], values[i]);
4324 b.exact = save_exact;
4325
4326 values[i] = nir_bcsel(&b, isnan, nir_imm_zero(&b, 1, 32), values[i]);
4327 }
4328 }
4329
4330 if (col_format == V_028714_SPI_SHADER_FP16_ABGR ||
4331 col_format == V_028714_SPI_SHADER_UNORM16_ABGR ||
4332 col_format == V_028714_SPI_SHADER_SNORM16_ABGR ||
4333 col_format == V_028714_SPI_SHADER_UINT16_ABGR ||
4334 col_format == V_028714_SPI_SHADER_SINT16_ABGR) {
4335 /* Convert and/or clamp the export values. */
4336 switch (col_format) {
4337 case V_028714_SPI_SHADER_UINT16_ABGR: {
4338 unsigned max_rgb = is_int8 ? 255 : is_int10 ? 1023 : 0;
4339 u_foreach_bit(i, write_mask) {
4340 if (is_int8 || is_int10) {
4341 values[i] = nir_umin(&b, values[i], i == 3 && is_int10 ? nir_imm_int(&b, 3u)
4342 : nir_imm_int(&b, max_rgb));
4343 } else if (is_16bit) {
4344 values[i] = nir_u2u32(&b, values[i]);
4345 }
4346 }
4347 break;
4348 }
4349 case V_028714_SPI_SHADER_SINT16_ABGR: {
4350 unsigned max_rgb = is_int8 ? 127 : is_int10 ? 511 : 0;
4351 unsigned min_rgb = is_int8 ? -128 : is_int10 ? -512 : 0;
4352 u_foreach_bit(i, write_mask) {
4353 if (is_int8 || is_int10) {
4354 values[i] = nir_imin(&b, values[i], i == 3 && is_int10 ? nir_imm_int(&b, 1u)
4355 : nir_imm_int(&b, max_rgb));
4356 values[i] = nir_imax(&b, values[i], i == 3 && is_int10 ? nir_imm_int(&b, -2u)
4357 : nir_imm_int(&b, min_rgb));
4358 } else if (is_16bit) {
4359 values[i] = nir_i2i32(&b, values[i]);
4360 }
4361 }
4362 break;
4363 }
4364 case V_028714_SPI_SHADER_UNORM16_ABGR:
4365 case V_028714_SPI_SHADER_SNORM16_ABGR:
4366 u_foreach_bit(i, write_mask) {
4367 if (is_16bit) {
4368 values[i] = nir_f2f32(&b, values[i]);
4369 }
4370 }
4371 break;
4372 default:
4373 break;
4374 }
4375
4376 /* Only nir_pack_32_2x16_split needs 16-bit inputs. */
4377 bool input_16_bit = col_format == V_028714_SPI_SHADER_FP16_ABGR && is_16bit;
4378 unsigned new_write_mask = 0;
4379
4380 /* Pack the export values. */
4381 for (unsigned i = 0; i < 2; i++) {
4382 bool enabled = (write_mask >> (i * 2)) & 0x3;
4383
4384 if (!enabled) {
4385 values[i] = nir_ssa_undef(&b, 1, 32);
4386 continue;
4387 }
4388
4389 nir_ssa_def *src0 = values[i * 2];
4390 nir_ssa_def *src1 = values[i * 2 + 1];
4391
4392 if (!(write_mask & (1 << (i * 2))))
4393 src0 = nir_imm_zero(&b, 1, input_16_bit ? 16 : 32);
4394 if (!(write_mask & (1 << (i * 2 + 1))))
4395 src1 = nir_imm_zero(&b, 1, input_16_bit ? 16 : 32);
4396
4397 if (col_format == V_028714_SPI_SHADER_FP16_ABGR) {
4398 if (is_16bit) {
4399 values[i] = nir_pack_32_2x16_split(&b, src0, src1);
4400 } else {
4401 values[i] = nir_pack_half_2x16_split(&b, src0, src1);
4402 }
4403 } else if (col_format == V_028714_SPI_SHADER_UNORM16_ABGR) {
4404 values[i] = nir_pack_unorm_2x16(&b, nir_vec2(&b, src0, src1));
4405 } else if (col_format == V_028714_SPI_SHADER_SNORM16_ABGR) {
4406 values[i] = nir_pack_snorm_2x16(&b, nir_vec2(&b, src0, src1));
4407 } else if (col_format == V_028714_SPI_SHADER_UINT16_ABGR) {
4408 values[i] = nir_pack_uint_2x16(&b, nir_vec2(&b, src0, src1));
4409 } else if (col_format == V_028714_SPI_SHADER_SINT16_ABGR) {
4410 values[i] = nir_pack_sint_2x16(&b, nir_vec2(&b, src0, src1));
4411 }
4412
4413 new_write_mask |= 1 << i;
4414 }
4415
4416 /* Update the write mask for compressed outputs. */
4417 nir_intrinsic_set_write_mask(intrin, new_write_mask);
4418 intrin->num_components = util_last_bit(new_write_mask);
4419 }
4420
4421 nir_ssa_def *new_src = nir_vec(&b, values, intrin->num_components);
4422
4423 nir_instr_rewrite_src(&intrin->instr, &intrin->src[0], nir_src_for_ssa(new_src));
4424
4425 progress = true;
4426 }
4427 }
4428
4429 if (progress)
4430 nir_metadata_preserve(impl, nir_metadata_block_index | nir_metadata_dominance);
4431 else
4432 nir_metadata_preserve(impl, nir_metadata_all);
4433
4434 return progress;
4435 }
4436
4437 void
radv_pipeline_stage_init(const VkPipelineShaderStageCreateInfo * sinfo,struct radv_pipeline_stage * out_stage,gl_shader_stage stage)4438 radv_pipeline_stage_init(const VkPipelineShaderStageCreateInfo *sinfo,
4439 struct radv_pipeline_stage *out_stage, gl_shader_stage stage)
4440 {
4441 const VkShaderModuleCreateInfo *minfo =
4442 vk_find_struct_const(sinfo->pNext, SHADER_MODULE_CREATE_INFO);
4443 const VkPipelineShaderStageModuleIdentifierCreateInfoEXT *iinfo =
4444 vk_find_struct_const(sinfo->pNext, PIPELINE_SHADER_STAGE_MODULE_IDENTIFIER_CREATE_INFO_EXT);
4445
4446 if (sinfo->module == VK_NULL_HANDLE && !minfo && !iinfo)
4447 return;
4448
4449 memset(out_stage, 0, sizeof(*out_stage));
4450
4451 out_stage->stage = stage;
4452 out_stage->entrypoint = sinfo->pName;
4453 out_stage->spec_info = sinfo->pSpecializationInfo;
4454 out_stage->feedback.flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT;
4455
4456 if (sinfo->module != VK_NULL_HANDLE) {
4457 struct vk_shader_module *module = vk_shader_module_from_handle(sinfo->module);
4458 STATIC_ASSERT(sizeof(out_stage->spirv.sha1) == sizeof(module->sha1));
4459
4460 out_stage->spirv.data = module->data;
4461 out_stage->spirv.size = module->size;
4462 out_stage->spirv.object = &module->base;
4463
4464 if (module->nir)
4465 out_stage->internal_nir = module->nir;
4466 } else if (minfo) {
4467 out_stage->spirv.data = (const char *) minfo->pCode;
4468 out_stage->spirv.size = minfo->codeSize;
4469 }
4470
4471 vk_pipeline_hash_shader_stage(sinfo, out_stage->shader_sha1);
4472 }
4473
4474 static struct radv_shader *
radv_pipeline_create_gs_copy_shader(struct radv_pipeline * pipeline,struct radv_pipeline_stage * stages,const struct radv_pipeline_key * pipeline_key,const struct radv_pipeline_layout * pipeline_layout,bool keep_executable_info,bool keep_statistic_info,struct radv_shader_binary ** gs_copy_binary)4475 radv_pipeline_create_gs_copy_shader(struct radv_pipeline *pipeline,
4476 struct radv_pipeline_stage *stages,
4477 const struct radv_pipeline_key *pipeline_key,
4478 const struct radv_pipeline_layout *pipeline_layout,
4479 bool keep_executable_info, bool keep_statistic_info,
4480 struct radv_shader_binary **gs_copy_binary)
4481 {
4482 struct radv_device *device = pipeline->device;
4483 struct radv_shader_info info = {0};
4484
4485 if (stages[MESA_SHADER_GEOMETRY].info.vs.outinfo.export_clip_dists)
4486 info.vs.outinfo.export_clip_dists = true;
4487
4488 radv_nir_shader_info_pass(device, stages[MESA_SHADER_GEOMETRY].nir, pipeline_layout, pipeline_key,
4489 &info);
4490 info.wave_size = 64; /* Wave32 not supported. */
4491 info.workgroup_size = 64; /* HW VS: separate waves, no workgroups */
4492 info.ballot_bit_size = 64;
4493
4494 struct radv_shader_args gs_copy_args = {0};
4495 gs_copy_args.is_gs_copy_shader = true;
4496 gs_copy_args.explicit_scratch_args = !radv_use_llvm_for_stage(device, MESA_SHADER_VERTEX);
4497 radv_declare_shader_args(device->physical_device->rad_info.gfx_level, pipeline_key, &info,
4498 MESA_SHADER_VERTEX, false, MESA_SHADER_VERTEX, &gs_copy_args);
4499 info.user_sgprs_locs = gs_copy_args.user_sgprs_locs;
4500 info.inline_push_constant_mask = gs_copy_args.ac.inline_push_const_mask;
4501
4502 return radv_create_gs_copy_shader(device, stages[MESA_SHADER_GEOMETRY].nir, &info, &gs_copy_args,
4503 gs_copy_binary, keep_executable_info, keep_statistic_info,
4504 pipeline_key->optimisations_disabled);
4505 }
4506
4507 static void
radv_pipeline_nir_to_asm(struct radv_pipeline * pipeline,struct radv_pipeline_stage * stages,const struct radv_pipeline_key * pipeline_key,const struct radv_pipeline_layout * pipeline_layout,bool keep_executable_info,bool keep_statistic_info,gl_shader_stage last_vgt_api_stage,struct radv_shader_binary ** binaries,struct radv_shader_binary ** gs_copy_binary)4508 radv_pipeline_nir_to_asm(struct radv_pipeline *pipeline, struct radv_pipeline_stage *stages,
4509 const struct radv_pipeline_key *pipeline_key,
4510 const struct radv_pipeline_layout *pipeline_layout,
4511 bool keep_executable_info, bool keep_statistic_info,
4512 gl_shader_stage last_vgt_api_stage,
4513 struct radv_shader_binary **binaries,
4514 struct radv_shader_binary **gs_copy_binary)
4515 {
4516 struct radv_device *device = pipeline->device;
4517 unsigned active_stages = 0;
4518
4519 for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; i++) {
4520 if (stages[i].nir)
4521 active_stages |= (1 << i);
4522 }
4523
4524 bool pipeline_has_ngg = last_vgt_api_stage != MESA_SHADER_NONE &&
4525 stages[last_vgt_api_stage].info.is_ngg;
4526
4527 if (stages[MESA_SHADER_GEOMETRY].nir && !pipeline_has_ngg) {
4528 pipeline->gs_copy_shader =
4529 radv_pipeline_create_gs_copy_shader(pipeline, stages, pipeline_key, pipeline_layout,
4530 keep_executable_info, keep_statistic_info,
4531 gs_copy_binary);
4532 }
4533
4534 for (int s = MESA_VULKAN_SHADER_STAGES - 1; s >= 0; s--) {
4535 if (!(active_stages & (1 << s)) || pipeline->shaders[s])
4536 continue;
4537
4538 nir_shader *shaders[2] = { stages[s].nir, NULL };
4539 unsigned shader_count = 1;
4540
4541 /* On GFX9+, TES is merged with GS and VS is merged with TCS or GS. */
4542 if (device->physical_device->rad_info.gfx_level >= GFX9 &&
4543 (s == MESA_SHADER_TESS_CTRL || s == MESA_SHADER_GEOMETRY)) {
4544 gl_shader_stage pre_stage;
4545
4546 if (s == MESA_SHADER_GEOMETRY && stages[MESA_SHADER_TESS_EVAL].nir) {
4547 pre_stage = MESA_SHADER_TESS_EVAL;
4548 } else {
4549 pre_stage = MESA_SHADER_VERTEX;
4550 }
4551
4552 shaders[0] = stages[pre_stage].nir;
4553 shaders[1] = stages[s].nir;
4554 shader_count = 2;
4555 }
4556
4557 int64_t stage_start = os_time_get_nano();
4558
4559 pipeline->shaders[s] = radv_shader_nir_to_asm(device, &stages[s], shaders, shader_count,
4560 pipeline_key, keep_executable_info,
4561 keep_statistic_info, &binaries[s]);
4562
4563 stages[s].feedback.duration += os_time_get_nano() - stage_start;
4564
4565 active_stages &= ~(1 << shaders[0]->info.stage);
4566 if (shaders[1])
4567 active_stages &= ~(1 << shaders[1]->info.stage);
4568 }
4569 }
4570
4571 VkResult
radv_create_shaders(struct radv_pipeline * pipeline,struct radv_pipeline_layout * pipeline_layout,struct radv_device * device,struct radv_pipeline_cache * cache,const struct radv_pipeline_key * pipeline_key,const VkPipelineShaderStageCreateInfo * pStages,uint32_t stageCount,const VkPipelineCreateFlags flags,const uint8_t * custom_hash,const VkPipelineCreationFeedbackCreateInfo * creation_feedback,struct radv_pipeline_shader_stack_size ** stack_sizes,uint32_t * num_stack_sizes,gl_shader_stage * last_vgt_api_stage)4572 radv_create_shaders(struct radv_pipeline *pipeline, struct radv_pipeline_layout *pipeline_layout,
4573 struct radv_device *device, struct radv_pipeline_cache *cache,
4574 const struct radv_pipeline_key *pipeline_key,
4575 const VkPipelineShaderStageCreateInfo *pStages,
4576 uint32_t stageCount,
4577 const VkPipelineCreateFlags flags, const uint8_t *custom_hash,
4578 const VkPipelineCreationFeedbackCreateInfo *creation_feedback,
4579 struct radv_pipeline_shader_stack_size **stack_sizes,
4580 uint32_t *num_stack_sizes,
4581 gl_shader_stage *last_vgt_api_stage)
4582 {
4583 const char *noop_fs_entrypoint = "noop_fs";
4584 struct radv_shader_binary *binaries[MESA_VULKAN_SHADER_STAGES] = {NULL};
4585 struct radv_shader_binary *gs_copy_binary = NULL;
4586 unsigned char hash[20];
4587 bool keep_executable_info =
4588 (flags & VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR) ||
4589 device->keep_shader_info;
4590 bool keep_statistic_info = (flags & VK_PIPELINE_CREATE_CAPTURE_STATISTICS_BIT_KHR) ||
4591 (device->instance->debug_flags & RADV_DEBUG_DUMP_SHADER_STATS) ||
4592 device->keep_shader_info;
4593 struct radv_pipeline_stage stages[MESA_VULKAN_SHADER_STAGES] = {0};
4594 VkPipelineCreationFeedback pipeline_feedback = {
4595 .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
4596 };
4597 bool noop_fs = false;
4598 VkResult result = VK_SUCCESS;
4599
4600 int64_t pipeline_start = os_time_get_nano();
4601
4602 for (uint32_t i = 0; i < stageCount; i++) {
4603 const VkPipelineShaderStageCreateInfo *sinfo = &pStages[i];
4604 gl_shader_stage stage = vk_to_mesa_shader_stage(sinfo->stage);
4605
4606 radv_pipeline_stage_init(sinfo, &stages[stage], stage);
4607 }
4608
4609 for (unsigned s = 0; s < MESA_VULKAN_SHADER_STAGES; s++) {
4610 if (!stages[s].entrypoint)
4611 continue;
4612
4613 if (stages[s].stage < MESA_SHADER_FRAGMENT || stages[s].stage == MESA_SHADER_MESH)
4614 *last_vgt_api_stage = stages[s].stage;
4615 }
4616
4617 ASSERTED bool primitive_shading =
4618 stages[MESA_SHADER_VERTEX].entrypoint || stages[MESA_SHADER_TESS_CTRL].entrypoint ||
4619 stages[MESA_SHADER_TESS_EVAL].entrypoint || stages[MESA_SHADER_GEOMETRY].entrypoint;
4620 ASSERTED bool mesh_shading =
4621 stages[MESA_SHADER_MESH].entrypoint;
4622
4623 /* Primitive and mesh shading must not be mixed in the same pipeline. */
4624 assert(!primitive_shading || !mesh_shading);
4625 /* Mesh shaders are mandatory in mesh shading pipelines. */
4626 assert(mesh_shading == !!stages[MESA_SHADER_MESH].entrypoint);
4627 /* Mesh shaders always need NGG. */
4628 assert(!mesh_shading || pipeline_key->use_ngg);
4629
4630 if (custom_hash)
4631 memcpy(hash, custom_hash, 20);
4632 else {
4633 radv_hash_shaders(hash, stages, pipeline_layout, pipeline_key,
4634 radv_get_hash_flags(device, keep_statistic_info));
4635 }
4636
4637 pipeline->pipeline_hash = *(uint64_t *)hash;
4638
4639 bool found_in_application_cache = true;
4640 if (!keep_executable_info &&
4641 radv_create_shaders_from_pipeline_cache(device, cache, hash, pipeline,
4642 stack_sizes, num_stack_sizes,
4643 &found_in_application_cache)) {
4644 if (found_in_application_cache)
4645 pipeline_feedback.flags |= VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
4646 result = VK_SUCCESS;
4647 goto done;
4648 }
4649
4650 if (flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT) {
4651 if (found_in_application_cache)
4652 pipeline_feedback.flags |= VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
4653 result = VK_PIPELINE_COMPILE_REQUIRED;
4654 goto done;
4655 }
4656
4657 if (pipeline->type == RADV_PIPELINE_GRAPHICS && !stages[MESA_SHADER_FRAGMENT].entrypoint) {
4658 nir_builder fs_b = radv_meta_init_shader(device, MESA_SHADER_FRAGMENT, "noop_fs");
4659
4660 stages[MESA_SHADER_FRAGMENT] = (struct radv_pipeline_stage) {
4661 .stage = MESA_SHADER_FRAGMENT,
4662 .internal_nir = fs_b.shader,
4663 .entrypoint = noop_fs_entrypoint,
4664 .feedback = {
4665 .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
4666 },
4667 };
4668
4669 noop_fs = true;
4670 }
4671
4672 for (unsigned s = 0; s < MESA_VULKAN_SHADER_STAGES; s++) {
4673 if (!stages[s].entrypoint)
4674 continue;
4675
4676 int64_t stage_start = os_time_get_nano();
4677
4678 stages[s].nir = radv_shader_spirv_to_nir(device, &stages[s], pipeline_key);
4679
4680 stages[s].feedback.duration += os_time_get_nano() - stage_start;
4681 }
4682
4683 /* Force per-vertex VRS. */
4684 if (radv_consider_force_vrs(pipeline, noop_fs, stages, *last_vgt_api_stage)) {
4685 assert(*last_vgt_api_stage == MESA_SHADER_VERTEX ||
4686 *last_vgt_api_stage == MESA_SHADER_TESS_EVAL ||
4687 *last_vgt_api_stage == MESA_SHADER_GEOMETRY);
4688 nir_shader *last_vgt_shader = stages[*last_vgt_api_stage].nir;
4689 NIR_PASS(_, last_vgt_shader, radv_force_primitive_shading_rate, device);
4690 }
4691
4692 bool optimize_conservatively = pipeline_key->optimisations_disabled;
4693
4694 /* Determine if shaders uses NGG before linking because it's needed for some NIR pass. */
4695 radv_fill_shader_info_ngg(pipeline, pipeline_key, stages);
4696
4697 bool pipeline_has_ngg = (stages[MESA_SHADER_VERTEX].nir && stages[MESA_SHADER_VERTEX].info.is_ngg) ||
4698 (stages[MESA_SHADER_TESS_EVAL].nir && stages[MESA_SHADER_TESS_EVAL].info.is_ngg) ||
4699 (stages[MESA_SHADER_MESH].nir && stages[MESA_SHADER_MESH].info.is_ngg);
4700
4701 if (stages[MESA_SHADER_GEOMETRY].nir) {
4702 unsigned nir_gs_flags = nir_lower_gs_intrinsics_per_stream;
4703
4704 if (pipeline_has_ngg) {
4705 nir_gs_flags |= nir_lower_gs_intrinsics_count_primitives |
4706 nir_lower_gs_intrinsics_count_vertices_per_primitive |
4707 nir_lower_gs_intrinsics_overwrite_incomplete;
4708 }
4709
4710 NIR_PASS(_, stages[MESA_SHADER_GEOMETRY].nir, nir_lower_gs_intrinsics, nir_gs_flags);
4711 }
4712
4713 radv_link_shaders(pipeline, pipeline_key, stages, optimize_conservatively, *last_vgt_api_stage);
4714 radv_set_driver_locations(pipeline, stages, *last_vgt_api_stage);
4715
4716 for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i) {
4717 if (stages[i].nir) {
4718 int64_t stage_start = os_time_get_nano();
4719
4720 radv_optimize_nir(stages[i].nir, optimize_conservatively, false);
4721
4722 /* Gather info again, information such as outputs_read can be out-of-date. */
4723 nir_shader_gather_info(stages[i].nir, nir_shader_get_entrypoint(stages[i].nir));
4724 radv_lower_io(device, stages[i].nir, stages[MESA_SHADER_MESH].nir);
4725
4726 stages[i].feedback.duration += os_time_get_nano() - stage_start;
4727 }
4728 }
4729
4730 if (stages[MESA_SHADER_TESS_CTRL].nir) {
4731 nir_lower_patch_vertices(stages[MESA_SHADER_TESS_EVAL].nir,
4732 stages[MESA_SHADER_TESS_CTRL].nir->info.tess.tcs_vertices_out, NULL);
4733 gather_tess_info(device, stages, pipeline_key);
4734 }
4735
4736 if (stages[MESA_SHADER_VERTEX].nir) {
4737 NIR_PASS(_, stages[MESA_SHADER_VERTEX].nir, radv_lower_vs_input, pipeline_key);
4738 }
4739
4740 if (stages[MESA_SHADER_FRAGMENT].nir && !radv_use_llvm_for_stage(device, MESA_SHADER_FRAGMENT)) {
4741 /* TODO: Convert the LLVM backend. */
4742 NIR_PASS(_, stages[MESA_SHADER_FRAGMENT].nir, radv_lower_fs_output, pipeline_key);
4743 }
4744
4745 radv_fill_shader_info(pipeline, pipeline_layout, pipeline_key, stages, *last_vgt_api_stage);
4746
4747 if (pipeline_has_ngg) {
4748 struct gfx10_ngg_info *ngg_info;
4749
4750 if (stages[MESA_SHADER_GEOMETRY].nir)
4751 ngg_info = &stages[MESA_SHADER_GEOMETRY].info.ngg_info;
4752 else if (stages[MESA_SHADER_TESS_CTRL].nir)
4753 ngg_info = &stages[MESA_SHADER_TESS_EVAL].info.ngg_info;
4754 else if (stages[MESA_SHADER_VERTEX].nir)
4755 ngg_info = &stages[MESA_SHADER_VERTEX].info.ngg_info;
4756 else if (stages[MESA_SHADER_MESH].nir)
4757 ngg_info = &stages[MESA_SHADER_MESH].info.ngg_info;
4758 else
4759 unreachable("Missing NGG shader stage.");
4760
4761 if (*last_vgt_api_stage == MESA_SHADER_MESH)
4762 gfx10_get_ngg_ms_info(&stages[MESA_SHADER_MESH], ngg_info);
4763 else
4764 gfx10_get_ngg_info(pipeline_key, pipeline, stages, ngg_info);
4765 } else if (stages[MESA_SHADER_GEOMETRY].nir) {
4766 struct gfx9_gs_info *gs_info = &stages[MESA_SHADER_GEOMETRY].info.gs_ring_info;
4767
4768 gfx9_get_gs_info(pipeline_key, pipeline, stages, gs_info);
4769 } else {
4770 gl_shader_stage hw_vs_api_stage =
4771 stages[MESA_SHADER_TESS_EVAL].nir ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
4772 stages[hw_vs_api_stage].info.workgroup_size = stages[hw_vs_api_stage].info.wave_size;
4773 }
4774
4775 radv_determine_ngg_settings(pipeline, pipeline_key, stages, *last_vgt_api_stage);
4776
4777 radv_declare_pipeline_args(device, stages, pipeline_key);
4778
4779 if (stages[MESA_SHADER_FRAGMENT].nir) {
4780 NIR_PASS(_, stages[MESA_SHADER_FRAGMENT].nir, radv_lower_fs_intrinsics,
4781 &stages[MESA_SHADER_FRAGMENT], pipeline_key);
4782 }
4783
4784 for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i) {
4785 if (stages[i].nir) {
4786 int64_t stage_start = os_time_get_nano();
4787
4788 /* Wave and workgroup size should already be filled. */
4789 assert(stages[i].info.wave_size && stages[i].info.workgroup_size);
4790
4791 if (!radv_use_llvm_for_stage(device, i)) {
4792 nir_lower_non_uniform_access_options options = {
4793 .types = nir_lower_non_uniform_ubo_access | nir_lower_non_uniform_ssbo_access |
4794 nir_lower_non_uniform_texture_access | nir_lower_non_uniform_image_access,
4795 .callback = &non_uniform_access_callback,
4796 .callback_data = NULL,
4797 };
4798 NIR_PASS(_, stages[i].nir, nir_lower_non_uniform_access, &options);
4799 }
4800 NIR_PASS(_, stages[i].nir, nir_lower_memory_model);
4801
4802 nir_load_store_vectorize_options vectorize_opts = {
4803 .modes = nir_var_mem_ssbo | nir_var_mem_ubo | nir_var_mem_push_const |
4804 nir_var_mem_shared | nir_var_mem_global,
4805 .callback = mem_vectorize_callback,
4806 .robust_modes = 0,
4807 /* On GFX6, read2/write2 is out-of-bounds if the offset register is negative, even if
4808 * the final offset is not.
4809 */
4810 .has_shared2_amd = device->physical_device->rad_info.gfx_level >= GFX7,
4811 };
4812
4813 if (device->robust_buffer_access2) {
4814 vectorize_opts.robust_modes =
4815 nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_mem_push_const;
4816 }
4817
4818 bool progress = false;
4819 NIR_PASS(progress, stages[i].nir, nir_opt_load_store_vectorize, &vectorize_opts);
4820 if (progress) {
4821 NIR_PASS(_, stages[i].nir, nir_copy_prop);
4822 NIR_PASS(_, stages[i].nir, nir_opt_shrink_stores,
4823 !device->instance->disable_shrink_image_store);
4824
4825 /* Gather info again, to update whether 8/16-bit are used. */
4826 nir_shader_gather_info(stages[i].nir, nir_shader_get_entrypoint(stages[i].nir));
4827 }
4828
4829 struct radv_shader_info *info = &stages[i].info;
4830 if (pipeline->device->physical_device->rad_info.gfx_level >= GFX9) {
4831 if (i == MESA_SHADER_VERTEX && stages[MESA_SHADER_TESS_CTRL].nir)
4832 info = &stages[MESA_SHADER_TESS_CTRL].info;
4833 else if (i == MESA_SHADER_VERTEX && stages[MESA_SHADER_GEOMETRY].nir)
4834 info = &stages[MESA_SHADER_GEOMETRY].info;
4835 else if (i == MESA_SHADER_TESS_EVAL && stages[MESA_SHADER_GEOMETRY].nir)
4836 info = &stages[MESA_SHADER_GEOMETRY].info;
4837 }
4838 NIR_PASS(_, stages[i].nir, radv_nir_lower_ycbcr_textures, pipeline_layout);
4839 NIR_PASS_V(stages[i].nir, radv_nir_apply_pipeline_layout, device, pipeline_layout, info,
4840 &stages[i].args);
4841
4842 NIR_PASS(_, stages[i].nir, nir_opt_shrink_vectors);
4843
4844 NIR_PASS(_, stages[i].nir, nir_lower_alu_width, opt_vectorize_callback, device);
4845
4846 /* lower ALU operations */
4847 NIR_PASS(_, stages[i].nir, nir_lower_int64);
4848
4849 NIR_PASS(_, stages[i].nir, nir_opt_idiv_const, 8);
4850
4851 NIR_PASS(_, stages[i].nir, nir_lower_idiv,
4852 &(nir_lower_idiv_options){
4853 .imprecise_32bit_lowering = false,
4854 .allow_fp16 = device->physical_device->rad_info.gfx_level >= GFX9,
4855 });
4856
4857 nir_move_options sink_opts = nir_move_const_undef | nir_move_copies;
4858 if (i != MESA_SHADER_FRAGMENT || !pipeline_key->disable_sinking_load_input_fs)
4859 sink_opts |= nir_move_load_input;
4860
4861 NIR_PASS(_, stages[i].nir, nir_opt_sink, sink_opts);
4862 NIR_PASS(_, stages[i].nir, nir_opt_move,
4863 nir_move_load_input | nir_move_const_undef | nir_move_copies);
4864
4865 /* Lower I/O intrinsics to memory instructions. */
4866 bool io_to_mem = radv_lower_io_to_mem(device, &stages[i], pipeline_key);
4867 bool lowered_ngg = pipeline_has_ngg && i == *last_vgt_api_stage;
4868 if (lowered_ngg)
4869 radv_lower_ngg(device, &stages[i], pipeline_key);
4870
4871 NIR_PASS(_, stages[i].nir, ac_nir_lower_global_access);
4872 NIR_PASS_V(stages[i].nir, radv_nir_lower_abi, device->physical_device->rad_info.gfx_level,
4873 &stages[i].info, &stages[i].args, pipeline_key,
4874 radv_use_llvm_for_stage(device, i));
4875 radv_optimize_nir_algebraic(
4876 stages[i].nir, io_to_mem || lowered_ngg || i == MESA_SHADER_COMPUTE || i == MESA_SHADER_TASK);
4877
4878 if (stages[i].nir->info.bit_sizes_int & (8 | 16)) {
4879 if (device->physical_device->rad_info.gfx_level >= GFX8) {
4880 NIR_PASS(_, stages[i].nir, nir_convert_to_lcssa, true, true);
4881 nir_divergence_analysis(stages[i].nir);
4882 }
4883
4884 if (nir_lower_bit_size(stages[i].nir, lower_bit_size_callback, device)) {
4885 NIR_PASS(_, stages[i].nir, nir_opt_constant_folding);
4886 }
4887
4888 if (device->physical_device->rad_info.gfx_level >= GFX8)
4889 NIR_PASS(_, stages[i].nir, nir_opt_remove_phis); /* cleanup LCSSA phis */
4890 }
4891 if (((stages[i].nir->info.bit_sizes_int | stages[i].nir->info.bit_sizes_float) & 16) &&
4892 device->physical_device->rad_info.gfx_level >= GFX9) {
4893 bool separate_g16 = device->physical_device->rad_info.gfx_level >= GFX10;
4894 struct nir_fold_tex_srcs_options fold_srcs_options[] = {
4895 {
4896 .sampler_dims =
4897 ~(BITFIELD_BIT(GLSL_SAMPLER_DIM_CUBE) | BITFIELD_BIT(GLSL_SAMPLER_DIM_BUF)),
4898 .src_types = (1 << nir_tex_src_coord) | (1 << nir_tex_src_lod) |
4899 (1 << nir_tex_src_bias) | (1 << nir_tex_src_min_lod) |
4900 (1 << nir_tex_src_ms_index) |
4901 (separate_g16 ? 0 : (1 << nir_tex_src_ddx) | (1 << nir_tex_src_ddy)),
4902 },
4903 {
4904 .sampler_dims = ~BITFIELD_BIT(GLSL_SAMPLER_DIM_CUBE),
4905 .src_types = (1 << nir_tex_src_ddx) | (1 << nir_tex_src_ddy),
4906 },
4907 };
4908 struct nir_fold_16bit_tex_image_options fold_16bit_options = {
4909 .rounding_mode = nir_rounding_mode_rtne,
4910 .fold_tex_dest = true,
4911 .fold_image_load_store_data = true,
4912 .fold_srcs_options_count = separate_g16 ? 2 : 1,
4913 .fold_srcs_options = fold_srcs_options,
4914 };
4915 NIR_PASS(_, stages[i].nir, nir_fold_16bit_tex_image, &fold_16bit_options);
4916
4917 NIR_PASS(_, stages[i].nir, nir_opt_vectorize, opt_vectorize_callback, device);
4918 }
4919
4920 /* cleanup passes */
4921 NIR_PASS(_, stages[i].nir, nir_lower_alu_width, opt_vectorize_callback, device);
4922 NIR_PASS(_, stages[i].nir, nir_lower_load_const_to_scalar);
4923 NIR_PASS(_, stages[i].nir, nir_copy_prop);
4924 NIR_PASS(_, stages[i].nir, nir_opt_dce);
4925
4926 sink_opts |= nir_move_comparisons | nir_move_load_ubo | nir_move_load_ssbo;
4927 NIR_PASS(_, stages[i].nir, nir_opt_sink, sink_opts);
4928
4929 nir_move_options move_opts = nir_move_const_undef | nir_move_load_ubo |
4930 nir_move_load_input | nir_move_comparisons | nir_move_copies;
4931 NIR_PASS(_, stages[i].nir, nir_opt_move, move_opts);
4932
4933 stages[i].feedback.duration += os_time_get_nano() - stage_start;
4934 }
4935 }
4936
4937 for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i) {
4938 if (stages[i].nir) {
4939 if (radv_can_dump_shader(device, stages[i].nir, false))
4940 nir_print_shader(stages[i].nir, stderr);
4941 }
4942 }
4943
4944 /* Compile NIR shaders to AMD assembly. */
4945 radv_pipeline_nir_to_asm(pipeline, stages, pipeline_key, pipeline_layout, keep_executable_info,
4946 keep_statistic_info, *last_vgt_api_stage, binaries, &gs_copy_binary);
4947
4948 if (keep_executable_info) {
4949 for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i) {
4950 struct radv_shader *shader = pipeline->shaders[i];
4951 if (!shader)
4952 continue;
4953
4954 if (!stages[i].spirv.size)
4955 continue;
4956
4957 shader->spirv = malloc(stages[i].spirv.size);
4958 memcpy(shader->spirv, stages[i].spirv.data, stages[i].spirv.size);
4959 shader->spirv_size = stages[i].spirv.size;
4960 }
4961 }
4962
4963 /* Upload shader binaries. */
4964 radv_upload_shaders(device, pipeline, binaries, gs_copy_binary);
4965
4966 if (!keep_executable_info) {
4967 if (pipeline->gs_copy_shader) {
4968 assert(!binaries[MESA_SHADER_COMPUTE] && !pipeline->shaders[MESA_SHADER_COMPUTE]);
4969 binaries[MESA_SHADER_COMPUTE] = gs_copy_binary;
4970 pipeline->shaders[MESA_SHADER_COMPUTE] = pipeline->gs_copy_shader;
4971 }
4972
4973 radv_pipeline_cache_insert_shaders(device, cache, hash, pipeline, binaries,
4974 stack_sizes ? *stack_sizes : NULL,
4975 num_stack_sizes ? *num_stack_sizes : 0);
4976
4977 if (pipeline->gs_copy_shader) {
4978 pipeline->gs_copy_shader = pipeline->shaders[MESA_SHADER_COMPUTE];
4979 pipeline->shaders[MESA_SHADER_COMPUTE] = NULL;
4980 binaries[MESA_SHADER_COMPUTE] = NULL;
4981 }
4982 }
4983
4984 free(gs_copy_binary);
4985 for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i) {
4986 free(binaries[i]);
4987 if (stages[i].nir) {
4988 if (radv_can_dump_shader_stats(device, stages[i].nir) && pipeline->shaders[i]) {
4989 radv_dump_shader_stats(device, pipeline, i, stderr);
4990 }
4991
4992 ralloc_free(stages[i].nir);
4993 }
4994 }
4995
4996 done:
4997 pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
4998
4999 if (creation_feedback) {
5000 *creation_feedback->pPipelineCreationFeedback = pipeline_feedback;
5001
5002 uint32_t stage_count = creation_feedback->pipelineStageCreationFeedbackCount;
5003 assert(stage_count == 0 || stageCount == stage_count);
5004 for (uint32_t i = 0; i < stage_count; i++) {
5005 gl_shader_stage s = vk_to_mesa_shader_stage(pStages[i].stage);
5006 creation_feedback->pPipelineStageCreationFeedbacks[i] = stages[s].feedback;
5007 }
5008 }
5009
5010 return result;
5011 }
5012
5013 static uint32_t
radv_pipeline_stage_to_user_data_0(struct radv_graphics_pipeline * pipeline,gl_shader_stage stage,enum amd_gfx_level gfx_level)5014 radv_pipeline_stage_to_user_data_0(struct radv_graphics_pipeline *pipeline, gl_shader_stage stage,
5015 enum amd_gfx_level gfx_level)
5016 {
5017 bool has_gs = radv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY);
5018 bool has_tess = radv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL);
5019 bool has_ngg = radv_pipeline_has_ngg(pipeline);
5020
5021 switch (stage) {
5022 case MESA_SHADER_FRAGMENT:
5023 return R_00B030_SPI_SHADER_USER_DATA_PS_0;
5024 case MESA_SHADER_VERTEX:
5025 if (has_tess) {
5026 if (gfx_level >= GFX10) {
5027 return R_00B430_SPI_SHADER_USER_DATA_HS_0;
5028 } else if (gfx_level == GFX9) {
5029 return R_00B430_SPI_SHADER_USER_DATA_LS_0;
5030 } else {
5031 return R_00B530_SPI_SHADER_USER_DATA_LS_0;
5032 }
5033 }
5034
5035 if (has_gs) {
5036 if (gfx_level >= GFX10) {
5037 return R_00B230_SPI_SHADER_USER_DATA_GS_0;
5038 } else {
5039 return R_00B330_SPI_SHADER_USER_DATA_ES_0;
5040 }
5041 }
5042
5043 if (has_ngg)
5044 return R_00B230_SPI_SHADER_USER_DATA_GS_0;
5045
5046 return R_00B130_SPI_SHADER_USER_DATA_VS_0;
5047 case MESA_SHADER_GEOMETRY:
5048 return gfx_level == GFX9 ? R_00B330_SPI_SHADER_USER_DATA_ES_0
5049 : R_00B230_SPI_SHADER_USER_DATA_GS_0;
5050 case MESA_SHADER_COMPUTE:
5051 case MESA_SHADER_TASK:
5052 return R_00B900_COMPUTE_USER_DATA_0;
5053 case MESA_SHADER_TESS_CTRL:
5054 return gfx_level == GFX9 ? R_00B430_SPI_SHADER_USER_DATA_LS_0
5055 : R_00B430_SPI_SHADER_USER_DATA_HS_0;
5056 case MESA_SHADER_TESS_EVAL:
5057 if (has_gs) {
5058 return gfx_level >= GFX10 ? R_00B230_SPI_SHADER_USER_DATA_GS_0
5059 : R_00B330_SPI_SHADER_USER_DATA_ES_0;
5060 } else if (has_ngg) {
5061 return R_00B230_SPI_SHADER_USER_DATA_GS_0;
5062 } else {
5063 return R_00B130_SPI_SHADER_USER_DATA_VS_0;
5064 }
5065 case MESA_SHADER_MESH:
5066 assert(has_ngg);
5067 return R_00B230_SPI_SHADER_USER_DATA_GS_0;
5068 default:
5069 unreachable("unknown shader");
5070 }
5071 }
5072
5073 struct radv_bin_size_entry {
5074 unsigned bpp;
5075 VkExtent2D extent;
5076 };
5077
5078 static VkExtent2D
radv_gfx9_compute_bin_size(const struct radv_graphics_pipeline * pipeline,const struct radv_graphics_pipeline_info * info)5079 radv_gfx9_compute_bin_size(const struct radv_graphics_pipeline *pipeline,
5080 const struct radv_graphics_pipeline_info *info)
5081 {
5082 const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
5083 static const struct radv_bin_size_entry color_size_table[][3][9] = {
5084 {
5085 /* One RB / SE */
5086 {
5087 /* One shader engine */
5088 {0, {128, 128}},
5089 {1, {64, 128}},
5090 {2, {32, 128}},
5091 {3, {16, 128}},
5092 {17, {0, 0}},
5093 {UINT_MAX, {0, 0}},
5094 },
5095 {
5096 /* Two shader engines */
5097 {0, {128, 128}},
5098 {2, {64, 128}},
5099 {3, {32, 128}},
5100 {5, {16, 128}},
5101 {17, {0, 0}},
5102 {UINT_MAX, {0, 0}},
5103 },
5104 {
5105 /* Four shader engines */
5106 {0, {128, 128}},
5107 {3, {64, 128}},
5108 {5, {16, 128}},
5109 {17, {0, 0}},
5110 {UINT_MAX, {0, 0}},
5111 },
5112 },
5113 {
5114 /* Two RB / SE */
5115 {
5116 /* One shader engine */
5117 {0, {128, 128}},
5118 {2, {64, 128}},
5119 {3, {32, 128}},
5120 {5, {16, 128}},
5121 {33, {0, 0}},
5122 {UINT_MAX, {0, 0}},
5123 },
5124 {
5125 /* Two shader engines */
5126 {0, {128, 128}},
5127 {3, {64, 128}},
5128 {5, {32, 128}},
5129 {9, {16, 128}},
5130 {33, {0, 0}},
5131 {UINT_MAX, {0, 0}},
5132 },
5133 {
5134 /* Four shader engines */
5135 {0, {256, 256}},
5136 {2, {128, 256}},
5137 {3, {128, 128}},
5138 {5, {64, 128}},
5139 {9, {16, 128}},
5140 {33, {0, 0}},
5141 {UINT_MAX, {0, 0}},
5142 },
5143 },
5144 {
5145 /* Four RB / SE */
5146 {
5147 /* One shader engine */
5148 {0, {128, 256}},
5149 {2, {128, 128}},
5150 {3, {64, 128}},
5151 {5, {32, 128}},
5152 {9, {16, 128}},
5153 {33, {0, 0}},
5154 {UINT_MAX, {0, 0}},
5155 },
5156 {
5157 /* Two shader engines */
5158 {0, {256, 256}},
5159 {2, {128, 256}},
5160 {3, {128, 128}},
5161 {5, {64, 128}},
5162 {9, {32, 128}},
5163 {17, {16, 128}},
5164 {33, {0, 0}},
5165 {UINT_MAX, {0, 0}},
5166 },
5167 {
5168 /* Four shader engines */
5169 {0, {256, 512}},
5170 {2, {256, 256}},
5171 {3, {128, 256}},
5172 {5, {128, 128}},
5173 {9, {64, 128}},
5174 {17, {16, 128}},
5175 {33, {0, 0}},
5176 {UINT_MAX, {0, 0}},
5177 },
5178 },
5179 };
5180 static const struct radv_bin_size_entry ds_size_table[][3][9] = {
5181 {
5182 // One RB / SE
5183 {
5184 // One shader engine
5185 {0, {128, 256}},
5186 {2, {128, 128}},
5187 {4, {64, 128}},
5188 {7, {32, 128}},
5189 {13, {16, 128}},
5190 {49, {0, 0}},
5191 {UINT_MAX, {0, 0}},
5192 },
5193 {
5194 // Two shader engines
5195 {0, {256, 256}},
5196 {2, {128, 256}},
5197 {4, {128, 128}},
5198 {7, {64, 128}},
5199 {13, {32, 128}},
5200 {25, {16, 128}},
5201 {49, {0, 0}},
5202 {UINT_MAX, {0, 0}},
5203 },
5204 {
5205 // Four shader engines
5206 {0, {256, 512}},
5207 {2, {256, 256}},
5208 {4, {128, 256}},
5209 {7, {128, 128}},
5210 {13, {64, 128}},
5211 {25, {16, 128}},
5212 {49, {0, 0}},
5213 {UINT_MAX, {0, 0}},
5214 },
5215 },
5216 {
5217 // Two RB / SE
5218 {
5219 // One shader engine
5220 {0, {256, 256}},
5221 {2, {128, 256}},
5222 {4, {128, 128}},
5223 {7, {64, 128}},
5224 {13, {32, 128}},
5225 {25, {16, 128}},
5226 {97, {0, 0}},
5227 {UINT_MAX, {0, 0}},
5228 },
5229 {
5230 // Two shader engines
5231 {0, {256, 512}},
5232 {2, {256, 256}},
5233 {4, {128, 256}},
5234 {7, {128, 128}},
5235 {13, {64, 128}},
5236 {25, {32, 128}},
5237 {49, {16, 128}},
5238 {97, {0, 0}},
5239 {UINT_MAX, {0, 0}},
5240 },
5241 {
5242 // Four shader engines
5243 {0, {512, 512}},
5244 {2, {256, 512}},
5245 {4, {256, 256}},
5246 {7, {128, 256}},
5247 {13, {128, 128}},
5248 {25, {64, 128}},
5249 {49, {16, 128}},
5250 {97, {0, 0}},
5251 {UINT_MAX, {0, 0}},
5252 },
5253 },
5254 {
5255 // Four RB / SE
5256 {
5257 // One shader engine
5258 {0, {256, 512}},
5259 {2, {256, 256}},
5260 {4, {128, 256}},
5261 {7, {128, 128}},
5262 {13, {64, 128}},
5263 {25, {32, 128}},
5264 {49, {16, 128}},
5265 {UINT_MAX, {0, 0}},
5266 },
5267 {
5268 // Two shader engines
5269 {0, {512, 512}},
5270 {2, {256, 512}},
5271 {4, {256, 256}},
5272 {7, {128, 256}},
5273 {13, {128, 128}},
5274 {25, {64, 128}},
5275 {49, {32, 128}},
5276 {97, {16, 128}},
5277 {UINT_MAX, {0, 0}},
5278 },
5279 {
5280 // Four shader engines
5281 {0, {512, 512}},
5282 {4, {256, 512}},
5283 {7, {256, 256}},
5284 {13, {128, 256}},
5285 {25, {128, 128}},
5286 {49, {64, 128}},
5287 {97, {16, 128}},
5288 {UINT_MAX, {0, 0}},
5289 },
5290 },
5291 };
5292
5293 VkExtent2D extent = {512, 512};
5294
5295 unsigned log_num_rb_per_se =
5296 util_logbase2_ceil(pdevice->rad_info.max_render_backends / pdevice->rad_info.max_se);
5297 unsigned log_num_se = util_logbase2_ceil(pdevice->rad_info.max_se);
5298
5299 unsigned total_samples = 1u << G_028BE0_MSAA_NUM_SAMPLES(pipeline->ms.pa_sc_aa_config);
5300 unsigned ps_iter_samples = 1u << G_028804_PS_ITER_SAMPLES(pipeline->ms.db_eqaa);
5301 unsigned effective_samples = total_samples;
5302 unsigned color_bytes_per_pixel = 0;
5303
5304 for (unsigned i = 0; i < info->ri.color_att_count; i++) {
5305 if (!info->cb.att[i].color_write_mask)
5306 continue;
5307
5308 if (info->ri.color_att_formats[i] == VK_FORMAT_UNDEFINED)
5309 continue;
5310
5311 color_bytes_per_pixel += vk_format_get_blocksize(info->ri.color_att_formats[i]);
5312 }
5313
5314 /* MSAA images typically don't use all samples all the time. */
5315 if (effective_samples >= 2 && ps_iter_samples <= 1)
5316 effective_samples = 2;
5317 color_bytes_per_pixel *= effective_samples;
5318
5319 const struct radv_bin_size_entry *color_entry = color_size_table[log_num_rb_per_se][log_num_se];
5320 while (color_entry[1].bpp <= color_bytes_per_pixel)
5321 ++color_entry;
5322
5323 extent = color_entry->extent;
5324
5325 if (radv_pipeline_has_ds_attachments(&info->ri)) {
5326 /* Coefficients taken from AMDVLK */
5327 unsigned depth_coeff = info->ri.depth_att_format != VK_FORMAT_UNDEFINED ? 5 : 0;
5328 unsigned stencil_coeff = info->ri.stencil_att_format != VK_FORMAT_UNDEFINED ? 1 : 0;
5329 unsigned ds_bytes_per_pixel = 4 * (depth_coeff + stencil_coeff) * total_samples;
5330
5331 const struct radv_bin_size_entry *ds_entry = ds_size_table[log_num_rb_per_se][log_num_se];
5332 while (ds_entry[1].bpp <= ds_bytes_per_pixel)
5333 ++ds_entry;
5334
5335 if (ds_entry->extent.width * ds_entry->extent.height < extent.width * extent.height)
5336 extent = ds_entry->extent;
5337 }
5338
5339 return extent;
5340 }
5341
5342 static VkExtent2D
radv_gfx10_compute_bin_size(const struct radv_graphics_pipeline * pipeline,const struct radv_graphics_pipeline_info * info)5343 radv_gfx10_compute_bin_size(const struct radv_graphics_pipeline *pipeline,
5344 const struct radv_graphics_pipeline_info *info)
5345 {
5346 const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
5347 VkExtent2D extent = {512, 512};
5348
5349 const unsigned db_tag_size = 64;
5350 const unsigned db_tag_count = 312;
5351 const unsigned color_tag_size = 1024;
5352 const unsigned color_tag_count = 31;
5353 const unsigned fmask_tag_size = 256;
5354 const unsigned fmask_tag_count = 44;
5355
5356 const unsigned rb_count = pdevice->rad_info.max_render_backends;
5357 const unsigned pipe_count = MAX2(rb_count, pdevice->rad_info.num_tcc_blocks);
5358
5359 const unsigned db_tag_part = (db_tag_count * rb_count / pipe_count) * db_tag_size * pipe_count;
5360 const unsigned color_tag_part =
5361 (color_tag_count * rb_count / pipe_count) * color_tag_size * pipe_count;
5362 const unsigned fmask_tag_part =
5363 (fmask_tag_count * rb_count / pipe_count) * fmask_tag_size * pipe_count;
5364
5365 const unsigned total_samples =
5366 1u << G_028BE0_MSAA_NUM_SAMPLES(pipeline->ms.pa_sc_aa_config);
5367 const unsigned samples_log = util_logbase2_ceil(total_samples);
5368
5369 unsigned color_bytes_per_pixel = 0;
5370 unsigned fmask_bytes_per_pixel = 0;
5371
5372 for (unsigned i = 0; i < info->ri.color_att_count; i++) {
5373 if (!info->cb.att[i].color_write_mask)
5374 continue;
5375
5376 if (info->ri.color_att_formats[i] == VK_FORMAT_UNDEFINED)
5377 continue;
5378
5379 color_bytes_per_pixel += vk_format_get_blocksize(info->ri.color_att_formats[i]);
5380
5381 if (total_samples > 1) {
5382 assert(samples_log <= 3);
5383 const unsigned fmask_array[] = {0, 1, 1, 4};
5384 fmask_bytes_per_pixel += fmask_array[samples_log];
5385 }
5386 }
5387
5388 color_bytes_per_pixel *= total_samples;
5389 color_bytes_per_pixel = MAX2(color_bytes_per_pixel, 1);
5390
5391 const unsigned color_pixel_count_log = util_logbase2(color_tag_part / color_bytes_per_pixel);
5392 extent.width = 1ull << ((color_pixel_count_log + 1) / 2);
5393 extent.height = 1ull << (color_pixel_count_log / 2);
5394
5395 if (fmask_bytes_per_pixel) {
5396 const unsigned fmask_pixel_count_log = util_logbase2(fmask_tag_part / fmask_bytes_per_pixel);
5397
5398 const VkExtent2D fmask_extent =
5399 (VkExtent2D){.width = 1ull << ((fmask_pixel_count_log + 1) / 2),
5400 .height = 1ull << (color_pixel_count_log / 2)};
5401
5402 if (fmask_extent.width * fmask_extent.height < extent.width * extent.height)
5403 extent = fmask_extent;
5404 }
5405
5406 if (radv_pipeline_has_ds_attachments(&info->ri)) {
5407 /* Coefficients taken from AMDVLK */
5408 unsigned depth_coeff = info->ri.depth_att_format != VK_FORMAT_UNDEFINED ? 5 : 0;
5409 unsigned stencil_coeff = info->ri.stencil_att_format != VK_FORMAT_UNDEFINED ? 1 : 0;
5410 unsigned db_bytes_per_pixel = (depth_coeff + stencil_coeff) * total_samples;
5411
5412 const unsigned db_pixel_count_log = util_logbase2(db_tag_part / db_bytes_per_pixel);
5413
5414 const VkExtent2D db_extent = (VkExtent2D){.width = 1ull << ((db_pixel_count_log + 1) / 2),
5415 .height = 1ull << (color_pixel_count_log / 2)};
5416
5417 if (db_extent.width * db_extent.height < extent.width * extent.height)
5418 extent = db_extent;
5419 }
5420
5421 extent.width = MAX2(extent.width, 128);
5422 extent.height = MAX2(extent.width, 64);
5423
5424 return extent;
5425 }
5426
5427 static void
radv_pipeline_init_disabled_binning_state(struct radv_graphics_pipeline * pipeline,const struct radv_graphics_pipeline_info * info)5428 radv_pipeline_init_disabled_binning_state(struct radv_graphics_pipeline *pipeline,
5429 const struct radv_graphics_pipeline_info *info)
5430 {
5431 const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
5432 uint32_t pa_sc_binner_cntl_0 = S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) |
5433 S_028C44_DISABLE_START_OF_PRIM(1);
5434
5435 if (pdevice->rad_info.gfx_level >= GFX10) {
5436 unsigned min_bytes_per_pixel = 0;
5437
5438 for (unsigned i = 0; i < info->ri.color_att_count; i++) {
5439 if (!info->cb.att[i].color_write_mask)
5440 continue;
5441
5442 if (info->ri.color_att_formats[i] == VK_FORMAT_UNDEFINED)
5443 continue;
5444
5445 unsigned bytes = vk_format_get_blocksize(info->ri.color_att_formats[i]);
5446 if (!min_bytes_per_pixel || bytes < min_bytes_per_pixel)
5447 min_bytes_per_pixel = bytes;
5448 }
5449
5450 pa_sc_binner_cntl_0 =
5451 S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_NEW_SC) | S_028C44_BIN_SIZE_X(0) |
5452 S_028C44_BIN_SIZE_Y(0) | S_028C44_BIN_SIZE_X_EXTEND(2) | /* 128 */
5453 S_028C44_BIN_SIZE_Y_EXTEND(min_bytes_per_pixel <= 4 ? 2 : 1) | /* 128 or 64 */
5454 S_028C44_DISABLE_START_OF_PRIM(1);
5455 }
5456
5457 pipeline->binning.pa_sc_binner_cntl_0 = pa_sc_binner_cntl_0;
5458 }
5459
5460 struct radv_binning_settings
radv_get_binning_settings(const struct radv_physical_device * pdev)5461 radv_get_binning_settings(const struct radv_physical_device *pdev)
5462 {
5463 struct radv_binning_settings settings;
5464 if (pdev->rad_info.has_dedicated_vram) {
5465 if (pdev->rad_info.max_render_backends > 4) {
5466 settings.context_states_per_bin = 1;
5467 settings.persistent_states_per_bin = 1;
5468 } else {
5469 settings.context_states_per_bin = 3;
5470 settings.persistent_states_per_bin = 8;
5471 }
5472 settings.fpovs_per_batch = 63;
5473 } else {
5474 /* The context states are affected by the scissor bug. */
5475 settings.context_states_per_bin = 6;
5476 /* 32 causes hangs for RAVEN. */
5477 settings.persistent_states_per_bin = 16;
5478 settings.fpovs_per_batch = 63;
5479 }
5480
5481 if (pdev->rad_info.has_gfx9_scissor_bug)
5482 settings.context_states_per_bin = 1;
5483
5484 return settings;
5485 }
5486
5487 static void
radv_pipeline_init_binning_state(struct radv_graphics_pipeline * pipeline,const struct radv_blend_state * blend,const struct radv_graphics_pipeline_info * info)5488 radv_pipeline_init_binning_state(struct radv_graphics_pipeline *pipeline,
5489 const struct radv_blend_state *blend,
5490 const struct radv_graphics_pipeline_info *info)
5491 {
5492 const struct radv_device *device = pipeline->base.device;
5493
5494 if (device->physical_device->rad_info.gfx_level < GFX9)
5495 return;
5496
5497 VkExtent2D bin_size;
5498 if (device->physical_device->rad_info.gfx_level >= GFX10) {
5499 bin_size = radv_gfx10_compute_bin_size(pipeline, info);
5500 } else if (device->physical_device->rad_info.gfx_level == GFX9) {
5501 bin_size = radv_gfx9_compute_bin_size(pipeline, info);
5502 } else
5503 unreachable("Unhandled generation for binning bin size calculation");
5504
5505 if (device->pbb_allowed && bin_size.width && bin_size.height) {
5506 struct radv_binning_settings settings = radv_get_binning_settings(device->physical_device);
5507
5508 const uint32_t pa_sc_binner_cntl_0 =
5509 S_028C44_BINNING_MODE(V_028C44_BINNING_ALLOWED) |
5510 S_028C44_BIN_SIZE_X(bin_size.width == 16) | S_028C44_BIN_SIZE_Y(bin_size.height == 16) |
5511 S_028C44_BIN_SIZE_X_EXTEND(util_logbase2(MAX2(bin_size.width, 32)) - 5) |
5512 S_028C44_BIN_SIZE_Y_EXTEND(util_logbase2(MAX2(bin_size.height, 32)) - 5) |
5513 S_028C44_CONTEXT_STATES_PER_BIN(settings.context_states_per_bin - 1) |
5514 S_028C44_PERSISTENT_STATES_PER_BIN(settings.persistent_states_per_bin - 1) |
5515 S_028C44_DISABLE_START_OF_PRIM(1) |
5516 S_028C44_FPOVS_PER_BATCH(settings.fpovs_per_batch) | S_028C44_OPTIMAL_BIN_SELECTION(1);
5517
5518 pipeline->binning.pa_sc_binner_cntl_0 = pa_sc_binner_cntl_0;
5519 } else
5520 radv_pipeline_init_disabled_binning_state(pipeline, info);
5521 }
5522
5523 static void
radv_pipeline_emit_depth_stencil_state(struct radeon_cmdbuf * ctx_cs,const struct radv_depth_stencil_state * ds_state)5524 radv_pipeline_emit_depth_stencil_state(struct radeon_cmdbuf *ctx_cs,
5525 const struct radv_depth_stencil_state *ds_state)
5526 {
5527 radeon_set_context_reg(ctx_cs, R_028000_DB_RENDER_CONTROL, ds_state->db_render_control);
5528
5529 radeon_set_context_reg_seq(ctx_cs, R_02800C_DB_RENDER_OVERRIDE, 2);
5530 radeon_emit(ctx_cs, ds_state->db_render_override);
5531 radeon_emit(ctx_cs, ds_state->db_render_override2);
5532 }
5533
5534 static void
radv_pipeline_emit_blend_state(struct radeon_cmdbuf * ctx_cs,const struct radv_graphics_pipeline * pipeline,const struct radv_blend_state * blend)5535 radv_pipeline_emit_blend_state(struct radeon_cmdbuf *ctx_cs,
5536 const struct radv_graphics_pipeline *pipeline,
5537 const struct radv_blend_state *blend)
5538 {
5539 const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
5540
5541 radeon_set_context_reg_seq(ctx_cs, R_028780_CB_BLEND0_CONTROL, 8);
5542 radeon_emit_array(ctx_cs, blend->cb_blend_control, 8);
5543 radeon_set_context_reg(ctx_cs, R_028B70_DB_ALPHA_TO_MASK, blend->db_alpha_to_mask);
5544
5545 if (pdevice->rad_info.has_rbplus) {
5546
5547 radeon_set_context_reg_seq(ctx_cs, R_028760_SX_MRT0_BLEND_OPT, 8);
5548 radeon_emit_array(ctx_cs, blend->sx_mrt_blend_opt, 8);
5549 }
5550
5551 radeon_set_context_reg(ctx_cs, R_028714_SPI_SHADER_COL_FORMAT, blend->spi_shader_col_format);
5552
5553 radeon_set_context_reg(ctx_cs, R_02823C_CB_SHADER_MASK, blend->cb_shader_mask);
5554 }
5555
5556 static void
radv_pipeline_emit_raster_state(struct radeon_cmdbuf * ctx_cs,const struct radv_graphics_pipeline * pipeline,const struct radv_graphics_pipeline_info * info)5557 radv_pipeline_emit_raster_state(struct radeon_cmdbuf *ctx_cs,
5558 const struct radv_graphics_pipeline *pipeline,
5559 const struct radv_graphics_pipeline_info *info)
5560 {
5561 const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
5562 const VkConservativeRasterizationModeEXT mode = info->rs.conservative_mode;
5563 uint32_t pa_sc_conservative_rast = S_028C4C_NULL_SQUAD_AA_MASK_ENABLE(1);
5564
5565 if (pdevice->rad_info.gfx_level >= GFX9) {
5566 /* Conservative rasterization. */
5567 if (mode != VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT) {
5568 pa_sc_conservative_rast = S_028C4C_PREZ_AA_MASK_ENABLE(1) | S_028C4C_POSTZ_AA_MASK_ENABLE(1) |
5569 S_028C4C_CENTROID_SAMPLE_OVERRIDE(1);
5570
5571 if (mode == VK_CONSERVATIVE_RASTERIZATION_MODE_OVERESTIMATE_EXT) {
5572 pa_sc_conservative_rast |=
5573 S_028C4C_OVER_RAST_ENABLE(1) | S_028C4C_OVER_RAST_SAMPLE_SELECT(0) |
5574 S_028C4C_UNDER_RAST_ENABLE(0) | S_028C4C_UNDER_RAST_SAMPLE_SELECT(1) |
5575 S_028C4C_PBB_UNCERTAINTY_REGION_ENABLE(1);
5576 } else {
5577 assert(mode == VK_CONSERVATIVE_RASTERIZATION_MODE_UNDERESTIMATE_EXT);
5578 pa_sc_conservative_rast |=
5579 S_028C4C_OVER_RAST_ENABLE(0) | S_028C4C_OVER_RAST_SAMPLE_SELECT(1) |
5580 S_028C4C_UNDER_RAST_ENABLE(1) | S_028C4C_UNDER_RAST_SAMPLE_SELECT(0) |
5581 S_028C4C_PBB_UNCERTAINTY_REGION_ENABLE(0);
5582 }
5583 }
5584
5585 radeon_set_context_reg(ctx_cs, R_028C4C_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL,
5586 pa_sc_conservative_rast);
5587 }
5588 }
5589
5590 static void
radv_pipeline_emit_multisample_state(struct radeon_cmdbuf * ctx_cs,const struct radv_graphics_pipeline * pipeline)5591 radv_pipeline_emit_multisample_state(struct radeon_cmdbuf *ctx_cs,
5592 const struct radv_graphics_pipeline *pipeline)
5593 {
5594 const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
5595 const struct radv_multisample_state *ms = &pipeline->ms;
5596
5597 radeon_set_context_reg_seq(ctx_cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2);
5598 radeon_emit(ctx_cs, ms->pa_sc_aa_mask[0]);
5599 radeon_emit(ctx_cs, ms->pa_sc_aa_mask[1]);
5600
5601 radeon_set_context_reg(ctx_cs, R_028804_DB_EQAA, ms->db_eqaa);
5602 radeon_set_context_reg(ctx_cs, R_028BE0_PA_SC_AA_CONFIG, ms->pa_sc_aa_config);
5603
5604 radeon_set_context_reg_seq(ctx_cs, R_028A48_PA_SC_MODE_CNTL_0, 2);
5605 radeon_emit(ctx_cs, ms->pa_sc_mode_cntl_0);
5606 radeon_emit(ctx_cs, ms->pa_sc_mode_cntl_1);
5607
5608 /* The exclusion bits can be set to improve rasterization efficiency
5609 * if no sample lies on the pixel boundary (-8 sample offset). It's
5610 * currently always TRUE because the driver doesn't support 16 samples.
5611 */
5612 bool exclusion = pdevice->rad_info.gfx_level >= GFX7;
5613 radeon_set_context_reg(
5614 ctx_cs, R_02882C_PA_SU_PRIM_FILTER_CNTL,
5615 S_02882C_XMAX_RIGHT_EXCLUSION(exclusion) | S_02882C_YMAX_BOTTOM_EXCLUSION(exclusion));
5616 }
5617
5618 static void
radv_pipeline_emit_vgt_gs_mode(struct radeon_cmdbuf * ctx_cs,const struct radv_graphics_pipeline * pipeline)5619 radv_pipeline_emit_vgt_gs_mode(struct radeon_cmdbuf *ctx_cs,
5620 const struct radv_graphics_pipeline *pipeline)
5621 {
5622 const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
5623 const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline);
5624 const struct radv_shader *vs = pipeline->base.shaders[MESA_SHADER_TESS_EVAL]
5625 ? pipeline->base.shaders[MESA_SHADER_TESS_EVAL]
5626 : pipeline->base.shaders[MESA_SHADER_VERTEX];
5627 unsigned vgt_primitiveid_en = 0;
5628 uint32_t vgt_gs_mode = 0;
5629
5630 if (radv_pipeline_has_ngg(pipeline))
5631 return;
5632
5633 if (radv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
5634 const struct radv_shader *gs = pipeline->base.shaders[MESA_SHADER_GEOMETRY];
5635
5636 vgt_gs_mode = ac_vgt_gs_mode(gs->info.gs.vertices_out, pdevice->rad_info.gfx_level);
5637 } else if (outinfo->export_prim_id || vs->info.uses_prim_id) {
5638 vgt_gs_mode = S_028A40_MODE(V_028A40_GS_SCENARIO_A);
5639 vgt_primitiveid_en |= S_028A84_PRIMITIVEID_EN(1);
5640 }
5641
5642 radeon_set_context_reg(ctx_cs, R_028A84_VGT_PRIMITIVEID_EN, vgt_primitiveid_en);
5643 radeon_set_context_reg(ctx_cs, R_028A40_VGT_GS_MODE, vgt_gs_mode);
5644 }
5645
5646 static void
radv_pipeline_emit_hw_vs(struct radeon_cmdbuf * ctx_cs,struct radeon_cmdbuf * cs,const struct radv_graphics_pipeline * pipeline,const struct radv_shader * shader)5647 radv_pipeline_emit_hw_vs(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
5648 const struct radv_graphics_pipeline *pipeline, const struct radv_shader *shader)
5649 {
5650 const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
5651 uint64_t va = radv_shader_get_va(shader);
5652
5653 radeon_set_sh_reg_seq(cs, R_00B120_SPI_SHADER_PGM_LO_VS, 4);
5654 radeon_emit(cs, va >> 8);
5655 radeon_emit(cs, S_00B124_MEM_BASE(va >> 40));
5656 radeon_emit(cs, shader->config.rsrc1);
5657 radeon_emit(cs, shader->config.rsrc2);
5658
5659 const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline);
5660 unsigned clip_dist_mask, cull_dist_mask, total_mask;
5661 clip_dist_mask = outinfo->clip_dist_mask;
5662 cull_dist_mask = outinfo->cull_dist_mask;
5663 total_mask = clip_dist_mask | cull_dist_mask;
5664
5665 bool misc_vec_ena = outinfo->writes_pointsize || outinfo->writes_layer ||
5666 outinfo->writes_viewport_index || outinfo->writes_primitive_shading_rate;
5667 unsigned spi_vs_out_config, nparams;
5668
5669 /* VS is required to export at least one param. */
5670 nparams = MAX2(outinfo->param_exports, 1);
5671 spi_vs_out_config = S_0286C4_VS_EXPORT_COUNT(nparams - 1);
5672
5673 if (pdevice->rad_info.gfx_level >= GFX10) {
5674 spi_vs_out_config |= S_0286C4_NO_PC_EXPORT(outinfo->param_exports == 0);
5675 }
5676
5677 radeon_set_context_reg(ctx_cs, R_0286C4_SPI_VS_OUT_CONFIG, spi_vs_out_config);
5678
5679 radeon_set_context_reg(
5680 ctx_cs, R_02870C_SPI_SHADER_POS_FORMAT,
5681 S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
5682 S_02870C_POS1_EXPORT_FORMAT(outinfo->pos_exports > 1 ? V_02870C_SPI_SHADER_4COMP
5683 : V_02870C_SPI_SHADER_NONE) |
5684 S_02870C_POS2_EXPORT_FORMAT(outinfo->pos_exports > 2 ? V_02870C_SPI_SHADER_4COMP
5685 : V_02870C_SPI_SHADER_NONE) |
5686 S_02870C_POS3_EXPORT_FORMAT(outinfo->pos_exports > 3 ? V_02870C_SPI_SHADER_4COMP
5687 : V_02870C_SPI_SHADER_NONE));
5688
5689 radeon_set_context_reg(ctx_cs, R_02881C_PA_CL_VS_OUT_CNTL,
5690 S_02881C_USE_VTX_POINT_SIZE(outinfo->writes_pointsize) |
5691 S_02881C_USE_VTX_RENDER_TARGET_INDX(outinfo->writes_layer) |
5692 S_02881C_USE_VTX_VIEWPORT_INDX(outinfo->writes_viewport_index) |
5693 S_02881C_USE_VTX_VRS_RATE(outinfo->writes_primitive_shading_rate) |
5694 S_02881C_VS_OUT_MISC_VEC_ENA(misc_vec_ena) |
5695 S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(misc_vec_ena) |
5696 S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0f) != 0) |
5697 S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xf0) != 0) |
5698 total_mask << 8 | clip_dist_mask);
5699
5700 if (pdevice->rad_info.gfx_level <= GFX8)
5701 radeon_set_context_reg(ctx_cs, R_028AB4_VGT_REUSE_OFF, outinfo->writes_viewport_index);
5702
5703 unsigned late_alloc_wave64, cu_mask;
5704 ac_compute_late_alloc(&pdevice->rad_info, false, false, shader->config.scratch_bytes_per_wave > 0,
5705 &late_alloc_wave64, &cu_mask);
5706
5707 if (pdevice->rad_info.gfx_level >= GFX7) {
5708 if (pdevice->rad_info.gfx_level >= GFX10) {
5709 ac_set_reg_cu_en(cs, R_00B118_SPI_SHADER_PGM_RSRC3_VS,
5710 S_00B118_CU_EN(cu_mask) | S_00B118_WAVE_LIMIT(0x3F),
5711 C_00B118_CU_EN, 0, &pdevice->rad_info,
5712 (void*)gfx10_set_sh_reg_idx3);
5713 } else {
5714 radeon_set_sh_reg_idx(pdevice, cs, R_00B118_SPI_SHADER_PGM_RSRC3_VS, 3,
5715 S_00B118_CU_EN(cu_mask) | S_00B118_WAVE_LIMIT(0x3F));
5716 }
5717 radeon_set_sh_reg(cs, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(late_alloc_wave64));
5718 }
5719 if (pdevice->rad_info.gfx_level >= GFX10) {
5720 uint32_t oversub_pc_lines = late_alloc_wave64 ? pdevice->rad_info.pc_lines / 4 : 0;
5721 gfx10_emit_ge_pc_alloc(cs, pdevice->rad_info.gfx_level, oversub_pc_lines);
5722 }
5723 }
5724
5725 static void
radv_pipeline_emit_hw_es(struct radeon_cmdbuf * cs,const struct radv_graphics_pipeline * pipeline,const struct radv_shader * shader)5726 radv_pipeline_emit_hw_es(struct radeon_cmdbuf *cs, const struct radv_graphics_pipeline *pipeline,
5727 const struct radv_shader *shader)
5728 {
5729 uint64_t va = radv_shader_get_va(shader);
5730
5731 radeon_set_sh_reg_seq(cs, R_00B320_SPI_SHADER_PGM_LO_ES, 4);
5732 radeon_emit(cs, va >> 8);
5733 radeon_emit(cs, S_00B324_MEM_BASE(va >> 40));
5734 radeon_emit(cs, shader->config.rsrc1);
5735 radeon_emit(cs, shader->config.rsrc2);
5736 }
5737
5738 static void
radv_pipeline_emit_hw_ls(struct radeon_cmdbuf * cs,const struct radv_graphics_pipeline * pipeline,const struct radv_shader * shader)5739 radv_pipeline_emit_hw_ls(struct radeon_cmdbuf *cs, const struct radv_graphics_pipeline *pipeline,
5740 const struct radv_shader *shader)
5741 {
5742 const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
5743 unsigned num_lds_blocks = pipeline->base.shaders[MESA_SHADER_TESS_CTRL]->info.tcs.num_lds_blocks;
5744 uint64_t va = radv_shader_get_va(shader);
5745 uint32_t rsrc2 = shader->config.rsrc2;
5746
5747 radeon_set_sh_reg(cs, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
5748
5749 rsrc2 |= S_00B52C_LDS_SIZE(num_lds_blocks);
5750 if (pdevice->rad_info.gfx_level == GFX7 && pdevice->rad_info.family != CHIP_HAWAII)
5751 radeon_set_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, rsrc2);
5752
5753 radeon_set_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2);
5754 radeon_emit(cs, shader->config.rsrc1);
5755 radeon_emit(cs, rsrc2);
5756 }
5757
5758 static void
radv_pipeline_emit_hw_ngg(struct radeon_cmdbuf * ctx_cs,struct radeon_cmdbuf * cs,const struct radv_graphics_pipeline * pipeline,const struct radv_shader * shader)5759 radv_pipeline_emit_hw_ngg(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
5760 const struct radv_graphics_pipeline *pipeline,
5761 const struct radv_shader *shader)
5762 {
5763 const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
5764 uint64_t va = radv_shader_get_va(shader);
5765 gl_shader_stage es_type =
5766 radv_pipeline_has_stage(pipeline, MESA_SHADER_MESH) ? MESA_SHADER_MESH :
5767 radv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL) ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
5768 struct radv_shader *es = pipeline->base.shaders[es_type];
5769 const struct gfx10_ngg_info *ngg_state = &shader->info.ngg_info;
5770
5771 radeon_set_sh_reg(cs, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
5772
5773 radeon_set_sh_reg_seq(cs, R_00B228_SPI_SHADER_PGM_RSRC1_GS, 2);
5774 radeon_emit(cs, shader->config.rsrc1);
5775 radeon_emit(cs, shader->config.rsrc2);
5776
5777 const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline);
5778 unsigned clip_dist_mask, cull_dist_mask, total_mask;
5779 clip_dist_mask = outinfo->clip_dist_mask;
5780 cull_dist_mask = outinfo->cull_dist_mask;
5781 total_mask = clip_dist_mask | cull_dist_mask;
5782
5783 bool misc_vec_ena = outinfo->writes_pointsize || outinfo->writes_layer ||
5784 outinfo->writes_viewport_index || outinfo->writes_primitive_shading_rate;
5785 bool es_enable_prim_id = outinfo->export_prim_id || (es && es->info.uses_prim_id);
5786 bool break_wave_at_eoi = false;
5787 unsigned ge_cntl;
5788
5789 if (es_type == MESA_SHADER_TESS_EVAL) {
5790 struct radv_shader *gs = pipeline->base.shaders[MESA_SHADER_GEOMETRY];
5791
5792 if (es_enable_prim_id || (gs && gs->info.uses_prim_id))
5793 break_wave_at_eoi = true;
5794 }
5795
5796 bool no_pc_export = outinfo->param_exports == 0 && outinfo->prim_param_exports == 0;
5797 unsigned num_params = MAX2(outinfo->param_exports, 1);
5798 unsigned num_prim_params = outinfo->prim_param_exports;
5799 radeon_set_context_reg(
5800 ctx_cs, R_0286C4_SPI_VS_OUT_CONFIG,
5801 S_0286C4_VS_EXPORT_COUNT(num_params - 1) |
5802 S_0286C4_PRIM_EXPORT_COUNT(num_prim_params) |
5803 S_0286C4_NO_PC_EXPORT(no_pc_export));
5804
5805 unsigned idx_format = V_028708_SPI_SHADER_1COMP;
5806 if (outinfo->writes_layer_per_primitive ||
5807 outinfo->writes_viewport_index_per_primitive ||
5808 outinfo->writes_primitive_shading_rate_per_primitive)
5809 idx_format = V_028708_SPI_SHADER_2COMP;
5810
5811 radeon_set_context_reg(ctx_cs, R_028708_SPI_SHADER_IDX_FORMAT,
5812 S_028708_IDX0_EXPORT_FORMAT(idx_format));
5813 radeon_set_context_reg(
5814 ctx_cs, R_02870C_SPI_SHADER_POS_FORMAT,
5815 S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
5816 S_02870C_POS1_EXPORT_FORMAT(outinfo->pos_exports > 1 ? V_02870C_SPI_SHADER_4COMP
5817 : V_02870C_SPI_SHADER_NONE) |
5818 S_02870C_POS2_EXPORT_FORMAT(outinfo->pos_exports > 2 ? V_02870C_SPI_SHADER_4COMP
5819 : V_02870C_SPI_SHADER_NONE) |
5820 S_02870C_POS3_EXPORT_FORMAT(outinfo->pos_exports > 3 ? V_02870C_SPI_SHADER_4COMP
5821 : V_02870C_SPI_SHADER_NONE));
5822
5823 radeon_set_context_reg(ctx_cs, R_02881C_PA_CL_VS_OUT_CNTL,
5824 S_02881C_USE_VTX_POINT_SIZE(outinfo->writes_pointsize) |
5825 S_02881C_USE_VTX_RENDER_TARGET_INDX(outinfo->writes_layer) |
5826 S_02881C_USE_VTX_VIEWPORT_INDX(outinfo->writes_viewport_index) |
5827 S_02881C_USE_VTX_VRS_RATE(outinfo->writes_primitive_shading_rate) |
5828 S_02881C_VS_OUT_MISC_VEC_ENA(misc_vec_ena) |
5829 S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(misc_vec_ena) |
5830 S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0f) != 0) |
5831 S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xf0) != 0) |
5832 total_mask << 8 | clip_dist_mask);
5833
5834 radeon_set_context_reg(ctx_cs, R_028A84_VGT_PRIMITIVEID_EN,
5835 S_028A84_PRIMITIVEID_EN(es_enable_prim_id) |
5836 S_028A84_NGG_DISABLE_PROVOK_REUSE(outinfo->export_prim_id));
5837
5838 radeon_set_context_reg(ctx_cs, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
5839 ngg_state->vgt_esgs_ring_itemsize);
5840
5841 /* NGG specific registers. */
5842 struct radv_shader *gs = pipeline->base.shaders[MESA_SHADER_GEOMETRY];
5843 uint32_t gs_num_invocations = gs ? gs->info.gs.invocations : 1;
5844
5845 if (pdevice->rad_info.gfx_level < GFX11) {
5846 radeon_set_context_reg(
5847 ctx_cs, R_028A44_VGT_GS_ONCHIP_CNTL,
5848 S_028A44_ES_VERTS_PER_SUBGRP(ngg_state->hw_max_esverts) |
5849 S_028A44_GS_PRIMS_PER_SUBGRP(ngg_state->max_gsprims) |
5850 S_028A44_GS_INST_PRIMS_IN_SUBGRP(ngg_state->max_gsprims * gs_num_invocations));
5851 }
5852
5853 radeon_set_context_reg(ctx_cs, R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP,
5854 S_0287FC_MAX_VERTS_PER_SUBGROUP(ngg_state->max_out_verts));
5855 radeon_set_context_reg(ctx_cs, R_028B4C_GE_NGG_SUBGRP_CNTL,
5856 S_028B4C_PRIM_AMP_FACTOR(ngg_state->prim_amp_factor) |
5857 S_028B4C_THDS_PER_SUBGRP(0)); /* for fast launch */
5858 radeon_set_context_reg(
5859 ctx_cs, R_028B90_VGT_GS_INSTANCE_CNT,
5860 S_028B90_CNT(gs_num_invocations) | S_028B90_ENABLE(gs_num_invocations > 1) |
5861 S_028B90_EN_MAX_VERT_OUT_PER_GS_INSTANCE(ngg_state->max_vert_out_per_gs_instance));
5862
5863 if (pdevice->rad_info.gfx_level >= GFX11) {
5864 ge_cntl = S_03096C_PRIMS_PER_SUBGRP(ngg_state->max_gsprims) |
5865 S_03096C_VERTS_PER_SUBGRP(ngg_state->enable_vertex_grouping
5866 ? ngg_state->hw_max_esverts
5867 : 256) | /* 256 = disable vertex grouping */
5868 S_03096C_BREAK_PRIMGRP_AT_EOI(break_wave_at_eoi) |
5869 S_03096C_PRIM_GRP_SIZE_GFX11(256);
5870 } else {
5871 ge_cntl = S_03096C_PRIM_GRP_SIZE_GFX10(ngg_state->max_gsprims) |
5872 S_03096C_VERT_GRP_SIZE(ngg_state->enable_vertex_grouping
5873 ? ngg_state->hw_max_esverts
5874 : 256) | /* 256 = disable vertex grouping */
5875 S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
5876 }
5877
5878 /* Bug workaround for a possible hang with non-tessellation cases.
5879 * Tessellation always sets GE_CNTL.VERT_GRP_SIZE = 0
5880 *
5881 * Requirement: GE_CNTL.VERT_GRP_SIZE = VGT_GS_ONCHIP_CNTL.ES_VERTS_PER_SUBGRP - 5
5882 */
5883 if (pdevice->rad_info.gfx_level == GFX10 &&
5884 !radv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL) && ngg_state->hw_max_esverts != 256) {
5885 ge_cntl &= C_03096C_VERT_GRP_SIZE;
5886
5887 if (ngg_state->hw_max_esverts > 5) {
5888 ge_cntl |= S_03096C_VERT_GRP_SIZE(ngg_state->hw_max_esverts - 5);
5889 }
5890 }
5891
5892 radeon_set_uconfig_reg(ctx_cs, R_03096C_GE_CNTL, ge_cntl);
5893
5894 unsigned late_alloc_wave64, cu_mask;
5895 ac_compute_late_alloc(&pdevice->rad_info, true, shader->info.has_ngg_culling,
5896 shader->config.scratch_bytes_per_wave > 0, &late_alloc_wave64, &cu_mask);
5897
5898 if (pdevice->rad_info.gfx_level >= GFX11) {
5899 /* TODO: figure out how S_00B204_CU_EN_GFX11 interacts with ac_set_reg_cu_en */
5900 gfx10_set_sh_reg_idx3(cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
5901 S_00B21C_CU_EN(cu_mask) | S_00B21C_WAVE_LIMIT(0x3F));
5902 gfx10_set_sh_reg_idx3(
5903 cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
5904 S_00B204_CU_EN_GFX11(0x1) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64));
5905 } else if (pdevice->rad_info.gfx_level >= GFX10) {
5906 ac_set_reg_cu_en(cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
5907 S_00B21C_CU_EN(cu_mask) | S_00B21C_WAVE_LIMIT(0x3F),
5908 C_00B21C_CU_EN, 0, &pdevice->rad_info, (void*)gfx10_set_sh_reg_idx3);
5909 ac_set_reg_cu_en(cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
5910 S_00B204_CU_EN_GFX10(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64),
5911 C_00B204_CU_EN_GFX10, 16, &pdevice->rad_info,
5912 (void*)gfx10_set_sh_reg_idx3);
5913 } else {
5914 radeon_set_sh_reg_idx(
5915 pdevice, cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, 3,
5916 S_00B21C_CU_EN(cu_mask) | S_00B21C_WAVE_LIMIT(0x3F));
5917 radeon_set_sh_reg_idx(
5918 pdevice, cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, 3,
5919 S_00B204_CU_EN_GFX10(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64));
5920 }
5921
5922 uint32_t oversub_pc_lines = late_alloc_wave64 ? pdevice->rad_info.pc_lines / 4 : 0;
5923 if (shader->info.has_ngg_culling) {
5924 unsigned oversub_factor = 2;
5925
5926 if (outinfo->param_exports > 4)
5927 oversub_factor = 4;
5928 else if (outinfo->param_exports > 2)
5929 oversub_factor = 3;
5930
5931 oversub_pc_lines *= oversub_factor;
5932 }
5933
5934 gfx10_emit_ge_pc_alloc(cs, pdevice->rad_info.gfx_level, oversub_pc_lines);
5935 }
5936
5937 static void
radv_pipeline_emit_hw_hs(struct radeon_cmdbuf * cs,const struct radv_graphics_pipeline * pipeline,const struct radv_shader * shader)5938 radv_pipeline_emit_hw_hs(struct radeon_cmdbuf *cs, const struct radv_graphics_pipeline *pipeline,
5939 const struct radv_shader *shader)
5940 {
5941 const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
5942 uint64_t va = radv_shader_get_va(shader);
5943
5944 if (pdevice->rad_info.gfx_level >= GFX9) {
5945 if (pdevice->rad_info.gfx_level >= GFX10) {
5946 radeon_set_sh_reg(cs, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
5947 } else {
5948 radeon_set_sh_reg(cs, R_00B410_SPI_SHADER_PGM_LO_LS, va >> 8);
5949 }
5950
5951 radeon_set_sh_reg_seq(cs, R_00B428_SPI_SHADER_PGM_RSRC1_HS, 2);
5952 radeon_emit(cs, shader->config.rsrc1);
5953 radeon_emit(cs, shader->config.rsrc2);
5954 } else {
5955 radeon_set_sh_reg_seq(cs, R_00B420_SPI_SHADER_PGM_LO_HS, 4);
5956 radeon_emit(cs, va >> 8);
5957 radeon_emit(cs, S_00B424_MEM_BASE(va >> 40));
5958 radeon_emit(cs, shader->config.rsrc1);
5959 radeon_emit(cs, shader->config.rsrc2);
5960 }
5961 }
5962
5963 static void
radv_pipeline_emit_vertex_shader(struct radeon_cmdbuf * ctx_cs,struct radeon_cmdbuf * cs,const struct radv_graphics_pipeline * pipeline)5964 radv_pipeline_emit_vertex_shader(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
5965 const struct radv_graphics_pipeline *pipeline)
5966 {
5967 struct radv_shader *vs;
5968
5969 /* Skip shaders merged into HS/GS */
5970 vs = pipeline->base.shaders[MESA_SHADER_VERTEX];
5971 if (!vs)
5972 return;
5973
5974 if (vs->info.vs.as_ls)
5975 radv_pipeline_emit_hw_ls(cs, pipeline, vs);
5976 else if (vs->info.vs.as_es)
5977 radv_pipeline_emit_hw_es(cs, pipeline, vs);
5978 else if (vs->info.is_ngg)
5979 radv_pipeline_emit_hw_ngg(ctx_cs, cs, pipeline, vs);
5980 else
5981 radv_pipeline_emit_hw_vs(ctx_cs, cs, pipeline, vs);
5982 }
5983
5984 static void
radv_pipeline_emit_tess_shaders(struct radeon_cmdbuf * ctx_cs,struct radeon_cmdbuf * cs,const struct radv_graphics_pipeline * pipeline)5985 radv_pipeline_emit_tess_shaders(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
5986 const struct radv_graphics_pipeline *pipeline)
5987 {
5988 const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
5989 struct radv_shader *tes, *tcs;
5990
5991 tcs = pipeline->base.shaders[MESA_SHADER_TESS_CTRL];
5992 tes = pipeline->base.shaders[MESA_SHADER_TESS_EVAL];
5993
5994 if (tes) {
5995 if (tes->info.is_ngg) {
5996 radv_pipeline_emit_hw_ngg(ctx_cs, cs, pipeline, tes);
5997 } else if (tes->info.tes.as_es)
5998 radv_pipeline_emit_hw_es(cs, pipeline, tes);
5999 else
6000 radv_pipeline_emit_hw_vs(ctx_cs, cs, pipeline, tes);
6001 }
6002
6003 radv_pipeline_emit_hw_hs(cs, pipeline, tcs);
6004
6005 if (pdevice->rad_info.gfx_level >= GFX10 &&
6006 !radv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY) && !radv_pipeline_has_ngg(pipeline)) {
6007 radeon_set_context_reg(ctx_cs, R_028A44_VGT_GS_ONCHIP_CNTL,
6008 S_028A44_ES_VERTS_PER_SUBGRP(250) | S_028A44_GS_PRIMS_PER_SUBGRP(126) |
6009 S_028A44_GS_INST_PRIMS_IN_SUBGRP(126));
6010 }
6011 }
6012
6013 static void
radv_pipeline_emit_tess_state(struct radeon_cmdbuf * ctx_cs,const struct radv_graphics_pipeline * pipeline,const struct radv_graphics_pipeline_info * info)6014 radv_pipeline_emit_tess_state(struct radeon_cmdbuf *ctx_cs,
6015 const struct radv_graphics_pipeline *pipeline,
6016 const struct radv_graphics_pipeline_info *info)
6017 {
6018 const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
6019 struct radv_shader *tes = radv_get_shader(&pipeline->base, MESA_SHADER_TESS_EVAL);
6020 unsigned type = 0, partitioning = 0, topology = 0, distribution_mode = 0;
6021 unsigned num_tcs_input_cp, num_tcs_output_cp, num_patches;
6022 unsigned ls_hs_config;
6023
6024 num_tcs_input_cp = info->ts.patch_control_points;
6025 num_tcs_output_cp =
6026 pipeline->base.shaders[MESA_SHADER_TESS_CTRL]->info.tcs.tcs_vertices_out; // TCS VERTICES OUT
6027 num_patches = pipeline->base.shaders[MESA_SHADER_TESS_CTRL]->info.num_tess_patches;
6028
6029 ls_hs_config = S_028B58_NUM_PATCHES(num_patches) | S_028B58_HS_NUM_INPUT_CP(num_tcs_input_cp) |
6030 S_028B58_HS_NUM_OUTPUT_CP(num_tcs_output_cp);
6031
6032 if (pdevice->rad_info.gfx_level >= GFX7) {
6033 radeon_set_context_reg_idx(ctx_cs, R_028B58_VGT_LS_HS_CONFIG, 2, ls_hs_config);
6034 } else {
6035 radeon_set_context_reg(ctx_cs, R_028B58_VGT_LS_HS_CONFIG, ls_hs_config);
6036 }
6037
6038 switch (tes->info.tes._primitive_mode) {
6039 case TESS_PRIMITIVE_TRIANGLES:
6040 type = V_028B6C_TESS_TRIANGLE;
6041 break;
6042 case TESS_PRIMITIVE_QUADS:
6043 type = V_028B6C_TESS_QUAD;
6044 break;
6045 case TESS_PRIMITIVE_ISOLINES:
6046 type = V_028B6C_TESS_ISOLINE;
6047 break;
6048 default:
6049 break;
6050 }
6051
6052 switch (tes->info.tes.spacing) {
6053 case TESS_SPACING_EQUAL:
6054 partitioning = V_028B6C_PART_INTEGER;
6055 break;
6056 case TESS_SPACING_FRACTIONAL_ODD:
6057 partitioning = V_028B6C_PART_FRAC_ODD;
6058 break;
6059 case TESS_SPACING_FRACTIONAL_EVEN:
6060 partitioning = V_028B6C_PART_FRAC_EVEN;
6061 break;
6062 default:
6063 break;
6064 }
6065
6066 bool ccw = tes->info.tes.ccw;
6067 if (info->ts.domain_origin != VK_TESSELLATION_DOMAIN_ORIGIN_UPPER_LEFT)
6068 ccw = !ccw;
6069
6070 if (tes->info.tes.point_mode)
6071 topology = V_028B6C_OUTPUT_POINT;
6072 else if (tes->info.tes._primitive_mode == TESS_PRIMITIVE_ISOLINES)
6073 topology = V_028B6C_OUTPUT_LINE;
6074 else if (ccw)
6075 topology = V_028B6C_OUTPUT_TRIANGLE_CCW;
6076 else
6077 topology = V_028B6C_OUTPUT_TRIANGLE_CW;
6078
6079 if (pdevice->rad_info.has_distributed_tess) {
6080 if (pdevice->rad_info.family == CHIP_FIJI || pdevice->rad_info.family >= CHIP_POLARIS10)
6081 distribution_mode = V_028B6C_TRAPEZOIDS;
6082 else
6083 distribution_mode = V_028B6C_DONUTS;
6084 } else
6085 distribution_mode = V_028B6C_NO_DIST;
6086
6087 radeon_set_context_reg(ctx_cs, R_028B6C_VGT_TF_PARAM,
6088 S_028B6C_TYPE(type) | S_028B6C_PARTITIONING(partitioning) |
6089 S_028B6C_TOPOLOGY(topology) |
6090 S_028B6C_DISTRIBUTION_MODE(distribution_mode));
6091 }
6092
6093 static void
radv_pipeline_emit_hw_gs(struct radeon_cmdbuf * ctx_cs,struct radeon_cmdbuf * cs,const struct radv_graphics_pipeline * pipeline,const struct radv_shader * gs)6094 radv_pipeline_emit_hw_gs(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
6095 const struct radv_graphics_pipeline *pipeline, const struct radv_shader *gs)
6096 {
6097 const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
6098 const struct gfx9_gs_info *gs_state = &gs->info.gs_ring_info;
6099 unsigned gs_max_out_vertices;
6100 const uint8_t *num_components;
6101 uint8_t max_stream;
6102 unsigned offset;
6103 uint64_t va;
6104
6105 gs_max_out_vertices = gs->info.gs.vertices_out;
6106 max_stream = gs->info.gs.max_stream;
6107 num_components = gs->info.gs.num_stream_output_components;
6108
6109 offset = num_components[0] * gs_max_out_vertices;
6110
6111 radeon_set_context_reg_seq(ctx_cs, R_028A60_VGT_GSVS_RING_OFFSET_1, 3);
6112 radeon_emit(ctx_cs, offset);
6113 if (max_stream >= 1)
6114 offset += num_components[1] * gs_max_out_vertices;
6115 radeon_emit(ctx_cs, offset);
6116 if (max_stream >= 2)
6117 offset += num_components[2] * gs_max_out_vertices;
6118 radeon_emit(ctx_cs, offset);
6119 if (max_stream >= 3)
6120 offset += num_components[3] * gs_max_out_vertices;
6121 radeon_set_context_reg(ctx_cs, R_028AB0_VGT_GSVS_RING_ITEMSIZE, offset);
6122
6123 radeon_set_context_reg_seq(ctx_cs, R_028B5C_VGT_GS_VERT_ITEMSIZE, 4);
6124 radeon_emit(ctx_cs, num_components[0]);
6125 radeon_emit(ctx_cs, (max_stream >= 1) ? num_components[1] : 0);
6126 radeon_emit(ctx_cs, (max_stream >= 2) ? num_components[2] : 0);
6127 radeon_emit(ctx_cs, (max_stream >= 3) ? num_components[3] : 0);
6128
6129 uint32_t gs_num_invocations = gs->info.gs.invocations;
6130 radeon_set_context_reg(
6131 ctx_cs, R_028B90_VGT_GS_INSTANCE_CNT,
6132 S_028B90_CNT(MIN2(gs_num_invocations, 127)) | S_028B90_ENABLE(gs_num_invocations > 0));
6133
6134 radeon_set_context_reg(ctx_cs, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
6135 gs_state->vgt_esgs_ring_itemsize);
6136
6137 va = radv_shader_get_va(gs);
6138
6139 if (pdevice->rad_info.gfx_level >= GFX9) {
6140 if (pdevice->rad_info.gfx_level >= GFX10) {
6141 radeon_set_sh_reg(cs, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
6142 } else {
6143 radeon_set_sh_reg(cs, R_00B210_SPI_SHADER_PGM_LO_ES, va >> 8);
6144 }
6145
6146 radeon_set_sh_reg_seq(cs, R_00B228_SPI_SHADER_PGM_RSRC1_GS, 2);
6147 radeon_emit(cs, gs->config.rsrc1);
6148 radeon_emit(cs, gs->config.rsrc2 | S_00B22C_LDS_SIZE(gs_state->lds_size));
6149
6150 radeon_set_context_reg(ctx_cs, R_028A44_VGT_GS_ONCHIP_CNTL, gs_state->vgt_gs_onchip_cntl);
6151 radeon_set_context_reg(ctx_cs, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
6152 gs_state->vgt_gs_max_prims_per_subgroup);
6153 } else {
6154 radeon_set_sh_reg_seq(cs, R_00B220_SPI_SHADER_PGM_LO_GS, 4);
6155 radeon_emit(cs, va >> 8);
6156 radeon_emit(cs, S_00B224_MEM_BASE(va >> 40));
6157 radeon_emit(cs, gs->config.rsrc1);
6158 radeon_emit(cs, gs->config.rsrc2);
6159 }
6160
6161 if (pdevice->rad_info.gfx_level >= GFX10) {
6162 ac_set_reg_cu_en(cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
6163 S_00B21C_CU_EN(0xffff) | S_00B21C_WAVE_LIMIT(0x3F),
6164 C_00B21C_CU_EN, 0, &pdevice->rad_info,
6165 (void*)gfx10_set_sh_reg_idx3);
6166 ac_set_reg_cu_en(cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
6167 S_00B204_CU_EN_GFX10(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0),
6168 C_00B204_CU_EN_GFX10, 16, &pdevice->rad_info,
6169 (void*)gfx10_set_sh_reg_idx3);
6170 } else if (pdevice->rad_info.gfx_level >= GFX7) {
6171 radeon_set_sh_reg_idx(
6172 pdevice, cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, 3,
6173 S_00B21C_CU_EN(0xffff) | S_00B21C_WAVE_LIMIT(0x3F));
6174
6175 if (pdevice->rad_info.gfx_level >= GFX10) {
6176 radeon_set_sh_reg_idx(
6177 pdevice, cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, 3,
6178 S_00B204_CU_EN_GFX10(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0));
6179 }
6180 }
6181
6182 radv_pipeline_emit_hw_vs(ctx_cs, cs, pipeline, pipeline->base.gs_copy_shader);
6183 }
6184
6185 static void
radv_pipeline_emit_geometry_shader(struct radeon_cmdbuf * ctx_cs,struct radeon_cmdbuf * cs,const struct radv_graphics_pipeline * pipeline)6186 radv_pipeline_emit_geometry_shader(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
6187 const struct radv_graphics_pipeline *pipeline)
6188 {
6189 struct radv_shader *gs;
6190
6191 gs = pipeline->base.shaders[MESA_SHADER_GEOMETRY];
6192 if (!gs)
6193 return;
6194
6195 if (gs->info.is_ngg)
6196 radv_pipeline_emit_hw_ngg(ctx_cs, cs, pipeline, gs);
6197 else
6198 radv_pipeline_emit_hw_gs(ctx_cs, cs, pipeline, gs);
6199
6200 radeon_set_context_reg(ctx_cs, R_028B38_VGT_GS_MAX_VERT_OUT, gs->info.gs.vertices_out);
6201 }
6202
6203 static void
radv_pipeline_emit_mesh_shader(struct radeon_cmdbuf * ctx_cs,struct radeon_cmdbuf * cs,const struct radv_graphics_pipeline * pipeline)6204 radv_pipeline_emit_mesh_shader(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
6205 const struct radv_graphics_pipeline *pipeline)
6206 {
6207 const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
6208 struct radv_shader *ms = pipeline->base.shaders[MESA_SHADER_MESH];
6209 if (!ms)
6210 return;
6211
6212 radv_pipeline_emit_hw_ngg(ctx_cs, cs, pipeline, ms);
6213 radeon_set_context_reg(ctx_cs, R_028B38_VGT_GS_MAX_VERT_OUT, ms->info.workgroup_size);
6214 radeon_set_uconfig_reg_idx(pdevice, ctx_cs,
6215 R_030908_VGT_PRIMITIVE_TYPE, 1, V_008958_DI_PT_POINTLIST);
6216 }
6217
6218 static uint32_t
offset_to_ps_input(uint32_t offset,bool flat_shade,bool explicit,bool float16)6219 offset_to_ps_input(uint32_t offset, bool flat_shade, bool explicit, bool float16)
6220 {
6221 uint32_t ps_input_cntl;
6222 if (offset <= AC_EXP_PARAM_OFFSET_31) {
6223 ps_input_cntl = S_028644_OFFSET(offset);
6224 if (flat_shade || explicit)
6225 ps_input_cntl |= S_028644_FLAT_SHADE(1);
6226 if (explicit) {
6227 /* Force parameter cache to be read in passthrough
6228 * mode.
6229 */
6230 ps_input_cntl |= S_028644_OFFSET(1 << 5);
6231 }
6232 if (float16) {
6233 ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) | S_028644_ATTR0_VALID(1);
6234 }
6235 } else {
6236 /* The input is a DEFAULT_VAL constant. */
6237 assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 && offset <= AC_EXP_PARAM_DEFAULT_VAL_1111);
6238 offset -= AC_EXP_PARAM_DEFAULT_VAL_0000;
6239 ps_input_cntl = S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(offset);
6240 }
6241 return ps_input_cntl;
6242 }
6243
6244 static void
single_slot_to_ps_input(const struct radv_vs_output_info * outinfo,unsigned slot,uint32_t * ps_input_cntl,unsigned * ps_offset,bool skip_undef,bool use_default_0,bool flat_shade)6245 single_slot_to_ps_input(const struct radv_vs_output_info *outinfo,
6246 unsigned slot, uint32_t *ps_input_cntl, unsigned *ps_offset,
6247 bool skip_undef, bool use_default_0, bool flat_shade)
6248 {
6249 unsigned vs_offset = outinfo->vs_output_param_offset[slot];
6250
6251 if (vs_offset == AC_EXP_PARAM_UNDEFINED) {
6252 if (skip_undef)
6253 return;
6254 else if (use_default_0)
6255 vs_offset = AC_EXP_PARAM_DEFAULT_VAL_0000;
6256 else
6257 unreachable("vs_offset should not be AC_EXP_PARAM_UNDEFINED.");
6258 }
6259
6260 ps_input_cntl[*ps_offset] = offset_to_ps_input(vs_offset, flat_shade, false, false);
6261 ++(*ps_offset);
6262 }
6263
6264 static void
input_mask_to_ps_inputs(const struct radv_vs_output_info * outinfo,const struct radv_shader * ps,uint32_t input_mask,uint32_t * ps_input_cntl,unsigned * ps_offset)6265 input_mask_to_ps_inputs(const struct radv_vs_output_info *outinfo, const struct radv_shader *ps,
6266 uint32_t input_mask, uint32_t *ps_input_cntl, unsigned *ps_offset)
6267 {
6268 u_foreach_bit(i, input_mask) {
6269 unsigned vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_VAR0 + i];
6270 if (vs_offset == AC_EXP_PARAM_UNDEFINED) {
6271 ps_input_cntl[*ps_offset] = S_028644_OFFSET(0x20);
6272 ++(*ps_offset);
6273 continue;
6274 }
6275
6276 bool flat_shade = !!(ps->info.ps.flat_shaded_mask & (1u << *ps_offset));
6277 bool explicit = !!(ps->info.ps.explicit_shaded_mask & (1u << *ps_offset));
6278 bool float16 = !!(ps->info.ps.float16_shaded_mask & (1u << *ps_offset));
6279
6280 ps_input_cntl[*ps_offset] = offset_to_ps_input(vs_offset, flat_shade, explicit, float16);
6281 ++(*ps_offset);
6282 }
6283 }
6284
6285 static void
radv_pipeline_emit_ps_inputs(struct radeon_cmdbuf * ctx_cs,const struct radv_graphics_pipeline * pipeline)6286 radv_pipeline_emit_ps_inputs(struct radeon_cmdbuf *ctx_cs,
6287 const struct radv_graphics_pipeline *pipeline)
6288 {
6289 struct radv_shader *ps = pipeline->base.shaders[MESA_SHADER_FRAGMENT];
6290 const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline);
6291 bool mesh = radv_pipeline_has_stage(pipeline, MESA_SHADER_MESH);
6292 uint32_t ps_input_cntl[32];
6293
6294 unsigned ps_offset = 0;
6295
6296 if (ps->info.ps.prim_id_input && !mesh)
6297 single_slot_to_ps_input(outinfo, VARYING_SLOT_PRIMITIVE_ID, ps_input_cntl, &ps_offset,
6298 true, false, true);
6299
6300 if (ps->info.ps.layer_input && !mesh)
6301 single_slot_to_ps_input(outinfo, VARYING_SLOT_LAYER, ps_input_cntl, &ps_offset,
6302 false, true, true);
6303
6304 if (ps->info.ps.viewport_index_input && !mesh)
6305 single_slot_to_ps_input(outinfo, VARYING_SLOT_VIEWPORT, ps_input_cntl, &ps_offset,
6306 false, false, true);
6307
6308 if (ps->info.ps.has_pcoord)
6309 ps_input_cntl[ps_offset++] = S_028644_PT_SPRITE_TEX(1) | S_028644_OFFSET(0x20);
6310
6311 if (ps->info.ps.num_input_clips_culls) {
6312 single_slot_to_ps_input(outinfo, VARYING_SLOT_CLIP_DIST0, ps_input_cntl, &ps_offset,
6313 true, false, false);
6314
6315 if (ps->info.ps.num_input_clips_culls > 4)
6316 single_slot_to_ps_input(outinfo, VARYING_SLOT_CLIP_DIST1, ps_input_cntl, &ps_offset,
6317 true, false, false);
6318 }
6319
6320 input_mask_to_ps_inputs(outinfo, ps, ps->info.ps.input_mask,
6321 ps_input_cntl, &ps_offset);
6322
6323 /* Per-primitive PS inputs: the HW needs these to be last. */
6324
6325 if (ps->info.ps.prim_id_input && mesh)
6326 single_slot_to_ps_input(outinfo, VARYING_SLOT_PRIMITIVE_ID, ps_input_cntl, &ps_offset,
6327 true, false, false);
6328
6329 if (ps->info.ps.layer_input && mesh)
6330 single_slot_to_ps_input(outinfo, VARYING_SLOT_LAYER, ps_input_cntl, &ps_offset,
6331 false, true, false);
6332
6333 if (ps->info.ps.viewport_index_input && mesh)
6334 single_slot_to_ps_input(outinfo, VARYING_SLOT_VIEWPORT, ps_input_cntl, &ps_offset,
6335 false, false, false);
6336
6337 input_mask_to_ps_inputs(outinfo, ps, ps->info.ps.input_per_primitive_mask,
6338 ps_input_cntl, &ps_offset);
6339
6340 if (ps_offset) {
6341 radeon_set_context_reg_seq(ctx_cs, R_028644_SPI_PS_INPUT_CNTL_0, ps_offset);
6342 for (unsigned i = 0; i < ps_offset; i++) {
6343 radeon_emit(ctx_cs, ps_input_cntl[i]);
6344 }
6345 }
6346 }
6347
6348 static uint32_t
radv_compute_db_shader_control(const struct radv_physical_device * pdevice,const struct radv_graphics_pipeline * pipeline,const struct radv_shader * ps)6349 radv_compute_db_shader_control(const struct radv_physical_device *pdevice,
6350 const struct radv_graphics_pipeline *pipeline,
6351 const struct radv_shader *ps)
6352 {
6353 unsigned conservative_z_export = V_02880C_EXPORT_ANY_Z;
6354 unsigned z_order;
6355 if (ps->info.ps.early_fragment_test || !ps->info.ps.writes_memory)
6356 z_order = V_02880C_EARLY_Z_THEN_LATE_Z;
6357 else
6358 z_order = V_02880C_LATE_Z;
6359
6360 if (ps->info.ps.depth_layout == FRAG_DEPTH_LAYOUT_GREATER)
6361 conservative_z_export = V_02880C_EXPORT_GREATER_THAN_Z;
6362 else if (ps->info.ps.depth_layout == FRAG_DEPTH_LAYOUT_LESS)
6363 conservative_z_export = V_02880C_EXPORT_LESS_THAN_Z;
6364
6365 bool disable_rbplus = pdevice->rad_info.has_rbplus && !pdevice->rad_info.rbplus_allowed;
6366
6367 /* It shouldn't be needed to export gl_SampleMask when MSAA is disabled
6368 * but this appears to break Project Cars (DXVK). See
6369 * https://bugs.freedesktop.org/show_bug.cgi?id=109401
6370 */
6371 bool mask_export_enable = ps->info.ps.writes_sample_mask;
6372
6373 return S_02880C_Z_EXPORT_ENABLE(ps->info.ps.writes_z) |
6374 S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(ps->info.ps.writes_stencil) |
6375 S_02880C_KILL_ENABLE(!!ps->info.ps.can_discard) |
6376 S_02880C_MASK_EXPORT_ENABLE(mask_export_enable) |
6377 S_02880C_CONSERVATIVE_Z_EXPORT(conservative_z_export) | S_02880C_Z_ORDER(z_order) |
6378 S_02880C_DEPTH_BEFORE_SHADER(ps->info.ps.early_fragment_test) |
6379 S_02880C_PRE_SHADER_DEPTH_COVERAGE_ENABLE(ps->info.ps.post_depth_coverage) |
6380 S_02880C_EXEC_ON_HIER_FAIL(ps->info.ps.writes_memory) |
6381 S_02880C_EXEC_ON_NOOP(ps->info.ps.writes_memory) |
6382 S_02880C_DUAL_QUAD_DISABLE(disable_rbplus);
6383 }
6384
6385 static void
radv_pipeline_emit_fragment_shader(struct radeon_cmdbuf * ctx_cs,struct radeon_cmdbuf * cs,const struct radv_graphics_pipeline * pipeline)6386 radv_pipeline_emit_fragment_shader(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
6387 const struct radv_graphics_pipeline *pipeline)
6388 {
6389 const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
6390 struct radv_shader *ps;
6391 bool param_gen;
6392 uint64_t va;
6393 assert(pipeline->base.shaders[MESA_SHADER_FRAGMENT]);
6394
6395 ps = pipeline->base.shaders[MESA_SHADER_FRAGMENT];
6396 va = radv_shader_get_va(ps);
6397
6398 radeon_set_sh_reg_seq(cs, R_00B020_SPI_SHADER_PGM_LO_PS, 4);
6399 radeon_emit(cs, va >> 8);
6400 radeon_emit(cs, S_00B024_MEM_BASE(va >> 40));
6401 radeon_emit(cs, ps->config.rsrc1);
6402 radeon_emit(cs, ps->config.rsrc2);
6403
6404 radeon_set_context_reg(ctx_cs, R_02880C_DB_SHADER_CONTROL,
6405 radv_compute_db_shader_control(pdevice, pipeline, ps));
6406
6407 radeon_set_context_reg_seq(ctx_cs, R_0286CC_SPI_PS_INPUT_ENA, 2);
6408 radeon_emit(ctx_cs, ps->config.spi_ps_input_ena);
6409 radeon_emit(ctx_cs, ps->config.spi_ps_input_addr);
6410
6411 /* Workaround when there are no PS inputs but LDS is used. */
6412 param_gen = pdevice->rad_info.gfx_level >= GFX11 &&
6413 !ps->info.ps.num_interp && ps->config.lds_size;
6414
6415 radeon_set_context_reg(
6416 ctx_cs, R_0286D8_SPI_PS_IN_CONTROL,
6417 S_0286D8_NUM_INTERP(ps->info.ps.num_interp) |
6418 S_0286D8_NUM_PRIM_INTERP(ps->info.ps.num_prim_interp) |
6419 S_0286D8_PS_W32_EN(ps->info.wave_size == 32) |
6420 S_0286D8_PARAM_GEN(param_gen));
6421
6422 radeon_set_context_reg(ctx_cs, R_0286E0_SPI_BARYC_CNTL, pipeline->spi_baryc_cntl);
6423
6424 radeon_set_context_reg(
6425 ctx_cs, R_028710_SPI_SHADER_Z_FORMAT,
6426 ac_get_spi_shader_z_format(ps->info.ps.writes_z, ps->info.ps.writes_stencil,
6427 ps->info.ps.writes_sample_mask, false));
6428 }
6429
6430 static void
radv_pipeline_emit_vgt_vertex_reuse(struct radeon_cmdbuf * ctx_cs,const struct radv_graphics_pipeline * pipeline)6431 radv_pipeline_emit_vgt_vertex_reuse(struct radeon_cmdbuf *ctx_cs,
6432 const struct radv_graphics_pipeline *pipeline)
6433 {
6434 const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
6435
6436 if (pdevice->rad_info.family < CHIP_POLARIS10 || pdevice->rad_info.gfx_level >= GFX10)
6437 return;
6438
6439 unsigned vtx_reuse_depth = 30;
6440 if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL) &&
6441 radv_get_shader(&pipeline->base, MESA_SHADER_TESS_EVAL)->info.tes.spacing ==
6442 TESS_SPACING_FRACTIONAL_ODD) {
6443 vtx_reuse_depth = 14;
6444 }
6445 radeon_set_context_reg(ctx_cs, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
6446 S_028C58_VTX_REUSE_DEPTH(vtx_reuse_depth));
6447 }
6448
6449 static void
radv_pipeline_emit_vgt_shader_config(struct radeon_cmdbuf * ctx_cs,const struct radv_graphics_pipeline * pipeline)6450 radv_pipeline_emit_vgt_shader_config(struct radeon_cmdbuf *ctx_cs,
6451 const struct radv_graphics_pipeline *pipeline)
6452 {
6453 const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
6454 uint32_t stages = 0;
6455 if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL)) {
6456 stages |= S_028B54_LS_EN(V_028B54_LS_STAGE_ON) | S_028B54_HS_EN(1) | S_028B54_DYNAMIC_HS(1);
6457
6458 if (radv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY))
6459 stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS) | S_028B54_GS_EN(1);
6460 else if (radv_pipeline_has_ngg(pipeline))
6461 stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS);
6462 else
6463 stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_DS);
6464 } else if (radv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
6465 stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) | S_028B54_GS_EN(1);
6466 } else if (radv_pipeline_has_stage(pipeline, MESA_SHADER_MESH)) {
6467 assert(!radv_pipeline_has_ngg_passthrough(pipeline));
6468 stages |= S_028B54_GS_EN(1) | S_028B54_GS_FAST_LAUNCH(1);
6469
6470 if (pipeline->base.shaders[MESA_SHADER_MESH]->info.ms.needs_ms_scratch_ring)
6471 stages |= S_028B54_NGG_WAVE_ID_EN(1);
6472 } else if (radv_pipeline_has_ngg(pipeline)) {
6473 stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL);
6474 }
6475
6476 if (radv_pipeline_has_ngg(pipeline)) {
6477 stages |= S_028B54_PRIMGEN_EN(1);
6478 if (pipeline->streamout_shader)
6479 stages |= S_028B54_NGG_WAVE_ID_EN(1);
6480 if (radv_pipeline_has_ngg_passthrough(pipeline)) {
6481 stages |= S_028B54_PRIMGEN_PASSTHRU_EN(1);
6482 if (pdevice->rad_info.family >= CHIP_NAVI23)
6483 stages |= S_028B54_PRIMGEN_PASSTHRU_NO_MSG(1);
6484 }
6485 } else if (radv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
6486 stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
6487 }
6488
6489 if (pdevice->rad_info.gfx_level >= GFX9)
6490 stages |= S_028B54_MAX_PRIMGRP_IN_WAVE(2);
6491
6492 if (pdevice->rad_info.gfx_level >= GFX10) {
6493 uint8_t hs_size = 64, gs_size = 64, vs_size = 64;
6494
6495 if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL))
6496 hs_size = pipeline->base.shaders[MESA_SHADER_TESS_CTRL]->info.wave_size;
6497
6498 if (pipeline->base.shaders[MESA_SHADER_GEOMETRY]) {
6499 vs_size = gs_size = pipeline->base.shaders[MESA_SHADER_GEOMETRY]->info.wave_size;
6500 if (radv_pipeline_has_gs_copy_shader(&pipeline->base))
6501 vs_size = pipeline->base.gs_copy_shader->info.wave_size;
6502 } else if (pipeline->base.shaders[MESA_SHADER_TESS_EVAL])
6503 vs_size = pipeline->base.shaders[MESA_SHADER_TESS_EVAL]->info.wave_size;
6504 else if (pipeline->base.shaders[MESA_SHADER_VERTEX])
6505 vs_size = pipeline->base.shaders[MESA_SHADER_VERTEX]->info.wave_size;
6506 else if (pipeline->base.shaders[MESA_SHADER_MESH])
6507 vs_size = gs_size = pipeline->base.shaders[MESA_SHADER_MESH]->info.wave_size;
6508
6509 if (radv_pipeline_has_ngg(pipeline)) {
6510 assert(!radv_pipeline_has_gs_copy_shader(&pipeline->base));
6511 gs_size = vs_size;
6512 }
6513
6514 /* legacy GS only supports Wave64 */
6515 stages |= S_028B54_HS_W32_EN(hs_size == 32 ? 1 : 0) |
6516 S_028B54_GS_W32_EN(gs_size == 32 ? 1 : 0) |
6517 S_028B54_VS_W32_EN(vs_size == 32 ? 1 : 0);
6518 }
6519
6520 radeon_set_context_reg(ctx_cs, R_028B54_VGT_SHADER_STAGES_EN, stages);
6521 }
6522
6523 static void
radv_pipeline_emit_cliprect_rule(struct radeon_cmdbuf * ctx_cs,const struct radv_graphics_pipeline_info * info)6524 radv_pipeline_emit_cliprect_rule(struct radeon_cmdbuf *ctx_cs,
6525 const struct radv_graphics_pipeline_info *info)
6526 {
6527 uint32_t cliprect_rule = 0;
6528
6529 if (!info->dr.count) {
6530 cliprect_rule = 0xffff;
6531 } else {
6532 for (unsigned i = 0; i < (1u << MAX_DISCARD_RECTANGLES); ++i) {
6533 /* Interpret i as a bitmask, and then set the bit in
6534 * the mask if that combination of rectangles in which
6535 * the pixel is contained should pass the cliprect
6536 * test.
6537 */
6538 unsigned relevant_subset = i & ((1u << info->dr.count) - 1);
6539
6540 if (info->dr.mode == VK_DISCARD_RECTANGLE_MODE_INCLUSIVE_EXT && !relevant_subset)
6541 continue;
6542
6543 if (info->dr.mode == VK_DISCARD_RECTANGLE_MODE_EXCLUSIVE_EXT && relevant_subset)
6544 continue;
6545
6546 cliprect_rule |= 1u << i;
6547 }
6548 }
6549
6550 radeon_set_context_reg(ctx_cs, R_02820C_PA_SC_CLIPRECT_RULE, cliprect_rule);
6551 }
6552
6553 static void
gfx10_pipeline_emit_ge_cntl(struct radeon_cmdbuf * ctx_cs,const struct radv_graphics_pipeline * pipeline)6554 gfx10_pipeline_emit_ge_cntl(struct radeon_cmdbuf *ctx_cs,
6555 const struct radv_graphics_pipeline *pipeline)
6556 {
6557 bool break_wave_at_eoi = false;
6558 unsigned primgroup_size;
6559 unsigned vertgroup_size = 256; /* 256 = disable vertex grouping */
6560
6561 if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL)) {
6562 primgroup_size = pipeline->base.shaders[MESA_SHADER_TESS_CTRL]->info.num_tess_patches;
6563 } else if (radv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
6564 const struct gfx9_gs_info *gs_state =
6565 &pipeline->base.shaders[MESA_SHADER_GEOMETRY]->info.gs_ring_info;
6566 unsigned vgt_gs_onchip_cntl = gs_state->vgt_gs_onchip_cntl;
6567 primgroup_size = G_028A44_GS_PRIMS_PER_SUBGRP(vgt_gs_onchip_cntl);
6568 } else {
6569 primgroup_size = 128; /* recommended without a GS and tess */
6570 }
6571
6572 if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL)) {
6573 if (pipeline->base.shaders[MESA_SHADER_TESS_CTRL]->info.uses_prim_id ||
6574 radv_get_shader(&pipeline->base, MESA_SHADER_TESS_EVAL)->info.uses_prim_id)
6575 break_wave_at_eoi = true;
6576 }
6577
6578 radeon_set_uconfig_reg(ctx_cs, R_03096C_GE_CNTL,
6579 S_03096C_PRIM_GRP_SIZE_GFX10(primgroup_size) |
6580 S_03096C_VERT_GRP_SIZE(vertgroup_size) |
6581 S_03096C_PACKET_TO_ONE_PA(0) /* line stipple */ |
6582 S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi));
6583 }
6584
6585 static void
radv_pipeline_emit_vgt_gs_out(struct radeon_cmdbuf * ctx_cs,const struct radv_graphics_pipeline * pipeline,uint32_t vgt_gs_out_prim_type)6586 radv_pipeline_emit_vgt_gs_out(struct radeon_cmdbuf *ctx_cs,
6587 const struct radv_graphics_pipeline *pipeline,
6588 uint32_t vgt_gs_out_prim_type)
6589 {
6590 const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
6591
6592 if (pdevice->rad_info.gfx_level >= GFX11) {
6593 radeon_set_uconfig_reg(ctx_cs, R_030998_VGT_GS_OUT_PRIM_TYPE, vgt_gs_out_prim_type);
6594 } else {
6595 radeon_set_context_reg(ctx_cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, vgt_gs_out_prim_type);
6596 }
6597 }
6598
6599 static void
gfx103_pipeline_emit_vgt_draw_payload_cntl(struct radeon_cmdbuf * ctx_cs,const struct radv_graphics_pipeline * pipeline,const struct radv_graphics_pipeline_info * info)6600 gfx103_pipeline_emit_vgt_draw_payload_cntl(struct radeon_cmdbuf *ctx_cs,
6601 const struct radv_graphics_pipeline *pipeline,
6602 const struct radv_graphics_pipeline_info *info)
6603 {
6604 const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline);
6605
6606 bool enable_vrs = radv_is_vrs_enabled(pipeline, info);
6607
6608 /* Enables the second channel of the primitive export instruction.
6609 * This channel contains: VRS rate x, y, viewport and layer.
6610 */
6611 bool enable_prim_payload =
6612 outinfo &&
6613 (outinfo->writes_viewport_index_per_primitive ||
6614 outinfo->writes_layer_per_primitive ||
6615 outinfo->writes_primitive_shading_rate_per_primitive);
6616
6617 radeon_set_context_reg(ctx_cs, R_028A98_VGT_DRAW_PAYLOAD_CNTL,
6618 S_028A98_EN_VRS_RATE(enable_vrs) |
6619 S_028A98_EN_PRIM_PAYLOAD(enable_prim_payload));
6620 }
6621
6622 static bool
gfx103_pipeline_vrs_coarse_shading(const struct radv_graphics_pipeline * pipeline)6623 gfx103_pipeline_vrs_coarse_shading(const struct radv_graphics_pipeline *pipeline)
6624 {
6625 struct radv_shader *ps = pipeline->base.shaders[MESA_SHADER_FRAGMENT];
6626 struct radv_device *device = pipeline->base.device;
6627
6628 if (device->instance->debug_flags & RADV_DEBUG_NO_VRS_FLAT_SHADING)
6629 return false;
6630
6631 if (!ps->info.ps.allow_flat_shading)
6632 return false;
6633
6634 return true;
6635 }
6636
6637 static void
gfx103_pipeline_emit_vrs_state(struct radeon_cmdbuf * ctx_cs,const struct radv_graphics_pipeline * pipeline,const struct radv_graphics_pipeline_info * info)6638 gfx103_pipeline_emit_vrs_state(struct radeon_cmdbuf *ctx_cs,
6639 const struct radv_graphics_pipeline *pipeline,
6640 const struct radv_graphics_pipeline_info *info)
6641 {
6642 const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
6643 uint32_t mode = V_028064_VRS_COMB_MODE_PASSTHRU;
6644 uint8_t rate_x = 0, rate_y = 0;
6645 bool enable_vrs = radv_is_vrs_enabled(pipeline, info);
6646
6647 if (!enable_vrs && gfx103_pipeline_vrs_coarse_shading(pipeline)) {
6648 /* When per-draw VRS is not enabled at all, try enabling VRS coarse shading 2x2 if the driver
6649 * determined that it's safe to enable.
6650 */
6651 mode = V_028064_VRS_COMB_MODE_OVERRIDE;
6652 rate_x = rate_y = 1;
6653 } else if (!radv_is_static_vrs_enabled(pipeline, info) && pipeline->force_vrs_per_vertex &&
6654 get_vs_output_info(pipeline)->writes_primitive_shading_rate) {
6655 /* Otherwise, if per-draw VRS is not enabled statically, try forcing per-vertex VRS if
6656 * requested by the user. Note that vkd3d-proton always has to declare VRS as dynamic because
6657 * in DX12 it's fully dynamic.
6658 */
6659 radeon_set_context_reg(ctx_cs, R_028848_PA_CL_VRS_CNTL,
6660 S_028848_SAMPLE_ITER_COMBINER_MODE(V_028848_VRS_COMB_MODE_OVERRIDE) |
6661 S_028848_VERTEX_RATE_COMBINER_MODE(V_028848_VRS_COMB_MODE_OVERRIDE));
6662
6663 /* If the shader is using discard, turn off coarse shading because discard at 2x2 pixel
6664 * granularity degrades quality too much. MIN allows sample shading but not coarse shading.
6665 */
6666 struct radv_shader *ps = pipeline->base.shaders[MESA_SHADER_FRAGMENT];
6667
6668 mode = ps->info.ps.can_discard ? V_028064_VRS_COMB_MODE_MIN : V_028064_VRS_COMB_MODE_PASSTHRU;
6669 }
6670
6671 if (pdevice->rad_info.gfx_level >= GFX11) {
6672 radeon_set_context_reg(ctx_cs, R_0283D0_PA_SC_VRS_OVERRIDE_CNTL,
6673 S_0283D0_VRS_OVERRIDE_RATE_COMBINER_MODE(mode) |
6674 S_0283D0_VRS_RATE((rate_x << 2) | rate_y));
6675 } else {
6676 radeon_set_context_reg(ctx_cs, R_028064_DB_VRS_OVERRIDE_CNTL,
6677 S_028064_VRS_OVERRIDE_RATE_COMBINER_MODE(mode) |
6678 S_028064_VRS_OVERRIDE_RATE_X(rate_x) |
6679 S_028064_VRS_OVERRIDE_RATE_Y(rate_y));
6680 }
6681 }
6682
6683 static void
radv_pipeline_emit_pm4(struct radv_graphics_pipeline * pipeline,const struct radv_blend_state * blend,const struct radv_depth_stencil_state * ds_state,uint32_t vgt_gs_out_prim_type,const struct radv_graphics_pipeline_info * info)6684 radv_pipeline_emit_pm4(struct radv_graphics_pipeline *pipeline,
6685 const struct radv_blend_state *blend,
6686 const struct radv_depth_stencil_state *ds_state,
6687 uint32_t vgt_gs_out_prim_type,
6688 const struct radv_graphics_pipeline_info *info)
6689 {
6690 const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
6691 struct radeon_cmdbuf *ctx_cs = &pipeline->base.ctx_cs;
6692 struct radeon_cmdbuf *cs = &pipeline->base.cs;
6693
6694 cs->max_dw = 64;
6695 ctx_cs->max_dw = 256;
6696 cs->buf = malloc(4 * (cs->max_dw + ctx_cs->max_dw));
6697 ctx_cs->buf = cs->buf + cs->max_dw;
6698
6699 radv_pipeline_emit_depth_stencil_state(ctx_cs, ds_state);
6700 radv_pipeline_emit_blend_state(ctx_cs, pipeline, blend);
6701 radv_pipeline_emit_raster_state(ctx_cs, pipeline, info);
6702 radv_pipeline_emit_multisample_state(ctx_cs, pipeline);
6703 radv_pipeline_emit_vgt_gs_mode(ctx_cs, pipeline);
6704 radv_pipeline_emit_vertex_shader(ctx_cs, cs, pipeline);
6705 radv_pipeline_emit_mesh_shader(ctx_cs, cs, pipeline);
6706
6707 if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL)) {
6708 radv_pipeline_emit_tess_shaders(ctx_cs, cs, pipeline);
6709 radv_pipeline_emit_tess_state(ctx_cs, pipeline, info);
6710 }
6711
6712 radv_pipeline_emit_geometry_shader(ctx_cs, cs, pipeline);
6713 radv_pipeline_emit_fragment_shader(ctx_cs, cs, pipeline);
6714 radv_pipeline_emit_ps_inputs(ctx_cs, pipeline);
6715 radv_pipeline_emit_vgt_vertex_reuse(ctx_cs, pipeline);
6716 radv_pipeline_emit_vgt_shader_config(ctx_cs, pipeline);
6717 radv_pipeline_emit_cliprect_rule(ctx_cs, info);
6718 radv_pipeline_emit_vgt_gs_out(ctx_cs, pipeline, vgt_gs_out_prim_type);
6719
6720 if (pdevice->rad_info.gfx_level >= GFX10 && !radv_pipeline_has_ngg(pipeline))
6721 gfx10_pipeline_emit_ge_cntl(ctx_cs, pipeline);
6722
6723 if (pdevice->rad_info.gfx_level >= GFX10_3) {
6724 gfx103_pipeline_emit_vgt_draw_payload_cntl(ctx_cs, pipeline, info);
6725 gfx103_pipeline_emit_vrs_state(ctx_cs, pipeline, info);
6726 }
6727
6728 pipeline->base.ctx_cs_hash = _mesa_hash_data(ctx_cs->buf, ctx_cs->cdw * 4);
6729
6730 assert(ctx_cs->cdw <= ctx_cs->max_dw);
6731 assert(cs->cdw <= cs->max_dw);
6732 }
6733
6734 static void
radv_pipeline_init_vertex_input_state(struct radv_graphics_pipeline * pipeline,const struct radv_graphics_pipeline_info * info)6735 radv_pipeline_init_vertex_input_state(struct radv_graphics_pipeline *pipeline,
6736 const struct radv_graphics_pipeline_info *info)
6737 {
6738 const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
6739 const struct radv_shader_info *vs_info = &radv_get_shader(&pipeline->base, MESA_SHADER_VERTEX)->info;
6740
6741 for (uint32_t i = 0; i < MAX_VERTEX_ATTRIBS; i++) {
6742 pipeline->attrib_ends[i] = info->vi.attrib_ends[i];
6743 pipeline->attrib_index_offset[i] = info->vi.attrib_index_offset[i];
6744 pipeline->attrib_bindings[i] = info->vi.attrib_bindings[i];
6745 }
6746
6747 for (uint32_t i = 0; i < MAX_VBS; i++) {
6748 pipeline->binding_stride[i] = info->vi.binding_stride[i];
6749 }
6750
6751 pipeline->use_per_attribute_vb_descs = vs_info->vs.use_per_attribute_vb_descs;
6752 pipeline->last_vertex_attrib_bit = util_last_bit(vs_info->vs.vb_desc_usage_mask);
6753 if (pipeline->base.shaders[MESA_SHADER_VERTEX])
6754 pipeline->next_vertex_stage = MESA_SHADER_VERTEX;
6755 else if (pipeline->base.shaders[MESA_SHADER_TESS_CTRL])
6756 pipeline->next_vertex_stage = MESA_SHADER_TESS_CTRL;
6757 else
6758 pipeline->next_vertex_stage = MESA_SHADER_GEOMETRY;
6759 if (pipeline->next_vertex_stage == MESA_SHADER_VERTEX) {
6760 const struct radv_shader *vs_shader = pipeline->base.shaders[MESA_SHADER_VERTEX];
6761 pipeline->can_use_simple_input = vs_shader->info.is_ngg == pdevice->use_ngg &&
6762 vs_shader->info.wave_size == pdevice->ge_wave_size;
6763 } else {
6764 pipeline->can_use_simple_input = false;
6765 }
6766 if (vs_info->vs.dynamic_inputs)
6767 pipeline->vb_desc_usage_mask = BITFIELD_MASK(pipeline->last_vertex_attrib_bit);
6768 else
6769 pipeline->vb_desc_usage_mask = vs_info->vs.vb_desc_usage_mask;
6770 pipeline->vb_desc_alloc_size = util_bitcount(pipeline->vb_desc_usage_mask) * 16;
6771 }
6772
6773 static struct radv_shader *
radv_pipeline_get_streamout_shader(struct radv_graphics_pipeline * pipeline)6774 radv_pipeline_get_streamout_shader(struct radv_graphics_pipeline *pipeline)
6775 {
6776 int i;
6777
6778 for (i = MESA_SHADER_GEOMETRY; i >= MESA_SHADER_VERTEX; i--) {
6779 struct radv_shader *shader = radv_get_shader(&pipeline->base, i);
6780
6781 if (shader && shader->info.so.num_outputs > 0)
6782 return shader;
6783 }
6784
6785 return NULL;
6786 }
6787
6788 static bool
radv_shader_need_indirect_descriptor_sets(struct radv_pipeline * pipeline,gl_shader_stage stage)6789 radv_shader_need_indirect_descriptor_sets(struct radv_pipeline *pipeline, gl_shader_stage stage)
6790 {
6791 struct radv_userdata_info *loc =
6792 radv_lookup_user_sgpr(pipeline, stage, AC_UD_INDIRECT_DESCRIPTOR_SETS);
6793 return loc->sgpr_idx != -1;
6794 }
6795
6796 static void
radv_pipeline_init_shader_stages_state(struct radv_graphics_pipeline * pipeline)6797 radv_pipeline_init_shader_stages_state(struct radv_graphics_pipeline *pipeline)
6798 {
6799 struct radv_device *device = pipeline->base.device;
6800
6801 for (unsigned i = 0; i < MESA_VULKAN_SHADER_STAGES; i++) {
6802 bool shader_exists = !!pipeline->base.shaders[i];
6803 if (shader_exists || i < MESA_SHADER_COMPUTE) {
6804 /* We need this info for some stages even when the shader doesn't exist. */
6805 pipeline->base.user_data_0[i] = radv_pipeline_stage_to_user_data_0(
6806 pipeline, i, device->physical_device->rad_info.gfx_level);
6807
6808 if (shader_exists)
6809 pipeline->base.need_indirect_descriptor_sets |=
6810 radv_shader_need_indirect_descriptor_sets(&pipeline->base, i);
6811 }
6812 }
6813
6814 gl_shader_stage first_stage =
6815 radv_pipeline_has_stage(pipeline, MESA_SHADER_MESH) ? MESA_SHADER_MESH : MESA_SHADER_VERTEX;
6816
6817 struct radv_userdata_info *loc =
6818 radv_lookup_user_sgpr(&pipeline->base, first_stage, AC_UD_VS_BASE_VERTEX_START_INSTANCE);
6819 if (loc->sgpr_idx != -1) {
6820 pipeline->vtx_base_sgpr = pipeline->base.user_data_0[first_stage];
6821 pipeline->vtx_base_sgpr += loc->sgpr_idx * 4;
6822 pipeline->vtx_emit_num = loc->num_sgprs;
6823 pipeline->uses_drawid =
6824 radv_get_shader(&pipeline->base, first_stage)->info.vs.needs_draw_id;
6825 pipeline->uses_baseinstance =
6826 radv_get_shader(&pipeline->base, first_stage)->info.vs.needs_base_instance;
6827
6828 assert(first_stage != MESA_SHADER_MESH || !pipeline->uses_baseinstance);
6829 }
6830 }
6831
6832 static uint32_t
radv_pipeline_init_vgt_gs_out(struct radv_graphics_pipeline * pipeline,const struct radv_graphics_pipeline_info * info)6833 radv_pipeline_init_vgt_gs_out(struct radv_graphics_pipeline *pipeline,
6834 const struct radv_graphics_pipeline_info *info)
6835 {
6836 uint32_t gs_out;
6837
6838 if (radv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
6839 gs_out =
6840 si_conv_gl_prim_to_gs_out(pipeline->base.shaders[MESA_SHADER_GEOMETRY]->info.gs.output_prim);
6841 } else if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL)) {
6842 if (pipeline->base.shaders[MESA_SHADER_TESS_EVAL]->info.tes.point_mode) {
6843 gs_out = V_028A6C_POINTLIST;
6844 } else {
6845 gs_out = si_conv_tess_prim_to_gs_out(
6846 pipeline->base.shaders[MESA_SHADER_TESS_EVAL]->info.tes._primitive_mode);
6847 }
6848 } else if (radv_pipeline_has_stage(pipeline, MESA_SHADER_MESH)) {
6849 gs_out =
6850 si_conv_gl_prim_to_gs_out(pipeline->base.shaders[MESA_SHADER_MESH]->info.ms.output_prim);
6851 } else {
6852 gs_out = si_conv_prim_to_gs_out(info->ia.primitive_topology);
6853 }
6854
6855 return gs_out;
6856 }
6857
6858 static void
radv_pipeline_init_extra(struct radv_graphics_pipeline * pipeline,const struct radv_graphics_pipeline_create_info * extra,struct radv_blend_state * blend_state,struct radv_depth_stencil_state * ds_state,const struct radv_graphics_pipeline_info * info,uint32_t * vgt_gs_out_prim_type)6859 radv_pipeline_init_extra(struct radv_graphics_pipeline *pipeline,
6860 const struct radv_graphics_pipeline_create_info *extra,
6861 struct radv_blend_state *blend_state,
6862 struct radv_depth_stencil_state *ds_state,
6863 const struct radv_graphics_pipeline_info *info,
6864 uint32_t *vgt_gs_out_prim_type)
6865 {
6866 if (extra->custom_blend_mode == V_028808_CB_ELIMINATE_FAST_CLEAR ||
6867 extra->custom_blend_mode == V_028808_CB_FMASK_DECOMPRESS ||
6868 extra->custom_blend_mode == V_028808_CB_DCC_DECOMPRESS_GFX8 ||
6869 extra->custom_blend_mode == V_028808_CB_DCC_DECOMPRESS_GFX11 ||
6870 extra->custom_blend_mode == V_028808_CB_RESOLVE) {
6871 /* According to the CB spec states, CB_SHADER_MASK should be set to enable writes to all four
6872 * channels of MRT0.
6873 */
6874 blend_state->cb_shader_mask = 0xf;
6875
6876 if (extra->custom_blend_mode == V_028808_CB_RESOLVE)
6877 pipeline->cb_color_control |= S_028808_DISABLE_DUAL_QUAD(1);
6878
6879 pipeline->cb_color_control &= C_028808_MODE;
6880 pipeline->cb_color_control |= S_028808_MODE(extra->custom_blend_mode);
6881 }
6882
6883 if (extra->use_rectlist) {
6884 struct radv_dynamic_state *dynamic = &pipeline->dynamic_state;
6885 dynamic->primitive_topology = V_008958_DI_PT_RECTLIST;
6886
6887 *vgt_gs_out_prim_type = V_028A6C_TRISTRIP;
6888 if (radv_pipeline_has_ngg(pipeline))
6889 *vgt_gs_out_prim_type = V_028A6C_RECTLIST;
6890
6891 pipeline->rast_prim = *vgt_gs_out_prim_type;
6892 }
6893
6894 if (radv_pipeline_has_ds_attachments(&info->ri)) {
6895 ds_state->db_render_control |= S_028000_DEPTH_CLEAR_ENABLE(extra->db_depth_clear);
6896 ds_state->db_render_control |= S_028000_STENCIL_CLEAR_ENABLE(extra->db_stencil_clear);
6897 ds_state->db_render_control |= S_028000_RESUMMARIZE_ENABLE(extra->resummarize_enable);
6898 ds_state->db_render_control |= S_028000_DEPTH_COMPRESS_DISABLE(extra->depth_compress_disable);
6899 ds_state->db_render_control |= S_028000_STENCIL_COMPRESS_DISABLE(extra->stencil_compress_disable);
6900 }
6901 }
6902
6903 void
radv_pipeline_init(struct radv_device * device,struct radv_pipeline * pipeline,enum radv_pipeline_type type)6904 radv_pipeline_init(struct radv_device *device, struct radv_pipeline *pipeline,
6905 enum radv_pipeline_type type)
6906 {
6907 vk_object_base_init(&device->vk, &pipeline->base, VK_OBJECT_TYPE_PIPELINE);
6908
6909 pipeline->device = device;
6910 pipeline->type = type;
6911 }
6912
6913 static VkResult
radv_graphics_pipeline_init(struct radv_graphics_pipeline * pipeline,struct radv_device * device,struct radv_pipeline_cache * cache,const VkGraphicsPipelineCreateInfo * pCreateInfo,const struct radv_graphics_pipeline_create_info * extra)6914 radv_graphics_pipeline_init(struct radv_graphics_pipeline *pipeline, struct radv_device *device,
6915 struct radv_pipeline_cache *cache,
6916 const VkGraphicsPipelineCreateInfo *pCreateInfo,
6917 const struct radv_graphics_pipeline_create_info *extra)
6918 {
6919 RADV_FROM_HANDLE(radv_pipeline_layout, pipeline_layout, pCreateInfo->layout);
6920 VkResult result;
6921
6922 pipeline->last_vgt_api_stage = MESA_SHADER_NONE;
6923
6924 /* Mark all states declared dynamic at pipeline creation. */
6925 if (pCreateInfo->pDynamicState) {
6926 uint32_t count = pCreateInfo->pDynamicState->dynamicStateCount;
6927 for (uint32_t s = 0; s < count; s++) {
6928 pipeline->dynamic_states |=
6929 radv_dynamic_state_mask(pCreateInfo->pDynamicState->pDynamicStates[s]);
6930 }
6931 }
6932
6933 /* Mark all active stages at pipeline creation. */
6934 for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) {
6935 const VkPipelineShaderStageCreateInfo *sinfo = &pCreateInfo->pStages[i];
6936
6937 pipeline->active_stages |= sinfo->stage;
6938 }
6939
6940 struct radv_graphics_pipeline_info info = radv_pipeline_init_graphics_info(pipeline, pCreateInfo);
6941
6942 struct radv_blend_state blend = radv_pipeline_init_blend_state(pipeline, pCreateInfo, &info);
6943
6944 const VkPipelineCreationFeedbackCreateInfo *creation_feedback =
6945 vk_find_struct_const(pCreateInfo->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);
6946
6947 struct radv_pipeline_key key =
6948 radv_generate_graphics_pipeline_key(pipeline, pCreateInfo, &info, &blend);
6949
6950 result = radv_create_shaders(&pipeline->base, pipeline_layout, device, cache, &key, pCreateInfo->pStages,
6951 pCreateInfo->stageCount, pCreateInfo->flags, NULL,
6952 creation_feedback, NULL, NULL, &pipeline->last_vgt_api_stage);
6953 if (result != VK_SUCCESS)
6954 return result;
6955
6956 pipeline->spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1);
6957
6958 uint32_t vgt_gs_out_prim_type = radv_pipeline_init_vgt_gs_out(pipeline, &info);
6959
6960 radv_pipeline_init_multisample_state(pipeline, &blend, &info, vgt_gs_out_prim_type);
6961
6962 if (!radv_pipeline_has_stage(pipeline, MESA_SHADER_MESH))
6963 radv_pipeline_init_input_assembly_state(pipeline, &info);
6964 radv_pipeline_init_dynamic_state(pipeline, &info);
6965
6966 pipeline->negative_one_to_one = info.vp.negative_one_to_one;
6967
6968 radv_pipeline_init_raster_state(pipeline, &info);
6969
6970 struct radv_depth_stencil_state ds_state =
6971 radv_pipeline_init_depth_stencil_state(pipeline, &info);
6972
6973 if (device->physical_device->rad_info.gfx_level >= GFX10_3)
6974 gfx103_pipeline_init_vrs_state(pipeline, &info);
6975
6976 /* Ensure that some export memory is always allocated, for two reasons:
6977 *
6978 * 1) Correctness: The hardware ignores the EXEC mask if no export
6979 * memory is allocated, so KILL and alpha test do not work correctly
6980 * without this.
6981 * 2) Performance: Every shader needs at least a NULL export, even when
6982 * it writes no color/depth output. The NULL export instruction
6983 * stalls without this setting.
6984 *
6985 * Don't add this to CB_SHADER_MASK.
6986 *
6987 * GFX10 supports pixel shaders without exports by setting both the
6988 * color and Z formats to SPI_SHADER_ZERO. The hw will skip export
6989 * instructions if any are present.
6990 */
6991 struct radv_shader *ps = pipeline->base.shaders[MESA_SHADER_FRAGMENT];
6992 if ((device->physical_device->rad_info.gfx_level <= GFX9 || ps->info.ps.can_discard) &&
6993 !blend.spi_shader_col_format) {
6994 if (!ps->info.ps.writes_z && !ps->info.ps.writes_stencil && !ps->info.ps.writes_sample_mask)
6995 blend.spi_shader_col_format = V_028714_SPI_SHADER_32_R;
6996 }
6997
6998 pipeline->col_format = blend.spi_shader_col_format;
6999 pipeline->cb_target_mask = blend.cb_target_mask;
7000
7001 if (radv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY) && !radv_pipeline_has_ngg(pipeline)) {
7002 struct radv_shader *gs = pipeline->base.shaders[MESA_SHADER_GEOMETRY];
7003
7004 radv_pipeline_init_gs_ring_state(pipeline, &gs->info.gs_ring_info);
7005 }
7006
7007 if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL)) {
7008 pipeline->tess_patch_control_points = info.ts.patch_control_points;
7009 }
7010
7011 if (!radv_pipeline_has_stage(pipeline, MESA_SHADER_MESH))
7012 radv_pipeline_init_vertex_input_state(pipeline, &info);
7013
7014 radv_pipeline_init_binning_state(pipeline, &blend, &info);
7015 radv_pipeline_init_shader_stages_state(pipeline);
7016 radv_pipeline_init_scratch(device, &pipeline->base);
7017
7018 /* Find the last vertex shader stage that eventually uses streamout. */
7019 pipeline->streamout_shader = radv_pipeline_get_streamout_shader(pipeline);
7020
7021 pipeline->is_ngg = radv_pipeline_has_ngg(pipeline);
7022 pipeline->has_ngg_culling =
7023 pipeline->is_ngg &&
7024 pipeline->base.shaders[pipeline->last_vgt_api_stage]->info.has_ngg_culling;
7025 pipeline->force_vrs_per_vertex =
7026 pipeline->base.shaders[pipeline->last_vgt_api_stage]->info.force_vrs_per_vertex;
7027 pipeline->uses_user_sample_locations = info.ms.sample_locs_enable;
7028 pipeline->rast_prim = vgt_gs_out_prim_type;
7029
7030 if (!(pipeline->dynamic_states & RADV_DYNAMIC_LINE_WIDTH)) {
7031 pipeline->line_width = info.rs.line_width;
7032 }
7033
7034 pipeline->base.push_constant_size = pipeline_layout->push_constant_size;
7035 pipeline->base.dynamic_offset_count = pipeline_layout->dynamic_offset_count;
7036
7037 if (extra) {
7038 radv_pipeline_init_extra(pipeline, extra, &blend, &ds_state, &info, &vgt_gs_out_prim_type);
7039 }
7040
7041 radv_pipeline_emit_pm4(pipeline, &blend, &ds_state, vgt_gs_out_prim_type, &info);
7042
7043 return result;
7044 }
7045
7046 static VkResult
radv_graphics_pipeline_create_nonlegacy(VkDevice _device,VkPipelineCache _cache,const VkGraphicsPipelineCreateInfo * pCreateInfo,const struct radv_graphics_pipeline_create_info * extra,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipeline)7047 radv_graphics_pipeline_create_nonlegacy(VkDevice _device, VkPipelineCache _cache,
7048 const VkGraphicsPipelineCreateInfo *pCreateInfo,
7049 const struct radv_graphics_pipeline_create_info *extra,
7050 const VkAllocationCallbacks *pAllocator,
7051 VkPipeline *pPipeline)
7052 {
7053 RADV_FROM_HANDLE(radv_device, device, _device);
7054 RADV_FROM_HANDLE(radv_pipeline_cache, cache, _cache);
7055 struct radv_graphics_pipeline *pipeline;
7056 VkResult result;
7057
7058 pipeline = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,
7059 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
7060 if (pipeline == NULL)
7061 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
7062
7063 radv_pipeline_init(device, &pipeline->base, RADV_PIPELINE_GRAPHICS);
7064
7065 result = radv_graphics_pipeline_init(pipeline, device, cache, pCreateInfo, extra);
7066 if (result != VK_SUCCESS) {
7067 radv_pipeline_destroy(device, &pipeline->base, pAllocator);
7068 return result;
7069 }
7070
7071 *pPipeline = radv_pipeline_to_handle(&pipeline->base);
7072
7073 return VK_SUCCESS;
7074 }
7075
7076 /* This is a wrapper for radv_graphics_pipeline_create_nonlegacy that does all legacy conversions
7077 * for the VkGraphicsPipelineCreateInfo data. */
7078 VkResult
radv_graphics_pipeline_create(VkDevice _device,VkPipelineCache _cache,const VkGraphicsPipelineCreateInfo * pCreateInfo,const struct radv_graphics_pipeline_create_info * extra,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipeline)7079 radv_graphics_pipeline_create(VkDevice _device, VkPipelineCache _cache,
7080 const VkGraphicsPipelineCreateInfo *pCreateInfo,
7081 const struct radv_graphics_pipeline_create_info *extra,
7082 const VkAllocationCallbacks *pAllocator, VkPipeline *pPipeline)
7083 {
7084 VkGraphicsPipelineCreateInfo create_info = *pCreateInfo;
7085
7086 VkPipelineRenderingCreateInfo rendering_create_info;
7087 VkFormat color_formats[MAX_RTS];
7088 VkAttachmentSampleCountInfoAMD sample_info;
7089 VkSampleCountFlagBits samples[MAX_RTS];
7090 if (pCreateInfo->renderPass != VK_NULL_HANDLE) {
7091 RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
7092 struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
7093
7094 rendering_create_info.sType = VK_STRUCTURE_TYPE_PIPELINE_RENDERING_CREATE_INFO;
7095 rendering_create_info.pNext = create_info.pNext;
7096 create_info.pNext = &rendering_create_info;
7097
7098 rendering_create_info.viewMask = subpass->view_mask;
7099
7100 VkFormat ds_format =
7101 subpass->depth_stencil_attachment
7102 ? pass->attachments[subpass->depth_stencil_attachment->attachment].format
7103 : VK_FORMAT_UNDEFINED;
7104
7105 rendering_create_info.depthAttachmentFormat =
7106 vk_format_has_depth(ds_format) ? ds_format : VK_FORMAT_UNDEFINED;
7107 rendering_create_info.stencilAttachmentFormat =
7108 vk_format_has_stencil(ds_format) ? ds_format : VK_FORMAT_UNDEFINED;
7109
7110 rendering_create_info.colorAttachmentCount = subpass->color_count;
7111 rendering_create_info.pColorAttachmentFormats = color_formats;
7112 for (unsigned i = 0; i < rendering_create_info.colorAttachmentCount; ++i) {
7113 if (subpass->color_attachments[i].attachment != VK_ATTACHMENT_UNUSED)
7114 color_formats[i] = pass->attachments[subpass->color_attachments[i].attachment].format;
7115 else
7116 color_formats[i] = VK_FORMAT_UNDEFINED;
7117 }
7118
7119 create_info.renderPass = VK_NULL_HANDLE;
7120
7121 sample_info.sType = VK_STRUCTURE_TYPE_ATTACHMENT_SAMPLE_COUNT_INFO_AMD;
7122 sample_info.pNext = create_info.pNext;
7123 create_info.pNext = &sample_info;
7124
7125 sample_info.colorAttachmentCount = rendering_create_info.colorAttachmentCount;
7126 sample_info.pColorAttachmentSamples = samples;
7127 for (unsigned i = 0; i < sample_info.colorAttachmentCount; ++i) {
7128 if (subpass->color_attachments[i].attachment != VK_ATTACHMENT_UNUSED) {
7129 samples[i] = pass->attachments[subpass->color_attachments[i].attachment].samples;
7130 } else
7131 samples[i] = 1;
7132 }
7133 sample_info.depthStencilAttachmentSamples = subpass->depth_sample_count;
7134 }
7135
7136 return radv_graphics_pipeline_create_nonlegacy(_device, _cache, &create_info, extra, pAllocator,
7137 pPipeline);
7138 }
7139
7140 VKAPI_ATTR VkResult VKAPI_CALL
radv_CreateGraphicsPipelines(VkDevice _device,VkPipelineCache pipelineCache,uint32_t count,const VkGraphicsPipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)7141 radv_CreateGraphicsPipelines(VkDevice _device, VkPipelineCache pipelineCache, uint32_t count,
7142 const VkGraphicsPipelineCreateInfo *pCreateInfos,
7143 const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines)
7144 {
7145 VkResult result = VK_SUCCESS;
7146 unsigned i = 0;
7147
7148 for (; i < count; i++) {
7149 VkResult r;
7150 r = radv_graphics_pipeline_create(_device, pipelineCache, &pCreateInfos[i], NULL, pAllocator,
7151 &pPipelines[i]);
7152 if (r != VK_SUCCESS) {
7153 result = r;
7154 pPipelines[i] = VK_NULL_HANDLE;
7155
7156 if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT)
7157 break;
7158 }
7159 }
7160
7161 for (; i < count; ++i)
7162 pPipelines[i] = VK_NULL_HANDLE;
7163
7164 return result;
7165 }
7166
7167 void
radv_pipeline_emit_hw_cs(const struct radv_physical_device * pdevice,struct radeon_cmdbuf * cs,const struct radv_shader * shader)7168 radv_pipeline_emit_hw_cs(const struct radv_physical_device *pdevice, struct radeon_cmdbuf *cs,
7169 const struct radv_shader *shader)
7170 {
7171 uint64_t va = radv_shader_get_va(shader);
7172
7173 radeon_set_sh_reg(cs, R_00B830_COMPUTE_PGM_LO, va >> 8);
7174
7175 radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
7176 radeon_emit(cs, shader->config.rsrc1);
7177 radeon_emit(cs, shader->config.rsrc2);
7178 if (pdevice->rad_info.gfx_level >= GFX10) {
7179 radeon_set_sh_reg(cs, R_00B8A0_COMPUTE_PGM_RSRC3, shader->config.rsrc3);
7180 }
7181 }
7182
7183 void
radv_pipeline_emit_compute_state(const struct radv_physical_device * pdevice,struct radeon_cmdbuf * cs,const struct radv_shader * shader)7184 radv_pipeline_emit_compute_state(const struct radv_physical_device *pdevice,
7185 struct radeon_cmdbuf *cs, const struct radv_shader *shader)
7186 {
7187 unsigned threads_per_threadgroup;
7188 unsigned threadgroups_per_cu = 1;
7189 unsigned waves_per_threadgroup;
7190 unsigned max_waves_per_sh = 0;
7191
7192 /* Calculate best compute resource limits. */
7193 threads_per_threadgroup =
7194 shader->info.cs.block_size[0] * shader->info.cs.block_size[1] * shader->info.cs.block_size[2];
7195 waves_per_threadgroup = DIV_ROUND_UP(threads_per_threadgroup, shader->info.wave_size);
7196
7197 if (pdevice->rad_info.gfx_level >= GFX10 && waves_per_threadgroup == 1)
7198 threadgroups_per_cu = 2;
7199
7200 radeon_set_sh_reg(
7201 cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
7202 ac_get_compute_resource_limits(&pdevice->rad_info, waves_per_threadgroup,
7203 max_waves_per_sh, threadgroups_per_cu));
7204
7205 radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
7206 radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(shader->info.cs.block_size[0]));
7207 radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(shader->info.cs.block_size[1]));
7208 radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(shader->info.cs.block_size[2]));
7209 }
7210
7211 static void
radv_compute_generate_pm4(struct radv_compute_pipeline * pipeline)7212 radv_compute_generate_pm4(struct radv_compute_pipeline *pipeline)
7213 {
7214 struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
7215 struct radv_shader *shader = pipeline->base.shaders[MESA_SHADER_COMPUTE];
7216 struct radeon_cmdbuf *cs = &pipeline->base.cs;
7217
7218 cs->max_dw = pdevice->rad_info.gfx_level >= GFX10 ? 19 : 16;
7219 cs->buf = malloc(cs->max_dw * 4);
7220
7221 radv_pipeline_emit_hw_cs(pdevice, cs, shader);
7222 radv_pipeline_emit_compute_state(pdevice, cs, shader);
7223
7224 assert(pipeline->base.cs.cdw <= pipeline->base.cs.max_dw);
7225 }
7226
7227 static struct radv_pipeline_key
radv_generate_compute_pipeline_key(struct radv_compute_pipeline * pipeline,const VkComputePipelineCreateInfo * pCreateInfo)7228 radv_generate_compute_pipeline_key(struct radv_compute_pipeline *pipeline,
7229 const VkComputePipelineCreateInfo *pCreateInfo)
7230 {
7231 const VkPipelineShaderStageCreateInfo *stage = &pCreateInfo->stage;
7232 struct radv_pipeline_key key = radv_generate_pipeline_key(&pipeline->base, pCreateInfo->flags);
7233
7234 const VkPipelineShaderStageRequiredSubgroupSizeCreateInfo *subgroup_size =
7235 vk_find_struct_const(stage->pNext,
7236 PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO);
7237
7238 if (subgroup_size) {
7239 assert(subgroup_size->requiredSubgroupSize == 32 ||
7240 subgroup_size->requiredSubgroupSize == 64);
7241 key.cs.compute_subgroup_size = subgroup_size->requiredSubgroupSize;
7242 } else if (stage->flags & VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT) {
7243 key.cs.require_full_subgroups = true;
7244 }
7245
7246 return key;
7247 }
7248
7249 VkResult
radv_compute_pipeline_create(VkDevice _device,VkPipelineCache _cache,const VkComputePipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,const uint8_t * custom_hash,struct radv_pipeline_shader_stack_size * rt_stack_sizes,uint32_t rt_group_count,VkPipeline * pPipeline)7250 radv_compute_pipeline_create(VkDevice _device, VkPipelineCache _cache,
7251 const VkComputePipelineCreateInfo *pCreateInfo,
7252 const VkAllocationCallbacks *pAllocator, const uint8_t *custom_hash,
7253 struct radv_pipeline_shader_stack_size *rt_stack_sizes,
7254 uint32_t rt_group_count, VkPipeline *pPipeline)
7255 {
7256 RADV_FROM_HANDLE(radv_device, device, _device);
7257 RADV_FROM_HANDLE(radv_pipeline_cache, cache, _cache);
7258 RADV_FROM_HANDLE(radv_pipeline_layout, pipeline_layout, pCreateInfo->layout);
7259 struct radv_compute_pipeline *pipeline;
7260 VkResult result;
7261
7262 pipeline = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,
7263 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
7264 if (pipeline == NULL) {
7265 free(rt_stack_sizes);
7266 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
7267 }
7268
7269 radv_pipeline_init(device, &pipeline->base, RADV_PIPELINE_COMPUTE);
7270
7271 pipeline->rt_stack_sizes = rt_stack_sizes;
7272 pipeline->group_count = rt_group_count;
7273
7274 const VkPipelineCreationFeedbackCreateInfo *creation_feedback =
7275 vk_find_struct_const(pCreateInfo->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);
7276
7277 struct radv_pipeline_key key = radv_generate_compute_pipeline_key(pipeline, pCreateInfo);
7278
7279 UNUSED gl_shader_stage last_vgt_api_stage = MESA_SHADER_NONE;
7280 result = radv_create_shaders(&pipeline->base, pipeline_layout, device, cache, &key, &pCreateInfo->stage,
7281 1, pCreateInfo->flags, custom_hash, creation_feedback,
7282 &pipeline->rt_stack_sizes, &pipeline->group_count,
7283 &last_vgt_api_stage);
7284 if (result != VK_SUCCESS) {
7285 radv_pipeline_destroy(device, &pipeline->base, pAllocator);
7286 return result;
7287 }
7288
7289 pipeline->base.user_data_0[MESA_SHADER_COMPUTE] = R_00B900_COMPUTE_USER_DATA_0;
7290 pipeline->base.need_indirect_descriptor_sets |=
7291 radv_shader_need_indirect_descriptor_sets(&pipeline->base, MESA_SHADER_COMPUTE);
7292 radv_pipeline_init_scratch(device, &pipeline->base);
7293
7294 pipeline->base.push_constant_size = pipeline_layout->push_constant_size;
7295 pipeline->base.dynamic_offset_count = pipeline_layout->dynamic_offset_count;
7296
7297 if (device->physical_device->rad_info.has_cs_regalloc_hang_bug) {
7298 struct radv_shader *compute_shader = pipeline->base.shaders[MESA_SHADER_COMPUTE];
7299 unsigned *cs_block_size = compute_shader->info.cs.block_size;
7300
7301 pipeline->cs_regalloc_hang_bug = cs_block_size[0] * cs_block_size[1] * cs_block_size[2] > 256;
7302 }
7303
7304 radv_compute_generate_pm4(pipeline);
7305
7306 *pPipeline = radv_pipeline_to_handle(&pipeline->base);
7307
7308 return VK_SUCCESS;
7309 }
7310
7311 VKAPI_ATTR VkResult VKAPI_CALL
radv_CreateComputePipelines(VkDevice _device,VkPipelineCache pipelineCache,uint32_t count,const VkComputePipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)7312 radv_CreateComputePipelines(VkDevice _device, VkPipelineCache pipelineCache, uint32_t count,
7313 const VkComputePipelineCreateInfo *pCreateInfos,
7314 const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines)
7315 {
7316 VkResult result = VK_SUCCESS;
7317
7318 unsigned i = 0;
7319 for (; i < count; i++) {
7320 VkResult r;
7321 r = radv_compute_pipeline_create(_device, pipelineCache, &pCreateInfos[i], pAllocator, NULL,
7322 NULL, 0, &pPipelines[i]);
7323 if (r != VK_SUCCESS) {
7324 result = r;
7325 pPipelines[i] = VK_NULL_HANDLE;
7326
7327 if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT)
7328 break;
7329 }
7330 }
7331
7332 for (; i < count; ++i)
7333 pPipelines[i] = VK_NULL_HANDLE;
7334
7335 return result;
7336 }
7337
7338 static uint32_t
radv_get_executable_count(struct radv_pipeline * pipeline)7339 radv_get_executable_count(struct radv_pipeline *pipeline)
7340 {
7341 uint32_t ret = 0;
7342 for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i) {
7343 if (!pipeline->shaders[i])
7344 continue;
7345
7346 if (i == MESA_SHADER_GEOMETRY &&
7347 !radv_pipeline_has_ngg(radv_pipeline_to_graphics(pipeline))) {
7348 ret += 2u;
7349 } else {
7350 ret += 1u;
7351 }
7352 }
7353 return ret;
7354 }
7355
7356 static struct radv_shader *
radv_get_shader_from_executable_index(struct radv_pipeline * pipeline,int index,gl_shader_stage * stage)7357 radv_get_shader_from_executable_index(struct radv_pipeline *pipeline, int index,
7358 gl_shader_stage *stage)
7359 {
7360 for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i) {
7361 if (!pipeline->shaders[i])
7362 continue;
7363 if (!index) {
7364 *stage = i;
7365 return pipeline->shaders[i];
7366 }
7367
7368 --index;
7369
7370 if (i == MESA_SHADER_GEOMETRY &&
7371 !radv_pipeline_has_ngg(radv_pipeline_to_graphics(pipeline))) {
7372 if (!index) {
7373 *stage = i;
7374 return pipeline->gs_copy_shader;
7375 }
7376 --index;
7377 }
7378 }
7379
7380 *stage = -1;
7381 return NULL;
7382 }
7383
7384 /* Basically strlcpy (which does not exist on linux) specialized for
7385 * descriptions. */
7386 static void
desc_copy(char * desc,const char * src)7387 desc_copy(char *desc, const char *src)
7388 {
7389 int len = strlen(src);
7390 assert(len < VK_MAX_DESCRIPTION_SIZE);
7391 memcpy(desc, src, len);
7392 memset(desc + len, 0, VK_MAX_DESCRIPTION_SIZE - len);
7393 }
7394
7395 VKAPI_ATTR VkResult VKAPI_CALL
radv_GetPipelineExecutablePropertiesKHR(VkDevice _device,const VkPipelineInfoKHR * pPipelineInfo,uint32_t * pExecutableCount,VkPipelineExecutablePropertiesKHR * pProperties)7396 radv_GetPipelineExecutablePropertiesKHR(VkDevice _device, const VkPipelineInfoKHR *pPipelineInfo,
7397 uint32_t *pExecutableCount,
7398 VkPipelineExecutablePropertiesKHR *pProperties)
7399 {
7400 RADV_FROM_HANDLE(radv_pipeline, pipeline, pPipelineInfo->pipeline);
7401 const uint32_t total_count = radv_get_executable_count(pipeline);
7402
7403 if (!pProperties) {
7404 *pExecutableCount = total_count;
7405 return VK_SUCCESS;
7406 }
7407
7408 const uint32_t count = MIN2(total_count, *pExecutableCount);
7409 for (unsigned i = 0, executable_idx = 0; i < MESA_VULKAN_SHADER_STAGES && executable_idx < count; ++i) {
7410 if (!pipeline->shaders[i])
7411 continue;
7412 pProperties[executable_idx].stages = mesa_to_vk_shader_stage(i);
7413 const char *name = NULL;
7414 const char *description = NULL;
7415 switch (i) {
7416 case MESA_SHADER_VERTEX:
7417 name = "Vertex Shader";
7418 description = "Vulkan Vertex Shader";
7419 break;
7420 case MESA_SHADER_TESS_CTRL:
7421 if (!pipeline->shaders[MESA_SHADER_VERTEX]) {
7422 pProperties[executable_idx].stages |= VK_SHADER_STAGE_VERTEX_BIT;
7423 name = "Vertex + Tessellation Control Shaders";
7424 description = "Combined Vulkan Vertex and Tessellation Control Shaders";
7425 } else {
7426 name = "Tessellation Control Shader";
7427 description = "Vulkan Tessellation Control Shader";
7428 }
7429 break;
7430 case MESA_SHADER_TESS_EVAL:
7431 name = "Tessellation Evaluation Shader";
7432 description = "Vulkan Tessellation Evaluation Shader";
7433 break;
7434 case MESA_SHADER_GEOMETRY:
7435 if (pipeline->shaders[MESA_SHADER_TESS_CTRL] && !pipeline->shaders[MESA_SHADER_TESS_EVAL]) {
7436 pProperties[executable_idx].stages |= VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT;
7437 name = "Tessellation Evaluation + Geometry Shaders";
7438 description = "Combined Vulkan Tessellation Evaluation and Geometry Shaders";
7439 } else if (!pipeline->shaders[MESA_SHADER_TESS_CTRL] && !pipeline->shaders[MESA_SHADER_VERTEX]) {
7440 pProperties[executable_idx].stages |= VK_SHADER_STAGE_VERTEX_BIT;
7441 name = "Vertex + Geometry Shader";
7442 description = "Combined Vulkan Vertex and Geometry Shaders";
7443 } else {
7444 name = "Geometry Shader";
7445 description = "Vulkan Geometry Shader";
7446 }
7447 break;
7448 case MESA_SHADER_FRAGMENT:
7449 name = "Fragment Shader";
7450 description = "Vulkan Fragment Shader";
7451 break;
7452 case MESA_SHADER_COMPUTE:
7453 name = "Compute Shader";
7454 description = "Vulkan Compute Shader";
7455 break;
7456 case MESA_SHADER_MESH:
7457 name = "Mesh Shader";
7458 description = "Vulkan Mesh Shader";
7459 break;
7460 case MESA_SHADER_TASK:
7461 name = "Task Shader";
7462 description = "Vulkan Task Shader";
7463 break;
7464 }
7465
7466 pProperties[executable_idx].subgroupSize = pipeline->shaders[i]->info.wave_size;
7467 desc_copy(pProperties[executable_idx].name, name);
7468 desc_copy(pProperties[executable_idx].description, description);
7469
7470 ++executable_idx;
7471 if (i == MESA_SHADER_GEOMETRY &&
7472 !radv_pipeline_has_ngg(radv_pipeline_to_graphics(pipeline))) {
7473 assert(pipeline->gs_copy_shader);
7474 if (executable_idx >= count)
7475 break;
7476
7477 pProperties[executable_idx].stages = VK_SHADER_STAGE_GEOMETRY_BIT;
7478 pProperties[executable_idx].subgroupSize = 64;
7479 desc_copy(pProperties[executable_idx].name, "GS Copy Shader");
7480 desc_copy(pProperties[executable_idx].description,
7481 "Extra shader stage that loads the GS output ringbuffer into the rasterizer");
7482
7483 ++executable_idx;
7484 }
7485 }
7486
7487 VkResult result = *pExecutableCount < total_count ? VK_INCOMPLETE : VK_SUCCESS;
7488 *pExecutableCount = count;
7489 return result;
7490 }
7491
7492 VKAPI_ATTR VkResult VKAPI_CALL
radv_GetPipelineExecutableStatisticsKHR(VkDevice _device,const VkPipelineExecutableInfoKHR * pExecutableInfo,uint32_t * pStatisticCount,VkPipelineExecutableStatisticKHR * pStatistics)7493 radv_GetPipelineExecutableStatisticsKHR(VkDevice _device,
7494 const VkPipelineExecutableInfoKHR *pExecutableInfo,
7495 uint32_t *pStatisticCount,
7496 VkPipelineExecutableStatisticKHR *pStatistics)
7497 {
7498 RADV_FROM_HANDLE(radv_device, device, _device);
7499 RADV_FROM_HANDLE(radv_pipeline, pipeline, pExecutableInfo->pipeline);
7500 gl_shader_stage stage;
7501 struct radv_shader *shader =
7502 radv_get_shader_from_executable_index(pipeline, pExecutableInfo->executableIndex, &stage);
7503
7504 const struct radv_physical_device *pdevice = device->physical_device;
7505
7506 unsigned lds_increment = pdevice->rad_info.gfx_level >= GFX11 && stage == MESA_SHADER_FRAGMENT
7507 ? 1024 : pdevice->rad_info.lds_encode_granularity;
7508 unsigned max_waves = radv_get_max_waves(device, shader, stage);
7509
7510 VkPipelineExecutableStatisticKHR *s = pStatistics;
7511 VkPipelineExecutableStatisticKHR *end = s + (pStatistics ? *pStatisticCount : 0);
7512 VkResult result = VK_SUCCESS;
7513
7514 if (s < end) {
7515 desc_copy(s->name, "Driver pipeline hash");
7516 desc_copy(s->description, "Driver pipeline hash used by RGP");
7517 s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
7518 s->value.u64 = pipeline->pipeline_hash;
7519 }
7520 ++s;
7521
7522 if (s < end) {
7523 desc_copy(s->name, "SGPRs");
7524 desc_copy(s->description, "Number of SGPR registers allocated per subgroup");
7525 s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
7526 s->value.u64 = shader->config.num_sgprs;
7527 }
7528 ++s;
7529
7530 if (s < end) {
7531 desc_copy(s->name, "VGPRs");
7532 desc_copy(s->description, "Number of VGPR registers allocated per subgroup");
7533 s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
7534 s->value.u64 = shader->config.num_vgprs;
7535 }
7536 ++s;
7537
7538 if (s < end) {
7539 desc_copy(s->name, "Spilled SGPRs");
7540 desc_copy(s->description, "Number of SGPR registers spilled per subgroup");
7541 s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
7542 s->value.u64 = shader->config.spilled_sgprs;
7543 }
7544 ++s;
7545
7546 if (s < end) {
7547 desc_copy(s->name, "Spilled VGPRs");
7548 desc_copy(s->description, "Number of VGPR registers spilled per subgroup");
7549 s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
7550 s->value.u64 = shader->config.spilled_vgprs;
7551 }
7552 ++s;
7553
7554 if (s < end) {
7555 desc_copy(s->name, "Code size");
7556 desc_copy(s->description, "Code size in bytes");
7557 s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
7558 s->value.u64 = shader->exec_size;
7559 }
7560 ++s;
7561
7562 if (s < end) {
7563 desc_copy(s->name, "LDS size");
7564 desc_copy(s->description, "LDS size in bytes per workgroup");
7565 s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
7566 s->value.u64 = shader->config.lds_size * lds_increment;
7567 }
7568 ++s;
7569
7570 if (s < end) {
7571 desc_copy(s->name, "Scratch size");
7572 desc_copy(s->description, "Private memory in bytes per subgroup");
7573 s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
7574 s->value.u64 = shader->config.scratch_bytes_per_wave;
7575 }
7576 ++s;
7577
7578 if (s < end) {
7579 desc_copy(s->name, "Subgroups per SIMD");
7580 desc_copy(s->description, "The maximum number of subgroups in flight on a SIMD unit");
7581 s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
7582 s->value.u64 = max_waves;
7583 }
7584 ++s;
7585
7586 if (shader->statistics) {
7587 for (unsigned i = 0; i < aco_num_statistics; i++) {
7588 const struct aco_compiler_statistic_info *info = &aco_statistic_infos[i];
7589 if (s < end) {
7590 desc_copy(s->name, info->name);
7591 desc_copy(s->description, info->desc);
7592 s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
7593 s->value.u64 = shader->statistics[i];
7594 }
7595 ++s;
7596 }
7597 }
7598
7599 if (!pStatistics)
7600 *pStatisticCount = s - pStatistics;
7601 else if (s > end) {
7602 *pStatisticCount = end - pStatistics;
7603 result = VK_INCOMPLETE;
7604 } else {
7605 *pStatisticCount = s - pStatistics;
7606 }
7607
7608 return result;
7609 }
7610
7611 static VkResult
radv_copy_representation(void * data,size_t * data_size,const char * src)7612 radv_copy_representation(void *data, size_t *data_size, const char *src)
7613 {
7614 size_t total_size = strlen(src) + 1;
7615
7616 if (!data) {
7617 *data_size = total_size;
7618 return VK_SUCCESS;
7619 }
7620
7621 size_t size = MIN2(total_size, *data_size);
7622
7623 memcpy(data, src, size);
7624 if (size)
7625 *((char *)data + size - 1) = 0;
7626 return size < total_size ? VK_INCOMPLETE : VK_SUCCESS;
7627 }
7628
7629 VKAPI_ATTR VkResult VKAPI_CALL
radv_GetPipelineExecutableInternalRepresentationsKHR(VkDevice _device,const VkPipelineExecutableInfoKHR * pExecutableInfo,uint32_t * pInternalRepresentationCount,VkPipelineExecutableInternalRepresentationKHR * pInternalRepresentations)7630 radv_GetPipelineExecutableInternalRepresentationsKHR(
7631 VkDevice _device, const VkPipelineExecutableInfoKHR *pExecutableInfo,
7632 uint32_t *pInternalRepresentationCount,
7633 VkPipelineExecutableInternalRepresentationKHR *pInternalRepresentations)
7634 {
7635 RADV_FROM_HANDLE(radv_device, device, _device);
7636 RADV_FROM_HANDLE(radv_pipeline, pipeline, pExecutableInfo->pipeline);
7637 gl_shader_stage stage;
7638 struct radv_shader *shader =
7639 radv_get_shader_from_executable_index(pipeline, pExecutableInfo->executableIndex, &stage);
7640
7641 VkPipelineExecutableInternalRepresentationKHR *p = pInternalRepresentations;
7642 VkPipelineExecutableInternalRepresentationKHR *end =
7643 p + (pInternalRepresentations ? *pInternalRepresentationCount : 0);
7644 VkResult result = VK_SUCCESS;
7645 /* optimized NIR */
7646 if (p < end) {
7647 p->isText = true;
7648 desc_copy(p->name, "NIR Shader(s)");
7649 desc_copy(p->description, "The optimized NIR shader(s)");
7650 if (radv_copy_representation(p->pData, &p->dataSize, shader->nir_string) != VK_SUCCESS)
7651 result = VK_INCOMPLETE;
7652 }
7653 ++p;
7654
7655 /* backend IR */
7656 if (p < end) {
7657 p->isText = true;
7658 if (radv_use_llvm_for_stage(device, stage)) {
7659 desc_copy(p->name, "LLVM IR");
7660 desc_copy(p->description, "The LLVM IR after some optimizations");
7661 } else {
7662 desc_copy(p->name, "ACO IR");
7663 desc_copy(p->description, "The ACO IR after some optimizations");
7664 }
7665 if (radv_copy_representation(p->pData, &p->dataSize, shader->ir_string) != VK_SUCCESS)
7666 result = VK_INCOMPLETE;
7667 }
7668 ++p;
7669
7670 /* Disassembler */
7671 if (p < end && shader->disasm_string) {
7672 p->isText = true;
7673 desc_copy(p->name, "Assembly");
7674 desc_copy(p->description, "Final Assembly");
7675 if (radv_copy_representation(p->pData, &p->dataSize, shader->disasm_string) != VK_SUCCESS)
7676 result = VK_INCOMPLETE;
7677 }
7678 ++p;
7679
7680 if (!pInternalRepresentations)
7681 *pInternalRepresentationCount = p - pInternalRepresentations;
7682 else if (p > end) {
7683 result = VK_INCOMPLETE;
7684 *pInternalRepresentationCount = end - pInternalRepresentations;
7685 } else {
7686 *pInternalRepresentationCount = p - pInternalRepresentations;
7687 }
7688
7689 return result;
7690 }
7691