• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2016 Red Hat.
3  * Copyright © 2016 Bas Nieuwenhuizen
4  *
5  * based in part on anv driver which is:
6  * Copyright © 2015 Intel Corporation
7  *
8  * Permission is hereby granted, free of charge, to any person obtaining a
9  * copy of this software and associated documentation files (the "Software"),
10  * to deal in the Software without restriction, including without limitation
11  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12  * and/or sell copies of the Software, and to permit persons to whom the
13  * Software is furnished to do so, subject to the following conditions:
14  *
15  * The above copyright notice and this permission notice (including the next
16  * paragraph) shall be included in all copies or substantial portions of the
17  * Software.
18  *
19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
22  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25  * IN THE SOFTWARE.
26  */
27 
28 #include "nir/nir.h"
29 #include "nir/nir_builder.h"
30 #include "spirv/nir_spirv.h"
31 #include "util/disk_cache.h"
32 #include "util/mesa-sha1.h"
33 #include "util/os_time.h"
34 #include "util/u_atomic.h"
35 #include "radv_cs.h"
36 #include "radv_debug.h"
37 #include "radv_meta.h"
38 #include "radv_private.h"
39 #include "radv_shader.h"
40 #include "radv_shader_args.h"
41 #include "vk_pipeline.h"
42 #include "vk_util.h"
43 
44 #include "util/debug.h"
45 #include "ac_binary.h"
46 #include "ac_nir.h"
47 #include "ac_shader_util.h"
48 #include "aco_interface.h"
49 #include "sid.h"
50 #include "vk_format.h"
51 
52 struct radv_blend_state {
53    uint32_t blend_enable_4bit;
54    uint32_t need_src_alpha;
55 
56    uint32_t cb_target_mask;
57    uint32_t cb_target_enabled_4bit;
58    uint32_t sx_mrt_blend_opt[8];
59    uint32_t cb_blend_control[8];
60 
61    uint32_t spi_shader_col_format;
62    uint32_t col_format_is_int8;
63    uint32_t col_format_is_int10;
64    uint32_t col_format_is_float32;
65    uint32_t cb_shader_mask;
66    uint32_t db_alpha_to_mask;
67 
68    uint32_t commutative_4bit;
69 
70    bool mrt0_is_dual_src;
71 };
72 
73 struct radv_depth_stencil_state {
74    uint32_t db_render_control;
75    uint32_t db_render_override;
76    uint32_t db_render_override2;
77 };
78 
79 struct radv_dsa_order_invariance {
80    /* Whether the final result in Z/S buffers is guaranteed to be
81     * invariant under changes to the order in which fragments arrive.
82     */
83    bool zs;
84 
85    /* Whether the set of fragments that pass the combined Z/S test is
86     * guaranteed to be invariant under changes to the order in which
87     * fragments arrive.
88     */
89    bool pass_set;
90 };
91 
92 static bool
radv_is_raster_enabled(const struct radv_graphics_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)93 radv_is_raster_enabled(const struct radv_graphics_pipeline *pipeline,
94                        const VkGraphicsPipelineCreateInfo *pCreateInfo)
95 {
96    return !pCreateInfo->pRasterizationState->rasterizerDiscardEnable ||
97           (pipeline->dynamic_states & RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE);
98 }
99 
100 static bool
radv_is_static_vrs_enabled(const struct radv_graphics_pipeline * pipeline,const struct radv_graphics_pipeline_info * info)101 radv_is_static_vrs_enabled(const struct radv_graphics_pipeline *pipeline,
102                            const struct radv_graphics_pipeline_info *info)
103 {
104    return info->fsr.size.width != 1 || info->fsr.size.height != 1 ||
105           info->fsr.combiner_ops[0] != VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR ||
106           info->fsr.combiner_ops[1] != VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR;
107 }
108 
109 static bool
radv_is_vrs_enabled(const struct radv_graphics_pipeline * pipeline,const struct radv_graphics_pipeline_info * info)110 radv_is_vrs_enabled(const struct radv_graphics_pipeline *pipeline,
111                     const struct radv_graphics_pipeline_info *info)
112 {
113    return radv_is_static_vrs_enabled(pipeline, info) ||
114           (pipeline->dynamic_states & RADV_DYNAMIC_FRAGMENT_SHADING_RATE);
115 }
116 
117 static bool
radv_pipeline_has_ds_attachments(const struct radv_rendering_info * ri_info)118 radv_pipeline_has_ds_attachments(const struct radv_rendering_info *ri_info)
119 {
120    return ri_info->depth_att_format != VK_FORMAT_UNDEFINED ||
121           ri_info->stencil_att_format != VK_FORMAT_UNDEFINED;
122 }
123 
124 static bool
radv_pipeline_has_color_attachments(const struct radv_rendering_info * ri_info)125 radv_pipeline_has_color_attachments(const struct radv_rendering_info *ri_info)
126 {
127    for (uint32_t i = 0; i < ri_info->color_att_count; ++i) {
128       if (ri_info->color_att_formats[i] != VK_FORMAT_UNDEFINED)
129          return true;
130    }
131 
132    return false;
133 }
134 
135 static bool
radv_pipeline_has_ngg(const struct radv_graphics_pipeline * pipeline)136 radv_pipeline_has_ngg(const struct radv_graphics_pipeline *pipeline)
137 {
138    struct radv_shader *shader = pipeline->base.shaders[pipeline->last_vgt_api_stage];
139 
140    return shader->info.is_ngg;
141 }
142 
143 bool
radv_pipeline_has_ngg_passthrough(const struct radv_graphics_pipeline * pipeline)144 radv_pipeline_has_ngg_passthrough(const struct radv_graphics_pipeline *pipeline)
145 {
146    assert(radv_pipeline_has_ngg(pipeline));
147 
148    struct radv_shader *shader = pipeline->base.shaders[pipeline->last_vgt_api_stage];
149 
150    return shader->info.is_ngg_passthrough;
151 }
152 
153 bool
radv_pipeline_has_gs_copy_shader(const struct radv_pipeline * pipeline)154 radv_pipeline_has_gs_copy_shader(const struct radv_pipeline *pipeline)
155 {
156    return !!pipeline->gs_copy_shader;
157 }
158 
159 static struct radv_pipeline_slab *
radv_pipeline_slab_create(struct radv_device * device,struct radv_pipeline * pipeline,uint32_t code_size)160 radv_pipeline_slab_create(struct radv_device *device, struct radv_pipeline *pipeline,
161                           uint32_t code_size)
162 {
163    struct radv_pipeline_slab *slab;
164 
165    slab = calloc(1, sizeof(*slab));
166    if (!slab)
167       return NULL;
168 
169    slab->ref_count = 1;
170 
171    slab->alloc = radv_alloc_shader_memory(device, code_size, pipeline);
172    if (!slab->alloc) {
173       free(slab);
174       return NULL;
175    }
176 
177    return slab;
178 }
179 
180 void
radv_pipeline_slab_destroy(struct radv_device * device,struct radv_pipeline_slab * slab)181 radv_pipeline_slab_destroy(struct radv_device *device, struct radv_pipeline_slab *slab)
182 {
183    if (!p_atomic_dec_zero(&slab->ref_count))
184       return;
185 
186    radv_free_shader_memory(device, slab->alloc);
187    free(slab);
188 }
189 
190 void
radv_pipeline_destroy(struct radv_device * device,struct radv_pipeline * pipeline,const VkAllocationCallbacks * allocator)191 radv_pipeline_destroy(struct radv_device *device, struct radv_pipeline *pipeline,
192                       const VkAllocationCallbacks *allocator)
193 {
194    if (pipeline->type == RADV_PIPELINE_COMPUTE) {
195       struct radv_compute_pipeline *compute_pipeline = radv_pipeline_to_compute(pipeline);
196 
197       free(compute_pipeline->rt_group_handles);
198       free(compute_pipeline->rt_stack_sizes);
199    } else if (pipeline->type == RADV_PIPELINE_LIBRARY) {
200       struct radv_library_pipeline *library_pipeline = radv_pipeline_to_library(pipeline);
201 
202       free(library_pipeline->groups);
203       for (uint32_t i = 0; i < library_pipeline->stage_count; i++) {
204          RADV_FROM_HANDLE(vk_shader_module, module, library_pipeline->stages[i].module);
205          if (module) {
206             vk_object_base_finish(&module->base);
207             ralloc_free(module);
208          }
209       }
210       free(library_pipeline->stages);
211       free(library_pipeline->identifiers);
212       free(library_pipeline->hashes);
213    }
214 
215    if (pipeline->slab)
216       radv_pipeline_slab_destroy(device, pipeline->slab);
217 
218    for (unsigned i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i)
219       if (pipeline->shaders[i])
220          radv_shader_destroy(device, pipeline->shaders[i]);
221 
222    if (pipeline->gs_copy_shader)
223       radv_shader_destroy(device, pipeline->gs_copy_shader);
224 
225    if (pipeline->cs.buf)
226       free(pipeline->cs.buf);
227 
228    vk_object_base_finish(&pipeline->base);
229    vk_free2(&device->vk.alloc, allocator, pipeline);
230 }
231 
232 VKAPI_ATTR void VKAPI_CALL
radv_DestroyPipeline(VkDevice _device,VkPipeline _pipeline,const VkAllocationCallbacks * pAllocator)233 radv_DestroyPipeline(VkDevice _device, VkPipeline _pipeline,
234                      const VkAllocationCallbacks *pAllocator)
235 {
236    RADV_FROM_HANDLE(radv_device, device, _device);
237    RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);
238 
239    if (!_pipeline)
240       return;
241 
242    radv_pipeline_destroy(device, pipeline, pAllocator);
243 }
244 
245 uint32_t
radv_get_hash_flags(const struct radv_device * device,bool stats)246 radv_get_hash_flags(const struct radv_device *device, bool stats)
247 {
248    uint32_t hash_flags = 0;
249 
250    if (device->physical_device->use_ngg_culling)
251       hash_flags |= RADV_HASH_SHADER_USE_NGG_CULLING;
252    if (device->instance->perftest_flags & RADV_PERFTEST_EMULATE_RT)
253       hash_flags |= RADV_HASH_SHADER_EMULATE_RT;
254    if (device->physical_device->rt_wave_size == 64)
255       hash_flags |= RADV_HASH_SHADER_RT_WAVE64;
256    if (device->physical_device->cs_wave_size == 32)
257       hash_flags |= RADV_HASH_SHADER_CS_WAVE32;
258    if (device->physical_device->ps_wave_size == 32)
259       hash_flags |= RADV_HASH_SHADER_PS_WAVE32;
260    if (device->physical_device->ge_wave_size == 32)
261       hash_flags |= RADV_HASH_SHADER_GE_WAVE32;
262    if (device->physical_device->use_llvm)
263       hash_flags |= RADV_HASH_SHADER_LLVM;
264    if (stats)
265       hash_flags |= RADV_HASH_SHADER_KEEP_STATISTICS;
266    if (device->robust_buffer_access) /* forces per-attribute vertex descriptors */
267       hash_flags |= RADV_HASH_SHADER_ROBUST_BUFFER_ACCESS;
268    if (device->robust_buffer_access2) /* affects load/store vectorizer */
269       hash_flags |= RADV_HASH_SHADER_ROBUST_BUFFER_ACCESS2;
270    if (device->instance->debug_flags & RADV_DEBUG_SPLIT_FMA)
271       hash_flags |= RADV_HASH_SHADER_SPLIT_FMA;
272    return hash_flags;
273 }
274 
275 static void
radv_pipeline_init_scratch(const struct radv_device * device,struct radv_pipeline * pipeline)276 radv_pipeline_init_scratch(const struct radv_device *device, struct radv_pipeline *pipeline)
277 {
278    unsigned scratch_bytes_per_wave = 0;
279    unsigned max_waves = 0;
280 
281    for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i) {
282       if (pipeline->shaders[i] && pipeline->shaders[i]->config.scratch_bytes_per_wave) {
283          unsigned max_stage_waves = device->scratch_waves;
284 
285          scratch_bytes_per_wave =
286             MAX2(scratch_bytes_per_wave, pipeline->shaders[i]->config.scratch_bytes_per_wave);
287 
288          max_stage_waves =
289             MIN2(max_stage_waves, 4 * device->physical_device->rad_info.num_cu *
290                  radv_get_max_waves(device, pipeline->shaders[i], i));
291          max_waves = MAX2(max_waves, max_stage_waves);
292       }
293    }
294 
295    pipeline->scratch_bytes_per_wave = scratch_bytes_per_wave;
296    pipeline->max_waves = max_waves;
297 }
298 
299 static uint32_t
si_translate_blend_function(VkBlendOp op)300 si_translate_blend_function(VkBlendOp op)
301 {
302    switch (op) {
303    case VK_BLEND_OP_ADD:
304       return V_028780_COMB_DST_PLUS_SRC;
305    case VK_BLEND_OP_SUBTRACT:
306       return V_028780_COMB_SRC_MINUS_DST;
307    case VK_BLEND_OP_REVERSE_SUBTRACT:
308       return V_028780_COMB_DST_MINUS_SRC;
309    case VK_BLEND_OP_MIN:
310       return V_028780_COMB_MIN_DST_SRC;
311    case VK_BLEND_OP_MAX:
312       return V_028780_COMB_MAX_DST_SRC;
313    default:
314       return 0;
315    }
316 }
317 
318 static uint32_t
si_translate_blend_factor(enum amd_gfx_level gfx_level,VkBlendFactor factor)319 si_translate_blend_factor(enum amd_gfx_level gfx_level, VkBlendFactor factor)
320 {
321    switch (factor) {
322    case VK_BLEND_FACTOR_ZERO:
323       return V_028780_BLEND_ZERO;
324    case VK_BLEND_FACTOR_ONE:
325       return V_028780_BLEND_ONE;
326    case VK_BLEND_FACTOR_SRC_COLOR:
327       return V_028780_BLEND_SRC_COLOR;
328    case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
329       return V_028780_BLEND_ONE_MINUS_SRC_COLOR;
330    case VK_BLEND_FACTOR_DST_COLOR:
331       return V_028780_BLEND_DST_COLOR;
332    case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
333       return V_028780_BLEND_ONE_MINUS_DST_COLOR;
334    case VK_BLEND_FACTOR_SRC_ALPHA:
335       return V_028780_BLEND_SRC_ALPHA;
336    case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
337       return V_028780_BLEND_ONE_MINUS_SRC_ALPHA;
338    case VK_BLEND_FACTOR_DST_ALPHA:
339       return V_028780_BLEND_DST_ALPHA;
340    case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
341       return V_028780_BLEND_ONE_MINUS_DST_ALPHA;
342    case VK_BLEND_FACTOR_CONSTANT_COLOR:
343       return gfx_level >= GFX11 ? V_028780_BLEND_CONSTANT_COLOR_GFX11
344                                 : V_028780_BLEND_CONSTANT_COLOR_GFX6;
345    case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
346       return gfx_level >= GFX11 ? V_028780_BLEND_ONE_MINUS_CONSTANT_COLOR_GFX11
347                                  : V_028780_BLEND_ONE_MINUS_CONSTANT_COLOR_GFX6;
348    case VK_BLEND_FACTOR_CONSTANT_ALPHA:
349       return gfx_level >= GFX11 ? V_028780_BLEND_CONSTANT_ALPHA_GFX11
350                                  : V_028780_BLEND_CONSTANT_ALPHA_GFX6;
351    case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
352       return gfx_level >= GFX11 ? V_028780_BLEND_ONE_MINUS_CONSTANT_ALPHA_GFX11
353                                  : V_028780_BLEND_ONE_MINUS_CONSTANT_ALPHA_GFX6;
354    case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
355       return V_028780_BLEND_SRC_ALPHA_SATURATE;
356    case VK_BLEND_FACTOR_SRC1_COLOR:
357       return gfx_level >= GFX11 ? V_028780_BLEND_SRC1_COLOR_GFX11 : V_028780_BLEND_SRC1_COLOR_GFX6;
358    case VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR:
359       return gfx_level >= GFX11 ? V_028780_BLEND_INV_SRC1_COLOR_GFX11
360                                  : V_028780_BLEND_INV_SRC1_COLOR_GFX6;
361    case VK_BLEND_FACTOR_SRC1_ALPHA:
362       return gfx_level >= GFX11 ? V_028780_BLEND_SRC1_ALPHA_GFX11 : V_028780_BLEND_SRC1_ALPHA_GFX6;
363    case VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA:
364       return gfx_level >= GFX11 ? V_028780_BLEND_INV_SRC1_ALPHA_GFX11
365                                  : V_028780_BLEND_INV_SRC1_ALPHA_GFX6;
366    default:
367       return 0;
368    }
369 }
370 
371 static uint32_t
si_translate_blend_opt_function(unsigned op)372 si_translate_blend_opt_function(unsigned op)
373 {
374    switch (op) {
375    case V_028780_COMB_DST_PLUS_SRC:
376       return V_028760_OPT_COMB_ADD;
377    case V_028780_COMB_SRC_MINUS_DST:
378       return V_028760_OPT_COMB_SUBTRACT;
379    case V_028780_COMB_DST_MINUS_SRC:
380       return V_028760_OPT_COMB_REVSUBTRACT;
381    case V_028780_COMB_MIN_DST_SRC:
382       return V_028760_OPT_COMB_MIN;
383    case V_028780_COMB_MAX_DST_SRC:
384       return V_028760_OPT_COMB_MAX;
385    default:
386       return V_028760_OPT_COMB_BLEND_DISABLED;
387    }
388 }
389 
390 static uint32_t
si_translate_blend_opt_factor(unsigned factor,bool is_alpha)391 si_translate_blend_opt_factor(unsigned factor, bool is_alpha)
392 {
393    switch (factor) {
394    case V_028780_BLEND_ZERO:
395       return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_ALL;
396    case V_028780_BLEND_ONE:
397       return V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE;
398    case V_028780_BLEND_SRC_COLOR:
399       return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0
400                       : V_028760_BLEND_OPT_PRESERVE_C1_IGNORE_C0;
401    case V_028780_BLEND_ONE_MINUS_SRC_COLOR:
402       return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1
403                       : V_028760_BLEND_OPT_PRESERVE_C0_IGNORE_C1;
404    case V_028780_BLEND_SRC_ALPHA:
405       return V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0;
406    case V_028780_BLEND_ONE_MINUS_SRC_ALPHA:
407       return V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1;
408    case V_028780_BLEND_SRC_ALPHA_SATURATE:
409       return is_alpha ? V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE
410                       : V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0;
411    default:
412       return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
413    }
414 }
415 
416 /**
417  * Get rid of DST in the blend factors by commuting the operands:
418  *    func(src * DST, dst * 0) ---> func(src * 0, dst * SRC)
419  */
420 static void
si_blend_remove_dst(unsigned * func,unsigned * src_factor,unsigned * dst_factor,unsigned expected_dst,unsigned replacement_src)421 si_blend_remove_dst(unsigned *func, unsigned *src_factor, unsigned *dst_factor,
422                     unsigned expected_dst, unsigned replacement_src)
423 {
424    if (*src_factor == expected_dst && *dst_factor == V_028780_BLEND_ZERO) {
425       *src_factor = V_028780_BLEND_ZERO;
426       *dst_factor = replacement_src;
427 
428       /* Commuting the operands requires reversing subtractions. */
429       if (*func == V_028780_COMB_SRC_MINUS_DST)
430          *func = V_028780_COMB_DST_MINUS_SRC;
431       else if (*func == V_028780_COMB_DST_MINUS_SRC)
432          *func = V_028780_COMB_SRC_MINUS_DST;
433    }
434 }
435 
436 static bool
si_blend_factor_uses_dst(unsigned factor)437 si_blend_factor_uses_dst(unsigned factor)
438 {
439    return factor == V_028780_BLEND_DST_COLOR ||
440           factor == V_028780_BLEND_DST_ALPHA ||
441           factor == V_028780_BLEND_SRC_ALPHA_SATURATE ||
442           factor == V_028780_BLEND_ONE_MINUS_DST_ALPHA ||
443           factor == V_028780_BLEND_ONE_MINUS_DST_COLOR;
444 }
445 
446 static bool
is_dual_src(enum amd_gfx_level gfx_level,unsigned factor)447 is_dual_src(enum amd_gfx_level gfx_level, unsigned factor)
448 {
449    if (gfx_level >= GFX11) {
450       switch (factor) {
451       case V_028780_BLEND_SRC1_COLOR_GFX11:
452       case V_028780_BLEND_INV_SRC1_COLOR_GFX11:
453       case V_028780_BLEND_SRC1_ALPHA_GFX11:
454       case V_028780_BLEND_INV_SRC1_ALPHA_GFX11:
455          return true;
456       default:
457          return false;
458       }
459    } else {
460       switch (factor) {
461       case V_028780_BLEND_SRC1_COLOR_GFX6:
462       case V_028780_BLEND_INV_SRC1_COLOR_GFX6:
463       case V_028780_BLEND_SRC1_ALPHA_GFX6:
464       case V_028780_BLEND_INV_SRC1_ALPHA_GFX6:
465          return true;
466       default:
467          return false;
468       }
469    }
470 }
471 
472 static unsigned
radv_choose_spi_color_format(const struct radv_device * device,VkFormat vk_format,bool blend_enable,bool blend_need_alpha)473 radv_choose_spi_color_format(const struct radv_device *device, VkFormat vk_format,
474                              bool blend_enable, bool blend_need_alpha)
475 {
476    const struct util_format_description *desc = vk_format_description(vk_format);
477    bool use_rbplus = device->physical_device->rad_info.rbplus_allowed;
478    struct ac_spi_color_formats formats = {0};
479    unsigned format, ntype, swap;
480 
481    format = radv_translate_colorformat(vk_format);
482    ntype = radv_translate_color_numformat(vk_format, desc,
483                                           vk_format_get_first_non_void_channel(vk_format));
484    swap = radv_translate_colorswap(vk_format, false);
485 
486    ac_choose_spi_color_formats(format, swap, ntype, false, use_rbplus, &formats);
487 
488    if (blend_enable && blend_need_alpha)
489       return formats.blend_alpha;
490    else if (blend_need_alpha)
491       return formats.alpha;
492    else if (blend_enable)
493       return formats.blend;
494    else
495       return formats.normal;
496 }
497 
498 static bool
format_is_int8(VkFormat format)499 format_is_int8(VkFormat format)
500 {
501    const struct util_format_description *desc = vk_format_description(format);
502    int channel = vk_format_get_first_non_void_channel(format);
503 
504    return channel >= 0 && desc->channel[channel].pure_integer && desc->channel[channel].size == 8;
505 }
506 
507 static bool
format_is_int10(VkFormat format)508 format_is_int10(VkFormat format)
509 {
510    const struct util_format_description *desc = vk_format_description(format);
511 
512    if (desc->nr_channels != 4)
513       return false;
514    for (unsigned i = 0; i < 4; i++) {
515       if (desc->channel[i].pure_integer && desc->channel[i].size == 10)
516          return true;
517    }
518    return false;
519 }
520 
521 static bool
format_is_float32(VkFormat format)522 format_is_float32(VkFormat format)
523 {
524    const struct util_format_description *desc = vk_format_description(format);
525    int channel = vk_format_get_first_non_void_channel(format);
526 
527    return channel >= 0 &&
528           desc->channel[channel].type == UTIL_FORMAT_TYPE_FLOAT && desc->channel[channel].size == 32;
529 }
530 
531 static void
radv_pipeline_compute_spi_color_formats(const struct radv_graphics_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo,struct radv_blend_state * blend,const struct radv_graphics_pipeline_info * info)532 radv_pipeline_compute_spi_color_formats(const struct radv_graphics_pipeline *pipeline,
533                                         const VkGraphicsPipelineCreateInfo *pCreateInfo,
534                                         struct radv_blend_state *blend,
535                                         const struct radv_graphics_pipeline_info *info)
536 {
537    unsigned col_format = 0, is_int8 = 0, is_int10 = 0, is_float32 = 0;
538    unsigned num_targets;
539 
540    for (unsigned i = 0; i < info->ri.color_att_count; ++i) {
541       unsigned cf;
542       VkFormat fmt = info->ri.color_att_formats[i];
543 
544       if (fmt == VK_FORMAT_UNDEFINED || !(blend->cb_target_mask & (0xfu << (i * 4)))) {
545          cf = V_028714_SPI_SHADER_ZERO;
546       } else {
547          bool blend_enable = blend->blend_enable_4bit & (0xfu << (i * 4));
548 
549          cf = radv_choose_spi_color_format(pipeline->base.device, fmt, blend_enable,
550                                            blend->need_src_alpha & (1 << i));
551 
552          if (format_is_int8(fmt))
553             is_int8 |= 1 << i;
554          if (format_is_int10(fmt))
555             is_int10 |= 1 << i;
556          if (format_is_float32(fmt))
557             is_float32 |= 1 << i;
558       }
559 
560       col_format |= cf << (4 * i);
561    }
562 
563    if (!(col_format & 0xf) && blend->need_src_alpha & (1 << 0)) {
564       /* When a subpass doesn't have any color attachments, write the
565        * alpha channel of MRT0 when alpha coverage is enabled because
566        * the depth attachment needs it.
567        */
568       col_format |= V_028714_SPI_SHADER_32_AR;
569    }
570 
571    /* If the i-th target format is set, all previous target formats must
572     * be non-zero to avoid hangs.
573     */
574    num_targets = (util_last_bit(col_format) + 3) / 4;
575    for (unsigned i = 0; i < num_targets; i++) {
576       if (!(col_format & (0xfu << (i * 4)))) {
577          col_format |= V_028714_SPI_SHADER_32_R << (i * 4);
578       }
579    }
580 
581    /* The output for dual source blending should have the same format as
582     * the first output.
583     */
584    if (blend->mrt0_is_dual_src) {
585       assert(!(col_format >> 4));
586       col_format |= (col_format & 0xf) << 4;
587    }
588 
589    blend->cb_shader_mask = ac_get_cb_shader_mask(col_format);
590    blend->spi_shader_col_format = col_format;
591    blend->col_format_is_int8 = is_int8;
592    blend->col_format_is_int10 = is_int10;
593    blend->col_format_is_float32 = is_float32;
594 }
595 
596 /*
597  * Ordered so that for each i,
598  * radv_format_meta_fs_key(radv_fs_key_format_exemplars[i]) == i.
599  */
600 const VkFormat radv_fs_key_format_exemplars[NUM_META_FS_KEYS] = {
601    VK_FORMAT_R32_SFLOAT,
602    VK_FORMAT_R32G32_SFLOAT,
603    VK_FORMAT_R8G8B8A8_UNORM,
604    VK_FORMAT_R16G16B16A16_UNORM,
605    VK_FORMAT_R16G16B16A16_SNORM,
606    VK_FORMAT_R16G16B16A16_UINT,
607    VK_FORMAT_R16G16B16A16_SINT,
608    VK_FORMAT_R32G32B32A32_SFLOAT,
609    VK_FORMAT_R8G8B8A8_UINT,
610    VK_FORMAT_R8G8B8A8_SINT,
611    VK_FORMAT_A2R10G10B10_UINT_PACK32,
612    VK_FORMAT_A2R10G10B10_SINT_PACK32,
613 };
614 
615 unsigned
radv_format_meta_fs_key(struct radv_device * device,VkFormat format)616 radv_format_meta_fs_key(struct radv_device *device, VkFormat format)
617 {
618    unsigned col_format = radv_choose_spi_color_format(device, format, false, false);
619    assert(col_format != V_028714_SPI_SHADER_32_AR);
620 
621    bool is_int8 = format_is_int8(format);
622    bool is_int10 = format_is_int10(format);
623 
624    if (col_format == V_028714_SPI_SHADER_UINT16_ABGR && is_int8)
625       return 8;
626    else if (col_format == V_028714_SPI_SHADER_SINT16_ABGR && is_int8)
627       return 9;
628    else if (col_format == V_028714_SPI_SHADER_UINT16_ABGR && is_int10)
629       return 10;
630    else if (col_format == V_028714_SPI_SHADER_SINT16_ABGR && is_int10)
631       return 11;
632    else {
633       if (col_format >= V_028714_SPI_SHADER_32_AR)
634          --col_format; /* Skip V_028714_SPI_SHADER_32_AR  since there is no such VkFormat */
635 
636       --col_format; /* Skip V_028714_SPI_SHADER_ZERO */
637       return col_format;
638    }
639 }
640 
641 static void
radv_blend_check_commutativity(enum amd_gfx_level gfx_level,struct radv_blend_state * blend,unsigned op,unsigned src,unsigned dst,unsigned chanmask)642 radv_blend_check_commutativity(enum amd_gfx_level gfx_level, struct radv_blend_state *blend,
643                                unsigned op, unsigned src, unsigned dst, unsigned chanmask)
644 {
645    bool is_src_allowed = false;
646 
647    /* Src factor is allowed when it does not depend on Dst. */
648    if (src == V_028780_BLEND_ZERO ||
649        src == V_028780_BLEND_ONE ||
650        src == V_028780_BLEND_SRC_COLOR ||
651        src == V_028780_BLEND_SRC_ALPHA ||
652        src == V_028780_BLEND_SRC_ALPHA_SATURATE ||
653        src == V_028780_BLEND_ONE_MINUS_SRC_COLOR ||
654        src == V_028780_BLEND_ONE_MINUS_SRC_ALPHA) {
655       is_src_allowed = true;
656    }
657 
658    if (gfx_level >= GFX11) {
659       if (src == V_028780_BLEND_CONSTANT_COLOR_GFX11 ||
660           src == V_028780_BLEND_CONSTANT_ALPHA_GFX11 ||
661           src == V_028780_BLEND_SRC1_COLOR_GFX11 ||
662           src == V_028780_BLEND_SRC1_ALPHA_GFX11 ||
663           src == V_028780_BLEND_ONE_MINUS_CONSTANT_COLOR_GFX11 ||
664           src == V_028780_BLEND_ONE_MINUS_CONSTANT_ALPHA_GFX11 ||
665           src == V_028780_BLEND_INV_SRC1_COLOR_GFX11 ||
666           src == V_028780_BLEND_INV_SRC1_ALPHA_GFX11) {
667          is_src_allowed = true;
668       }
669    } else {
670       if (src == V_028780_BLEND_CONSTANT_COLOR_GFX6 ||
671           src == V_028780_BLEND_CONSTANT_ALPHA_GFX6 ||
672           src == V_028780_BLEND_SRC1_COLOR_GFX6 ||
673           src == V_028780_BLEND_SRC1_ALPHA_GFX6 ||
674           src == V_028780_BLEND_ONE_MINUS_CONSTANT_COLOR_GFX6 ||
675           src == V_028780_BLEND_ONE_MINUS_CONSTANT_ALPHA_GFX6 ||
676           src == V_028780_BLEND_INV_SRC1_COLOR_GFX6 ||
677           src == V_028780_BLEND_INV_SRC1_ALPHA_GFX6) {
678          is_src_allowed = true;
679       }
680    }
681 
682    if (dst == V_028780_BLEND_ONE && is_src_allowed) {
683       /* Addition is commutative, but floating point addition isn't
684        * associative: subtle changes can be introduced via different
685        * rounding. Be conservative, only enable for min and max.
686        */
687       if (op == V_028780_COMB_MAX_DST_SRC || op == V_028780_COMB_MIN_DST_SRC)
688          blend->commutative_4bit |= chanmask;
689    }
690 }
691 
692 static struct radv_blend_state
radv_pipeline_init_blend_state(struct radv_graphics_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo,const struct radv_graphics_pipeline_info * info)693 radv_pipeline_init_blend_state(struct radv_graphics_pipeline *pipeline,
694                                const VkGraphicsPipelineCreateInfo *pCreateInfo,
695                                const struct radv_graphics_pipeline_info *info)
696 {
697    const struct radv_device *device = pipeline->base.device;
698    struct radv_blend_state blend = {0};
699    unsigned cb_color_control = 0;
700    const enum amd_gfx_level gfx_level = device->physical_device->rad_info.gfx_level;
701    int i;
702 
703    if (info->cb.logic_op_enable)
704       cb_color_control |= S_028808_ROP3(info->cb.logic_op);
705    else
706       cb_color_control |= S_028808_ROP3(V_028808_ROP3_COPY);
707 
708    if (device->instance->debug_flags & RADV_DEBUG_NO_ATOC_DITHERING)
709    {
710       blend.db_alpha_to_mask = S_028B70_ALPHA_TO_MASK_OFFSET0(2) | S_028B70_ALPHA_TO_MASK_OFFSET1(2) |
711                                S_028B70_ALPHA_TO_MASK_OFFSET2(2) | S_028B70_ALPHA_TO_MASK_OFFSET3(2) |
712                                S_028B70_OFFSET_ROUND(0);
713    }
714    else
715    {
716       blend.db_alpha_to_mask = S_028B70_ALPHA_TO_MASK_OFFSET0(3) | S_028B70_ALPHA_TO_MASK_OFFSET1(1) |
717                                S_028B70_ALPHA_TO_MASK_OFFSET2(0) | S_028B70_ALPHA_TO_MASK_OFFSET3(2) |
718                                S_028B70_OFFSET_ROUND(1);
719    }
720 
721    if (info->ms.alpha_to_coverage_enable) {
722       blend.db_alpha_to_mask |= S_028B70_ALPHA_TO_MASK_ENABLE(1);
723       blend.need_src_alpha |= 0x1;
724    }
725 
726    blend.cb_target_mask = 0;
727    for (i = 0; i < info->cb.att_count; i++) {
728       unsigned blend_cntl = 0;
729       unsigned srcRGB_opt, dstRGB_opt, srcA_opt, dstA_opt;
730       unsigned eqRGB = info->cb.att[i].color_blend_op;
731       unsigned srcRGB = info->cb.att[i].src_color_blend_factor;
732       unsigned dstRGB = info->cb.att[i].dst_color_blend_factor;
733       unsigned eqA = info->cb.att[i].alpha_blend_op;
734       unsigned srcA = info->cb.att[i].src_alpha_blend_factor;
735       unsigned dstA = info->cb.att[i].dst_alpha_blend_factor;
736 
737       blend.sx_mrt_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) |
738                                   S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED);
739 
740       if (!info->cb.att[i].color_write_mask)
741          continue;
742 
743       /* Ignore other blend targets if dual-source blending
744        * is enabled to prevent wrong behaviour.
745        */
746       if (blend.mrt0_is_dual_src)
747          continue;
748 
749       blend.cb_target_mask |= (unsigned)info->cb.att[i].color_write_mask << (4 * i);
750       blend.cb_target_enabled_4bit |= 0xfu << (4 * i);
751       if (!info->cb.att[i].blend_enable) {
752          blend.cb_blend_control[i] = blend_cntl;
753          continue;
754       }
755 
756       if (is_dual_src(gfx_level, srcRGB) || is_dual_src(gfx_level, dstRGB) ||
757           is_dual_src(gfx_level, srcA) || is_dual_src(gfx_level, dstA))
758          if (i == 0)
759             blend.mrt0_is_dual_src = true;
760 
761 
762       if (eqRGB == V_028780_COMB_MIN_DST_SRC || eqRGB == V_028780_COMB_MAX_DST_SRC) {
763          srcRGB = V_028780_BLEND_ONE;
764          dstRGB = V_028780_BLEND_ONE;
765       }
766       if (eqA == V_028780_COMB_MIN_DST_SRC || eqA == V_028780_COMB_MAX_DST_SRC) {
767          srcA = V_028780_BLEND_ONE;
768          dstA = V_028780_BLEND_ONE;
769       }
770 
771       radv_blend_check_commutativity(gfx_level, &blend, eqRGB, srcRGB, dstRGB, 0x7u << (4 * i));
772       radv_blend_check_commutativity(gfx_level, &blend, eqA, srcA, dstA, 0x8u << (4 * i));
773 
774       /* Blending optimizations for RB+.
775        * These transformations don't change the behavior.
776        *
777        * First, get rid of DST in the blend factors:
778        *    func(src * DST, dst * 0) ---> func(src * 0, dst * SRC)
779        */
780       si_blend_remove_dst(&eqRGB, &srcRGB, &dstRGB, V_028780_BLEND_DST_COLOR,
781                           V_028780_BLEND_SRC_COLOR);
782 
783       si_blend_remove_dst(&eqA, &srcA, &dstA, V_028780_BLEND_DST_COLOR,
784                           V_028780_BLEND_SRC_COLOR);
785 
786       si_blend_remove_dst(&eqA, &srcA, &dstA, V_028780_BLEND_DST_ALPHA,
787                           V_028780_BLEND_SRC_ALPHA);
788 
789       /* Look up the ideal settings from tables. */
790       srcRGB_opt = si_translate_blend_opt_factor(srcRGB, false);
791       dstRGB_opt = si_translate_blend_opt_factor(dstRGB, false);
792       srcA_opt = si_translate_blend_opt_factor(srcA, true);
793       dstA_opt = si_translate_blend_opt_factor(dstA, true);
794 
795       /* Handle interdependencies. */
796       if (si_blend_factor_uses_dst(srcRGB))
797          dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
798       if (si_blend_factor_uses_dst(srcA))
799          dstA_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
800 
801       if (srcRGB == V_028780_BLEND_SRC_ALPHA_SATURATE &&
802           (dstRGB == V_028780_BLEND_ZERO || dstRGB == V_028780_BLEND_SRC_ALPHA ||
803            dstRGB == V_028780_BLEND_SRC_ALPHA_SATURATE))
804          dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0;
805 
806       /* Set the final value. */
807       blend.sx_mrt_blend_opt[i] =
808          S_028760_COLOR_SRC_OPT(srcRGB_opt) | S_028760_COLOR_DST_OPT(dstRGB_opt) |
809          S_028760_COLOR_COMB_FCN(si_translate_blend_opt_function(eqRGB)) |
810          S_028760_ALPHA_SRC_OPT(srcA_opt) | S_028760_ALPHA_DST_OPT(dstA_opt) |
811          S_028760_ALPHA_COMB_FCN(si_translate_blend_opt_function(eqA));
812       blend_cntl |= S_028780_ENABLE(1);
813 
814       blend_cntl |= S_028780_COLOR_COMB_FCN(eqRGB);
815       blend_cntl |= S_028780_COLOR_SRCBLEND(srcRGB);
816       blend_cntl |= S_028780_COLOR_DESTBLEND(dstRGB);
817       if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB) {
818          blend_cntl |= S_028780_SEPARATE_ALPHA_BLEND(1);
819          blend_cntl |= S_028780_ALPHA_COMB_FCN(eqA);
820          blend_cntl |= S_028780_ALPHA_SRCBLEND(srcA);
821          blend_cntl |= S_028780_ALPHA_DESTBLEND(dstA);
822       }
823       blend.cb_blend_control[i] = blend_cntl;
824 
825       blend.blend_enable_4bit |= 0xfu << (i * 4);
826 
827       if (srcRGB == V_028780_BLEND_SRC_ALPHA || dstRGB == V_028780_BLEND_SRC_ALPHA ||
828           srcRGB == V_028780_BLEND_SRC_ALPHA_SATURATE ||
829           dstRGB == V_028780_BLEND_SRC_ALPHA_SATURATE ||
830           srcRGB == V_028780_BLEND_ONE_MINUS_SRC_ALPHA ||
831           dstRGB == V_028780_BLEND_ONE_MINUS_SRC_ALPHA)
832          blend.need_src_alpha |= 1 << i;
833    }
834    for (i = info->cb.att_count; i < 8; i++) {
835       blend.cb_blend_control[i] = 0;
836       blend.sx_mrt_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) |
837                                   S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED);
838    }
839 
840    if (device->physical_device->rad_info.has_rbplus) {
841       /* Disable RB+ blend optimizations for dual source blending. */
842       if (blend.mrt0_is_dual_src) {
843          for (i = 0; i < 8; i++) {
844             blend.sx_mrt_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_NONE) |
845                                         S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_NONE);
846          }
847       }
848 
849       /* RB+ doesn't work with dual source blending, logic op and
850        * RESOLVE.
851        */
852       if (blend.mrt0_is_dual_src || info->cb.logic_op_enable ||
853           (device->physical_device->rad_info.gfx_level >= GFX11 && blend.blend_enable_4bit))
854          cb_color_control |= S_028808_DISABLE_DUAL_QUAD(1);
855    }
856 
857    if (blend.cb_target_mask)
858       cb_color_control |= S_028808_MODE(V_028808_CB_NORMAL);
859    else
860       cb_color_control |= S_028808_MODE(V_028808_CB_DISABLE);
861 
862    radv_pipeline_compute_spi_color_formats(pipeline, pCreateInfo, &blend, info);
863 
864    pipeline->cb_color_control = cb_color_control;
865 
866    return blend;
867 }
868 
869 static uint32_t
si_translate_fill(VkPolygonMode func)870 si_translate_fill(VkPolygonMode func)
871 {
872    switch (func) {
873    case VK_POLYGON_MODE_FILL:
874       return V_028814_X_DRAW_TRIANGLES;
875    case VK_POLYGON_MODE_LINE:
876       return V_028814_X_DRAW_LINES;
877    case VK_POLYGON_MODE_POINT:
878       return V_028814_X_DRAW_POINTS;
879    default:
880       assert(0);
881       return V_028814_X_DRAW_POINTS;
882    }
883 }
884 
885 static unsigned
radv_pipeline_color_samples(const struct radv_graphics_pipeline_info * info)886 radv_pipeline_color_samples( const struct radv_graphics_pipeline_info *info)
887 {
888    if (info->color_att_samples && radv_pipeline_has_color_attachments(&info->ri)) {
889       return info->color_att_samples;
890    }
891 
892    return info->ms.raster_samples;
893 }
894 
895 static unsigned
radv_pipeline_depth_samples(const struct radv_graphics_pipeline_info * info)896 radv_pipeline_depth_samples(const struct radv_graphics_pipeline_info *info)
897 {
898    if (info->ds_att_samples && radv_pipeline_has_ds_attachments(&info->ri)) {
899       return info->ds_att_samples;
900    }
901 
902    return info->ms.raster_samples;
903 }
904 
905 static uint8_t
radv_pipeline_get_ps_iter_samples(const struct radv_graphics_pipeline_info * info)906 radv_pipeline_get_ps_iter_samples(const struct radv_graphics_pipeline_info *info)
907 {
908    uint32_t ps_iter_samples = 1;
909    uint32_t num_samples = radv_pipeline_color_samples(info);
910 
911    if (info->ms.sample_shading_enable) {
912       ps_iter_samples = ceilf(info->ms.min_sample_shading * num_samples);
913       ps_iter_samples = util_next_power_of_two(ps_iter_samples);
914    }
915    return ps_iter_samples;
916 }
917 
918 static bool
radv_is_depth_write_enabled(const struct radv_depth_stencil_info * ds_info)919 radv_is_depth_write_enabled(const struct radv_depth_stencil_info *ds_info)
920 {
921    return ds_info->depth_test_enable && ds_info->depth_write_enable &&
922           ds_info->depth_compare_op != VK_COMPARE_OP_NEVER;
923 }
924 
925 static bool
radv_writes_stencil(const struct radv_stencil_op_info * info)926 radv_writes_stencil(const struct radv_stencil_op_info *info)
927 {
928    return info->write_mask &&
929           (info->fail_op != VK_STENCIL_OP_KEEP || info->pass_op != VK_STENCIL_OP_KEEP ||
930            info->depth_fail_op != VK_STENCIL_OP_KEEP);
931 }
932 
933 static bool
radv_is_stencil_write_enabled(const struct radv_depth_stencil_info * ds_info)934 radv_is_stencil_write_enabled(const struct radv_depth_stencil_info *ds_info)
935 {
936    return ds_info->stencil_test_enable &&
937           (radv_writes_stencil(&ds_info->front) || radv_writes_stencil(&ds_info->back));
938 }
939 
940 static bool
radv_order_invariant_stencil_op(VkStencilOp op)941 radv_order_invariant_stencil_op(VkStencilOp op)
942 {
943    /* REPLACE is normally order invariant, except when the stencil
944     * reference value is written by the fragment shader. Tracking this
945     * interaction does not seem worth the effort, so be conservative.
946     */
947    return op != VK_STENCIL_OP_INCREMENT_AND_CLAMP && op != VK_STENCIL_OP_DECREMENT_AND_CLAMP &&
948           op != VK_STENCIL_OP_REPLACE;
949 }
950 
951 static bool
radv_order_invariant_stencil_state(const struct radv_stencil_op_info * info)952 radv_order_invariant_stencil_state(const struct radv_stencil_op_info *info)
953 {
954    /* Compute whether, assuming Z writes are disabled, this stencil state
955     * is order invariant in the sense that the set of passing fragments as
956     * well as the final stencil buffer result does not depend on the order
957     * of fragments.
958     */
959    return !info->write_mask ||
960           /* The following assumes that Z writes are disabled. */
961           (info->compare_op == VK_COMPARE_OP_ALWAYS &&
962            radv_order_invariant_stencil_op(info->pass_op) &&
963            radv_order_invariant_stencil_op(info->depth_fail_op)) ||
964           (info->compare_op == VK_COMPARE_OP_NEVER &&
965            radv_order_invariant_stencil_op(info->fail_op));
966 }
967 
968 static bool
radv_pipeline_has_dynamic_ds_states(const struct radv_graphics_pipeline * pipeline)969 radv_pipeline_has_dynamic_ds_states(const struct radv_graphics_pipeline *pipeline)
970 {
971    return !!(pipeline->dynamic_states & (RADV_DYNAMIC_DEPTH_TEST_ENABLE |
972                                          RADV_DYNAMIC_DEPTH_WRITE_ENABLE |
973                                          RADV_DYNAMIC_DEPTH_COMPARE_OP |
974                                          RADV_DYNAMIC_STENCIL_TEST_ENABLE |
975                                          RADV_DYNAMIC_STENCIL_WRITE_MASK |
976                                          RADV_DYNAMIC_STENCIL_OP));
977 }
978 
979 static bool
radv_pipeline_out_of_order_rast(struct radv_graphics_pipeline * pipeline,const struct radv_blend_state * blend,const struct radv_graphics_pipeline_info * info)980 radv_pipeline_out_of_order_rast(struct radv_graphics_pipeline *pipeline,
981                                 const struct radv_blend_state *blend,
982                                 const struct radv_graphics_pipeline_info *info)
983 {
984    unsigned colormask = blend->cb_target_enabled_4bit;
985 
986    if (!pipeline->base.device->physical_device->out_of_order_rast_allowed)
987       return false;
988 
989    /* Be conservative if a logic operation is enabled with color buffers. */
990    if (colormask && info->cb.logic_op_enable)
991       return false;
992 
993    /* Be conservative if an extended dynamic depth/stencil state is
994     * enabled because the driver can't update out-of-order rasterization
995     * dynamically.
996     */
997    if (radv_pipeline_has_dynamic_ds_states(pipeline))
998       return false;
999 
1000    /* Default depth/stencil invariance when no attachment is bound. */
1001    struct radv_dsa_order_invariance dsa_order_invariant = {.zs = true, .pass_set = true};
1002 
1003    bool has_stencil = info->ri.stencil_att_format != VK_FORMAT_UNDEFINED;
1004    struct radv_dsa_order_invariance order_invariance[2];
1005    struct radv_shader *ps = pipeline->base.shaders[MESA_SHADER_FRAGMENT];
1006 
1007    /* Compute depth/stencil order invariance in order to know if
1008     * it's safe to enable out-of-order.
1009     */
1010    bool zfunc_is_ordered = info->ds.depth_compare_op == VK_COMPARE_OP_NEVER ||
1011                            info->ds.depth_compare_op == VK_COMPARE_OP_LESS ||
1012                            info->ds.depth_compare_op == VK_COMPARE_OP_LESS_OR_EQUAL ||
1013                            info->ds.depth_compare_op == VK_COMPARE_OP_GREATER ||
1014                            info->ds.depth_compare_op == VK_COMPARE_OP_GREATER_OR_EQUAL;
1015    bool depth_write_enabled = radv_is_depth_write_enabled(&info->ds);
1016    bool stencil_write_enabled = radv_is_stencil_write_enabled(&info->ds);
1017    bool ds_write_enabled = depth_write_enabled || stencil_write_enabled;
1018 
1019    bool nozwrite_and_order_invariant_stencil =
1020       !ds_write_enabled ||
1021       (!depth_write_enabled && radv_order_invariant_stencil_state(&info->ds.front) &&
1022        radv_order_invariant_stencil_state(&info->ds.back));
1023 
1024    order_invariance[1].zs = nozwrite_and_order_invariant_stencil ||
1025                             (!stencil_write_enabled && zfunc_is_ordered);
1026    order_invariance[0].zs = !depth_write_enabled || zfunc_is_ordered;
1027 
1028    order_invariance[1].pass_set =
1029       nozwrite_and_order_invariant_stencil ||
1030       (!stencil_write_enabled &&
1031        (info->ds.depth_compare_op == VK_COMPARE_OP_ALWAYS ||
1032         info->ds.depth_compare_op == VK_COMPARE_OP_NEVER));
1033    order_invariance[0].pass_set =
1034       !depth_write_enabled ||
1035       (info->ds.depth_compare_op == VK_COMPARE_OP_ALWAYS ||
1036        info->ds.depth_compare_op == VK_COMPARE_OP_NEVER);
1037 
1038    dsa_order_invariant = order_invariance[has_stencil];
1039    if (!dsa_order_invariant.zs)
1040       return false;
1041 
1042    /* The set of PS invocations is always order invariant,
1043     * except when early Z/S tests are requested.
1044     */
1045    if (ps && ps->info.ps.writes_memory && ps->info.ps.early_fragment_test &&
1046        !dsa_order_invariant.pass_set)
1047       return false;
1048 
1049    /* Determine if out-of-order rasterization should be disabled when occlusion queries are used. */
1050    pipeline->disable_out_of_order_rast_for_occlusion = !dsa_order_invariant.pass_set;
1051 
1052    /* No color buffers are enabled for writing. */
1053    if (!colormask)
1054       return true;
1055 
1056    unsigned blendmask = colormask & blend->blend_enable_4bit;
1057 
1058    if (blendmask) {
1059       /* Only commutative blending. */
1060       if (blendmask & ~blend->commutative_4bit)
1061          return false;
1062 
1063       if (!dsa_order_invariant.pass_set)
1064          return false;
1065    }
1066 
1067    if (colormask & ~blendmask)
1068       return false;
1069 
1070    return true;
1071 }
1072 
1073 static void
radv_pipeline_init_multisample_state(struct radv_graphics_pipeline * pipeline,const struct radv_blend_state * blend,const struct radv_graphics_pipeline_info * info,unsigned rast_prim)1074 radv_pipeline_init_multisample_state(struct radv_graphics_pipeline *pipeline,
1075                                      const struct radv_blend_state *blend,
1076                                      const struct radv_graphics_pipeline_info *info,
1077                                      unsigned rast_prim)
1078 {
1079    const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
1080    struct radv_multisample_state *ms = &pipeline->ms;
1081    unsigned num_tile_pipes = pdevice->rad_info.num_tile_pipes;
1082    const VkConservativeRasterizationModeEXT mode = info->rs.conservative_mode;
1083    bool out_of_order_rast = false;
1084    int ps_iter_samples = 1;
1085 
1086    ms->num_samples = info->ms.raster_samples;
1087 
1088    /* From the Vulkan 1.1.129 spec, 26.7. Sample Shading:
1089     *
1090     * "Sample shading is enabled for a graphics pipeline:
1091     *
1092     * - If the interface of the fragment shader entry point of the
1093     *   graphics pipeline includes an input variable decorated
1094     *   with SampleId or SamplePosition. In this case
1095     *   minSampleShadingFactor takes the value 1.0.
1096     * - Else if the sampleShadingEnable member of the
1097     *   VkPipelineMultisampleStateCreateInfo structure specified
1098     *   when creating the graphics pipeline is set to VK_TRUE. In
1099     *   this case minSampleShadingFactor takes the value of
1100     *   VkPipelineMultisampleStateCreateInfo::minSampleShading.
1101     *
1102     * Otherwise, sample shading is considered disabled."
1103     */
1104    if (pipeline->base.shaders[MESA_SHADER_FRAGMENT]->info.ps.uses_sample_shading) {
1105       ps_iter_samples = ms->num_samples;
1106    } else {
1107       ps_iter_samples = radv_pipeline_get_ps_iter_samples(info);
1108    }
1109 
1110    if (info->rs.order == VK_RASTERIZATION_ORDER_RELAXED_AMD) {
1111       /* Out-of-order rasterization is explicitly enabled by the
1112        * application.
1113        */
1114       out_of_order_rast = true;
1115    } else {
1116       /* Determine if the driver can enable out-of-order
1117        * rasterization internally.
1118        */
1119       out_of_order_rast = radv_pipeline_out_of_order_rast(pipeline, blend, info);
1120    }
1121 
1122    ms->pa_sc_aa_config = 0;
1123    ms->db_eqaa = S_028804_HIGH_QUALITY_INTERSECTIONS(1) | S_028804_INCOHERENT_EQAA_READS(1) |
1124                  S_028804_INTERPOLATE_COMP_Z(1) | S_028804_STATIC_ANCHOR_ASSOCIATIONS(1);
1125 
1126    /* Adjust MSAA state if conservative rasterization is enabled. */
1127    if (mode != VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT) {
1128       ms->pa_sc_aa_config |= S_028BE0_AA_MASK_CENTROID_DTMN(1);
1129 
1130       ms->db_eqaa |=
1131          S_028804_ENABLE_POSTZ_OVERRASTERIZATION(1) | S_028804_OVERRASTERIZATION_AMOUNT(4);
1132    }
1133 
1134    ms->pa_sc_mode_cntl_1 =
1135       S_028A4C_WALK_FENCE_ENABLE(1) | // TODO linear dst fixes
1136       S_028A4C_WALK_FENCE_SIZE(num_tile_pipes == 2 ? 2 : 3) |
1137       S_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(out_of_order_rast) |
1138       S_028A4C_OUT_OF_ORDER_WATER_MARK(0x7) |
1139       /* always 1: */
1140       S_028A4C_WALK_ALIGN8_PRIM_FITS_ST(1) | S_028A4C_SUPERTILE_WALK_ORDER_ENABLE(1) |
1141       S_028A4C_TILE_WALK_ORDER_ENABLE(1) | S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE(1) |
1142       S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) | S_028A4C_FORCE_EOV_REZ_ENABLE(1);
1143    ms->pa_sc_mode_cntl_0 = S_028A48_ALTERNATE_RBS_PER_TILE(pdevice->rad_info.gfx_level >= GFX9) |
1144                            S_028A48_VPORT_SCISSOR_ENABLE(1) |
1145                            S_028A48_LINE_STIPPLE_ENABLE(info->rs.stippled_line_enable);
1146 
1147    if (info->rs.line_raster_mode == VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT &&
1148        radv_rast_prim_is_line(rast_prim)) {
1149       /* From the Vulkan spec 1.3.221:
1150        *
1151        * "When Bresenham lines are being rasterized, sample locations may all be treated as being at
1152        * the pixel center (this may affect attribute and depth interpolation)."
1153        *
1154        * "One consequence of this is that Bresenham lines cover the same pixels regardless of the
1155        * number of rasterization samples, and cover all samples in those pixels (unless masked out
1156        * or killed)."
1157        */
1158       ms->num_samples = 1;
1159    }
1160 
1161    if (ms->num_samples > 1) {
1162       uint32_t z_samples = radv_pipeline_depth_samples(info);
1163       unsigned log_samples = util_logbase2(ms->num_samples);
1164       unsigned log_z_samples = util_logbase2(z_samples);
1165       unsigned log_ps_iter_samples = util_logbase2(ps_iter_samples);
1166       ms->pa_sc_mode_cntl_0 |= S_028A48_MSAA_ENABLE(1);
1167       ms->db_eqaa |= S_028804_MAX_ANCHOR_SAMPLES(log_z_samples) |
1168                      S_028804_PS_ITER_SAMPLES(log_ps_iter_samples) |
1169                      S_028804_MASK_EXPORT_NUM_SAMPLES(log_samples) |
1170                      S_028804_ALPHA_TO_MASK_NUM_SAMPLES(log_samples);
1171       ms->pa_sc_aa_config |=
1172          S_028BE0_MSAA_NUM_SAMPLES(log_samples) |
1173          S_028BE0_MAX_SAMPLE_DIST(radv_get_default_max_sample_dist(log_samples)) |
1174          S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples) | /* CM_R_028BE0_PA_SC_AA_CONFIG */
1175          S_028BE0_COVERED_CENTROID_IS_CENTER(pdevice->rad_info.gfx_level >= GFX10_3);
1176       ms->pa_sc_mode_cntl_1 |= S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1);
1177       if (ps_iter_samples > 1)
1178          pipeline->spi_baryc_cntl |= S_0286E0_POS_FLOAT_LOCATION(2);
1179    }
1180 
1181    ms->pa_sc_aa_mask[0] = info->ms.sample_mask | ((uint32_t)info->ms.sample_mask << 16);
1182    ms->pa_sc_aa_mask[1] = info->ms.sample_mask | ((uint32_t)info->ms.sample_mask << 16);
1183 }
1184 
1185 static void
gfx103_pipeline_init_vrs_state(struct radv_graphics_pipeline * pipeline,const struct radv_graphics_pipeline_info * info)1186 gfx103_pipeline_init_vrs_state(struct radv_graphics_pipeline *pipeline,
1187                                const struct radv_graphics_pipeline_info *info)
1188 {
1189    struct radv_shader *ps = pipeline->base.shaders[MESA_SHADER_FRAGMENT];
1190    struct radv_multisample_state *ms = &pipeline->ms;
1191    struct radv_vrs_state *vrs = &pipeline->vrs;
1192 
1193    if (info->ms.sample_shading_enable ||
1194        ps->info.ps.uses_sample_shading || ps->info.ps.reads_sample_mask_in) {
1195       /* Disable VRS and use the rates from PS_ITER_SAMPLES if:
1196        *
1197        * 1) sample shading is enabled or per-sample interpolation is
1198        *    used by the fragment shader
1199        * 2) the fragment shader reads gl_SampleMaskIn because the
1200        *    16-bit sample coverage mask isn't enough for MSAA8x and
1201        *    2x2 coarse shading isn't enough.
1202        */
1203       vrs->pa_cl_vrs_cntl = S_028848_SAMPLE_ITER_COMBINER_MODE(V_028848_VRS_COMB_MODE_OVERRIDE);
1204 
1205       /* Make sure sample shading is enabled even if only MSAA1x is
1206        * used because the SAMPLE_ITER combiner is in passthrough
1207        * mode if PS_ITER_SAMPLE is 0, and it uses the per-draw rate.
1208        * The default VRS rate when sample shading is enabled is 1x1.
1209        */
1210       if (!G_028A4C_PS_ITER_SAMPLE(ms->pa_sc_mode_cntl_1))
1211          ms->pa_sc_mode_cntl_1 |= S_028A4C_PS_ITER_SAMPLE(1);
1212    } else {
1213       vrs->pa_cl_vrs_cntl = S_028848_SAMPLE_ITER_COMBINER_MODE(V_028848_VRS_COMB_MODE_PASSTHRU);
1214    }
1215 }
1216 
1217 static uint32_t
si_conv_tess_prim_to_gs_out(enum tess_primitive_mode prim)1218 si_conv_tess_prim_to_gs_out(enum tess_primitive_mode prim)
1219 {
1220    switch (prim) {
1221    case TESS_PRIMITIVE_TRIANGLES:
1222    case TESS_PRIMITIVE_QUADS:
1223       return V_028A6C_TRISTRIP;
1224    case TESS_PRIMITIVE_ISOLINES:
1225       return V_028A6C_LINESTRIP;
1226    default:
1227       assert(0);
1228       return 0;
1229    }
1230 }
1231 
1232 static uint32_t
si_conv_gl_prim_to_gs_out(unsigned gl_prim)1233 si_conv_gl_prim_to_gs_out(unsigned gl_prim)
1234 {
1235    switch (gl_prim) {
1236    case SHADER_PRIM_POINTS:
1237       return V_028A6C_POINTLIST;
1238    case SHADER_PRIM_LINES:
1239    case SHADER_PRIM_LINE_STRIP:
1240    case SHADER_PRIM_LINES_ADJACENCY:
1241       return V_028A6C_LINESTRIP;
1242 
1243    case SHADER_PRIM_TRIANGLES:
1244    case SHADER_PRIM_TRIANGLE_STRIP_ADJACENCY:
1245    case SHADER_PRIM_TRIANGLE_STRIP:
1246    case SHADER_PRIM_QUADS:
1247       return V_028A6C_TRISTRIP;
1248    default:
1249       assert(0);
1250       return 0;
1251    }
1252 }
1253 
1254 static uint64_t
radv_dynamic_state_mask(VkDynamicState state)1255 radv_dynamic_state_mask(VkDynamicState state)
1256 {
1257    switch (state) {
1258    case VK_DYNAMIC_STATE_VIEWPORT:
1259    case VK_DYNAMIC_STATE_VIEWPORT_WITH_COUNT:
1260       return RADV_DYNAMIC_VIEWPORT;
1261    case VK_DYNAMIC_STATE_SCISSOR:
1262    case VK_DYNAMIC_STATE_SCISSOR_WITH_COUNT:
1263       return RADV_DYNAMIC_SCISSOR;
1264    case VK_DYNAMIC_STATE_LINE_WIDTH:
1265       return RADV_DYNAMIC_LINE_WIDTH;
1266    case VK_DYNAMIC_STATE_DEPTH_BIAS:
1267       return RADV_DYNAMIC_DEPTH_BIAS;
1268    case VK_DYNAMIC_STATE_BLEND_CONSTANTS:
1269       return RADV_DYNAMIC_BLEND_CONSTANTS;
1270    case VK_DYNAMIC_STATE_DEPTH_BOUNDS:
1271       return RADV_DYNAMIC_DEPTH_BOUNDS;
1272    case VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK:
1273       return RADV_DYNAMIC_STENCIL_COMPARE_MASK;
1274    case VK_DYNAMIC_STATE_STENCIL_WRITE_MASK:
1275       return RADV_DYNAMIC_STENCIL_WRITE_MASK;
1276    case VK_DYNAMIC_STATE_STENCIL_REFERENCE:
1277       return RADV_DYNAMIC_STENCIL_REFERENCE;
1278    case VK_DYNAMIC_STATE_DISCARD_RECTANGLE_EXT:
1279       return RADV_DYNAMIC_DISCARD_RECTANGLE;
1280    case VK_DYNAMIC_STATE_SAMPLE_LOCATIONS_EXT:
1281       return RADV_DYNAMIC_SAMPLE_LOCATIONS;
1282    case VK_DYNAMIC_STATE_LINE_STIPPLE_EXT:
1283       return RADV_DYNAMIC_LINE_STIPPLE;
1284    case VK_DYNAMIC_STATE_CULL_MODE:
1285       return RADV_DYNAMIC_CULL_MODE;
1286    case VK_DYNAMIC_STATE_FRONT_FACE:
1287       return RADV_DYNAMIC_FRONT_FACE;
1288    case VK_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY:
1289       return RADV_DYNAMIC_PRIMITIVE_TOPOLOGY;
1290    case VK_DYNAMIC_STATE_DEPTH_TEST_ENABLE:
1291       return RADV_DYNAMIC_DEPTH_TEST_ENABLE;
1292    case VK_DYNAMIC_STATE_DEPTH_WRITE_ENABLE:
1293       return RADV_DYNAMIC_DEPTH_WRITE_ENABLE;
1294    case VK_DYNAMIC_STATE_DEPTH_COMPARE_OP:
1295       return RADV_DYNAMIC_DEPTH_COMPARE_OP;
1296    case VK_DYNAMIC_STATE_DEPTH_BOUNDS_TEST_ENABLE:
1297       return RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE;
1298    case VK_DYNAMIC_STATE_STENCIL_TEST_ENABLE:
1299       return RADV_DYNAMIC_STENCIL_TEST_ENABLE;
1300    case VK_DYNAMIC_STATE_STENCIL_OP:
1301       return RADV_DYNAMIC_STENCIL_OP;
1302    case VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE:
1303       return RADV_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE;
1304    case VK_DYNAMIC_STATE_FRAGMENT_SHADING_RATE_KHR:
1305       return RADV_DYNAMIC_FRAGMENT_SHADING_RATE;
1306    case VK_DYNAMIC_STATE_PATCH_CONTROL_POINTS_EXT:
1307       return RADV_DYNAMIC_PATCH_CONTROL_POINTS;
1308    case VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE:
1309       return RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
1310    case VK_DYNAMIC_STATE_DEPTH_BIAS_ENABLE:
1311       return RADV_DYNAMIC_DEPTH_BIAS_ENABLE;
1312    case VK_DYNAMIC_STATE_LOGIC_OP_EXT:
1313       return RADV_DYNAMIC_LOGIC_OP;
1314    case VK_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE:
1315       return RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE;
1316    case VK_DYNAMIC_STATE_COLOR_WRITE_ENABLE_EXT:
1317       return RADV_DYNAMIC_COLOR_WRITE_ENABLE;
1318    case VK_DYNAMIC_STATE_VERTEX_INPUT_EXT:
1319       return RADV_DYNAMIC_VERTEX_INPUT;
1320    default:
1321       unreachable("Unhandled dynamic state");
1322    }
1323 }
1324 
1325 static bool
radv_pipeline_is_blend_enabled(const struct radv_graphics_pipeline * pipeline,const struct radv_color_blend_info * cb_info)1326 radv_pipeline_is_blend_enabled(const struct radv_graphics_pipeline *pipeline,
1327                                const struct radv_color_blend_info *cb_info)
1328 {
1329    for (uint32_t i = 0; i < cb_info->att_count; i++) {
1330       if (cb_info->att[i].color_write_mask && cb_info->att[i].blend_enable)
1331          return true;
1332    }
1333 
1334    return false;
1335 }
1336 
1337 static uint64_t
radv_pipeline_needed_dynamic_state(const struct radv_graphics_pipeline * pipeline,const struct radv_graphics_pipeline_info * info)1338 radv_pipeline_needed_dynamic_state(const struct radv_graphics_pipeline *pipeline,
1339                                    const struct radv_graphics_pipeline_info *info)
1340 {
1341    bool has_color_att = radv_pipeline_has_color_attachments(&info->ri);
1342    bool raster_enabled = !info->rs.discard_enable ||
1343                          (pipeline->dynamic_states & RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE);
1344    uint64_t states = RADV_DYNAMIC_ALL;
1345 
1346    /* Disable dynamic states that are useless to mesh shading. */
1347    if (radv_pipeline_has_stage(pipeline, MESA_SHADER_MESH)) {
1348       if (!raster_enabled)
1349          return RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
1350 
1351       states &= ~(RADV_DYNAMIC_VERTEX_INPUT | RADV_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE |
1352                   RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE | RADV_DYNAMIC_PRIMITIVE_TOPOLOGY);
1353    }
1354 
1355    /* If rasterization is disabled we do not care about any of the
1356     * dynamic states, since they are all rasterization related only,
1357     * except primitive topology, primitive restart enable, vertex
1358     * binding stride and rasterization discard itself.
1359     */
1360    if (!raster_enabled) {
1361       return RADV_DYNAMIC_PRIMITIVE_TOPOLOGY | RADV_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE |
1362              RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE | RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE |
1363              RADV_DYNAMIC_VERTEX_INPUT;
1364    }
1365 
1366    if (!info->rs.depth_bias_enable &&
1367        !(pipeline->dynamic_states & RADV_DYNAMIC_DEPTH_BIAS_ENABLE))
1368       states &= ~RADV_DYNAMIC_DEPTH_BIAS;
1369 
1370    if (!info->ds.depth_bounds_test_enable &&
1371        !(pipeline->dynamic_states & RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE))
1372       states &= ~RADV_DYNAMIC_DEPTH_BOUNDS;
1373 
1374    if (!info->ds.stencil_test_enable &&
1375        !(pipeline->dynamic_states & RADV_DYNAMIC_STENCIL_TEST_ENABLE))
1376       states &= ~(RADV_DYNAMIC_STENCIL_COMPARE_MASK | RADV_DYNAMIC_STENCIL_WRITE_MASK |
1377                   RADV_DYNAMIC_STENCIL_REFERENCE | RADV_DYNAMIC_STENCIL_OP);
1378 
1379    if (!info->dr.count)
1380       states &= ~RADV_DYNAMIC_DISCARD_RECTANGLE;
1381 
1382    if (!info->ms.sample_locs_enable)
1383       states &= ~RADV_DYNAMIC_SAMPLE_LOCATIONS;
1384 
1385    if (!info->rs.stippled_line_enable)
1386       states &= ~RADV_DYNAMIC_LINE_STIPPLE;
1387 
1388    if (!radv_is_vrs_enabled(pipeline, info))
1389       states &= ~RADV_DYNAMIC_FRAGMENT_SHADING_RATE;
1390 
1391    if (!has_color_att || !radv_pipeline_is_blend_enabled(pipeline, &info->cb))
1392       states &= ~RADV_DYNAMIC_BLEND_CONSTANTS;
1393 
1394    if (!has_color_att)
1395       states &= ~RADV_DYNAMIC_COLOR_WRITE_ENABLE;
1396 
1397    return states;
1398 }
1399 
1400 static struct radv_ia_multi_vgt_param_helpers
radv_compute_ia_multi_vgt_param_helpers(struct radv_graphics_pipeline * pipeline)1401 radv_compute_ia_multi_vgt_param_helpers(struct radv_graphics_pipeline *pipeline)
1402 {
1403    const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
1404    struct radv_ia_multi_vgt_param_helpers ia_multi_vgt_param = {0};
1405 
1406    if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL))
1407       ia_multi_vgt_param.primgroup_size =
1408          pipeline->base.shaders[MESA_SHADER_TESS_CTRL]->info.num_tess_patches;
1409    else if (radv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY))
1410       ia_multi_vgt_param.primgroup_size = 64;
1411    else
1412       ia_multi_vgt_param.primgroup_size = 128; /* recommended without a GS */
1413 
1414    /* GS requirement. */
1415    ia_multi_vgt_param.partial_es_wave = false;
1416    if (radv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY) && pdevice->rad_info.gfx_level <= GFX8)
1417       if (SI_GS_PER_ES / ia_multi_vgt_param.primgroup_size >= pdevice->gs_table_depth - 3)
1418          ia_multi_vgt_param.partial_es_wave = true;
1419 
1420    ia_multi_vgt_param.ia_switch_on_eoi = false;
1421    if (pipeline->base.shaders[MESA_SHADER_FRAGMENT]->info.ps.prim_id_input)
1422       ia_multi_vgt_param.ia_switch_on_eoi = true;
1423    if (radv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY) && pipeline->base.shaders[MESA_SHADER_GEOMETRY]->info.uses_prim_id)
1424       ia_multi_vgt_param.ia_switch_on_eoi = true;
1425    if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL)) {
1426       /* SWITCH_ON_EOI must be set if PrimID is used. */
1427       if (pipeline->base.shaders[MESA_SHADER_TESS_CTRL]->info.uses_prim_id ||
1428           radv_get_shader(&pipeline->base, MESA_SHADER_TESS_EVAL)->info.uses_prim_id)
1429          ia_multi_vgt_param.ia_switch_on_eoi = true;
1430    }
1431 
1432    ia_multi_vgt_param.partial_vs_wave = false;
1433    if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL)) {
1434       /* Bug with tessellation and GS on Bonaire and older 2 SE chips. */
1435       if ((pdevice->rad_info.family == CHIP_TAHITI ||
1436            pdevice->rad_info.family == CHIP_PITCAIRN ||
1437            pdevice->rad_info.family == CHIP_BONAIRE) &&
1438           radv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY))
1439          ia_multi_vgt_param.partial_vs_wave = true;
1440       /* Needed for 028B6C_DISTRIBUTION_MODE != 0 */
1441       if (pdevice->rad_info.has_distributed_tess) {
1442          if (radv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
1443             if (pdevice->rad_info.gfx_level <= GFX8)
1444                ia_multi_vgt_param.partial_es_wave = true;
1445          } else {
1446             ia_multi_vgt_param.partial_vs_wave = true;
1447          }
1448       }
1449    }
1450 
1451    if (radv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
1452       /* On these chips there is the possibility of a hang if the
1453        * pipeline uses a GS and partial_vs_wave is not set.
1454        *
1455        * This mostly does not hit 4-SE chips, as those typically set
1456        * ia_switch_on_eoi and then partial_vs_wave is set for pipelines
1457        * with GS due to another workaround.
1458        *
1459        * Reproducer: https://bugs.freedesktop.org/show_bug.cgi?id=109242
1460        */
1461       if (pdevice->rad_info.family == CHIP_TONGA ||
1462           pdevice->rad_info.family == CHIP_FIJI ||
1463           pdevice->rad_info.family == CHIP_POLARIS10 ||
1464           pdevice->rad_info.family == CHIP_POLARIS11 ||
1465           pdevice->rad_info.family == CHIP_POLARIS12 ||
1466           pdevice->rad_info.family == CHIP_VEGAM) {
1467          ia_multi_vgt_param.partial_vs_wave = true;
1468       }
1469    }
1470 
1471    ia_multi_vgt_param.base =
1472       S_028AA8_PRIMGROUP_SIZE(ia_multi_vgt_param.primgroup_size - 1) |
1473       /* The following field was moved to VGT_SHADER_STAGES_EN in GFX9. */
1474       S_028AA8_MAX_PRIMGRP_IN_WAVE(pdevice->rad_info.gfx_level == GFX8 ? 2 : 0) |
1475       S_030960_EN_INST_OPT_BASIC(pdevice->rad_info.gfx_level >= GFX9) |
1476       S_030960_EN_INST_OPT_ADV(pdevice->rad_info.gfx_level >= GFX9);
1477 
1478    return ia_multi_vgt_param;
1479 }
1480 
1481 static uint32_t
radv_get_attrib_stride(const VkPipelineVertexInputStateCreateInfo * vi,uint32_t attrib_binding)1482 radv_get_attrib_stride(const VkPipelineVertexInputStateCreateInfo *vi, uint32_t attrib_binding)
1483 {
1484    for (uint32_t i = 0; i < vi->vertexBindingDescriptionCount; i++) {
1485       const VkVertexInputBindingDescription *input_binding = &vi->pVertexBindingDescriptions[i];
1486 
1487       if (input_binding->binding == attrib_binding)
1488          return input_binding->stride;
1489    }
1490 
1491    return 0;
1492 }
1493 
1494 static struct radv_vertex_input_info
radv_pipeline_init_vertex_input_info(struct radv_graphics_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)1495 radv_pipeline_init_vertex_input_info(struct radv_graphics_pipeline *pipeline,
1496                                      const VkGraphicsPipelineCreateInfo *pCreateInfo)
1497 {
1498    const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
1499    const VkPipelineVertexInputStateCreateInfo *vi = pCreateInfo->pVertexInputState;
1500    struct radv_vertex_input_info info = {0};
1501 
1502    if (!(pipeline->dynamic_states & RADV_DYNAMIC_VERTEX_INPUT)) {
1503       /* Vertex input */
1504       const VkPipelineVertexInputDivisorStateCreateInfoEXT *divisor_state =
1505          vk_find_struct_const(vi->pNext, PIPELINE_VERTEX_INPUT_DIVISOR_STATE_CREATE_INFO_EXT);
1506 
1507       uint32_t binding_input_rate = 0;
1508       uint32_t instance_rate_divisors[MAX_VERTEX_ATTRIBS];
1509       for (unsigned i = 0; i < vi->vertexBindingDescriptionCount; ++i) {
1510          const VkVertexInputBindingDescription *desc = &vi->pVertexBindingDescriptions[i];
1511 
1512          if (desc->inputRate) {
1513             unsigned binding = vi->pVertexBindingDescriptions[i].binding;
1514             binding_input_rate |= 1u << binding;
1515             instance_rate_divisors[binding] = 1;
1516          }
1517 
1518          info.binding_stride[desc->binding] = desc->stride;
1519       }
1520 
1521       if (divisor_state) {
1522          for (unsigned i = 0; i < divisor_state->vertexBindingDivisorCount; ++i) {
1523             instance_rate_divisors[divisor_state->pVertexBindingDivisors[i].binding] =
1524                divisor_state->pVertexBindingDivisors[i].divisor;
1525          }
1526       }
1527 
1528       for (unsigned i = 0; i < vi->vertexAttributeDescriptionCount; ++i) {
1529          const VkVertexInputAttributeDescription *desc = &vi->pVertexAttributeDescriptions[i];
1530          const struct util_format_description *format_desc;
1531          unsigned location = desc->location;
1532          unsigned binding = desc->binding;
1533          unsigned num_format, data_format;
1534          bool post_shuffle;
1535 
1536          if (binding_input_rate & (1u << binding)) {
1537             info.instance_rate_inputs |= 1u << location;
1538             info.instance_rate_divisors[location] = instance_rate_divisors[binding];
1539          }
1540 
1541          format_desc = vk_format_description(desc->format);
1542          radv_translate_vertex_format(pdevice, desc->format, format_desc, &data_format, &num_format,
1543                                       &post_shuffle, &info.vertex_alpha_adjust[location]);
1544 
1545          info.vertex_attribute_formats[location] = data_format | (num_format << 4);
1546          info.vertex_attribute_bindings[location] = desc->binding;
1547          info.vertex_attribute_offsets[location] = desc->offset;
1548 
1549          const struct ac_data_format_info *dfmt_info = ac_get_data_format_info(data_format);
1550          unsigned attrib_align =
1551             dfmt_info->chan_byte_size ? dfmt_info->chan_byte_size : dfmt_info->element_size;
1552 
1553          /* If desc->offset is misaligned, then the buffer offset must be too. Just
1554           * skip updating vertex_binding_align in this case.
1555           */
1556          if (desc->offset % attrib_align == 0)
1557             info.vertex_binding_align[desc->binding] =
1558                MAX2(info.vertex_binding_align[desc->binding], attrib_align);
1559 
1560          if (!(pipeline->dynamic_states & RADV_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE)) {
1561             /* From the Vulkan spec 1.2.157:
1562              *
1563              * "If the bound pipeline state object was created
1564              *  with the
1565              *  VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE
1566              *  dynamic state enabled then pStrides[i] specifies
1567              *  the distance in bytes between two consecutive
1568              *  elements within the corresponding buffer. In this
1569              *  case the VkVertexInputBindingDescription::stride
1570              *  state from the pipeline state object is ignored."
1571              *
1572              * Make sure the vertex attribute stride is zero to
1573              * avoid computing a wrong offset if it's initialized
1574              * to something else than zero.
1575              */
1576             info.vertex_attribute_strides[location] = radv_get_attrib_stride(vi, desc->binding);
1577          }
1578 
1579          if (post_shuffle)
1580             info.vertex_post_shuffle |= 1 << location;
1581 
1582          uint32_t end = desc->offset + vk_format_get_blocksize(desc->format);
1583          info.attrib_ends[desc->location] = end;
1584          if (info.binding_stride[desc->binding])
1585             info.attrib_index_offset[desc->location] =
1586                desc->offset / info.binding_stride[desc->binding];
1587          info.attrib_bindings[desc->location] = desc->binding;
1588       }
1589    }
1590 
1591    return info;
1592 }
1593 
1594 static struct radv_input_assembly_info
radv_pipeline_init_input_assembly_info(struct radv_graphics_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)1595 radv_pipeline_init_input_assembly_info(struct radv_graphics_pipeline *pipeline,
1596                                        const VkGraphicsPipelineCreateInfo *pCreateInfo)
1597 {
1598    const VkPipelineInputAssemblyStateCreateInfo *ia = pCreateInfo->pInputAssemblyState;
1599    struct radv_input_assembly_info info = {0};
1600 
1601    info.primitive_topology = si_translate_prim(ia->topology);
1602    info.primitive_restart_enable = !!ia->primitiveRestartEnable;
1603 
1604    return info;
1605 }
1606 
1607 static struct radv_tessellation_info
radv_pipeline_init_tessellation_info(struct radv_graphics_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)1608 radv_pipeline_init_tessellation_info(struct radv_graphics_pipeline *pipeline,
1609                                      const VkGraphicsPipelineCreateInfo *pCreateInfo)
1610 {
1611    const VkPipelineTessellationStateCreateInfo *ts = pCreateInfo->pTessellationState;
1612    const VkShaderStageFlagBits tess_stages = VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT |
1613                                              VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT;
1614    struct radv_tessellation_info info = {0};
1615 
1616    if ((pipeline->active_stages & tess_stages) == tess_stages) {
1617       info.patch_control_points = ts->patchControlPoints;
1618 
1619       const VkPipelineTessellationDomainOriginStateCreateInfo *domain_origin_state =
1620          vk_find_struct_const(ts->pNext, PIPELINE_TESSELLATION_DOMAIN_ORIGIN_STATE_CREATE_INFO);
1621       if (domain_origin_state) {
1622          info.domain_origin = domain_origin_state->domainOrigin;
1623       }
1624    }
1625 
1626    return info;
1627 }
1628 
1629 static struct radv_viewport_info
radv_pipeline_init_viewport_info(struct radv_graphics_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)1630 radv_pipeline_init_viewport_info(struct radv_graphics_pipeline *pipeline,
1631                                  const VkGraphicsPipelineCreateInfo *pCreateInfo)
1632 {
1633    const VkPipelineViewportStateCreateInfo *vp = pCreateInfo->pViewportState;
1634    struct radv_viewport_info info = {0};
1635 
1636    if (radv_is_raster_enabled(pipeline, pCreateInfo)) {
1637       if (!(pipeline->dynamic_states & RADV_DYNAMIC_VIEWPORT)) {
1638          typed_memcpy(info.viewports, vp->pViewports, vp->viewportCount);
1639       }
1640       info.viewport_count = vp->viewportCount;
1641 
1642       if (!(pipeline->dynamic_states & RADV_DYNAMIC_SCISSOR)) {
1643          typed_memcpy(info.scissors, vp->pScissors, vp->scissorCount);
1644       }
1645       info.scissor_count = vp->scissorCount;
1646 
1647       const VkPipelineViewportDepthClipControlCreateInfoEXT *depth_clip_control =
1648          vk_find_struct_const(vp->pNext, PIPELINE_VIEWPORT_DEPTH_CLIP_CONTROL_CREATE_INFO_EXT);
1649       if (depth_clip_control) {
1650          info.negative_one_to_one = !!depth_clip_control->negativeOneToOne;
1651       }
1652    }
1653 
1654    return info;
1655 }
1656 
1657 static struct radv_rasterization_info
radv_pipeline_init_rasterization_info(struct radv_graphics_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)1658 radv_pipeline_init_rasterization_info(struct radv_graphics_pipeline *pipeline,
1659                                       const VkGraphicsPipelineCreateInfo *pCreateInfo)
1660 {
1661    const VkPipelineRasterizationStateCreateInfo *rs = pCreateInfo->pRasterizationState;
1662    struct radv_rasterization_info info = {0};
1663 
1664    info.discard_enable = rs->rasterizerDiscardEnable;
1665    info.front_face = rs->frontFace;
1666    info.cull_mode = rs->cullMode;
1667    info.polygon_mode = si_translate_fill(rs->polygonMode);
1668    info.depth_bias_enable = rs->depthBiasEnable;
1669    info.depth_clamp_enable = rs->depthClampEnable;
1670    info.line_width = rs->lineWidth;
1671    info.depth_bias_constant_factor = rs->depthBiasConstantFactor;
1672    info.depth_bias_clamp = rs->depthBiasClamp;
1673    info.depth_bias_slope_factor = rs->depthBiasSlopeFactor;
1674    info.depth_clip_disable = rs->depthClampEnable;
1675 
1676    const VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *provoking_vtx_info =
1677       vk_find_struct_const(rs->pNext, PIPELINE_RASTERIZATION_PROVOKING_VERTEX_STATE_CREATE_INFO_EXT);
1678    if (provoking_vtx_info &&
1679        provoking_vtx_info->provokingVertexMode == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT) {
1680       info.provoking_vtx_last = true;
1681    }
1682 
1683    const VkPipelineRasterizationConservativeStateCreateInfoEXT *conservative_raster =
1684       vk_find_struct_const(rs->pNext, PIPELINE_RASTERIZATION_CONSERVATIVE_STATE_CREATE_INFO_EXT);
1685    if (conservative_raster) {
1686       info.conservative_mode = conservative_raster->conservativeRasterizationMode;
1687    }
1688 
1689    const VkPipelineRasterizationLineStateCreateInfoEXT *rast_line_info =
1690       vk_find_struct_const(rs->pNext, PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT);
1691    if (rast_line_info) {
1692       info.stippled_line_enable = rast_line_info->stippledLineEnable;
1693       info.line_raster_mode = rast_line_info->lineRasterizationMode;
1694       info.line_stipple_factor = rast_line_info->lineStippleFactor;
1695       info.line_stipple_pattern = rast_line_info->lineStipplePattern;
1696    }
1697 
1698    const VkPipelineRasterizationDepthClipStateCreateInfoEXT *depth_clip_state =
1699       vk_find_struct_const(rs->pNext, PIPELINE_RASTERIZATION_DEPTH_CLIP_STATE_CREATE_INFO_EXT);
1700    if (depth_clip_state) {
1701       info.depth_clip_disable = !depth_clip_state->depthClipEnable;
1702    }
1703 
1704    const VkPipelineRasterizationStateRasterizationOrderAMD *raster_order =
1705       vk_find_struct_const(rs->pNext, PIPELINE_RASTERIZATION_STATE_RASTERIZATION_ORDER_AMD);
1706    if (raster_order) {
1707       info.order = raster_order->rasterizationOrder;
1708    }
1709 
1710    return info;
1711 }
1712 
1713 static struct radv_discard_rectangle_info
radv_pipeline_init_discard_rectangle_info(struct radv_graphics_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)1714 radv_pipeline_init_discard_rectangle_info(struct radv_graphics_pipeline *pipeline,
1715                                           const VkGraphicsPipelineCreateInfo *pCreateInfo)
1716 {
1717    const VkPipelineDiscardRectangleStateCreateInfoEXT *discard_rectangle_info =
1718       vk_find_struct_const(pCreateInfo->pNext, PIPELINE_DISCARD_RECTANGLE_STATE_CREATE_INFO_EXT);
1719    struct radv_discard_rectangle_info info = {0};
1720 
1721    if (discard_rectangle_info) {
1722       info.mode = discard_rectangle_info->discardRectangleMode;
1723       if (!(pipeline->dynamic_states & RADV_DYNAMIC_DISCARD_RECTANGLE)) {
1724          typed_memcpy(info.rects, discard_rectangle_info->pDiscardRectangles,
1725                       discard_rectangle_info->discardRectangleCount);
1726       }
1727       info.count = discard_rectangle_info->discardRectangleCount;
1728    }
1729 
1730    return info;
1731 }
1732 
1733 static struct radv_multisample_info
radv_pipeline_init_multisample_info(struct radv_graphics_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)1734 radv_pipeline_init_multisample_info(struct radv_graphics_pipeline *pipeline,
1735                                     const VkGraphicsPipelineCreateInfo *pCreateInfo)
1736 {
1737    const VkPipelineMultisampleStateCreateInfo *ms = pCreateInfo->pMultisampleState;
1738    struct radv_multisample_info info = {0};
1739 
1740    if (radv_is_raster_enabled(pipeline, pCreateInfo)) {
1741       info.raster_samples = ms->rasterizationSamples;
1742       info.sample_shading_enable = ms->sampleShadingEnable;
1743       info.min_sample_shading = ms->minSampleShading;
1744       info.alpha_to_coverage_enable = ms->alphaToCoverageEnable;
1745       if (ms->pSampleMask) {
1746          info.sample_mask = ms->pSampleMask[0] & 0xffff;
1747       } else {
1748          info.sample_mask = 0xffff;
1749       }
1750 
1751       const VkPipelineSampleLocationsStateCreateInfoEXT *sample_location_info =
1752          vk_find_struct_const(ms->pNext, PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT);
1753       if (sample_location_info) {
1754          /* If sampleLocationsEnable is VK_FALSE, the default sample locations are used and the
1755           * values specified in sampleLocationsInfo are ignored.
1756           */
1757          info.sample_locs_enable = sample_location_info->sampleLocationsEnable;
1758          if (sample_location_info->sampleLocationsEnable) {
1759             const VkSampleLocationsInfoEXT *pSampleLocationsInfo =
1760                &sample_location_info->sampleLocationsInfo;
1761             assert(pSampleLocationsInfo->sampleLocationsCount <= MAX_SAMPLE_LOCATIONS);
1762 
1763             info.sample_locs_per_pixel = pSampleLocationsInfo->sampleLocationsPerPixel;
1764             info.sample_locs_grid_size = pSampleLocationsInfo->sampleLocationGridSize;
1765             for (uint32_t i = 0; i < pSampleLocationsInfo->sampleLocationsCount; i++) {
1766                info.sample_locs[i] = pSampleLocationsInfo->pSampleLocations[i];
1767             }
1768             info.sample_locs_count = pSampleLocationsInfo->sampleLocationsCount;
1769          }
1770       }
1771    } else {
1772       info.raster_samples = VK_SAMPLE_COUNT_1_BIT;
1773    }
1774 
1775    return info;
1776 }
1777 
1778 static struct radv_depth_stencil_info
radv_pipeline_init_depth_stencil_info(struct radv_graphics_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)1779 radv_pipeline_init_depth_stencil_info(struct radv_graphics_pipeline *pipeline,
1780                                       const VkGraphicsPipelineCreateInfo *pCreateInfo)
1781 {
1782    const VkPipelineDepthStencilStateCreateInfo *ds = pCreateInfo->pDepthStencilState;
1783    const VkPipelineRenderingCreateInfo *ri =
1784       vk_find_struct_const(pCreateInfo->pNext, PIPELINE_RENDERING_CREATE_INFO);
1785    struct radv_depth_stencil_info info = {0};
1786 
1787    if (radv_is_raster_enabled(pipeline, pCreateInfo) &&
1788        (ri->depthAttachmentFormat != VK_FORMAT_UNDEFINED ||
1789         ri->stencilAttachmentFormat != VK_FORMAT_UNDEFINED)) {
1790       info.depth_bounds_test_enable = ds->depthBoundsTestEnable;
1791       info.depth_bounds.min = ds->minDepthBounds;
1792       info.depth_bounds.max = ds->maxDepthBounds;
1793       info.stencil_test_enable = ds->stencilTestEnable;
1794       info.front.fail_op = ds->front.failOp;
1795       info.front.pass_op = ds->front.passOp;
1796       info.front.depth_fail_op = ds->front.depthFailOp;
1797       info.front.compare_op = ds->front.compareOp;
1798       info.front.compare_mask = ds->front.compareMask;
1799       info.front.write_mask = ds->front.writeMask;
1800       info.front.reference = ds->front.reference;
1801       info.back.fail_op = ds->back.failOp;
1802       info.back.pass_op = ds->back.passOp;
1803       info.back.depth_fail_op = ds->back.depthFailOp;
1804       info.back.compare_op = ds->back.compareOp;
1805       info.back.compare_mask = ds->back.compareMask;
1806       info.back.write_mask = ds->back.writeMask;
1807       info.back.reference = ds->back.reference;
1808       info.depth_test_enable = ds->depthTestEnable;
1809       info.depth_write_enable = ds->depthWriteEnable;
1810       info.depth_compare_op = ds->depthCompareOp;
1811    }
1812 
1813    return info;
1814 }
1815 
1816 static struct radv_rendering_info
radv_pipeline_init_rendering_info(struct radv_graphics_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)1817 radv_pipeline_init_rendering_info(struct radv_graphics_pipeline *pipeline,
1818                                   const VkGraphicsPipelineCreateInfo *pCreateInfo)
1819 {
1820    const VkPipelineRenderingCreateInfo *ri =
1821       vk_find_struct_const(pCreateInfo->pNext, PIPELINE_RENDERING_CREATE_INFO);
1822    struct radv_rendering_info info = {0};
1823 
1824    info.view_mask = ri->viewMask;
1825    for (uint32_t i = 0; i < ri->colorAttachmentCount; i++) {
1826       info.color_att_formats[i] = ri->pColorAttachmentFormats[i];
1827    }
1828    info.color_att_count = ri->colorAttachmentCount;
1829    info.depth_att_format = ri->depthAttachmentFormat;
1830    info.stencil_att_format = ri->stencilAttachmentFormat;
1831 
1832    return info;
1833 }
1834 
1835 static struct radv_color_blend_info
radv_pipeline_init_color_blend_info(struct radv_graphics_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)1836 radv_pipeline_init_color_blend_info(struct radv_graphics_pipeline *pipeline,
1837                                     const VkGraphicsPipelineCreateInfo *pCreateInfo)
1838 {
1839    const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
1840    const VkPipelineColorBlendStateCreateInfo *cb = pCreateInfo->pColorBlendState;
1841    const VkPipelineRenderingCreateInfo *ri =
1842       vk_find_struct_const(pCreateInfo->pNext, PIPELINE_RENDERING_CREATE_INFO);
1843    struct radv_color_blend_info info = {0};
1844    bool has_color_att = false;
1845 
1846    for (uint32_t i = 0; i < ri->colorAttachmentCount; ++i) {
1847       if (ri->pColorAttachmentFormats[i] != VK_FORMAT_UNDEFINED) {
1848          has_color_att = true;
1849          break;
1850       }
1851    }
1852 
1853    if (radv_is_raster_enabled(pipeline, pCreateInfo) && has_color_att) {
1854       for (uint32_t i = 0; i < cb->attachmentCount; i++) {
1855          const VkPipelineColorBlendAttachmentState *att = &cb->pAttachments[i];
1856 
1857          info.att[i].color_write_mask = att->colorWriteMask;
1858          info.att[i].blend_enable = att->blendEnable;
1859          info.att[i].color_blend_op = si_translate_blend_function(att->colorBlendOp);
1860          info.att[i].alpha_blend_op = si_translate_blend_function(att->alphaBlendOp);
1861          info.att[i].src_color_blend_factor =
1862             si_translate_blend_factor(pdevice->rad_info.gfx_level, att->srcColorBlendFactor);
1863          info.att[i].dst_color_blend_factor =
1864             si_translate_blend_factor(pdevice->rad_info.gfx_level, att->dstColorBlendFactor);
1865          info.att[i].src_alpha_blend_factor =
1866             si_translate_blend_factor(pdevice->rad_info.gfx_level, att->srcAlphaBlendFactor);
1867          info.att[i].dst_alpha_blend_factor =
1868             si_translate_blend_factor(pdevice->rad_info.gfx_level, att->dstAlphaBlendFactor);
1869       }
1870       info.att_count = cb->attachmentCount;
1871 
1872       for (uint32_t i = 0; i < 4; i++) {
1873          info.blend_constants[i] = cb->blendConstants[i];
1874       }
1875 
1876       info.logic_op_enable = cb->logicOpEnable;
1877       if (info.logic_op_enable)
1878          info.logic_op = si_translate_blend_logic_op(cb->logicOp);
1879 
1880       const VkPipelineColorWriteCreateInfoEXT *color_write_info =
1881          vk_find_struct_const(cb->pNext, PIPELINE_COLOR_WRITE_CREATE_INFO_EXT);
1882       if (color_write_info) {
1883          for (uint32_t i = 0; i < color_write_info->attachmentCount; i++) {
1884             info.color_write_enable |=
1885                color_write_info->pColorWriteEnables[i] ? (0xfu << (i * 4)) : 0;
1886          }
1887       } else {
1888          info.color_write_enable = 0xffffffffu;
1889       }
1890    }
1891 
1892    return info;
1893 }
1894 
1895 static struct radv_fragment_shading_rate_info
radv_pipeline_init_fragment_shading_rate_info(struct radv_graphics_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)1896 radv_pipeline_init_fragment_shading_rate_info(struct radv_graphics_pipeline *pipeline,
1897                                               const VkGraphicsPipelineCreateInfo *pCreateInfo)
1898 {
1899    const VkPipelineFragmentShadingRateStateCreateInfoKHR *shading_rate =
1900       vk_find_struct_const(pCreateInfo->pNext, PIPELINE_FRAGMENT_SHADING_RATE_STATE_CREATE_INFO_KHR);
1901    struct radv_fragment_shading_rate_info info = {0};
1902 
1903    if (shading_rate && !(pipeline->dynamic_states & RADV_DYNAMIC_FRAGMENT_SHADING_RATE)) {
1904       info.size = shading_rate->fragmentSize;
1905       for (int i = 0; i < 2; i++)
1906          info.combiner_ops[i] = shading_rate->combinerOps[i];
1907    } else {
1908       info.size = (VkExtent2D){ 1, 1 };
1909       info.combiner_ops[0] = VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR;
1910       info.combiner_ops[1] = VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR;
1911    }
1912 
1913    return info;
1914 }
1915 
1916 static struct radv_graphics_pipeline_info
radv_pipeline_init_graphics_info(struct radv_graphics_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo)1917 radv_pipeline_init_graphics_info(struct radv_graphics_pipeline *pipeline,
1918                                  const VkGraphicsPipelineCreateInfo *pCreateInfo)
1919 {
1920    struct radv_graphics_pipeline_info info = {0};
1921 
1922    /* Vertex input interface structs have to be ignored if the pipeline includes a mesh shader. */
1923    if (!(pipeline->active_stages & VK_SHADER_STAGE_MESH_BIT_NV)) {
1924       info.vi = radv_pipeline_init_vertex_input_info(pipeline, pCreateInfo);
1925       info.ia = radv_pipeline_init_input_assembly_info(pipeline, pCreateInfo);
1926    }
1927 
1928    info.ts = radv_pipeline_init_tessellation_info(pipeline, pCreateInfo);
1929    info.vp = radv_pipeline_init_viewport_info(pipeline, pCreateInfo);
1930    info.rs = radv_pipeline_init_rasterization_info(pipeline, pCreateInfo);
1931    info.dr = radv_pipeline_init_discard_rectangle_info(pipeline, pCreateInfo);
1932 
1933    info.ms = radv_pipeline_init_multisample_info(pipeline, pCreateInfo);
1934    info.ds = radv_pipeline_init_depth_stencil_info(pipeline, pCreateInfo);
1935    info.ri = radv_pipeline_init_rendering_info(pipeline, pCreateInfo);
1936    info.cb = radv_pipeline_init_color_blend_info(pipeline, pCreateInfo);
1937 
1938    info.fsr = radv_pipeline_init_fragment_shading_rate_info(pipeline, pCreateInfo);
1939 
1940    /* VK_AMD_mixed_attachment_samples */
1941    const VkAttachmentSampleCountInfoAMD *sample_info =
1942       vk_find_struct_const(pCreateInfo->pNext, ATTACHMENT_SAMPLE_COUNT_INFO_AMD);
1943    if (sample_info) {
1944       for (uint32_t i = 0; i < sample_info->colorAttachmentCount; ++i) {
1945          if (info.ri.color_att_formats[i] != VK_FORMAT_UNDEFINED) {
1946             info.color_att_samples = MAX2(info.color_att_samples, sample_info->pColorAttachmentSamples[i]);
1947          }
1948       }
1949       info.ds_att_samples = sample_info->depthStencilAttachmentSamples;
1950    }
1951 
1952    return info;
1953 }
1954 
1955 static void
radv_pipeline_init_input_assembly_state(struct radv_graphics_pipeline * pipeline,const struct radv_graphics_pipeline_info * info)1956 radv_pipeline_init_input_assembly_state(struct radv_graphics_pipeline *pipeline,
1957                                         const struct radv_graphics_pipeline_info *info)
1958 {
1959    pipeline->ia_multi_vgt_param = radv_compute_ia_multi_vgt_param_helpers(pipeline);
1960 }
1961 
1962 static void
radv_pipeline_init_dynamic_state(struct radv_graphics_pipeline * pipeline,const struct radv_graphics_pipeline_info * info)1963 radv_pipeline_init_dynamic_state(struct radv_graphics_pipeline *pipeline,
1964                                  const struct radv_graphics_pipeline_info *info)
1965 {
1966    uint64_t needed_states = radv_pipeline_needed_dynamic_state(pipeline, info);
1967    uint64_t states = needed_states;
1968 
1969    pipeline->dynamic_state = default_dynamic_state;
1970    pipeline->needed_dynamic_state = needed_states;
1971 
1972    states &= ~pipeline->dynamic_states;
1973 
1974    struct radv_dynamic_state *dynamic = &pipeline->dynamic_state;
1975 
1976    if (needed_states & RADV_DYNAMIC_VIEWPORT) {
1977       dynamic->viewport.count = info->vp.viewport_count;
1978       if (states & RADV_DYNAMIC_VIEWPORT) {
1979          typed_memcpy(dynamic->viewport.viewports, info->vp.viewports, info->vp.viewport_count);
1980          for (unsigned i = 0; i < dynamic->viewport.count; i++)
1981             radv_get_viewport_xform(&dynamic->viewport.viewports[i],
1982                                     dynamic->viewport.xform[i].scale, dynamic->viewport.xform[i].translate);
1983       }
1984    }
1985 
1986    if (needed_states & RADV_DYNAMIC_SCISSOR) {
1987       dynamic->scissor.count = info->vp.scissor_count;
1988       if (states & RADV_DYNAMIC_SCISSOR) {
1989          typed_memcpy(dynamic->scissor.scissors, info->vp.scissors, info->vp.scissor_count);
1990       }
1991    }
1992 
1993    if (states & RADV_DYNAMIC_LINE_WIDTH) {
1994       dynamic->line_width = info->rs.line_width;
1995    }
1996 
1997    if (states & RADV_DYNAMIC_DEPTH_BIAS) {
1998       dynamic->depth_bias.bias = info->rs.depth_bias_constant_factor;
1999       dynamic->depth_bias.clamp = info->rs.depth_bias_clamp;
2000       dynamic->depth_bias.slope = info->rs.depth_bias_slope_factor;
2001    }
2002 
2003    /* Section 9.2 of the Vulkan 1.0.15 spec says:
2004     *
2005     *    pColorBlendState is [...] NULL if the pipeline has rasterization
2006     *    disabled or if the subpass of the render pass the pipeline is
2007     *    created against does not use any color attachments.
2008     */
2009    if (states & RADV_DYNAMIC_BLEND_CONSTANTS) {
2010       typed_memcpy(dynamic->blend_constants, info->cb.blend_constants, 4);
2011    }
2012 
2013    if (states & RADV_DYNAMIC_CULL_MODE) {
2014       dynamic->cull_mode = info->rs.cull_mode;
2015    }
2016 
2017    if (states & RADV_DYNAMIC_FRONT_FACE) {
2018       dynamic->front_face = info->rs.front_face;
2019    }
2020 
2021    if (states & RADV_DYNAMIC_PRIMITIVE_TOPOLOGY) {
2022       dynamic->primitive_topology = info->ia.primitive_topology;
2023    }
2024 
2025    /* If there is no depthstencil attachment, then don't read
2026     * pDepthStencilState. The Vulkan spec states that pDepthStencilState may
2027     * be NULL in this case. Even if pDepthStencilState is non-NULL, there is
2028     * no need to override the depthstencil defaults in
2029     * radv_pipeline::dynamic_state when there is no depthstencil attachment.
2030     *
2031     * Section 9.2 of the Vulkan 1.0.15 spec says:
2032     *
2033     *    pDepthStencilState is [...] NULL if the pipeline has rasterization
2034     *    disabled or if the subpass of the render pass the pipeline is created
2035     *    against does not use a depth/stencil attachment.
2036     */
2037    if (needed_states && radv_pipeline_has_ds_attachments(&info->ri)) {
2038       if (states & RADV_DYNAMIC_DEPTH_BOUNDS) {
2039          dynamic->depth_bounds.min = info->ds.depth_bounds.min;
2040          dynamic->depth_bounds.max = info->ds.depth_bounds.max;
2041       }
2042 
2043       if (states & RADV_DYNAMIC_STENCIL_COMPARE_MASK) {
2044          dynamic->stencil_compare_mask.front = info->ds.front.compare_mask;
2045          dynamic->stencil_compare_mask.back = info->ds.back.compare_mask;
2046       }
2047 
2048       if (states & RADV_DYNAMIC_STENCIL_WRITE_MASK) {
2049          dynamic->stencil_write_mask.front = info->ds.front.write_mask;
2050          dynamic->stencil_write_mask.back = info->ds.back.write_mask;
2051       }
2052 
2053       if (states & RADV_DYNAMIC_STENCIL_REFERENCE) {
2054          dynamic->stencil_reference.front = info->ds.front.reference;
2055          dynamic->stencil_reference.back = info->ds.back.reference;
2056       }
2057 
2058       if (states & RADV_DYNAMIC_DEPTH_TEST_ENABLE) {
2059          dynamic->depth_test_enable = info->ds.depth_test_enable;
2060       }
2061 
2062       if (states & RADV_DYNAMIC_DEPTH_WRITE_ENABLE) {
2063          dynamic->depth_write_enable = info->ds.depth_write_enable;
2064       }
2065 
2066       if (states & RADV_DYNAMIC_DEPTH_COMPARE_OP) {
2067          dynamic->depth_compare_op = info->ds.depth_compare_op;
2068       }
2069 
2070       if (states & RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE) {
2071          dynamic->depth_bounds_test_enable = info->ds.depth_bounds_test_enable;
2072       }
2073 
2074       if (states & RADV_DYNAMIC_STENCIL_TEST_ENABLE) {
2075          dynamic->stencil_test_enable = info->ds.stencil_test_enable;
2076       }
2077 
2078       if (states & RADV_DYNAMIC_STENCIL_OP) {
2079          dynamic->stencil_op.front.compare_op = info->ds.front.compare_op;
2080          dynamic->stencil_op.front.fail_op = info->ds.front.fail_op;
2081          dynamic->stencil_op.front.pass_op = info->ds.front.pass_op;
2082          dynamic->stencil_op.front.depth_fail_op = info->ds.front.depth_fail_op;
2083 
2084          dynamic->stencil_op.back.compare_op = info->ds.back.compare_op;
2085          dynamic->stencil_op.back.fail_op = info->ds.back.fail_op;
2086          dynamic->stencil_op.back.pass_op = info->ds.back.pass_op;
2087          dynamic->stencil_op.back.depth_fail_op = info->ds.back.depth_fail_op;
2088       }
2089    }
2090 
2091    if (needed_states & RADV_DYNAMIC_DISCARD_RECTANGLE) {
2092       dynamic->discard_rectangle.count = info->dr.count;
2093       if (states & RADV_DYNAMIC_DISCARD_RECTANGLE) {
2094          typed_memcpy(dynamic->discard_rectangle.rectangles, info->dr.rects, info->dr.count);
2095       }
2096    }
2097 
2098    if (needed_states & RADV_DYNAMIC_SAMPLE_LOCATIONS) {
2099       if (info->ms.sample_locs_enable) {
2100          dynamic->sample_location.per_pixel = info->ms.sample_locs_per_pixel;
2101          dynamic->sample_location.grid_size = info->ms.sample_locs_grid_size;
2102          dynamic->sample_location.count = info->ms.sample_locs_count;
2103          typed_memcpy(&dynamic->sample_location.locations[0], info->ms.sample_locs,
2104                       info->ms.sample_locs_count);
2105       }
2106    }
2107 
2108    if (needed_states & RADV_DYNAMIC_LINE_STIPPLE) {
2109       dynamic->line_stipple.factor = info->rs.line_stipple_factor;
2110       dynamic->line_stipple.pattern = info->rs.line_stipple_pattern;
2111    }
2112 
2113    if (!(states & RADV_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE) ||
2114        !(states & RADV_DYNAMIC_VERTEX_INPUT))
2115       pipeline->uses_dynamic_stride = true;
2116 
2117    if (states & RADV_DYNAMIC_FRAGMENT_SHADING_RATE) {
2118       dynamic->fragment_shading_rate.size = info->fsr.size;
2119       for (int i = 0; i < 2; i++)
2120          dynamic->fragment_shading_rate.combiner_ops[i] = info->fsr.combiner_ops[i];
2121    }
2122 
2123    if (states & RADV_DYNAMIC_DEPTH_BIAS_ENABLE) {
2124       dynamic->depth_bias_enable = info->rs.depth_bias_enable;
2125    }
2126 
2127    if (states & RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE) {
2128       dynamic->primitive_restart_enable = info->ia.primitive_restart_enable;
2129    }
2130 
2131    if (states & RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE) {
2132       dynamic->rasterizer_discard_enable = info->rs.discard_enable;
2133    }
2134 
2135    if (radv_pipeline_has_color_attachments(&info->ri) && states & RADV_DYNAMIC_LOGIC_OP) {
2136       if (info->cb.logic_op_enable) {
2137          dynamic->logic_op = info->cb.logic_op;
2138       } else {
2139          dynamic->logic_op = V_028808_ROP3_COPY;
2140       }
2141    }
2142 
2143    if (states & RADV_DYNAMIC_COLOR_WRITE_ENABLE) {
2144       dynamic->color_write_enable = info->cb.color_write_enable;
2145    }
2146 
2147    pipeline->dynamic_state.mask = states;
2148 }
2149 
2150 static void
radv_pipeline_init_raster_state(struct radv_graphics_pipeline * pipeline,const struct radv_graphics_pipeline_info * info)2151 radv_pipeline_init_raster_state(struct radv_graphics_pipeline *pipeline,
2152                                 const struct radv_graphics_pipeline_info *info)
2153 {
2154    const struct radv_device *device = pipeline->base.device;
2155 
2156    pipeline->pa_su_sc_mode_cntl =
2157       S_028814_FACE(info->rs.front_face) |
2158       S_028814_CULL_FRONT(!!(info->rs.cull_mode & VK_CULL_MODE_FRONT_BIT)) |
2159       S_028814_CULL_BACK(!!(info->rs.cull_mode & VK_CULL_MODE_BACK_BIT)) |
2160       S_028814_POLY_MODE(info->rs.polygon_mode != V_028814_X_DRAW_TRIANGLES) |
2161       S_028814_POLYMODE_FRONT_PTYPE(info->rs.polygon_mode) |
2162       S_028814_POLYMODE_BACK_PTYPE(info->rs.polygon_mode) |
2163       S_028814_POLY_OFFSET_FRONT_ENABLE(info->rs.depth_bias_enable) |
2164       S_028814_POLY_OFFSET_BACK_ENABLE(info->rs.depth_bias_enable) |
2165       S_028814_POLY_OFFSET_PARA_ENABLE(info->rs.depth_bias_enable) |
2166       S_028814_PROVOKING_VTX_LAST(info->rs.provoking_vtx_last);
2167 
2168    if (device->physical_device->rad_info.gfx_level >= GFX10) {
2169       /* It should also be set if PERPENDICULAR_ENDCAP_ENA is set. */
2170       pipeline->pa_su_sc_mode_cntl |=
2171          S_028814_KEEP_TOGETHER_ENABLE(info->rs.polygon_mode != V_028814_X_DRAW_TRIANGLES);
2172    }
2173 
2174    pipeline->pa_cl_clip_cntl =
2175       S_028810_DX_CLIP_SPACE_DEF(!pipeline->negative_one_to_one) |
2176       S_028810_ZCLIP_NEAR_DISABLE(info->rs.depth_clip_disable) |
2177       S_028810_ZCLIP_FAR_DISABLE(info->rs.depth_clip_disable) |
2178       S_028810_DX_RASTERIZATION_KILL(info->rs.discard_enable) |
2179       S_028810_DX_LINEAR_ATTR_CLIP_ENA(1);
2180 
2181    pipeline->uses_conservative_overestimate =
2182       info->rs.conservative_mode == VK_CONSERVATIVE_RASTERIZATION_MODE_OVERESTIMATE_EXT;
2183 
2184    pipeline->depth_clamp_mode = RADV_DEPTH_CLAMP_MODE_VIEWPORT;
2185    if (!info->rs.depth_clamp_enable) {
2186       /* For optimal performance, depth clamping should always be enabled except if the
2187        * application disables clamping explicitly or uses depth values outside of the [0.0, 1.0]
2188        * range.
2189        */
2190       if (info->rs.depth_clip_disable ||
2191           device->vk.enabled_extensions.EXT_depth_range_unrestricted) {
2192          pipeline->depth_clamp_mode = RADV_DEPTH_CLAMP_MODE_DISABLED;
2193       } else {
2194          pipeline->depth_clamp_mode = RADV_DEPTH_CLAMP_MODE_ZERO_TO_ONE;
2195       }
2196    }
2197 }
2198 
2199 static struct radv_depth_stencil_state
radv_pipeline_init_depth_stencil_state(struct radv_graphics_pipeline * pipeline,const struct radv_graphics_pipeline_info * info)2200 radv_pipeline_init_depth_stencil_state(struct radv_graphics_pipeline *pipeline,
2201                                        const struct radv_graphics_pipeline_info *info)
2202 {
2203    const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
2204    struct radv_depth_stencil_state ds_state = {0};
2205    uint32_t db_depth_control = 0;
2206 
2207    bool has_depth_attachment = info->ri.depth_att_format != VK_FORMAT_UNDEFINED;
2208    bool has_stencil_attachment = info->ri.stencil_att_format != VK_FORMAT_UNDEFINED;
2209 
2210    if (has_depth_attachment) {
2211       /* from amdvlk: For 4xAA and 8xAA need to decompress on flush for better performance */
2212       ds_state.db_render_override2 |= S_028010_DECOMPRESS_Z_ON_FLUSH(info->ms.raster_samples > 2);
2213 
2214       if (pdevice->rad_info.gfx_level >= GFX10_3)
2215          ds_state.db_render_override2 |= S_028010_CENTROID_COMPUTATION_MODE(1);
2216 
2217       db_depth_control = S_028800_Z_ENABLE(info->ds.depth_test_enable) |
2218                          S_028800_Z_WRITE_ENABLE(info->ds.depth_write_enable) |
2219                          S_028800_ZFUNC(info->ds.depth_compare_op) |
2220                          S_028800_DEPTH_BOUNDS_ENABLE(info->ds.depth_bounds_test_enable);
2221    }
2222 
2223    if (has_stencil_attachment && info->ds.stencil_test_enable) {
2224       db_depth_control |= S_028800_STENCIL_ENABLE(1) | S_028800_BACKFACE_ENABLE(1);
2225       db_depth_control |= S_028800_STENCILFUNC(info->ds.front.compare_op);
2226       db_depth_control |= S_028800_STENCILFUNC_BF(info->ds.back.compare_op);
2227    }
2228 
2229    ds_state.db_render_override |= S_02800C_FORCE_HIS_ENABLE0(V_02800C_FORCE_DISABLE) |
2230                                   S_02800C_FORCE_HIS_ENABLE1(V_02800C_FORCE_DISABLE);
2231 
2232    if (pipeline->depth_clamp_mode == RADV_DEPTH_CLAMP_MODE_DISABLED)
2233       ds_state.db_render_override |= S_02800C_DISABLE_VIEWPORT_CLAMP(1);
2234 
2235    if (pdevice->rad_info.gfx_level >= GFX11) {
2236       unsigned max_allowed_tiles_in_wave = 0;
2237       unsigned num_samples = MAX2(radv_pipeline_color_samples(info),
2238                                   radv_pipeline_depth_samples(info));
2239 
2240       if (pdevice->rad_info.has_dedicated_vram) {
2241          if (num_samples == 8)
2242             max_allowed_tiles_in_wave = 7;
2243          else if (num_samples == 4)
2244             max_allowed_tiles_in_wave = 14;
2245       } else {
2246          if (num_samples == 8)
2247             max_allowed_tiles_in_wave = 8;
2248       }
2249 
2250       /* TODO: We may want to disable this workaround for future chips. */
2251       if (num_samples >= 4) {
2252          if (max_allowed_tiles_in_wave)
2253             max_allowed_tiles_in_wave--;
2254          else
2255             max_allowed_tiles_in_wave = 15;
2256       }
2257 
2258       ds_state.db_render_control |= S_028000_OREO_MODE(V_028000_OMODE_O_THEN_B) |
2259                                     S_028000_MAX_ALLOWED_TILES_IN_WAVE(max_allowed_tiles_in_wave);
2260    }
2261 
2262    pipeline->db_depth_control = db_depth_control;
2263 
2264    return ds_state;
2265 }
2266 
2267 static void
gfx9_get_gs_info(const struct radv_pipeline_key * key,const struct radv_pipeline * pipeline,struct radv_pipeline_stage * stages,struct gfx9_gs_info * out)2268 gfx9_get_gs_info(const struct radv_pipeline_key *key, const struct radv_pipeline *pipeline,
2269                  struct radv_pipeline_stage *stages, struct gfx9_gs_info *out)
2270 {
2271    const struct radv_physical_device *pdevice = pipeline->device->physical_device;
2272    struct radv_shader_info *gs_info = &stages[MESA_SHADER_GEOMETRY].info;
2273    struct radv_es_output_info *es_info;
2274    bool has_tess = !!stages[MESA_SHADER_TESS_CTRL].nir;
2275 
2276    if (pdevice->rad_info.gfx_level >= GFX9)
2277       es_info = has_tess ? &gs_info->tes.es_info : &gs_info->vs.es_info;
2278    else
2279       es_info = has_tess ? &stages[MESA_SHADER_TESS_EVAL].info.tes.es_info
2280                          : &stages[MESA_SHADER_VERTEX].info.vs.es_info;
2281 
2282    unsigned gs_num_invocations = MAX2(gs_info->gs.invocations, 1);
2283    bool uses_adjacency;
2284    switch (key->vs.topology) {
2285    case V_008958_DI_PT_LINELIST_ADJ:
2286    case V_008958_DI_PT_LINESTRIP_ADJ:
2287    case V_008958_DI_PT_TRILIST_ADJ:
2288    case V_008958_DI_PT_TRISTRIP_ADJ:
2289       uses_adjacency = true;
2290       break;
2291    default:
2292       uses_adjacency = false;
2293       break;
2294    }
2295 
2296    /* All these are in dwords: */
2297    /* We can't allow using the whole LDS, because GS waves compete with
2298     * other shader stages for LDS space. */
2299    const unsigned max_lds_size = 8 * 1024;
2300    const unsigned esgs_itemsize = es_info->esgs_itemsize / 4;
2301    unsigned esgs_lds_size;
2302 
2303    /* All these are per subgroup: */
2304    const unsigned max_out_prims = 32 * 1024;
2305    const unsigned max_es_verts = 255;
2306    const unsigned ideal_gs_prims = 64;
2307    unsigned max_gs_prims, gs_prims;
2308    unsigned min_es_verts, es_verts, worst_case_es_verts;
2309 
2310    if (uses_adjacency || gs_num_invocations > 1)
2311       max_gs_prims = 127 / gs_num_invocations;
2312    else
2313       max_gs_prims = 255;
2314 
2315    /* MAX_PRIMS_PER_SUBGROUP = gs_prims * max_vert_out * gs_invocations.
2316     * Make sure we don't go over the maximum value.
2317     */
2318    if (gs_info->gs.vertices_out > 0) {
2319       max_gs_prims =
2320          MIN2(max_gs_prims, max_out_prims / (gs_info->gs.vertices_out * gs_num_invocations));
2321    }
2322    assert(max_gs_prims > 0);
2323 
2324    /* If the primitive has adjacency, halve the number of vertices
2325     * that will be reused in multiple primitives.
2326     */
2327    min_es_verts = gs_info->gs.vertices_in / (uses_adjacency ? 2 : 1);
2328 
2329    gs_prims = MIN2(ideal_gs_prims, max_gs_prims);
2330    worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);
2331 
2332    /* Compute ESGS LDS size based on the worst case number of ES vertices
2333     * needed to create the target number of GS prims per subgroup.
2334     */
2335    esgs_lds_size = esgs_itemsize * worst_case_es_verts;
2336 
2337    /* If total LDS usage is too big, refactor partitions based on ratio
2338     * of ESGS item sizes.
2339     */
2340    if (esgs_lds_size > max_lds_size) {
2341       /* Our target GS Prims Per Subgroup was too large. Calculate
2342        * the maximum number of GS Prims Per Subgroup that will fit
2343        * into LDS, capped by the maximum that the hardware can support.
2344        */
2345       gs_prims = MIN2((max_lds_size / (esgs_itemsize * min_es_verts)), max_gs_prims);
2346       assert(gs_prims > 0);
2347       worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);
2348 
2349       esgs_lds_size = esgs_itemsize * worst_case_es_verts;
2350       assert(esgs_lds_size <= max_lds_size);
2351    }
2352 
2353    /* Now calculate remaining ESGS information. */
2354    if (esgs_lds_size)
2355       es_verts = MIN2(esgs_lds_size / esgs_itemsize, max_es_verts);
2356    else
2357       es_verts = max_es_verts;
2358 
2359    /* Vertices for adjacency primitives are not always reused, so restore
2360     * it for ES_VERTS_PER_SUBGRP.
2361     */
2362    min_es_verts = gs_info->gs.vertices_in;
2363 
2364    /* For normal primitives, the VGT only checks if they are past the ES
2365     * verts per subgroup after allocating a full GS primitive and if they
2366     * are, kick off a new subgroup.  But if those additional ES verts are
2367     * unique (e.g. not reused) we need to make sure there is enough LDS
2368     * space to account for those ES verts beyond ES_VERTS_PER_SUBGRP.
2369     */
2370    es_verts -= min_es_verts - 1;
2371 
2372    uint32_t es_verts_per_subgroup = es_verts;
2373    uint32_t gs_prims_per_subgroup = gs_prims;
2374    uint32_t gs_inst_prims_in_subgroup = gs_prims * gs_num_invocations;
2375    uint32_t max_prims_per_subgroup = gs_inst_prims_in_subgroup * gs_info->gs.vertices_out;
2376    out->lds_size = align(esgs_lds_size, 128) / 128;
2377    out->vgt_gs_onchip_cntl = S_028A44_ES_VERTS_PER_SUBGRP(es_verts_per_subgroup) |
2378                              S_028A44_GS_PRIMS_PER_SUBGRP(gs_prims_per_subgroup) |
2379                              S_028A44_GS_INST_PRIMS_IN_SUBGRP(gs_inst_prims_in_subgroup);
2380    out->vgt_gs_max_prims_per_subgroup = S_028A94_MAX_PRIMS_PER_SUBGROUP(max_prims_per_subgroup);
2381    out->vgt_esgs_ring_itemsize = esgs_itemsize;
2382    assert(max_prims_per_subgroup <= max_out_prims);
2383 
2384    gl_shader_stage es_stage = has_tess ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
2385    unsigned workgroup_size = ac_compute_esgs_workgroup_size(
2386       pdevice->rad_info.gfx_level, stages[es_stage].info.wave_size,
2387       es_verts_per_subgroup, gs_inst_prims_in_subgroup);
2388    stages[es_stage].info.workgroup_size = workgroup_size;
2389    stages[MESA_SHADER_GEOMETRY].info.workgroup_size = workgroup_size;
2390 }
2391 
2392 static void
clamp_gsprims_to_esverts(unsigned * max_gsprims,unsigned max_esverts,unsigned min_verts_per_prim,bool use_adjacency)2393 clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts, unsigned min_verts_per_prim,
2394                          bool use_adjacency)
2395 {
2396    unsigned max_reuse = max_esverts - min_verts_per_prim;
2397    if (use_adjacency)
2398       max_reuse /= 2;
2399    *max_gsprims = MIN2(*max_gsprims, 1 + max_reuse);
2400 }
2401 
2402 static unsigned
radv_get_num_input_vertices(const struct radv_pipeline_stage * stages)2403 radv_get_num_input_vertices(const struct radv_pipeline_stage *stages)
2404 {
2405    if (stages[MESA_SHADER_GEOMETRY].nir) {
2406       nir_shader *gs = stages[MESA_SHADER_GEOMETRY].nir;
2407 
2408       return gs->info.gs.vertices_in;
2409    }
2410 
2411    if (stages[MESA_SHADER_TESS_CTRL].nir) {
2412       nir_shader *tes = stages[MESA_SHADER_TESS_EVAL].nir;
2413 
2414       if (tes->info.tess.point_mode)
2415          return 1;
2416       if (tes->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES)
2417          return 2;
2418       return 3;
2419    }
2420 
2421    return 3;
2422 }
2423 
2424 static void
gfx10_emit_ge_pc_alloc(struct radeon_cmdbuf * cs,enum amd_gfx_level gfx_level,uint32_t oversub_pc_lines)2425 gfx10_emit_ge_pc_alloc(struct radeon_cmdbuf *cs, enum amd_gfx_level gfx_level,
2426                        uint32_t oversub_pc_lines)
2427 {
2428    radeon_set_uconfig_reg(
2429       cs, R_030980_GE_PC_ALLOC,
2430       S_030980_OVERSUB_EN(oversub_pc_lines > 0) | S_030980_NUM_PC_LINES(oversub_pc_lines - 1));
2431 }
2432 
2433 static void
gfx10_get_ngg_ms_info(struct radv_pipeline_stage * stage,struct gfx10_ngg_info * ngg)2434 gfx10_get_ngg_ms_info(struct radv_pipeline_stage *stage, struct gfx10_ngg_info *ngg)
2435 {
2436    /* Special case for mesh shader workgroups.
2437     *
2438     * Mesh shaders don't have any real vertex input, but they can produce
2439     * an arbitrary number of vertices and primitives (up to 256).
2440     * We need to precisely control the number of mesh shader workgroups
2441     * that are launched from draw calls.
2442     *
2443     * To achieve that, we set:
2444     * - input primitive topology to point list
2445     * - input vertex and primitive count to 1
2446     * - max output vertex count and primitive amplification factor
2447     *   to the boundaries of the shader
2448     *
2449     * With that, in the draw call:
2450     * - drawing 1 input vertex ~ launching 1 mesh shader workgroup
2451     *
2452     * In the shader:
2453     * - base vertex ~ first workgroup index (firstTask in NV_mesh_shader)
2454     * - input vertex id ~ workgroup id (in 1D - shader needs to calculate in 3D)
2455     *
2456     * Notes:
2457     * - without GS_EN=1 PRIM_AMP_FACTOR and MAX_VERTS_PER_SUBGROUP don't seem to work
2458     * - with GS_EN=1 we must also set VGT_GS_MAX_VERT_OUT (otherwise the GPU hangs)
2459     * - with GS_FAST_LAUNCH=1 every lane's VGPRs are initialized to the same input vertex index
2460     *
2461     */
2462    nir_shader *ms = stage->nir;
2463 
2464    ngg->enable_vertex_grouping = true;
2465    ngg->esgs_ring_size = 1;
2466    ngg->hw_max_esverts = 1;
2467    ngg->max_gsprims = 1;
2468    ngg->max_out_verts = ms->info.mesh.max_vertices_out;
2469    ngg->max_vert_out_per_gs_instance = false;
2470    ngg->ngg_emit_size = 0;
2471    ngg->prim_amp_factor = ms->info.mesh.max_primitives_out;
2472    ngg->vgt_esgs_ring_itemsize = 1;
2473 
2474    unsigned min_ngg_workgroup_size =
2475       ac_compute_ngg_workgroup_size(ngg->hw_max_esverts, ngg->max_gsprims,
2476                                     ngg->max_out_verts, ngg->prim_amp_factor);
2477 
2478    unsigned api_workgroup_size =
2479       ac_compute_cs_workgroup_size(ms->info.workgroup_size, false, UINT32_MAX);
2480 
2481    stage->info.workgroup_size = MAX2(min_ngg_workgroup_size, api_workgroup_size);
2482 }
2483 
2484 static void
gfx10_get_ngg_info(const struct radv_pipeline_key * key,struct radv_pipeline * pipeline,struct radv_pipeline_stage * stages,struct gfx10_ngg_info * ngg)2485 gfx10_get_ngg_info(const struct radv_pipeline_key *key, struct radv_pipeline *pipeline,
2486                    struct radv_pipeline_stage *stages, struct gfx10_ngg_info *ngg)
2487 {
2488    const struct radv_physical_device *pdevice = pipeline->device->physical_device;
2489    struct radv_shader_info *gs_info = &stages[MESA_SHADER_GEOMETRY].info;
2490    struct radv_es_output_info *es_info =
2491       stages[MESA_SHADER_TESS_CTRL].nir ? &gs_info->tes.es_info : &gs_info->vs.es_info;
2492    unsigned gs_type = stages[MESA_SHADER_GEOMETRY].nir ? MESA_SHADER_GEOMETRY : MESA_SHADER_VERTEX;
2493    unsigned max_verts_per_prim = radv_get_num_input_vertices(stages);
2494    unsigned min_verts_per_prim = gs_type == MESA_SHADER_GEOMETRY ? max_verts_per_prim : 1;
2495    unsigned gs_num_invocations = stages[MESA_SHADER_GEOMETRY].nir ? MAX2(gs_info->gs.invocations, 1) : 1;
2496    bool uses_adjacency;
2497    switch (key->vs.topology) {
2498    case V_008958_DI_PT_LINELIST_ADJ:
2499    case V_008958_DI_PT_LINESTRIP_ADJ:
2500    case V_008958_DI_PT_TRILIST_ADJ:
2501    case V_008958_DI_PT_TRISTRIP_ADJ:
2502       uses_adjacency = true;
2503       break;
2504    default:
2505       uses_adjacency = false;
2506       break;
2507    }
2508 
2509    /* All these are in dwords: */
2510    /* We can't allow using the whole LDS, because GS waves compete with
2511     * other shader stages for LDS space.
2512     *
2513     * TODO: We should really take the shader's internal LDS use into
2514     *       account. The linker will fail if the size is greater than
2515     *       8K dwords.
2516     */
2517    const unsigned max_lds_size = 8 * 1024 - 768;
2518    const unsigned target_lds_size = max_lds_size;
2519    unsigned esvert_lds_size = 0;
2520    unsigned gsprim_lds_size = 0;
2521 
2522    /* All these are per subgroup: */
2523    const unsigned min_esverts = pdevice->rad_info.gfx_level >= GFX10_3 ? 29 : 24;
2524    bool max_vert_out_per_gs_instance = false;
2525    unsigned max_esverts_base = 128;
2526    unsigned max_gsprims_base = 128; /* default prim group size clamp */
2527 
2528    /* Hardware has the following non-natural restrictions on the value
2529     * of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of
2530     * the draw:
2531     *  - at most 252 for any line input primitive type
2532     *  - at most 251 for any quad input primitive type
2533     *  - at most 251 for triangle strips with adjacency (this happens to
2534     *    be the natural limit for triangle *lists* with adjacency)
2535     */
2536    max_esverts_base = MIN2(max_esverts_base, 251 + max_verts_per_prim - 1);
2537 
2538    if (gs_type == MESA_SHADER_GEOMETRY) {
2539       unsigned max_out_verts_per_gsprim = gs_info->gs.vertices_out * gs_num_invocations;
2540 
2541       if (max_out_verts_per_gsprim <= 256) {
2542          if (max_out_verts_per_gsprim) {
2543             max_gsprims_base = MIN2(max_gsprims_base, 256 / max_out_verts_per_gsprim);
2544          }
2545       } else {
2546          /* Use special multi-cycling mode in which each GS
2547           * instance gets its own subgroup. Does not work with
2548           * tessellation. */
2549          max_vert_out_per_gs_instance = true;
2550          max_gsprims_base = 1;
2551          max_out_verts_per_gsprim = gs_info->gs.vertices_out;
2552       }
2553 
2554       esvert_lds_size = es_info->esgs_itemsize / 4;
2555       gsprim_lds_size = (gs_info->gs.gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim;
2556    } else {
2557       /* VS and TES. */
2558       /* LDS size for passing data from GS to ES. */
2559       struct radv_streamout_info *so_info = stages[MESA_SHADER_TESS_CTRL].nir
2560                                                ? &stages[MESA_SHADER_TESS_EVAL].info.so
2561                                                : &stages[MESA_SHADER_VERTEX].info.so;
2562 
2563       if (so_info->num_outputs)
2564          esvert_lds_size = 4 * so_info->num_outputs + 1;
2565 
2566       /* GS stores Primitive IDs (one DWORD) into LDS at the address
2567        * corresponding to the ES thread of the provoking vertex. All
2568        * ES threads load and export PrimitiveID for their thread.
2569        */
2570       if (!stages[MESA_SHADER_TESS_CTRL].nir && stages[MESA_SHADER_VERTEX].info.vs.outinfo.export_prim_id)
2571          esvert_lds_size = MAX2(esvert_lds_size, 1);
2572    }
2573 
2574    unsigned max_gsprims = max_gsprims_base;
2575    unsigned max_esverts = max_esverts_base;
2576 
2577    if (esvert_lds_size)
2578       max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size);
2579    if (gsprim_lds_size)
2580       max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size);
2581 
2582    max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2583    clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency);
2584    assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
2585 
2586    if (esvert_lds_size || gsprim_lds_size) {
2587       /* Now that we have a rough proportionality between esverts
2588        * and gsprims based on the primitive type, scale both of them
2589        * down simultaneously based on required LDS space.
2590        *
2591        * We could be smarter about this if we knew how much vertex
2592        * reuse to expect.
2593        */
2594       unsigned lds_total = max_esverts * esvert_lds_size + max_gsprims * gsprim_lds_size;
2595       if (lds_total > target_lds_size) {
2596          max_esverts = max_esverts * target_lds_size / lds_total;
2597          max_gsprims = max_gsprims * target_lds_size / lds_total;
2598 
2599          max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2600          clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency);
2601          assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
2602       }
2603    }
2604 
2605    /* Round up towards full wave sizes for better ALU utilization. */
2606    if (!max_vert_out_per_gs_instance) {
2607       unsigned orig_max_esverts;
2608       unsigned orig_max_gsprims;
2609       unsigned wavesize;
2610 
2611       if (gs_type == MESA_SHADER_GEOMETRY) {
2612          wavesize = gs_info->wave_size;
2613       } else {
2614          wavesize = stages[MESA_SHADER_TESS_CTRL].nir ? stages[MESA_SHADER_TESS_EVAL].info.wave_size
2615                                                       : stages[MESA_SHADER_VERTEX].info.wave_size;
2616       }
2617 
2618       do {
2619          orig_max_esverts = max_esverts;
2620          orig_max_gsprims = max_gsprims;
2621 
2622          max_esverts = align(max_esverts, wavesize);
2623          max_esverts = MIN2(max_esverts, max_esverts_base);
2624          if (esvert_lds_size)
2625             max_esverts =
2626                MIN2(max_esverts, (max_lds_size - max_gsprims * gsprim_lds_size) / esvert_lds_size);
2627          max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2628 
2629          /* Hardware restriction: minimum value of max_esverts */
2630          if (pdevice->rad_info.gfx_level == GFX10)
2631             max_esverts = MAX2(max_esverts, min_esverts - 1 + max_verts_per_prim);
2632          else
2633             max_esverts = MAX2(max_esverts, min_esverts);
2634 
2635          max_gsprims = align(max_gsprims, wavesize);
2636          max_gsprims = MIN2(max_gsprims, max_gsprims_base);
2637          if (gsprim_lds_size) {
2638             /* Don't count unusable vertices to the LDS
2639              * size. Those are vertices above the maximum
2640              * number of vertices that can occur in the
2641              * workgroup, which is e.g. max_gsprims * 3
2642              * for triangles.
2643              */
2644             unsigned usable_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2645             max_gsprims = MIN2(max_gsprims,
2646                                (max_lds_size - usable_esverts * esvert_lds_size) / gsprim_lds_size);
2647          }
2648          clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency);
2649          assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
2650       } while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims);
2651 
2652       /* Verify the restriction. */
2653       if (pdevice->rad_info.gfx_level == GFX10)
2654          assert(max_esverts >= min_esverts - 1 + max_verts_per_prim);
2655       else
2656          assert(max_esverts >= min_esverts);
2657    } else {
2658       /* Hardware restriction: minimum value of max_esverts */
2659       if (pdevice->rad_info.gfx_level == GFX10)
2660          max_esverts = MAX2(max_esverts, min_esverts - 1 + max_verts_per_prim);
2661       else
2662          max_esverts = MAX2(max_esverts, min_esverts);
2663    }
2664 
2665    unsigned max_out_vertices = max_vert_out_per_gs_instance ? gs_info->gs.vertices_out
2666                                : gs_type == MESA_SHADER_GEOMETRY
2667                                   ? max_gsprims * gs_num_invocations * gs_info->gs.vertices_out
2668                                   : max_esverts;
2669    assert(max_out_vertices <= 256);
2670 
2671    unsigned prim_amp_factor = 1;
2672    if (gs_type == MESA_SHADER_GEOMETRY) {
2673       /* Number of output primitives per GS input primitive after
2674        * GS instancing. */
2675       prim_amp_factor = gs_info->gs.vertices_out;
2676    }
2677 
2678    /* On Gfx10, the GE only checks against the maximum number of ES verts
2679     * after allocating a full GS primitive. So we need to ensure that
2680     * whenever this check passes, there is enough space for a full
2681     * primitive without vertex reuse.
2682     */
2683    if (pdevice->rad_info.gfx_level == GFX10)
2684       ngg->hw_max_esverts = max_esverts - max_verts_per_prim + 1;
2685    else
2686       ngg->hw_max_esverts = max_esverts;
2687 
2688    ngg->max_gsprims = max_gsprims;
2689    ngg->max_out_verts = max_out_vertices;
2690    ngg->prim_amp_factor = prim_amp_factor;
2691    ngg->max_vert_out_per_gs_instance = max_vert_out_per_gs_instance;
2692    ngg->ngg_emit_size = max_gsprims * gsprim_lds_size;
2693    ngg->enable_vertex_grouping = true;
2694 
2695    /* Don't count unusable vertices. */
2696    ngg->esgs_ring_size = MIN2(max_esverts, max_gsprims * max_verts_per_prim) * esvert_lds_size * 4;
2697 
2698    if (gs_type == MESA_SHADER_GEOMETRY) {
2699       ngg->vgt_esgs_ring_itemsize = es_info->esgs_itemsize / 4;
2700    } else {
2701       ngg->vgt_esgs_ring_itemsize = 1;
2702    }
2703 
2704    assert(ngg->hw_max_esverts >= min_esverts); /* HW limitation */
2705 
2706    gl_shader_stage es_stage = stages[MESA_SHADER_TESS_CTRL].nir ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
2707    unsigned workgroup_size =
2708       ac_compute_ngg_workgroup_size(
2709          max_esverts, max_gsprims * gs_num_invocations, max_out_vertices, prim_amp_factor);
2710    stages[MESA_SHADER_GEOMETRY].info.workgroup_size = workgroup_size;
2711    stages[es_stage].info.workgroup_size = workgroup_size;
2712 }
2713 
2714 static void
radv_pipeline_init_gs_ring_state(struct radv_graphics_pipeline * pipeline,const struct gfx9_gs_info * gs)2715 radv_pipeline_init_gs_ring_state(struct radv_graphics_pipeline *pipeline, const struct gfx9_gs_info *gs)
2716 {
2717    const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
2718    unsigned num_se = pdevice->rad_info.max_se;
2719    unsigned wave_size = 64;
2720    unsigned max_gs_waves = 32 * num_se; /* max 32 per SE on GCN */
2721    /* On GFX6-GFX7, the value comes from VGT_GS_VERTEX_REUSE = 16.
2722     * On GFX8+, the value comes from VGT_VERTEX_REUSE_BLOCK_CNTL = 30 (+2).
2723     */
2724    unsigned gs_vertex_reuse = (pdevice->rad_info.gfx_level >= GFX8 ? 32 : 16) * num_se;
2725    unsigned alignment = 256 * num_se;
2726    /* The maximum size is 63.999 MB per SE. */
2727    unsigned max_size = ((unsigned)(63.999 * 1024 * 1024) & ~255) * num_se;
2728    struct radv_shader_info *gs_info = &pipeline->base.shaders[MESA_SHADER_GEOMETRY]->info;
2729 
2730    /* Calculate the minimum size. */
2731    unsigned min_esgs_ring_size =
2732       align(gs->vgt_esgs_ring_itemsize * 4 * gs_vertex_reuse * wave_size, alignment);
2733    /* These are recommended sizes, not minimum sizes. */
2734    unsigned esgs_ring_size =
2735       max_gs_waves * 2 * wave_size * gs->vgt_esgs_ring_itemsize * 4 * gs_info->gs.vertices_in;
2736    unsigned gsvs_ring_size = max_gs_waves * 2 * wave_size * gs_info->gs.max_gsvs_emit_size;
2737 
2738    min_esgs_ring_size = align(min_esgs_ring_size, alignment);
2739    esgs_ring_size = align(esgs_ring_size, alignment);
2740    gsvs_ring_size = align(gsvs_ring_size, alignment);
2741 
2742    if (pdevice->rad_info.gfx_level <= GFX8)
2743       pipeline->esgs_ring_size = CLAMP(esgs_ring_size, min_esgs_ring_size, max_size);
2744 
2745    pipeline->gsvs_ring_size = MIN2(gsvs_ring_size, max_size);
2746 }
2747 
2748 struct radv_shader *
radv_get_shader(const struct radv_pipeline * pipeline,gl_shader_stage stage)2749 radv_get_shader(const struct radv_pipeline *pipeline, gl_shader_stage stage)
2750 {
2751    if (stage == MESA_SHADER_VERTEX) {
2752       if (pipeline->shaders[MESA_SHADER_VERTEX])
2753          return pipeline->shaders[MESA_SHADER_VERTEX];
2754       if (pipeline->shaders[MESA_SHADER_TESS_CTRL])
2755          return pipeline->shaders[MESA_SHADER_TESS_CTRL];
2756       if (pipeline->shaders[MESA_SHADER_GEOMETRY])
2757          return pipeline->shaders[MESA_SHADER_GEOMETRY];
2758    } else if (stage == MESA_SHADER_TESS_EVAL) {
2759       if (!pipeline->shaders[MESA_SHADER_TESS_CTRL])
2760          return NULL;
2761       if (pipeline->shaders[MESA_SHADER_TESS_EVAL])
2762          return pipeline->shaders[MESA_SHADER_TESS_EVAL];
2763       if (pipeline->shaders[MESA_SHADER_GEOMETRY])
2764          return pipeline->shaders[MESA_SHADER_GEOMETRY];
2765    }
2766    return pipeline->shaders[stage];
2767 }
2768 
2769 static const struct radv_vs_output_info *
get_vs_output_info(const struct radv_graphics_pipeline * pipeline)2770 get_vs_output_info(const struct radv_graphics_pipeline *pipeline)
2771 {
2772    if (radv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY))
2773       if (radv_pipeline_has_ngg(pipeline))
2774          return &pipeline->base.shaders[MESA_SHADER_GEOMETRY]->info.vs.outinfo;
2775       else
2776          return &pipeline->base.gs_copy_shader->info.vs.outinfo;
2777    else if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL))
2778       return &pipeline->base.shaders[MESA_SHADER_TESS_EVAL]->info.tes.outinfo;
2779    else if (radv_pipeline_has_stage(pipeline, MESA_SHADER_MESH))
2780       return &pipeline->base.shaders[MESA_SHADER_MESH]->info.ms.outinfo;
2781    else
2782       return &pipeline->base.shaders[MESA_SHADER_VERTEX]->info.vs.outinfo;
2783 }
2784 
2785 static bool
radv_lower_viewport_to_zero(nir_shader * nir)2786 radv_lower_viewport_to_zero(nir_shader *nir)
2787 {
2788    nir_function_impl *impl = nir_shader_get_entrypoint(nir);
2789    bool progress = false;
2790 
2791    nir_builder b;
2792    nir_builder_init(&b, impl);
2793 
2794    /* There should be only one deref load for VIEWPORT after lower_io_to_temporaries. */
2795    nir_foreach_block(block, impl) {
2796       nir_foreach_instr(instr, block) {
2797          if (instr->type != nir_instr_type_intrinsic)
2798             continue;
2799 
2800          nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
2801          if (intr->intrinsic != nir_intrinsic_load_deref)
2802             continue;
2803 
2804          nir_variable *var = nir_intrinsic_get_var(intr, 0);
2805          if (var->data.mode != nir_var_shader_in ||
2806              var->data.location != VARYING_SLOT_VIEWPORT)
2807             continue;
2808 
2809          b.cursor = nir_before_instr(instr);
2810 
2811          nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_imm_zero(&b, 1, 32));
2812          progress = true;
2813          break;
2814       }
2815       if (progress)
2816          break;
2817    }
2818 
2819    if (progress)
2820       nir_metadata_preserve(impl, nir_metadata_block_index | nir_metadata_dominance);
2821    else
2822       nir_metadata_preserve(impl, nir_metadata_all);
2823 
2824    return progress;
2825 }
2826 
2827 static nir_variable *
find_layer_out_var(nir_shader * nir)2828 find_layer_out_var(nir_shader *nir)
2829 {
2830    nir_variable *var = nir_find_variable_with_location(nir, nir_var_shader_out, VARYING_SLOT_LAYER);
2831    if (var != NULL)
2832       return var;
2833 
2834    var = nir_variable_create(nir, nir_var_shader_out, glsl_int_type(), "layer id");
2835    var->data.location = VARYING_SLOT_LAYER;
2836    var->data.interpolation = INTERP_MODE_NONE;
2837 
2838    return var;
2839 }
2840 
2841 static bool
radv_lower_multiview(nir_shader * nir)2842 radv_lower_multiview(nir_shader *nir)
2843 {
2844    /* This pass is not suitable for mesh shaders, because it can't know
2845     * the mapping between API mesh shader invocations and output primitives.
2846     * Needs to be handled in ac_nir_lower_ngg.
2847     */
2848    if (nir->info.stage == MESA_SHADER_MESH)
2849       return false;
2850 
2851    nir_function_impl *impl = nir_shader_get_entrypoint(nir);
2852    bool progress = false;
2853 
2854    nir_builder b;
2855    nir_builder_init(&b, impl);
2856 
2857    /* Iterate in reverse order since there should be only one deref store to POS after
2858     * lower_io_to_temporaries for vertex shaders and inject the layer there. For geometry shaders,
2859     * the layer is injected right before every emit_vertex_with_counter.
2860     */
2861    nir_variable *layer = NULL;
2862    nir_foreach_block_reverse(block, impl) {
2863       nir_foreach_instr_reverse(instr, block) {
2864          if (instr->type != nir_instr_type_intrinsic)
2865             continue;
2866 
2867          if (nir->info.stage == MESA_SHADER_GEOMETRY) {
2868             nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
2869             if (intr->intrinsic != nir_intrinsic_emit_vertex_with_counter)
2870                continue;
2871 
2872             b.cursor = nir_before_instr(instr);
2873          } else {
2874             nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
2875             if (intr->intrinsic != nir_intrinsic_store_deref)
2876                continue;
2877 
2878             nir_variable *var = nir_intrinsic_get_var(intr, 0);
2879             if (var->data.mode != nir_var_shader_out || var->data.location != VARYING_SLOT_POS)
2880                continue;
2881 
2882             b.cursor = nir_after_instr(instr);
2883          }
2884 
2885          if (!layer)
2886             layer = find_layer_out_var(nir);
2887 
2888          nir_store_var(&b, layer, nir_load_view_index(&b), 1);
2889 
2890          /* Update outputs_written to reflect that the pass added a new output. */
2891          nir->info.outputs_written |= BITFIELD64_BIT(VARYING_SLOT_LAYER);
2892 
2893          progress = true;
2894          if (nir->info.stage == MESA_SHADER_VERTEX)
2895             break;
2896       }
2897       if (nir->info.stage == MESA_SHADER_VERTEX && progress)
2898          break;
2899    }
2900 
2901    if (progress)
2902       nir_metadata_preserve(impl, nir_metadata_block_index | nir_metadata_dominance);
2903    else
2904       nir_metadata_preserve(impl, nir_metadata_all);
2905 
2906    return progress;
2907 }
2908 
2909 static bool
radv_export_implicit_primitive_id(nir_shader * nir)2910 radv_export_implicit_primitive_id(nir_shader *nir)
2911 {
2912    nir_function_impl *impl = nir_shader_get_entrypoint(nir);
2913    nir_builder b;
2914    nir_builder_init(&b, impl);
2915 
2916    b.cursor = nir_after_cf_list(&impl->body);
2917 
2918    nir_variable *var = nir_variable_create(nir, nir_var_shader_out, glsl_int_type(), NULL);
2919    var->data.location = VARYING_SLOT_PRIMITIVE_ID;
2920    var->data.interpolation = INTERP_MODE_NONE;
2921 
2922    nir_store_var(&b, var, nir_load_primitive_id(&b), 1);
2923 
2924    /* Update outputs_written to reflect that the pass added a new output. */
2925    nir->info.outputs_written |= BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_ID);
2926 
2927    nir_metadata_preserve(impl, nir_metadata_block_index | nir_metadata_dominance);
2928 
2929    return true;
2930 }
2931 
2932 static void
radv_link_shaders(struct radv_pipeline * pipeline,const struct radv_pipeline_key * pipeline_key,const struct radv_pipeline_stage * stages,bool optimize_conservatively,gl_shader_stage last_vgt_api_stage)2933 radv_link_shaders(struct radv_pipeline *pipeline,
2934                   const struct radv_pipeline_key *pipeline_key,
2935                   const struct radv_pipeline_stage *stages,
2936                   bool optimize_conservatively,
2937                   gl_shader_stage last_vgt_api_stage)
2938 {
2939    const struct radv_physical_device *pdevice = pipeline->device->physical_device;
2940    nir_shader *ordered_shaders[MESA_VULKAN_SHADER_STAGES];
2941    int shader_count = 0;
2942 
2943    if (stages[MESA_SHADER_FRAGMENT].nir) {
2944       ordered_shaders[shader_count++] = stages[MESA_SHADER_FRAGMENT].nir;
2945    }
2946    if (stages[MESA_SHADER_GEOMETRY].nir) {
2947       ordered_shaders[shader_count++] = stages[MESA_SHADER_GEOMETRY].nir;
2948    }
2949    if (stages[MESA_SHADER_TESS_EVAL].nir) {
2950       ordered_shaders[shader_count++] = stages[MESA_SHADER_TESS_EVAL].nir;
2951    }
2952    if (stages[MESA_SHADER_TESS_CTRL].nir) {
2953       ordered_shaders[shader_count++] = stages[MESA_SHADER_TESS_CTRL].nir;
2954    }
2955    if (stages[MESA_SHADER_VERTEX].nir) {
2956       ordered_shaders[shader_count++] = stages[MESA_SHADER_VERTEX].nir;
2957    }
2958    if (stages[MESA_SHADER_MESH].nir) {
2959       ordered_shaders[shader_count++] = stages[MESA_SHADER_MESH].nir;
2960    }
2961    if (stages[MESA_SHADER_TASK].nir) {
2962       ordered_shaders[shader_count++] = stages[MESA_SHADER_TASK].nir;
2963    }
2964    if (stages[MESA_SHADER_COMPUTE].nir) {
2965       ordered_shaders[shader_count++] = stages[MESA_SHADER_COMPUTE].nir;
2966    }
2967 
2968    if (stages[MESA_SHADER_MESH].nir && stages[MESA_SHADER_FRAGMENT].nir) {
2969       nir_shader *ps = stages[MESA_SHADER_FRAGMENT].nir;
2970 
2971       nir_foreach_shader_in_variable(var, ps) {
2972          /* These variables are per-primitive when used with a mesh shader. */
2973          if (var->data.location == VARYING_SLOT_PRIMITIVE_ID ||
2974              var->data.location == VARYING_SLOT_VIEWPORT ||
2975              var->data.location == VARYING_SLOT_LAYER)
2976             var->data.per_primitive = true;
2977       }
2978    }
2979 
2980    bool has_geom_tess = stages[MESA_SHADER_GEOMETRY].nir || stages[MESA_SHADER_TESS_CTRL].nir;
2981    bool merged_gs = stages[MESA_SHADER_GEOMETRY].nir && pdevice->rad_info.gfx_level >= GFX9;
2982 
2983    if (!optimize_conservatively && shader_count > 1) {
2984       unsigned first = ordered_shaders[shader_count - 1]->info.stage;
2985       unsigned last = ordered_shaders[0]->info.stage;
2986 
2987       if (ordered_shaders[0]->info.stage == MESA_SHADER_FRAGMENT &&
2988           ordered_shaders[1]->info.has_transform_feedback_varyings)
2989          nir_link_xfb_varyings(ordered_shaders[1], ordered_shaders[0]);
2990 
2991       for (int i = 1; i < shader_count; ++i) {
2992          nir_lower_io_arrays_to_elements(ordered_shaders[i], ordered_shaders[i - 1]);
2993          nir_validate_shader(ordered_shaders[i], "after nir_lower_io_arrays_to_elements");
2994          nir_validate_shader(ordered_shaders[i - 1], "after nir_lower_io_arrays_to_elements");
2995       }
2996 
2997       for (int i = 0; i < shader_count; ++i) {
2998          nir_variable_mode mask = 0;
2999 
3000          if (ordered_shaders[i]->info.stage != first)
3001             mask = mask | nir_var_shader_in;
3002 
3003          if (ordered_shaders[i]->info.stage != last)
3004             mask = mask | nir_var_shader_out;
3005 
3006          bool progress = false;
3007          NIR_PASS(progress, ordered_shaders[i], nir_lower_io_to_scalar_early, mask);
3008          if (progress) {
3009             /* Optimize the new vector code and then remove dead vars */
3010             NIR_PASS(_, ordered_shaders[i], nir_copy_prop);
3011             NIR_PASS(_, ordered_shaders[i], nir_opt_shrink_vectors);
3012 
3013             if (ordered_shaders[i]->info.stage != last) {
3014                /* Optimize swizzled movs of load_const for
3015                 * nir_link_opt_varyings's constant propagation
3016                 */
3017                NIR_PASS(_, ordered_shaders[i], nir_opt_constant_folding);
3018                /* For nir_link_opt_varyings's duplicate input opt */
3019                NIR_PASS(_, ordered_shaders[i], nir_opt_cse);
3020             }
3021 
3022             /* Run copy-propagation to help remove dead
3023              * output variables (some shaders have useless
3024              * copies to/from an output), so compaction
3025              * later will be more effective.
3026              *
3027              * This will have been done earlier but it might
3028              * not have worked because the outputs were vector.
3029              */
3030             if (ordered_shaders[i]->info.stage == MESA_SHADER_TESS_CTRL)
3031                NIR_PASS(_, ordered_shaders[i], nir_opt_copy_prop_vars);
3032 
3033             NIR_PASS(_, ordered_shaders[i], nir_opt_dce);
3034             NIR_PASS(_, ordered_shaders[i], nir_remove_dead_variables,
3035                      nir_var_function_temp | nir_var_shader_in | nir_var_shader_out, NULL);
3036          }
3037       }
3038    }
3039 
3040    /* Export the primitive ID when VS or TES don't export it because it's implicit, while it's
3041     * required for GS or MS. The primitive ID is added during lowering for NGG.
3042     */
3043    if (stages[MESA_SHADER_FRAGMENT].nir &&
3044        (stages[MESA_SHADER_FRAGMENT].nir->info.inputs_read & VARYING_BIT_PRIMITIVE_ID) &&
3045        !(stages[last_vgt_api_stage].nir->info.outputs_written & VARYING_BIT_PRIMITIVE_ID) &&
3046        ((last_vgt_api_stage == MESA_SHADER_VERTEX && !stages[MESA_SHADER_VERTEX].info.is_ngg) ||
3047         (last_vgt_api_stage == MESA_SHADER_TESS_EVAL && !stages[MESA_SHADER_TESS_EVAL].info.is_ngg))) {
3048       radv_export_implicit_primitive_id(stages[last_vgt_api_stage].nir);
3049    }
3050 
3051    if (!optimize_conservatively) {
3052       bool uses_xfb = last_vgt_api_stage != -1 &&
3053                       stages[last_vgt_api_stage].nir->xfb_info;
3054 
3055       for (unsigned i = 0; i < shader_count; ++i) {
3056          shader_info *info = &ordered_shaders[i]->info;
3057 
3058          /* Remove exports without color attachment or writemask. */
3059          if (info->stage == MESA_SHADER_FRAGMENT) {
3060             bool fixup_derefs = false;
3061             nir_foreach_variable_with_modes(var, ordered_shaders[i], nir_var_shader_out) {
3062                int idx = var->data.location;
3063                idx -= FRAG_RESULT_DATA0;
3064                if (idx < 0)
3065                   continue;
3066 
3067                unsigned col_format = (pipeline_key->ps.col_format >> (4 * idx)) & 0xf;
3068                unsigned cb_target_mask = (pipeline_key->ps.cb_target_mask >> (4 * idx)) & 0xf;
3069 
3070                if (col_format == V_028714_SPI_SHADER_ZERO ||
3071                    (col_format == V_028714_SPI_SHADER_32_R && !cb_target_mask &&
3072                     !pipeline_key->ps.mrt0_is_dual_src)) {
3073                   /* Remove the color export if it's unused or in presence of holes. */
3074                   info->outputs_written &= ~BITFIELD64_BIT(var->data.location);
3075                   var->data.location = 0;
3076                   var->data.mode = nir_var_shader_temp;
3077                   fixup_derefs = true;
3078                }
3079             }
3080             if (fixup_derefs) {
3081                NIR_PASS_V(ordered_shaders[i], nir_fixup_deref_modes);
3082                NIR_PASS(_, ordered_shaders[i], nir_remove_dead_variables, nir_var_shader_temp,
3083                         NULL);
3084                NIR_PASS(_, ordered_shaders[i], nir_opt_dce);
3085             }
3086             continue;
3087          }
3088 
3089          /* Remove PSIZ from shaders when it's not needed.
3090           * This is typically produced by translation layers like Zink or D9VK.
3091           */
3092          if (uses_xfb || !(info->outputs_written & VARYING_BIT_PSIZ))
3093             continue;
3094 
3095          bool next_stage_needs_psiz =
3096             i != 0 && /* ordered_shaders is backwards, so next stage is: i - 1 */
3097             ordered_shaders[i - 1]->info.inputs_read & VARYING_BIT_PSIZ;
3098          bool topology_uses_psiz =
3099             info->stage == last_vgt_api_stage &&
3100             ((info->stage == MESA_SHADER_VERTEX && pipeline_key->vs.topology == V_008958_DI_PT_POINTLIST) ||
3101              (info->stage == MESA_SHADER_TESS_EVAL && info->tess.point_mode) ||
3102              (info->stage == MESA_SHADER_GEOMETRY && info->gs.output_primitive == SHADER_PRIM_POINTS) ||
3103              (info->stage == MESA_SHADER_MESH && info->mesh.primitive_type == SHADER_PRIM_POINTS));
3104 
3105          nir_variable *psiz_var =
3106                nir_find_variable_with_location(ordered_shaders[i], nir_var_shader_out, VARYING_SLOT_PSIZ);
3107 
3108          if (!next_stage_needs_psiz && !topology_uses_psiz && psiz_var) {
3109             /* Change PSIZ to a global variable which allows it to be DCE'd. */
3110             psiz_var->data.location = 0;
3111             psiz_var->data.mode = nir_var_shader_temp;
3112 
3113             info->outputs_written &= ~VARYING_BIT_PSIZ;
3114             NIR_PASS_V(ordered_shaders[i], nir_fixup_deref_modes);
3115             NIR_PASS(_, ordered_shaders[i], nir_remove_dead_variables, nir_var_shader_temp, NULL);
3116             NIR_PASS(_, ordered_shaders[i], nir_opt_dce);
3117          }
3118       }
3119    }
3120 
3121    /* Lower the viewport index to zero when the last vertex stage doesn't export it. */
3122    if (stages[MESA_SHADER_FRAGMENT].nir &&
3123        (stages[MESA_SHADER_FRAGMENT].nir->info.inputs_read & VARYING_BIT_VIEWPORT) &&
3124        !(stages[last_vgt_api_stage].nir->info.outputs_written & VARYING_BIT_VIEWPORT)) {
3125       NIR_PASS(_, stages[MESA_SHADER_FRAGMENT].nir, radv_lower_viewport_to_zero);
3126    }
3127 
3128    /* Export the layer in the last VGT stage if multiview is used. */
3129    if (pipeline_key->has_multiview_view_index && last_vgt_api_stage != -1 &&
3130        !(stages[last_vgt_api_stage].nir->info.outputs_written &
3131          VARYING_BIT_LAYER)) {
3132       nir_shader *last_vgt_shader = stages[last_vgt_api_stage].nir;
3133       NIR_PASS(_, last_vgt_shader, radv_lower_multiview);
3134    }
3135 
3136    for (int i = 1; !optimize_conservatively && (i < shader_count); ++i) {
3137       if (nir_link_opt_varyings(ordered_shaders[i], ordered_shaders[i - 1])) {
3138          nir_validate_shader(ordered_shaders[i], "after nir_link_opt_varyings");
3139          nir_validate_shader(ordered_shaders[i - 1], "after nir_link_opt_varyings");
3140 
3141          NIR_PASS(_, ordered_shaders[i - 1], nir_opt_constant_folding);
3142          NIR_PASS(_, ordered_shaders[i - 1], nir_opt_algebraic);
3143          NIR_PASS(_, ordered_shaders[i - 1], nir_opt_dce);
3144       }
3145 
3146       NIR_PASS(_, ordered_shaders[i], nir_remove_dead_variables, nir_var_shader_out, NULL);
3147       NIR_PASS(_, ordered_shaders[i - 1], nir_remove_dead_variables, nir_var_shader_in, NULL);
3148 
3149       bool progress = nir_remove_unused_varyings(ordered_shaders[i], ordered_shaders[i - 1]);
3150 
3151       nir_compact_varyings(ordered_shaders[i], ordered_shaders[i - 1], true);
3152       nir_validate_shader(ordered_shaders[i], "after nir_compact_varyings");
3153       nir_validate_shader(ordered_shaders[i - 1], "after nir_compact_varyings");
3154       if (ordered_shaders[i]->info.stage == MESA_SHADER_MESH) {
3155          /* nir_compact_varyings can change the location of per-vertex and per-primitive outputs */
3156          nir_shader_gather_info(ordered_shaders[i], nir_shader_get_entrypoint(ordered_shaders[i]));
3157       }
3158 
3159       if (ordered_shaders[i]->info.stage == MESA_SHADER_TESS_CTRL ||
3160           ordered_shaders[i]->info.stage == MESA_SHADER_MESH ||
3161           (ordered_shaders[i]->info.stage == MESA_SHADER_VERTEX && has_geom_tess) ||
3162           (ordered_shaders[i]->info.stage == MESA_SHADER_TESS_EVAL && merged_gs)) {
3163          NIR_PASS(_, ordered_shaders[i], nir_lower_io_to_vector, nir_var_shader_out);
3164          if (ordered_shaders[i]->info.stage == MESA_SHADER_TESS_CTRL)
3165             NIR_PASS(_, ordered_shaders[i], nir_vectorize_tess_levels);
3166          NIR_PASS(_, ordered_shaders[i], nir_opt_combine_stores, nir_var_shader_out);
3167       }
3168       if (ordered_shaders[i - 1]->info.stage == MESA_SHADER_GEOMETRY ||
3169           ordered_shaders[i - 1]->info.stage == MESA_SHADER_TESS_CTRL ||
3170           ordered_shaders[i - 1]->info.stage == MESA_SHADER_TESS_EVAL) {
3171          NIR_PASS(_, ordered_shaders[i - 1], nir_lower_io_to_vector, nir_var_shader_in);
3172       }
3173 
3174       if (progress) {
3175          progress = false;
3176          NIR_PASS(progress, ordered_shaders[i], nir_lower_global_vars_to_local);
3177          if (progress) {
3178             ac_nir_lower_indirect_derefs(ordered_shaders[i], pdevice->rad_info.gfx_level);
3179             /* remove dead writes, which can remove input loads */
3180             NIR_PASS(_, ordered_shaders[i], nir_lower_vars_to_ssa);
3181             NIR_PASS(_, ordered_shaders[i], nir_opt_dce);
3182          }
3183 
3184          progress = false;
3185          NIR_PASS(progress, ordered_shaders[i - 1], nir_lower_global_vars_to_local);
3186          if (progress) {
3187             ac_nir_lower_indirect_derefs(ordered_shaders[i - 1], pdevice->rad_info.gfx_level);
3188          }
3189       }
3190    }
3191 }
3192 
3193 static void
radv_set_driver_locations(struct radv_pipeline * pipeline,struct radv_pipeline_stage * stages,gl_shader_stage last_vgt_api_stage)3194 radv_set_driver_locations(struct radv_pipeline *pipeline, struct radv_pipeline_stage *stages,
3195                           gl_shader_stage last_vgt_api_stage)
3196 {
3197    const struct radv_physical_device *pdevice = pipeline->device->physical_device;
3198 
3199    if (stages[MESA_SHADER_FRAGMENT].nir) {
3200       nir_foreach_shader_out_variable(var, stages[MESA_SHADER_FRAGMENT].nir)
3201       {
3202          var->data.driver_location = var->data.location + var->data.index;
3203       }
3204    }
3205 
3206    if (stages[MESA_SHADER_MESH].nir) {
3207       /* ac_nir_lower_ngg ignores driver locations for mesh shaders,
3208        * but set them to all zero just to be on the safe side.
3209        */
3210       nir_foreach_shader_out_variable(var, stages[MESA_SHADER_MESH].nir) {
3211          var->data.driver_location = 0;
3212       }
3213       return;
3214    }
3215 
3216    if (!stages[MESA_SHADER_VERTEX].nir)
3217       return;
3218 
3219    bool has_tess = stages[MESA_SHADER_TESS_CTRL].nir;
3220    bool has_gs = stages[MESA_SHADER_GEOMETRY].nir;
3221 
3222    /* Merged stage for VS and TES */
3223    unsigned vs_info_idx = MESA_SHADER_VERTEX;
3224    unsigned tes_info_idx = MESA_SHADER_TESS_EVAL;
3225 
3226    if (pdevice->rad_info.gfx_level >= GFX9) {
3227       /* These are merged into the next stage */
3228       vs_info_idx = has_tess ? MESA_SHADER_TESS_CTRL : MESA_SHADER_GEOMETRY;
3229       tes_info_idx = has_gs ? MESA_SHADER_GEOMETRY : MESA_SHADER_TESS_EVAL;
3230    }
3231 
3232    nir_foreach_shader_in_variable (var, stages[MESA_SHADER_VERTEX].nir) {
3233       var->data.driver_location = var->data.location;
3234    }
3235 
3236    if (has_tess) {
3237       nir_linked_io_var_info vs2tcs = nir_assign_linked_io_var_locations(
3238          stages[MESA_SHADER_VERTEX].nir, stages[MESA_SHADER_TESS_CTRL].nir);
3239       nir_linked_io_var_info tcs2tes = nir_assign_linked_io_var_locations(
3240          stages[MESA_SHADER_TESS_CTRL].nir, stages[MESA_SHADER_TESS_EVAL].nir);
3241 
3242       stages[MESA_SHADER_VERTEX].info.vs.num_linked_outputs = vs2tcs.num_linked_io_vars;
3243       stages[MESA_SHADER_TESS_CTRL].info.tcs.num_linked_inputs = vs2tcs.num_linked_io_vars;
3244       stages[MESA_SHADER_TESS_CTRL].info.tcs.num_linked_outputs = tcs2tes.num_linked_io_vars;
3245       stages[MESA_SHADER_TESS_CTRL].info.tcs.num_linked_patch_outputs = tcs2tes.num_linked_patch_io_vars;
3246       stages[MESA_SHADER_TESS_EVAL].info.tes.num_linked_inputs = tcs2tes.num_linked_io_vars;
3247       stages[MESA_SHADER_TESS_EVAL].info.tes.num_linked_patch_inputs = tcs2tes.num_linked_patch_io_vars;
3248 
3249       /* Copy data to merged stage */
3250       stages[vs_info_idx].info.vs.num_linked_outputs = vs2tcs.num_linked_io_vars;
3251       stages[tes_info_idx].info.tes.num_linked_inputs = tcs2tes.num_linked_io_vars;
3252       stages[tes_info_idx].info.tes.num_linked_patch_inputs = tcs2tes.num_linked_patch_io_vars;
3253 
3254       if (has_gs) {
3255          nir_linked_io_var_info tes2gs = nir_assign_linked_io_var_locations(
3256             stages[MESA_SHADER_TESS_EVAL].nir, stages[MESA_SHADER_GEOMETRY].nir);
3257 
3258          stages[MESA_SHADER_TESS_EVAL].info.tes.num_linked_outputs = tes2gs.num_linked_io_vars;
3259          stages[MESA_SHADER_GEOMETRY].info.gs.num_linked_inputs = tes2gs.num_linked_io_vars;
3260 
3261          /* Copy data to merged stage */
3262          stages[tes_info_idx].info.tes.num_linked_outputs = tes2gs.num_linked_io_vars;
3263       }
3264    } else if (has_gs) {
3265       nir_linked_io_var_info vs2gs = nir_assign_linked_io_var_locations(
3266          stages[MESA_SHADER_VERTEX].nir, stages[MESA_SHADER_GEOMETRY].nir);
3267 
3268       stages[MESA_SHADER_VERTEX].info.vs.num_linked_outputs = vs2gs.num_linked_io_vars;
3269       stages[MESA_SHADER_GEOMETRY].info.gs.num_linked_inputs = vs2gs.num_linked_io_vars;
3270 
3271       /* Copy data to merged stage */
3272       stages[vs_info_idx].info.vs.num_linked_outputs = vs2gs.num_linked_io_vars;
3273    }
3274 
3275    assert(last_vgt_api_stage != MESA_SHADER_NONE);
3276    nir_foreach_shader_out_variable(var, stages[last_vgt_api_stage].nir)
3277    {
3278       var->data.driver_location = var->data.location;
3279    }
3280 }
3281 
3282 static struct radv_pipeline_key
radv_generate_pipeline_key(const struct radv_pipeline * pipeline,VkPipelineCreateFlags flags)3283 radv_generate_pipeline_key(const struct radv_pipeline *pipeline, VkPipelineCreateFlags flags)
3284 {
3285    struct radv_device *device = pipeline->device;
3286    struct radv_pipeline_key key;
3287 
3288    memset(&key, 0, sizeof(key));
3289 
3290    if (flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT)
3291       key.optimisations_disabled = 1;
3292 
3293    key.disable_aniso_single_level = device->instance->disable_aniso_single_level &&
3294                                     device->physical_device->rad_info.gfx_level < GFX8;
3295 
3296    key.image_2d_view_of_3d = device->image_2d_view_of_3d &&
3297                              device->physical_device->rad_info.gfx_level == GFX9;
3298 
3299    return key;
3300 }
3301 
3302 static struct radv_pipeline_key
radv_generate_graphics_pipeline_key(const struct radv_graphics_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo,const struct radv_graphics_pipeline_info * info,const struct radv_blend_state * blend)3303 radv_generate_graphics_pipeline_key(const struct radv_graphics_pipeline *pipeline,
3304                                     const VkGraphicsPipelineCreateInfo *pCreateInfo,
3305                                     const struct radv_graphics_pipeline_info *info,
3306                                     const struct radv_blend_state *blend)
3307 {
3308    struct radv_device *device = pipeline->base.device;
3309    struct radv_pipeline_key key = radv_generate_pipeline_key(&pipeline->base, pCreateInfo->flags);
3310 
3311    key.has_multiview_view_index = !!info->ri.view_mask;
3312 
3313    if (pipeline->dynamic_states & RADV_DYNAMIC_VERTEX_INPUT) {
3314       key.vs.dynamic_input_state = true;
3315    }
3316 
3317    /* Vertex input state */
3318    key.vs.instance_rate_inputs = info->vi.instance_rate_inputs;
3319    key.vs.vertex_post_shuffle = info->vi.vertex_post_shuffle;
3320 
3321    for (uint32_t i = 0; i < MAX_VERTEX_ATTRIBS; i++) {
3322       key.vs.instance_rate_divisors[i] = info->vi.instance_rate_divisors[i];
3323       key.vs.vertex_attribute_formats[i] = info->vi.vertex_attribute_formats[i];
3324       key.vs.vertex_attribute_bindings[i] = info->vi.vertex_attribute_bindings[i];
3325       key.vs.vertex_attribute_offsets[i] = info->vi.vertex_attribute_offsets[i];
3326       key.vs.vertex_attribute_strides[i] = info->vi.vertex_attribute_strides[i];
3327       key.vs.vertex_alpha_adjust[i] = info->vi.vertex_alpha_adjust[i];
3328    }
3329 
3330    for (uint32_t i = 0; i < MAX_VBS; i++) {
3331       key.vs.vertex_binding_align[i] = info->vi.vertex_binding_align[i];
3332    }
3333 
3334    key.tcs.tess_input_vertices = info->ts.patch_control_points;
3335 
3336    if (info->ms.raster_samples > 1) {
3337       uint32_t ps_iter_samples = radv_pipeline_get_ps_iter_samples(info);
3338       key.ps.num_samples = info->ms.raster_samples;
3339       key.ps.log2_ps_iter_samples = util_logbase2(ps_iter_samples);
3340    }
3341 
3342    key.ps.col_format = blend->spi_shader_col_format;
3343    key.ps.cb_target_mask = blend->cb_target_mask;
3344    key.ps.mrt0_is_dual_src = blend->mrt0_is_dual_src;
3345    if (device->physical_device->rad_info.gfx_level < GFX8) {
3346       key.ps.is_int8 = blend->col_format_is_int8;
3347       key.ps.is_int10 = blend->col_format_is_int10;
3348    }
3349    if (device->physical_device->rad_info.gfx_level >= GFX11) {
3350       key.ps.alpha_to_coverage_via_mrtz = info->ms.alpha_to_coverage_enable;
3351    }
3352 
3353    key.vs.topology = info->ia.primitive_topology;
3354 
3355    if (device->physical_device->rad_info.gfx_level >= GFX10) {
3356       key.vs.provoking_vtx_last = info->rs.provoking_vtx_last;
3357    }
3358 
3359    if (device->instance->debug_flags & RADV_DEBUG_DISCARD_TO_DEMOTE)
3360       key.ps.lower_discard_to_demote = true;
3361 
3362    if (device->instance->enable_mrt_output_nan_fixup)
3363       key.ps.enable_mrt_output_nan_fixup = blend->col_format_is_float32;
3364 
3365 
3366    key.ps.force_vrs_enabled = device->force_vrs_enabled;
3367 
3368    if (device->instance->debug_flags & RADV_DEBUG_INVARIANT_GEOM)
3369       key.invariant_geom = true;
3370 
3371    key.use_ngg = device->physical_device->use_ngg;
3372 
3373    if ((radv_is_vrs_enabled(pipeline, info) || device->force_vrs_enabled) &&
3374        (device->physical_device->rad_info.family == CHIP_NAVI21 ||
3375         device->physical_device->rad_info.family == CHIP_NAVI22 ||
3376         device->physical_device->rad_info.family == CHIP_VANGOGH))
3377       key.adjust_frag_coord_z = true;
3378 
3379    if (device->instance->disable_sinking_load_input_fs)
3380       key.disable_sinking_load_input_fs = true;
3381 
3382    if (device->primitives_generated_query)
3383       key.primitives_generated_query = true;
3384 
3385    key.ps.has_epilog = false; /* TODO: hook up PS epilogs */
3386 
3387    return key;
3388 }
3389 
3390 static uint8_t
radv_get_wave_size(struct radv_device * device,gl_shader_stage stage,const struct radv_shader_info * info)3391 radv_get_wave_size(struct radv_device *device,  gl_shader_stage stage,
3392                    const struct radv_shader_info *info)
3393 {
3394    if (stage == MESA_SHADER_GEOMETRY && !info->is_ngg)
3395       return 64;
3396    else if (stage == MESA_SHADER_COMPUTE) {
3397       return info->cs.subgroup_size;
3398    } else if (stage == MESA_SHADER_FRAGMENT)
3399       return device->physical_device->ps_wave_size;
3400    else if (stage == MESA_SHADER_TASK)
3401       return device->physical_device->cs_wave_size;
3402    else
3403       return device->physical_device->ge_wave_size;
3404 }
3405 
3406 static uint8_t
radv_get_ballot_bit_size(struct radv_device * device,gl_shader_stage stage,const struct radv_shader_info * info)3407 radv_get_ballot_bit_size(struct radv_device *device, gl_shader_stage stage,
3408                          const struct radv_shader_info *info)
3409 {
3410    if (stage == MESA_SHADER_COMPUTE && info->cs.subgroup_size)
3411       return info->cs.subgroup_size;
3412    return 64;
3413 }
3414 
3415 static void
radv_determine_ngg_settings(struct radv_pipeline * pipeline,const struct radv_pipeline_key * pipeline_key,struct radv_pipeline_stage * stages,gl_shader_stage last_vgt_api_stage)3416 radv_determine_ngg_settings(struct radv_pipeline *pipeline,
3417                             const struct radv_pipeline_key *pipeline_key,
3418                             struct radv_pipeline_stage *stages,
3419                             gl_shader_stage last_vgt_api_stage)
3420 {
3421    const struct radv_physical_device *pdevice = pipeline->device->physical_device;
3422 
3423    /* Shader settings for VS or TES without GS. */
3424    if (last_vgt_api_stage == MESA_SHADER_VERTEX ||
3425        last_vgt_api_stage == MESA_SHADER_TESS_EVAL) {
3426       uint64_t ps_inputs_read =
3427          stages[MESA_SHADER_FRAGMENT].nir ? stages[MESA_SHADER_FRAGMENT].nir->info.inputs_read : 0;
3428       gl_shader_stage es_stage = last_vgt_api_stage;
3429 
3430       unsigned num_vertices_per_prim = si_conv_prim_to_gs_out(pipeline_key->vs.topology) + 1;
3431       if (es_stage == MESA_SHADER_TESS_EVAL)
3432          num_vertices_per_prim = stages[es_stage].nir->info.tess.point_mode                      ? 1
3433                                  : stages[es_stage].nir->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES ? 2
3434                                                                                           : 3;
3435       /* TODO: Enable culling for LLVM. */
3436       stages[es_stage].info.has_ngg_culling = radv_consider_culling(
3437          pdevice, stages[es_stage].nir, ps_inputs_read, num_vertices_per_prim, &stages[es_stage].info) &&
3438          !radv_use_llvm_for_stage(pipeline->device, es_stage);
3439 
3440       nir_function_impl *impl = nir_shader_get_entrypoint(stages[es_stage].nir);
3441       stages[es_stage].info.has_ngg_early_prim_export = exec_list_is_singular(&impl->body);
3442 
3443       /* Invocations that process an input vertex */
3444       const struct gfx10_ngg_info *ngg_info = &stages[es_stage].info.ngg_info;
3445       unsigned max_vtx_in = MIN2(256, ngg_info->enable_vertex_grouping ? ngg_info->hw_max_esverts : num_vertices_per_prim * ngg_info->max_gsprims);
3446 
3447       unsigned lds_bytes_if_culling_off = 0;
3448       /* We need LDS space when VS needs to export the primitive ID. */
3449       if (es_stage == MESA_SHADER_VERTEX && stages[es_stage].info.vs.outinfo.export_prim_id)
3450          lds_bytes_if_culling_off = max_vtx_in * 4u;
3451       stages[es_stage].info.num_lds_blocks_when_not_culling =
3452          DIV_ROUND_UP(lds_bytes_if_culling_off, pdevice->rad_info.lds_encode_granularity);
3453 
3454       /* NGG passthrough mode should be disabled when culling and when the vertex shader exports the
3455        * primitive ID.
3456        */
3457       stages[es_stage].info.is_ngg_passthrough = stages[es_stage].info.is_ngg_passthrough &&
3458                                                 !stages[es_stage].info.has_ngg_culling &&
3459                                                  !(es_stage == MESA_SHADER_VERTEX &&
3460                                                    stages[es_stage].info.vs.outinfo.export_prim_id);
3461    }
3462 }
3463 
3464 static void
radv_fill_shader_info_ngg(struct radv_pipeline * pipeline,const struct radv_pipeline_key * pipeline_key,struct radv_pipeline_stage * stages)3465 radv_fill_shader_info_ngg(struct radv_pipeline *pipeline,
3466                           const struct radv_pipeline_key *pipeline_key,
3467                           struct radv_pipeline_stage *stages)
3468 {
3469    struct radv_device *device = pipeline->device;
3470 
3471    if (pipeline_key->use_ngg) {
3472       if (stages[MESA_SHADER_TESS_CTRL].nir) {
3473          stages[MESA_SHADER_TESS_EVAL].info.is_ngg = true;
3474       } else if (stages[MESA_SHADER_VERTEX].nir) {
3475          stages[MESA_SHADER_VERTEX].info.is_ngg = true;
3476       } else if (stages[MESA_SHADER_MESH].nir) {
3477          stages[MESA_SHADER_MESH].info.is_ngg = true;
3478       }
3479 
3480       if (stages[MESA_SHADER_TESS_CTRL].nir && stages[MESA_SHADER_GEOMETRY].nir &&
3481           stages[MESA_SHADER_GEOMETRY].nir->info.gs.invocations *
3482                 stages[MESA_SHADER_GEOMETRY].nir->info.gs.vertices_out >
3483              256) {
3484          /* Fallback to the legacy path if tessellation is
3485           * enabled with extreme geometry because
3486           * EN_MAX_VERT_OUT_PER_GS_INSTANCE doesn't work and it
3487           * might hang.
3488           */
3489          stages[MESA_SHADER_TESS_EVAL].info.is_ngg = false;
3490 
3491          /* GFX11+ requires NGG. */
3492          assert(device->physical_device->rad_info.gfx_level < GFX11);
3493       }
3494 
3495       gl_shader_stage last_xfb_stage = MESA_SHADER_VERTEX;
3496 
3497       for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
3498          if (stages[i].nir)
3499             last_xfb_stage = i;
3500       }
3501 
3502       bool uses_xfb = stages[last_xfb_stage].nir &&
3503                       stages[last_xfb_stage].nir->xfb_info;
3504 
3505       if (!device->physical_device->use_ngg_streamout && uses_xfb) {
3506          /* GFX11+ requires NGG. */
3507          assert(device->physical_device->rad_info.gfx_level < GFX11);
3508 
3509          if (stages[MESA_SHADER_TESS_CTRL].nir)
3510            stages[MESA_SHADER_TESS_EVAL].info.is_ngg = false;
3511          else
3512            stages[MESA_SHADER_VERTEX].info.is_ngg = false;
3513       }
3514 
3515       /* Determine if the pipeline is eligible for the NGG passthrough
3516        * mode. It can't be enabled for geometry shaders, for NGG
3517        * streamout or for vertex shaders that export the primitive ID
3518        * (this is checked later because we don't have the info here.)
3519        */
3520       if (!stages[MESA_SHADER_GEOMETRY].nir && !uses_xfb) {
3521          if (stages[MESA_SHADER_TESS_CTRL].nir && stages[MESA_SHADER_TESS_EVAL].info.is_ngg) {
3522             stages[MESA_SHADER_TESS_EVAL].info.is_ngg_passthrough = true;
3523          } else if (stages[MESA_SHADER_VERTEX].nir && stages[MESA_SHADER_VERTEX].info.is_ngg) {
3524             stages[MESA_SHADER_VERTEX].info.is_ngg_passthrough = true;
3525          }
3526       }
3527    }
3528 }
3529 
3530 static void
radv_fill_shader_info(struct radv_pipeline * pipeline,struct radv_pipeline_layout * pipeline_layout,const struct radv_pipeline_key * pipeline_key,struct radv_pipeline_stage * stages,gl_shader_stage last_vgt_api_stage)3531 radv_fill_shader_info(struct radv_pipeline *pipeline,
3532                       struct radv_pipeline_layout *pipeline_layout,
3533                       const struct radv_pipeline_key *pipeline_key,
3534                       struct radv_pipeline_stage *stages,
3535                       gl_shader_stage last_vgt_api_stage)
3536 {
3537    struct radv_device *device = pipeline->device;
3538    unsigned active_stages = 0;
3539    unsigned filled_stages = 0;
3540 
3541    for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; i++) {
3542       if (stages[i].nir)
3543          active_stages |= (1 << i);
3544    }
3545 
3546    if (stages[MESA_SHADER_TESS_CTRL].nir) {
3547       stages[MESA_SHADER_VERTEX].info.vs.as_ls = true;
3548    }
3549 
3550    if (stages[MESA_SHADER_GEOMETRY].nir) {
3551       if (stages[MESA_SHADER_TESS_CTRL].nir)
3552          stages[MESA_SHADER_TESS_EVAL].info.tes.as_es = true;
3553       else
3554          stages[MESA_SHADER_VERTEX].info.vs.as_es = true;
3555    }
3556 
3557    if (stages[MESA_SHADER_FRAGMENT].nir) {
3558       radv_nir_shader_info_init(&stages[MESA_SHADER_FRAGMENT].info);
3559       radv_nir_shader_info_pass(device, stages[MESA_SHADER_FRAGMENT].nir, pipeline_layout,
3560                                 pipeline_key, &stages[MESA_SHADER_FRAGMENT].info);
3561 
3562       assert(last_vgt_api_stage != MESA_SHADER_NONE);
3563       struct radv_shader_info *pre_ps_info = &stages[last_vgt_api_stage].info;
3564       struct radv_vs_output_info *outinfo = NULL;
3565       if (last_vgt_api_stage == MESA_SHADER_VERTEX ||
3566           last_vgt_api_stage == MESA_SHADER_GEOMETRY) {
3567          outinfo = &pre_ps_info->vs.outinfo;
3568       } else if (last_vgt_api_stage == MESA_SHADER_TESS_EVAL) {
3569          outinfo = &pre_ps_info->tes.outinfo;
3570       } else if (last_vgt_api_stage == MESA_SHADER_MESH) {
3571          outinfo = &pre_ps_info->ms.outinfo;
3572       }
3573 
3574       /* Add PS input requirements to the output of the pre-PS stage. */
3575       bool ps_prim_id_in = stages[MESA_SHADER_FRAGMENT].info.ps.prim_id_input;
3576       bool ps_clip_dists_in = !!stages[MESA_SHADER_FRAGMENT].info.ps.num_input_clips_culls;
3577 
3578       assert(outinfo);
3579       outinfo->export_clip_dists |= ps_clip_dists_in;
3580       if (last_vgt_api_stage == MESA_SHADER_VERTEX ||
3581           last_vgt_api_stage == MESA_SHADER_TESS_EVAL) {
3582          outinfo->export_prim_id |= ps_prim_id_in;
3583       }
3584 
3585       filled_stages |= (1 << MESA_SHADER_FRAGMENT);
3586    }
3587 
3588    if (device->physical_device->rad_info.gfx_level >= GFX9 &&
3589        stages[MESA_SHADER_TESS_CTRL].nir) {
3590       struct nir_shader *combined_nir[] = {stages[MESA_SHADER_VERTEX].nir, stages[MESA_SHADER_TESS_CTRL].nir};
3591 
3592       radv_nir_shader_info_init(&stages[MESA_SHADER_TESS_CTRL].info);
3593 
3594       /* Copy data to merged stage. */
3595       stages[MESA_SHADER_TESS_CTRL].info.vs.as_ls = true;
3596 
3597       for (int i = 0; i < 2; i++) {
3598          radv_nir_shader_info_pass(device, combined_nir[i], pipeline_layout, pipeline_key,
3599                                    &stages[MESA_SHADER_TESS_CTRL].info);
3600       }
3601 
3602       filled_stages |= (1 << MESA_SHADER_VERTEX);
3603       filled_stages |= (1 << MESA_SHADER_TESS_CTRL);
3604    }
3605 
3606    if (device->physical_device->rad_info.gfx_level >= GFX9 &&
3607        stages[MESA_SHADER_GEOMETRY].nir) {
3608       gl_shader_stage pre_stage =
3609          stages[MESA_SHADER_TESS_EVAL].nir ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
3610       struct nir_shader *combined_nir[] = {stages[pre_stage].nir, stages[MESA_SHADER_GEOMETRY].nir};
3611 
3612       radv_nir_shader_info_init(&stages[MESA_SHADER_GEOMETRY].info);
3613 
3614       /* Copy data to merged stage. */
3615       if (pre_stage == MESA_SHADER_VERTEX) {
3616          stages[MESA_SHADER_GEOMETRY].info.vs.as_es = stages[MESA_SHADER_VERTEX].info.vs.as_es;
3617       } else {
3618          stages[MESA_SHADER_GEOMETRY].info.tes.as_es = stages[MESA_SHADER_TESS_EVAL].info.tes.as_es;
3619       }
3620       stages[MESA_SHADER_GEOMETRY].info.is_ngg = stages[pre_stage].info.is_ngg;
3621       stages[MESA_SHADER_GEOMETRY].info.gs.es_type = pre_stage;
3622 
3623       for (int i = 0; i < 2; i++) {
3624          radv_nir_shader_info_pass(device, combined_nir[i], pipeline_layout, pipeline_key,
3625                                    &stages[MESA_SHADER_GEOMETRY].info);
3626       }
3627 
3628       filled_stages |= (1 << pre_stage);
3629       filled_stages |= (1 << MESA_SHADER_GEOMETRY);
3630    }
3631 
3632    active_stages ^= filled_stages;
3633    while (active_stages) {
3634       int i = u_bit_scan(&active_stages);
3635       radv_nir_shader_info_init(&stages[i].info);
3636       radv_nir_shader_info_pass(device, stages[i].nir, pipeline_layout, pipeline_key,
3637                                 &stages[i].info);
3638    }
3639 
3640    if (stages[MESA_SHADER_COMPUTE].nir) {
3641       unsigned subgroup_size = pipeline_key->cs.compute_subgroup_size;
3642       unsigned req_subgroup_size = subgroup_size;
3643       bool require_full_subgroups = pipeline_key->cs.require_full_subgroups;
3644 
3645       if (!subgroup_size)
3646          subgroup_size = device->physical_device->cs_wave_size;
3647 
3648       unsigned local_size = stages[MESA_SHADER_COMPUTE].nir->info.workgroup_size[0] *
3649                             stages[MESA_SHADER_COMPUTE].nir->info.workgroup_size[1] *
3650                             stages[MESA_SHADER_COMPUTE].nir->info.workgroup_size[2];
3651 
3652       /* Games don't always request full subgroups when they should,
3653        * which can cause bugs if cswave32 is enabled.
3654        */
3655       if (device->physical_device->cs_wave_size == 32 &&
3656           stages[MESA_SHADER_COMPUTE].nir->info.cs.uses_wide_subgroup_intrinsics && !req_subgroup_size &&
3657           local_size % RADV_SUBGROUP_SIZE == 0)
3658          require_full_subgroups = true;
3659 
3660       if (require_full_subgroups && !req_subgroup_size) {
3661          /* don't use wave32 pretending to be wave64 */
3662          subgroup_size = RADV_SUBGROUP_SIZE;
3663       }
3664 
3665       stages[MESA_SHADER_COMPUTE].info.cs.subgroup_size = subgroup_size;
3666    }
3667 
3668    for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; i++) {
3669       if (stages[i].nir) {
3670          stages[i].info.wave_size = radv_get_wave_size(device, i, &stages[i].info);
3671          stages[i].info.ballot_bit_size = radv_get_ballot_bit_size(device, i, &stages[i].info);
3672       }
3673    }
3674 
3675    /* PS always operates without workgroups. */
3676    if (stages[MESA_SHADER_FRAGMENT].nir)
3677       stages[MESA_SHADER_FRAGMENT].info.workgroup_size = stages[MESA_SHADER_FRAGMENT].info.wave_size;
3678 
3679    if (stages[MESA_SHADER_COMPUTE].nir) {
3680       /* Variable workgroup size is not supported by Vulkan. */
3681       assert(!stages[MESA_SHADER_COMPUTE].nir->info.workgroup_size_variable);
3682 
3683       stages[MESA_SHADER_COMPUTE].info.workgroup_size =
3684          ac_compute_cs_workgroup_size(
3685             stages[MESA_SHADER_COMPUTE].nir->info.workgroup_size, false, UINT32_MAX);
3686    }
3687 
3688    if (stages[MESA_SHADER_TASK].nir) {
3689       /* Task/mesh I/O uses the task ring buffers. */
3690       stages[MESA_SHADER_TASK].info.cs.uses_task_rings = true;
3691       stages[MESA_SHADER_MESH].info.cs.uses_task_rings = true;
3692 
3693       stages[MESA_SHADER_TASK].info.workgroup_size =
3694          ac_compute_cs_workgroup_size(
3695             stages[MESA_SHADER_TASK].nir->info.workgroup_size, false, UINT32_MAX);
3696    }
3697 }
3698 
3699 static void
radv_declare_pipeline_args(struct radv_device * device,struct radv_pipeline_stage * stages,const struct radv_pipeline_key * pipeline_key)3700 radv_declare_pipeline_args(struct radv_device *device, struct radv_pipeline_stage *stages,
3701                            const struct radv_pipeline_key *pipeline_key)
3702 {
3703    enum amd_gfx_level gfx_level = device->physical_device->rad_info.gfx_level;
3704    unsigned active_stages = 0;
3705 
3706    for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; i++) {
3707       if (stages[i].nir)
3708          active_stages |= (1 << i);
3709    }
3710 
3711    for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i) {
3712       stages[i].args.is_gs_copy_shader = false;
3713       stages[i].args.explicit_scratch_args = !radv_use_llvm_for_stage(device, i);
3714       stages[i].args.remap_spi_ps_input = !radv_use_llvm_for_stage(device, i);
3715       stages[i].args.load_grid_size_from_user_sgpr = device->load_grid_size_from_user_sgpr;
3716    }
3717 
3718    if (gfx_level >= GFX9 && stages[MESA_SHADER_TESS_CTRL].nir) {
3719       radv_declare_shader_args(gfx_level, pipeline_key, &stages[MESA_SHADER_TESS_CTRL].info,
3720                                MESA_SHADER_TESS_CTRL, true, MESA_SHADER_VERTEX,
3721                                &stages[MESA_SHADER_TESS_CTRL].args);
3722       stages[MESA_SHADER_TESS_CTRL].info.user_sgprs_locs = stages[MESA_SHADER_TESS_CTRL].args.user_sgprs_locs;
3723       stages[MESA_SHADER_TESS_CTRL].info.inline_push_constant_mask =
3724          stages[MESA_SHADER_TESS_CTRL].args.ac.inline_push_const_mask;
3725 
3726       stages[MESA_SHADER_VERTEX].args = stages[MESA_SHADER_TESS_CTRL].args;
3727       active_stages &= ~(1 << MESA_SHADER_VERTEX);
3728       active_stages &= ~(1 << MESA_SHADER_TESS_CTRL);
3729    }
3730 
3731    if (gfx_level >= GFX9 && stages[MESA_SHADER_GEOMETRY].nir) {
3732       gl_shader_stage pre_stage =
3733          stages[MESA_SHADER_TESS_EVAL].nir ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
3734       radv_declare_shader_args(gfx_level, pipeline_key, &stages[MESA_SHADER_GEOMETRY].info,
3735                                MESA_SHADER_GEOMETRY, true, pre_stage,
3736                                &stages[MESA_SHADER_GEOMETRY].args);
3737       stages[MESA_SHADER_GEOMETRY].info.user_sgprs_locs = stages[MESA_SHADER_GEOMETRY].args.user_sgprs_locs;
3738       stages[MESA_SHADER_GEOMETRY].info.inline_push_constant_mask =
3739          stages[MESA_SHADER_GEOMETRY].args.ac.inline_push_const_mask;
3740 
3741       stages[pre_stage].args = stages[MESA_SHADER_GEOMETRY].args;
3742       active_stages &= ~(1 << pre_stage);
3743       active_stages &= ~(1 << MESA_SHADER_GEOMETRY);
3744    }
3745 
3746    u_foreach_bit(i, active_stages) {
3747       radv_declare_shader_args(gfx_level, pipeline_key, &stages[i].info, i, false,
3748                                MESA_SHADER_VERTEX, &stages[i].args);
3749       stages[i].info.user_sgprs_locs = stages[i].args.user_sgprs_locs;
3750       stages[i].info.inline_push_constant_mask = stages[i].args.ac.inline_push_const_mask;
3751    }
3752 }
3753 
3754 static void
merge_tess_info(struct shader_info * tes_info,struct shader_info * tcs_info)3755 merge_tess_info(struct shader_info *tes_info, struct shader_info *tcs_info)
3756 {
3757    /* The Vulkan 1.0.38 spec, section 21.1 Tessellator says:
3758     *
3759     *    "PointMode. Controls generation of points rather than triangles
3760     *     or lines. This functionality defaults to disabled, and is
3761     *     enabled if either shader stage includes the execution mode.
3762     *
3763     * and about Triangles, Quads, IsoLines, VertexOrderCw, VertexOrderCcw,
3764     * PointMode, SpacingEqual, SpacingFractionalEven, SpacingFractionalOdd,
3765     * and OutputVertices, it says:
3766     *
3767     *    "One mode must be set in at least one of the tessellation
3768     *     shader stages."
3769     *
3770     * So, the fields can be set in either the TCS or TES, but they must
3771     * agree if set in both.  Our backend looks at TES, so bitwise-or in
3772     * the values from the TCS.
3773     */
3774    assert(tcs_info->tess.tcs_vertices_out == 0 || tes_info->tess.tcs_vertices_out == 0 ||
3775           tcs_info->tess.tcs_vertices_out == tes_info->tess.tcs_vertices_out);
3776    tes_info->tess.tcs_vertices_out |= tcs_info->tess.tcs_vertices_out;
3777 
3778    assert(tcs_info->tess.spacing == TESS_SPACING_UNSPECIFIED ||
3779           tes_info->tess.spacing == TESS_SPACING_UNSPECIFIED ||
3780           tcs_info->tess.spacing == tes_info->tess.spacing);
3781    tes_info->tess.spacing |= tcs_info->tess.spacing;
3782 
3783    assert(tcs_info->tess._primitive_mode == TESS_PRIMITIVE_UNSPECIFIED ||
3784           tes_info->tess._primitive_mode == TESS_PRIMITIVE_UNSPECIFIED ||
3785           tcs_info->tess._primitive_mode == tes_info->tess._primitive_mode);
3786    tes_info->tess._primitive_mode |= tcs_info->tess._primitive_mode;
3787    tes_info->tess.ccw |= tcs_info->tess.ccw;
3788    tes_info->tess.point_mode |= tcs_info->tess.point_mode;
3789 
3790    /* Copy the merged info back to the TCS */
3791    tcs_info->tess.tcs_vertices_out = tes_info->tess.tcs_vertices_out;
3792    tcs_info->tess.spacing = tes_info->tess.spacing;
3793    tcs_info->tess._primitive_mode = tes_info->tess._primitive_mode;
3794    tcs_info->tess.ccw = tes_info->tess.ccw;
3795    tcs_info->tess.point_mode = tes_info->tess.point_mode;
3796 }
3797 
3798 static void
gather_tess_info(struct radv_device * device,struct radv_pipeline_stage * stages,const struct radv_pipeline_key * pipeline_key)3799 gather_tess_info(struct radv_device *device, struct radv_pipeline_stage *stages,
3800                  const struct radv_pipeline_key *pipeline_key)
3801 {
3802    merge_tess_info(&stages[MESA_SHADER_TESS_EVAL].nir->info,
3803                    &stages[MESA_SHADER_TESS_CTRL].nir->info);
3804 
3805    unsigned tess_in_patch_size = pipeline_key->tcs.tess_input_vertices;
3806    unsigned tess_out_patch_size = stages[MESA_SHADER_TESS_CTRL].nir->info.tess.tcs_vertices_out;
3807 
3808    /* Number of tessellation patches per workgroup processed by the current pipeline. */
3809    unsigned num_patches = get_tcs_num_patches(
3810       tess_in_patch_size, tess_out_patch_size,
3811       stages[MESA_SHADER_TESS_CTRL].info.tcs.num_linked_inputs,
3812       stages[MESA_SHADER_TESS_CTRL].info.tcs.num_linked_outputs,
3813       stages[MESA_SHADER_TESS_CTRL].info.tcs.num_linked_patch_outputs,
3814       device->physical_device->hs.tess_offchip_block_dw_size, device->physical_device->rad_info.gfx_level,
3815       device->physical_device->rad_info.family);
3816 
3817    /* LDS size used by VS+TCS for storing TCS inputs and outputs. */
3818    unsigned tcs_lds_size = calculate_tess_lds_size(
3819       device->physical_device->rad_info.gfx_level, tess_in_patch_size, tess_out_patch_size,
3820       stages[MESA_SHADER_TESS_CTRL].info.tcs.num_linked_inputs, num_patches,
3821       stages[MESA_SHADER_TESS_CTRL].info.tcs.num_linked_outputs,
3822       stages[MESA_SHADER_TESS_CTRL].info.tcs.num_linked_patch_outputs);
3823 
3824    stages[MESA_SHADER_TESS_CTRL].info.num_tess_patches = num_patches;
3825    stages[MESA_SHADER_TESS_CTRL].info.tcs.num_lds_blocks = tcs_lds_size;
3826    stages[MESA_SHADER_TESS_CTRL].info.tcs.tes_reads_tess_factors =
3827       !!(stages[MESA_SHADER_TESS_EVAL].nir->info.inputs_read &
3828          (VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER));
3829    stages[MESA_SHADER_TESS_CTRL].info.tcs.tes_inputs_read = stages[MESA_SHADER_TESS_EVAL].nir->info.inputs_read;
3830    stages[MESA_SHADER_TESS_CTRL].info.tcs.tes_patch_inputs_read =
3831       stages[MESA_SHADER_TESS_EVAL].nir->info.patch_inputs_read;
3832 
3833    stages[MESA_SHADER_TESS_EVAL].info.num_tess_patches = num_patches;
3834    stages[MESA_SHADER_GEOMETRY].info.num_tess_patches = num_patches;
3835    stages[MESA_SHADER_VERTEX].info.num_tess_patches = num_patches;
3836    stages[MESA_SHADER_TESS_CTRL].info.tcs.tcs_vertices_out = tess_out_patch_size;
3837    stages[MESA_SHADER_VERTEX].info.tcs.tcs_vertices_out = tess_out_patch_size;
3838 
3839    if (!radv_use_llvm_for_stage(device, MESA_SHADER_VERTEX)) {
3840       /* When the number of TCS input and output vertices are the same (typically 3):
3841        * - There is an equal amount of LS and HS invocations
3842        * - In case of merged LSHS shaders, the LS and HS halves of the shader
3843        *   always process the exact same vertex. We can use this knowledge to optimize them.
3844        *
3845        * We don't set tcs_in_out_eq if the float controls differ because that might
3846        * involve different float modes for the same block and our optimizer
3847        * doesn't handle a instruction dominating another with a different mode.
3848        */
3849       stages[MESA_SHADER_VERTEX].info.vs.tcs_in_out_eq =
3850          device->physical_device->rad_info.gfx_level >= GFX9 &&
3851          tess_in_patch_size == tess_out_patch_size &&
3852          stages[MESA_SHADER_VERTEX].nir->info.float_controls_execution_mode ==
3853             stages[MESA_SHADER_TESS_CTRL].nir->info.float_controls_execution_mode;
3854 
3855       if (stages[MESA_SHADER_VERTEX].info.vs.tcs_in_out_eq)
3856          stages[MESA_SHADER_VERTEX].info.vs.tcs_temp_only_input_mask =
3857             stages[MESA_SHADER_TESS_CTRL].nir->info.inputs_read &
3858             stages[MESA_SHADER_VERTEX].nir->info.outputs_written &
3859             ~stages[MESA_SHADER_TESS_CTRL].nir->info.tess.tcs_cross_invocation_inputs_read &
3860             ~stages[MESA_SHADER_TESS_CTRL].nir->info.inputs_read_indirectly &
3861             ~stages[MESA_SHADER_VERTEX].nir->info.outputs_accessed_indirectly;
3862 
3863       /* Copy data to TCS so it can be accessed by the backend if they are merged. */
3864       stages[MESA_SHADER_TESS_CTRL].info.vs.tcs_in_out_eq = stages[MESA_SHADER_VERTEX].info.vs.tcs_in_out_eq;
3865       stages[MESA_SHADER_TESS_CTRL].info.vs.tcs_temp_only_input_mask =
3866          stages[MESA_SHADER_VERTEX].info.vs.tcs_temp_only_input_mask;
3867    }
3868 
3869    for (gl_shader_stage s = MESA_SHADER_VERTEX; s <= MESA_SHADER_TESS_CTRL; ++s)
3870       stages[s].info.workgroup_size =
3871          ac_compute_lshs_workgroup_size(device->physical_device->rad_info.gfx_level, s, num_patches,
3872                                         tess_in_patch_size, tess_out_patch_size);
3873 }
3874 
3875 static bool
mem_vectorize_callback(unsigned align_mul,unsigned align_offset,unsigned bit_size,unsigned num_components,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * data)3876 mem_vectorize_callback(unsigned align_mul, unsigned align_offset, unsigned bit_size,
3877                        unsigned num_components, nir_intrinsic_instr *low, nir_intrinsic_instr *high,
3878                        void *data)
3879 {
3880    if (num_components > 4)
3881       return false;
3882 
3883    /* >128 bit loads are split except with SMEM */
3884    if (bit_size * num_components > 128)
3885       return false;
3886 
3887    uint32_t align;
3888    if (align_offset)
3889       align = 1 << (ffs(align_offset) - 1);
3890    else
3891       align = align_mul;
3892 
3893    switch (low->intrinsic) {
3894    case nir_intrinsic_load_global:
3895    case nir_intrinsic_store_global:
3896    case nir_intrinsic_store_ssbo:
3897    case nir_intrinsic_load_ssbo:
3898    case nir_intrinsic_load_ubo:
3899    case nir_intrinsic_load_push_constant: {
3900       unsigned max_components;
3901       if (align % 4 == 0)
3902          max_components = NIR_MAX_VEC_COMPONENTS;
3903       else if (align % 2 == 0)
3904          max_components = 16u / bit_size;
3905       else
3906          max_components = 8u / bit_size;
3907       return (align % (bit_size / 8u)) == 0 && num_components <= max_components;
3908    }
3909    case nir_intrinsic_load_deref:
3910    case nir_intrinsic_store_deref:
3911       assert(nir_deref_mode_is(nir_src_as_deref(low->src[0]), nir_var_mem_shared));
3912       FALLTHROUGH;
3913    case nir_intrinsic_load_shared:
3914    case nir_intrinsic_store_shared:
3915       if (bit_size * num_components ==
3916           96) { /* 96 bit loads require 128 bit alignment and are split otherwise */
3917          return align % 16 == 0;
3918       } else if (bit_size == 16 && (align % 4)) {
3919          /* AMD hardware can't do 2-byte aligned f16vec2 loads, but they are useful for ALU
3920           * vectorization, because our vectorizer requires the scalar IR to already contain vectors.
3921           */
3922          return (align % 2 == 0) && num_components <= 2;
3923       } else {
3924          if (num_components == 3) {
3925             /* AMD hardware can't do 3-component loads except for 96-bit loads, handled above. */
3926             return false;
3927          }
3928          unsigned req = bit_size * num_components;
3929          if (req == 64 || req == 128) /* 64-bit and 128-bit loads can use ds_read2_b{32,64} */
3930             req /= 2u;
3931          return align % (req / 8u) == 0;
3932       }
3933    default:
3934       return false;
3935    }
3936    return false;
3937 }
3938 
3939 static unsigned
lower_bit_size_callback(const nir_instr * instr,void * _)3940 lower_bit_size_callback(const nir_instr *instr, void *_)
3941 {
3942    struct radv_device *device = _;
3943    enum amd_gfx_level chip = device->physical_device->rad_info.gfx_level;
3944 
3945    if (instr->type != nir_instr_type_alu)
3946       return 0;
3947    nir_alu_instr *alu = nir_instr_as_alu(instr);
3948 
3949    /* If an instruction is not scalarized by this point,
3950     * it can be emitted as packed instruction */
3951    if (alu->dest.dest.ssa.num_components > 1)
3952       return 0;
3953 
3954    if (alu->dest.dest.ssa.bit_size & (8 | 16)) {
3955       unsigned bit_size = alu->dest.dest.ssa.bit_size;
3956       switch (alu->op) {
3957       case nir_op_bitfield_select:
3958       case nir_op_imul_high:
3959       case nir_op_umul_high:
3960          return 32;
3961       case nir_op_iabs:
3962       case nir_op_imax:
3963       case nir_op_umax:
3964       case nir_op_imin:
3965       case nir_op_umin:
3966       case nir_op_ishr:
3967       case nir_op_ushr:
3968       case nir_op_ishl:
3969       case nir_op_isign:
3970       case nir_op_uadd_sat:
3971       case nir_op_usub_sat:
3972          return (bit_size == 8 || !(chip >= GFX8 && nir_dest_is_divergent(alu->dest.dest))) ? 32
3973                                                                                             : 0;
3974       case nir_op_iadd_sat:
3975       case nir_op_isub_sat:
3976          return bit_size == 8 || !nir_dest_is_divergent(alu->dest.dest) ? 32 : 0;
3977 
3978       default:
3979          return 0;
3980       }
3981    }
3982 
3983    if (nir_src_bit_size(alu->src[0].src) & (8 | 16)) {
3984       unsigned bit_size = nir_src_bit_size(alu->src[0].src);
3985       switch (alu->op) {
3986       case nir_op_bit_count:
3987       case nir_op_find_lsb:
3988       case nir_op_ufind_msb:
3989       case nir_op_i2b1:
3990          return 32;
3991       case nir_op_ilt:
3992       case nir_op_ige:
3993       case nir_op_ieq:
3994       case nir_op_ine:
3995       case nir_op_ult:
3996       case nir_op_uge:
3997          return (bit_size == 8 || !(chip >= GFX8 && nir_dest_is_divergent(alu->dest.dest))) ? 32
3998                                                                                             : 0;
3999       default:
4000          return 0;
4001       }
4002    }
4003 
4004    return 0;
4005 }
4006 
4007 static uint8_t
opt_vectorize_callback(const nir_instr * instr,const void * _)4008 opt_vectorize_callback(const nir_instr *instr, const void *_)
4009 {
4010    if (instr->type != nir_instr_type_alu)
4011       return 0;
4012 
4013    const struct radv_device *device = _;
4014    enum amd_gfx_level chip = device->physical_device->rad_info.gfx_level;
4015    if (chip < GFX9)
4016       return 1;
4017 
4018    const nir_alu_instr *alu = nir_instr_as_alu(instr);
4019    const unsigned bit_size = alu->dest.dest.ssa.bit_size;
4020    if (bit_size != 16)
4021       return 1;
4022 
4023    switch (alu->op) {
4024    case nir_op_fadd:
4025    case nir_op_fsub:
4026    case nir_op_fmul:
4027    case nir_op_ffma:
4028    case nir_op_fdiv:
4029    case nir_op_flrp:
4030    case nir_op_fabs:
4031    case nir_op_fneg:
4032    case nir_op_fsat:
4033    case nir_op_fmin:
4034    case nir_op_fmax:
4035    case nir_op_iabs:
4036    case nir_op_iadd:
4037    case nir_op_iadd_sat:
4038    case nir_op_uadd_sat:
4039    case nir_op_isub:
4040    case nir_op_isub_sat:
4041    case nir_op_usub_sat:
4042    case nir_op_ineg:
4043    case nir_op_imul:
4044    case nir_op_imin:
4045    case nir_op_imax:
4046    case nir_op_umin:
4047    case nir_op_umax:
4048       return 2;
4049    case nir_op_ishl: /* TODO: in NIR, these have 32bit shift operands */
4050    case nir_op_ishr: /* while Radeon needs 16bit operands when vectorized */
4051    case nir_op_ushr:
4052    default:
4053       return 1;
4054    }
4055 }
4056 
4057 static nir_component_mask_t
non_uniform_access_callback(const nir_src * src,void * _)4058 non_uniform_access_callback(const nir_src *src, void *_)
4059 {
4060    if (src->ssa->num_components == 1)
4061       return 0x1;
4062    return nir_chase_binding(*src).success ? 0x2 : 0x3;
4063 }
4064 
4065 
4066 VkResult
radv_upload_shaders(struct radv_device * device,struct radv_pipeline * pipeline,struct radv_shader_binary ** binaries,struct radv_shader_binary * gs_copy_binary)4067 radv_upload_shaders(struct radv_device *device, struct radv_pipeline *pipeline,
4068                     struct radv_shader_binary **binaries, struct radv_shader_binary *gs_copy_binary)
4069 {
4070    uint32_t code_size = 0;
4071 
4072    /* Compute the total code size. */
4073    for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; i++) {
4074       struct radv_shader *shader = pipeline->shaders[i];
4075       if (!shader)
4076          continue;
4077 
4078       code_size += align(shader->code_size, RADV_SHADER_ALLOC_ALIGNMENT);
4079    }
4080 
4081    if (pipeline->gs_copy_shader) {
4082       code_size += align(pipeline->gs_copy_shader->code_size, RADV_SHADER_ALLOC_ALIGNMENT);
4083    }
4084 
4085    /* Allocate memory for all shader binaries. */
4086    pipeline->slab = radv_pipeline_slab_create(device, pipeline, code_size);
4087    if (!pipeline->slab)
4088       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
4089 
4090    pipeline->slab_bo = pipeline->slab->alloc->arena->bo;
4091 
4092    /* Upload shader binaries. */
4093    uint64_t slab_va = radv_buffer_get_va(pipeline->slab_bo);
4094    uint32_t slab_offset = pipeline->slab->alloc->offset;
4095    char *slab_ptr = pipeline->slab->alloc->arena->ptr;
4096 
4097    for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i) {
4098       struct radv_shader *shader = pipeline->shaders[i];
4099       if (!shader)
4100          continue;
4101 
4102       shader->va = slab_va + slab_offset;
4103 
4104       void *dest_ptr = slab_ptr + slab_offset;
4105       if (!radv_shader_binary_upload(device, binaries[i], shader, dest_ptr))
4106          return VK_ERROR_OUT_OF_HOST_MEMORY;
4107 
4108       slab_offset += align(shader->code_size, RADV_SHADER_ALLOC_ALIGNMENT);
4109    }
4110 
4111    if (pipeline->gs_copy_shader) {
4112       pipeline->gs_copy_shader->va = slab_va + slab_offset;
4113 
4114       void *dest_ptr = slab_ptr + slab_offset;
4115       if (!radv_shader_binary_upload(device, gs_copy_binary, pipeline->gs_copy_shader, dest_ptr))
4116          return VK_ERROR_OUT_OF_HOST_MEMORY;
4117    }
4118 
4119    return VK_SUCCESS;
4120 }
4121 
4122 static bool
radv_consider_force_vrs(const struct radv_pipeline * pipeline,bool noop_fs,const struct radv_pipeline_stage * stages,gl_shader_stage last_vgt_api_stage)4123 radv_consider_force_vrs(const struct radv_pipeline *pipeline, bool noop_fs,
4124                         const struct radv_pipeline_stage *stages,
4125                         gl_shader_stage last_vgt_api_stage)
4126 {
4127    struct radv_device *device = pipeline->device;
4128 
4129    if (!device->force_vrs_enabled)
4130       return false;
4131 
4132    if (last_vgt_api_stage != MESA_SHADER_VERTEX &&
4133        last_vgt_api_stage != MESA_SHADER_TESS_EVAL &&
4134        last_vgt_api_stage != MESA_SHADER_GEOMETRY)
4135       return false;
4136 
4137    nir_shader *last_vgt_shader = stages[last_vgt_api_stage].nir;
4138    if (last_vgt_shader->info.outputs_written & BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE))
4139       return false;
4140 
4141    /* VRS has no effect if there is no pixel shader. */
4142    if (noop_fs)
4143       return false;
4144 
4145    /* Do not enable if the PS uses gl_FragCoord because it breaks postprocessing in some games. */
4146    nir_shader *fs_shader = stages[MESA_SHADER_FRAGMENT].nir;
4147    if (fs_shader &&
4148        BITSET_TEST(fs_shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD)) {
4149       return false;
4150    }
4151 
4152    return true;
4153 }
4154 
4155 static nir_ssa_def *
radv_adjust_vertex_fetch_alpha(nir_builder * b,enum radv_vs_input_alpha_adjust alpha_adjust,nir_ssa_def * alpha)4156 radv_adjust_vertex_fetch_alpha(nir_builder *b,
4157                                enum radv_vs_input_alpha_adjust alpha_adjust,
4158                                nir_ssa_def *alpha)
4159 {
4160    if (alpha_adjust == ALPHA_ADJUST_SSCALED)
4161       alpha = nir_f2u32(b, alpha);
4162 
4163    /* For the integer-like cases, do a natural sign extension.
4164     *
4165     * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0 and happen to contain 0, 1, 2, 3 as
4166     * the two LSBs of the exponent.
4167     */
4168    unsigned offset = alpha_adjust == ALPHA_ADJUST_SNORM ? 23u : 0u;
4169 
4170    alpha = nir_ibfe_imm(b, alpha, offset, 2u);
4171 
4172    /* Convert back to the right type. */
4173    if (alpha_adjust == ALPHA_ADJUST_SNORM) {
4174       alpha = nir_i2f32(b, alpha);
4175       alpha = nir_fmax(b, alpha, nir_imm_float(b, -1.0f));
4176    } else if (alpha_adjust == ALPHA_ADJUST_SSCALED) {
4177       alpha = nir_i2f32(b, alpha);
4178    }
4179 
4180    return alpha;
4181 }
4182 
4183 static bool
radv_lower_vs_input(nir_shader * nir,const struct radv_pipeline_key * pipeline_key)4184 radv_lower_vs_input(nir_shader *nir, const struct radv_pipeline_key *pipeline_key)
4185 {
4186    nir_function_impl *impl = nir_shader_get_entrypoint(nir);
4187    bool progress = false;
4188 
4189    if (pipeline_key->vs.dynamic_input_state)
4190       return false;
4191 
4192    nir_builder b;
4193    nir_builder_init(&b, impl);
4194 
4195    nir_foreach_block(block, impl) {
4196       nir_foreach_instr(instr, block) {
4197          if (instr->type != nir_instr_type_intrinsic)
4198             continue;
4199 
4200          nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
4201          if (intrin->intrinsic != nir_intrinsic_load_input)
4202             continue;
4203 
4204          unsigned location = nir_intrinsic_base(intrin) - VERT_ATTRIB_GENERIC0;
4205          enum radv_vs_input_alpha_adjust alpha_adjust = pipeline_key->vs.vertex_alpha_adjust[location];
4206          bool post_shuffle = pipeline_key->vs.vertex_post_shuffle & (1 << location);
4207 
4208          unsigned component = nir_intrinsic_component(intrin);
4209          unsigned num_components = intrin->dest.ssa.num_components;
4210 
4211          unsigned attrib_format = pipeline_key->vs.vertex_attribute_formats[location];
4212          unsigned dfmt = attrib_format & 0xf;
4213          unsigned nfmt = (attrib_format >> 4) & 0x7;
4214          const struct ac_data_format_info *vtx_info = ac_get_data_format_info(dfmt);
4215          bool is_float =
4216             nfmt != V_008F0C_BUF_NUM_FORMAT_UINT && nfmt != V_008F0C_BUF_NUM_FORMAT_SINT;
4217 
4218          unsigned mask = nir_ssa_def_components_read(&intrin->dest.ssa) << component;
4219          unsigned num_channels = MIN2(util_last_bit(mask), vtx_info->num_channels);
4220 
4221          static const unsigned swizzle_normal[4] = {0, 1, 2, 3};
4222          static const unsigned swizzle_post_shuffle[4] = {2, 1, 0, 3};
4223          const unsigned *swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal;
4224 
4225          b.cursor = nir_after_instr(instr);
4226          nir_ssa_def *channels[4];
4227 
4228          if (post_shuffle) {
4229             /* Expand to load 3 components because it's shuffled like X<->Z. */
4230             intrin->num_components = MAX2(component + num_components, 3);
4231             intrin->dest.ssa.num_components = intrin->num_components;
4232 
4233             nir_intrinsic_set_component(intrin, 0);
4234 
4235             num_channels = MAX2(num_channels, 3);
4236          }
4237 
4238          for (uint32_t i = 0; i < num_components; i++) {
4239             unsigned idx = i + (post_shuffle ? component : 0);
4240 
4241             if (swizzle[i + component] < num_channels) {
4242                channels[i] = nir_channel(&b, &intrin->dest.ssa, swizzle[idx]);
4243             } else if (i + component == 3) {
4244                channels[i] = is_float ? nir_imm_floatN_t(&b, 1.0f, intrin->dest.ssa.bit_size)
4245                                       : nir_imm_intN_t(&b, 1u, intrin->dest.ssa.bit_size);
4246             } else {
4247                channels[i] = nir_imm_zero(&b, 1, intrin->dest.ssa.bit_size);
4248             }
4249          }
4250 
4251          if (alpha_adjust != ALPHA_ADJUST_NONE && component + num_components == 4) {
4252             unsigned idx = num_components - 1;
4253             channels[idx] = radv_adjust_vertex_fetch_alpha(&b, alpha_adjust, channels[idx]);
4254          }
4255 
4256          nir_ssa_def *new_dest = nir_vec(&b, channels, num_components);
4257 
4258          nir_ssa_def_rewrite_uses_after(&intrin->dest.ssa, new_dest,
4259                                         new_dest->parent_instr);
4260 
4261          progress = true;
4262       }
4263    }
4264 
4265    if (progress)
4266       nir_metadata_preserve(impl, nir_metadata_block_index | nir_metadata_dominance);
4267    else
4268       nir_metadata_preserve(impl, nir_metadata_all);
4269 
4270    return progress;
4271 }
4272 
4273 static bool
radv_lower_fs_output(nir_shader * nir,const struct radv_pipeline_key * pipeline_key)4274 radv_lower_fs_output(nir_shader *nir, const struct radv_pipeline_key *pipeline_key)
4275 {
4276    nir_function_impl *impl = nir_shader_get_entrypoint(nir);
4277    bool progress = false;
4278 
4279    nir_builder b;
4280    nir_builder_init(&b, impl);
4281 
4282    nir_foreach_block(block, impl) {
4283       nir_foreach_instr(instr, block) {
4284          if (instr->type != nir_instr_type_intrinsic)
4285             continue;
4286 
4287          nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
4288          if (intrin->intrinsic != nir_intrinsic_store_output)
4289             continue;
4290 
4291          int slot = nir_intrinsic_base(intrin) - FRAG_RESULT_DATA0;
4292          if (slot < 0)
4293             continue;
4294 
4295          unsigned write_mask = nir_intrinsic_write_mask(intrin);
4296          unsigned col_format = (pipeline_key->ps.col_format >> (4 * slot)) & 0xf;
4297          bool is_int8 = (pipeline_key->ps.is_int8 >> slot) & 1;
4298          bool is_int10 = (pipeline_key->ps.is_int10 >> slot) & 1;
4299          bool enable_mrt_output_nan_fixup = (pipeline_key->ps.enable_mrt_output_nan_fixup >> slot) & 1;
4300          bool is_16bit = intrin->src[0].ssa->bit_size == 16;
4301 
4302          if (col_format == V_028714_SPI_SHADER_ZERO)
4303             continue;
4304 
4305          b.cursor = nir_before_instr(instr);
4306          nir_ssa_def *values[4];
4307 
4308          /* Extract the export values. */
4309          for (unsigned i = 0; i < 4; i++) {
4310             if (write_mask & (1 << i)) {
4311                values[i] = nir_channel(&b, intrin->src[0].ssa, i);
4312             } else {
4313                values[i] = nir_ssa_undef(&b, 1, 32);
4314             }
4315          }
4316 
4317          /* Replace NaN by zero (for 32-bit float formats) to fix game bugs if requested. */
4318          if (enable_mrt_output_nan_fixup && !nir->info.internal && !is_16bit) {
4319             u_foreach_bit(i, write_mask) {
4320                const bool save_exact = b.exact;
4321 
4322                b.exact = true;
4323                nir_ssa_def *isnan = nir_fneu(&b, values[i], values[i]);
4324                b.exact = save_exact;
4325 
4326                values[i] = nir_bcsel(&b, isnan, nir_imm_zero(&b, 1, 32), values[i]);
4327             }
4328          }
4329 
4330          if (col_format == V_028714_SPI_SHADER_FP16_ABGR ||
4331              col_format == V_028714_SPI_SHADER_UNORM16_ABGR ||
4332              col_format == V_028714_SPI_SHADER_SNORM16_ABGR ||
4333              col_format == V_028714_SPI_SHADER_UINT16_ABGR ||
4334              col_format == V_028714_SPI_SHADER_SINT16_ABGR) {
4335             /* Convert and/or clamp the export values. */
4336             switch (col_format) {
4337             case V_028714_SPI_SHADER_UINT16_ABGR: {
4338                unsigned max_rgb = is_int8 ? 255 : is_int10 ? 1023 : 0;
4339                u_foreach_bit(i, write_mask) {
4340                   if (is_int8 || is_int10) {
4341                      values[i] = nir_umin(&b, values[i], i == 3 && is_int10 ? nir_imm_int(&b, 3u)
4342                                                                             : nir_imm_int(&b, max_rgb));
4343                   } else if (is_16bit) {
4344                      values[i] = nir_u2u32(&b, values[i]);
4345                   }
4346                }
4347                break;
4348             }
4349             case V_028714_SPI_SHADER_SINT16_ABGR: {
4350                unsigned max_rgb = is_int8 ? 127 : is_int10 ? 511 : 0;
4351                unsigned min_rgb = is_int8 ? -128 : is_int10 ? -512 : 0;
4352                u_foreach_bit(i, write_mask) {
4353                   if (is_int8 || is_int10) {
4354                      values[i] = nir_imin(&b, values[i], i == 3 && is_int10 ? nir_imm_int(&b, 1u)
4355                                                                             : nir_imm_int(&b, max_rgb));
4356                      values[i] = nir_imax(&b, values[i], i == 3 && is_int10 ? nir_imm_int(&b, -2u)
4357                                                                             : nir_imm_int(&b, min_rgb));
4358                   } else if (is_16bit) {
4359                      values[i] = nir_i2i32(&b, values[i]);
4360                   }
4361                }
4362                break;
4363             }
4364             case V_028714_SPI_SHADER_UNORM16_ABGR:
4365             case V_028714_SPI_SHADER_SNORM16_ABGR:
4366                u_foreach_bit(i, write_mask) {
4367                   if (is_16bit) {
4368                      values[i] = nir_f2f32(&b, values[i]);
4369                   }
4370                }
4371                break;
4372             default:
4373                break;
4374             }
4375 
4376             /* Only nir_pack_32_2x16_split needs 16-bit inputs. */
4377             bool input_16_bit = col_format == V_028714_SPI_SHADER_FP16_ABGR && is_16bit;
4378             unsigned new_write_mask = 0;
4379 
4380             /* Pack the export values. */
4381             for (unsigned i = 0; i < 2; i++) {
4382                bool enabled = (write_mask >> (i * 2)) & 0x3;
4383 
4384                if (!enabled) {
4385                   values[i] = nir_ssa_undef(&b, 1, 32);
4386                   continue;
4387                }
4388 
4389                nir_ssa_def *src0 = values[i * 2];
4390                nir_ssa_def *src1 = values[i * 2 + 1];
4391 
4392                if (!(write_mask & (1 << (i * 2))))
4393                   src0 = nir_imm_zero(&b, 1, input_16_bit ? 16 : 32);
4394                if (!(write_mask & (1 << (i * 2 + 1))))
4395                   src1 = nir_imm_zero(&b, 1, input_16_bit ? 16 : 32);
4396 
4397                if (col_format == V_028714_SPI_SHADER_FP16_ABGR) {
4398                   if (is_16bit) {
4399                      values[i] = nir_pack_32_2x16_split(&b, src0, src1);
4400                   } else {
4401                      values[i] = nir_pack_half_2x16_split(&b, src0, src1);
4402                   }
4403                } else if (col_format == V_028714_SPI_SHADER_UNORM16_ABGR) {
4404                   values[i] = nir_pack_unorm_2x16(&b, nir_vec2(&b, src0, src1));
4405                } else if (col_format == V_028714_SPI_SHADER_SNORM16_ABGR) {
4406                   values[i] = nir_pack_snorm_2x16(&b, nir_vec2(&b, src0, src1));
4407                } else if (col_format == V_028714_SPI_SHADER_UINT16_ABGR) {
4408                   values[i] = nir_pack_uint_2x16(&b, nir_vec2(&b, src0, src1));
4409                } else if (col_format == V_028714_SPI_SHADER_SINT16_ABGR) {
4410                   values[i] = nir_pack_sint_2x16(&b, nir_vec2(&b, src0, src1));
4411                }
4412 
4413                new_write_mask |= 1 << i;
4414             }
4415 
4416             /* Update the write mask for compressed outputs. */
4417             nir_intrinsic_set_write_mask(intrin, new_write_mask);
4418             intrin->num_components = util_last_bit(new_write_mask);
4419          }
4420 
4421          nir_ssa_def *new_src = nir_vec(&b, values, intrin->num_components);
4422 
4423          nir_instr_rewrite_src(&intrin->instr, &intrin->src[0], nir_src_for_ssa(new_src));
4424 
4425          progress = true;
4426       }
4427    }
4428 
4429    if (progress)
4430       nir_metadata_preserve(impl, nir_metadata_block_index | nir_metadata_dominance);
4431    else
4432       nir_metadata_preserve(impl, nir_metadata_all);
4433 
4434    return progress;
4435 }
4436 
4437 void
radv_pipeline_stage_init(const VkPipelineShaderStageCreateInfo * sinfo,struct radv_pipeline_stage * out_stage,gl_shader_stage stage)4438 radv_pipeline_stage_init(const VkPipelineShaderStageCreateInfo *sinfo,
4439                          struct radv_pipeline_stage *out_stage, gl_shader_stage stage)
4440 {
4441    const VkShaderModuleCreateInfo *minfo =
4442       vk_find_struct_const(sinfo->pNext, SHADER_MODULE_CREATE_INFO);
4443    const VkPipelineShaderStageModuleIdentifierCreateInfoEXT *iinfo =
4444       vk_find_struct_const(sinfo->pNext, PIPELINE_SHADER_STAGE_MODULE_IDENTIFIER_CREATE_INFO_EXT);
4445 
4446    if (sinfo->module == VK_NULL_HANDLE && !minfo && !iinfo)
4447       return;
4448 
4449    memset(out_stage, 0, sizeof(*out_stage));
4450 
4451    out_stage->stage = stage;
4452    out_stage->entrypoint = sinfo->pName;
4453    out_stage->spec_info = sinfo->pSpecializationInfo;
4454    out_stage->feedback.flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT;
4455 
4456    if (sinfo->module != VK_NULL_HANDLE) {
4457       struct vk_shader_module *module = vk_shader_module_from_handle(sinfo->module);
4458       STATIC_ASSERT(sizeof(out_stage->spirv.sha1) == sizeof(module->sha1));
4459 
4460       out_stage->spirv.data = module->data;
4461       out_stage->spirv.size = module->size;
4462       out_stage->spirv.object = &module->base;
4463 
4464       if (module->nir)
4465          out_stage->internal_nir = module->nir;
4466    } else if (minfo) {
4467       out_stage->spirv.data = (const char *) minfo->pCode;
4468       out_stage->spirv.size = minfo->codeSize;
4469    }
4470 
4471    vk_pipeline_hash_shader_stage(sinfo, out_stage->shader_sha1);
4472 }
4473 
4474 static struct radv_shader *
radv_pipeline_create_gs_copy_shader(struct radv_pipeline * pipeline,struct radv_pipeline_stage * stages,const struct radv_pipeline_key * pipeline_key,const struct radv_pipeline_layout * pipeline_layout,bool keep_executable_info,bool keep_statistic_info,struct radv_shader_binary ** gs_copy_binary)4475 radv_pipeline_create_gs_copy_shader(struct radv_pipeline *pipeline,
4476                                     struct radv_pipeline_stage *stages,
4477                                     const struct radv_pipeline_key *pipeline_key,
4478                                     const struct radv_pipeline_layout *pipeline_layout,
4479                                     bool keep_executable_info, bool keep_statistic_info,
4480                                     struct radv_shader_binary **gs_copy_binary)
4481 {
4482    struct radv_device *device = pipeline->device;
4483    struct radv_shader_info info = {0};
4484 
4485    if (stages[MESA_SHADER_GEOMETRY].info.vs.outinfo.export_clip_dists)
4486       info.vs.outinfo.export_clip_dists = true;
4487 
4488    radv_nir_shader_info_pass(device, stages[MESA_SHADER_GEOMETRY].nir, pipeline_layout, pipeline_key,
4489                              &info);
4490    info.wave_size = 64; /* Wave32 not supported. */
4491    info.workgroup_size = 64; /* HW VS: separate waves, no workgroups */
4492    info.ballot_bit_size = 64;
4493 
4494    struct radv_shader_args gs_copy_args = {0};
4495    gs_copy_args.is_gs_copy_shader = true;
4496    gs_copy_args.explicit_scratch_args = !radv_use_llvm_for_stage(device, MESA_SHADER_VERTEX);
4497    radv_declare_shader_args(device->physical_device->rad_info.gfx_level, pipeline_key, &info,
4498                             MESA_SHADER_VERTEX, false, MESA_SHADER_VERTEX, &gs_copy_args);
4499    info.user_sgprs_locs = gs_copy_args.user_sgprs_locs;
4500    info.inline_push_constant_mask = gs_copy_args.ac.inline_push_const_mask;
4501 
4502    return radv_create_gs_copy_shader(device, stages[MESA_SHADER_GEOMETRY].nir, &info, &gs_copy_args,
4503                                      gs_copy_binary, keep_executable_info, keep_statistic_info,
4504                                      pipeline_key->optimisations_disabled);
4505 }
4506 
4507 static void
radv_pipeline_nir_to_asm(struct radv_pipeline * pipeline,struct radv_pipeline_stage * stages,const struct radv_pipeline_key * pipeline_key,const struct radv_pipeline_layout * pipeline_layout,bool keep_executable_info,bool keep_statistic_info,gl_shader_stage last_vgt_api_stage,struct radv_shader_binary ** binaries,struct radv_shader_binary ** gs_copy_binary)4508 radv_pipeline_nir_to_asm(struct radv_pipeline *pipeline, struct radv_pipeline_stage *stages,
4509                          const struct radv_pipeline_key *pipeline_key,
4510                          const struct radv_pipeline_layout *pipeline_layout,
4511                          bool keep_executable_info, bool keep_statistic_info,
4512                          gl_shader_stage last_vgt_api_stage,
4513                          struct radv_shader_binary **binaries,
4514                          struct radv_shader_binary **gs_copy_binary)
4515 {
4516    struct radv_device *device = pipeline->device;
4517    unsigned active_stages = 0;
4518 
4519    for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; i++) {
4520       if (stages[i].nir)
4521          active_stages |= (1 << i);
4522    }
4523 
4524    bool pipeline_has_ngg = last_vgt_api_stage != MESA_SHADER_NONE &&
4525                            stages[last_vgt_api_stage].info.is_ngg;
4526 
4527    if (stages[MESA_SHADER_GEOMETRY].nir && !pipeline_has_ngg) {
4528       pipeline->gs_copy_shader =
4529          radv_pipeline_create_gs_copy_shader(pipeline, stages, pipeline_key, pipeline_layout,
4530                                              keep_executable_info, keep_statistic_info,
4531                                              gs_copy_binary);
4532    }
4533 
4534    for (int s = MESA_VULKAN_SHADER_STAGES - 1; s >= 0; s--) {
4535       if (!(active_stages & (1 << s)) || pipeline->shaders[s])
4536          continue;
4537 
4538       nir_shader *shaders[2] = { stages[s].nir, NULL };
4539       unsigned shader_count = 1;
4540 
4541       /* On GFX9+, TES is merged with GS and VS is merged with TCS or GS. */
4542       if (device->physical_device->rad_info.gfx_level >= GFX9 &&
4543           (s == MESA_SHADER_TESS_CTRL || s == MESA_SHADER_GEOMETRY)) {
4544          gl_shader_stage pre_stage;
4545 
4546          if (s == MESA_SHADER_GEOMETRY && stages[MESA_SHADER_TESS_EVAL].nir) {
4547             pre_stage = MESA_SHADER_TESS_EVAL;
4548          } else {
4549             pre_stage = MESA_SHADER_VERTEX;
4550          }
4551 
4552          shaders[0] = stages[pre_stage].nir;
4553          shaders[1] = stages[s].nir;
4554          shader_count = 2;
4555       }
4556 
4557       int64_t stage_start = os_time_get_nano();
4558 
4559       pipeline->shaders[s] = radv_shader_nir_to_asm(device, &stages[s], shaders, shader_count,
4560                                                     pipeline_key, keep_executable_info,
4561                                                     keep_statistic_info, &binaries[s]);
4562 
4563       stages[s].feedback.duration += os_time_get_nano() - stage_start;
4564 
4565       active_stages &= ~(1 << shaders[0]->info.stage);
4566       if (shaders[1])
4567          active_stages &= ~(1 << shaders[1]->info.stage);
4568    }
4569 }
4570 
4571 VkResult
radv_create_shaders(struct radv_pipeline * pipeline,struct radv_pipeline_layout * pipeline_layout,struct radv_device * device,struct radv_pipeline_cache * cache,const struct radv_pipeline_key * pipeline_key,const VkPipelineShaderStageCreateInfo * pStages,uint32_t stageCount,const VkPipelineCreateFlags flags,const uint8_t * custom_hash,const VkPipelineCreationFeedbackCreateInfo * creation_feedback,struct radv_pipeline_shader_stack_size ** stack_sizes,uint32_t * num_stack_sizes,gl_shader_stage * last_vgt_api_stage)4572 radv_create_shaders(struct radv_pipeline *pipeline, struct radv_pipeline_layout *pipeline_layout,
4573                     struct radv_device *device, struct radv_pipeline_cache *cache,
4574                     const struct radv_pipeline_key *pipeline_key,
4575                     const VkPipelineShaderStageCreateInfo *pStages,
4576                     uint32_t stageCount,
4577                     const VkPipelineCreateFlags flags, const uint8_t *custom_hash,
4578                     const VkPipelineCreationFeedbackCreateInfo *creation_feedback,
4579                     struct radv_pipeline_shader_stack_size **stack_sizes,
4580                     uint32_t *num_stack_sizes,
4581                     gl_shader_stage *last_vgt_api_stage)
4582 {
4583    const char *noop_fs_entrypoint = "noop_fs";
4584    struct radv_shader_binary *binaries[MESA_VULKAN_SHADER_STAGES] = {NULL};
4585    struct radv_shader_binary *gs_copy_binary = NULL;
4586    unsigned char hash[20];
4587    bool keep_executable_info =
4588       (flags & VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR) ||
4589       device->keep_shader_info;
4590    bool keep_statistic_info = (flags & VK_PIPELINE_CREATE_CAPTURE_STATISTICS_BIT_KHR) ||
4591                               (device->instance->debug_flags & RADV_DEBUG_DUMP_SHADER_STATS) ||
4592                               device->keep_shader_info;
4593    struct radv_pipeline_stage stages[MESA_VULKAN_SHADER_STAGES] = {0};
4594    VkPipelineCreationFeedback pipeline_feedback = {
4595       .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
4596    };
4597    bool noop_fs = false;
4598    VkResult result = VK_SUCCESS;
4599 
4600    int64_t pipeline_start = os_time_get_nano();
4601 
4602    for (uint32_t i = 0; i < stageCount; i++) {
4603       const VkPipelineShaderStageCreateInfo *sinfo = &pStages[i];
4604       gl_shader_stage stage = vk_to_mesa_shader_stage(sinfo->stage);
4605 
4606       radv_pipeline_stage_init(sinfo, &stages[stage], stage);
4607    }
4608 
4609    for (unsigned s = 0; s < MESA_VULKAN_SHADER_STAGES; s++) {
4610       if (!stages[s].entrypoint)
4611          continue;
4612 
4613       if (stages[s].stage < MESA_SHADER_FRAGMENT || stages[s].stage == MESA_SHADER_MESH)
4614          *last_vgt_api_stage = stages[s].stage;
4615    }
4616 
4617    ASSERTED bool primitive_shading =
4618       stages[MESA_SHADER_VERTEX].entrypoint || stages[MESA_SHADER_TESS_CTRL].entrypoint ||
4619       stages[MESA_SHADER_TESS_EVAL].entrypoint || stages[MESA_SHADER_GEOMETRY].entrypoint;
4620    ASSERTED bool mesh_shading =
4621       stages[MESA_SHADER_MESH].entrypoint;
4622 
4623    /* Primitive and mesh shading must not be mixed in the same pipeline. */
4624    assert(!primitive_shading || !mesh_shading);
4625    /* Mesh shaders are mandatory in mesh shading pipelines. */
4626    assert(mesh_shading == !!stages[MESA_SHADER_MESH].entrypoint);
4627    /* Mesh shaders always need NGG. */
4628    assert(!mesh_shading || pipeline_key->use_ngg);
4629 
4630    if (custom_hash)
4631       memcpy(hash, custom_hash, 20);
4632    else {
4633       radv_hash_shaders(hash, stages, pipeline_layout, pipeline_key,
4634                         radv_get_hash_flags(device, keep_statistic_info));
4635    }
4636 
4637    pipeline->pipeline_hash = *(uint64_t *)hash;
4638 
4639    bool found_in_application_cache = true;
4640    if (!keep_executable_info &&
4641        radv_create_shaders_from_pipeline_cache(device, cache, hash, pipeline,
4642                                                stack_sizes, num_stack_sizes,
4643                                                &found_in_application_cache)) {
4644       if (found_in_application_cache)
4645          pipeline_feedback.flags |= VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
4646       result = VK_SUCCESS;
4647       goto done;
4648    }
4649 
4650    if (flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT) {
4651       if (found_in_application_cache)
4652          pipeline_feedback.flags |= VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
4653       result = VK_PIPELINE_COMPILE_REQUIRED;
4654       goto done;
4655    }
4656 
4657    if (pipeline->type == RADV_PIPELINE_GRAPHICS && !stages[MESA_SHADER_FRAGMENT].entrypoint) {
4658       nir_builder fs_b = radv_meta_init_shader(device, MESA_SHADER_FRAGMENT, "noop_fs");
4659 
4660       stages[MESA_SHADER_FRAGMENT] = (struct radv_pipeline_stage) {
4661          .stage = MESA_SHADER_FRAGMENT,
4662          .internal_nir = fs_b.shader,
4663          .entrypoint = noop_fs_entrypoint,
4664          .feedback = {
4665             .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
4666          },
4667       };
4668 
4669       noop_fs = true;
4670    }
4671 
4672    for (unsigned s = 0; s < MESA_VULKAN_SHADER_STAGES; s++) {
4673       if (!stages[s].entrypoint)
4674          continue;
4675 
4676       int64_t stage_start = os_time_get_nano();
4677 
4678       stages[s].nir = radv_shader_spirv_to_nir(device, &stages[s], pipeline_key);
4679 
4680       stages[s].feedback.duration += os_time_get_nano() - stage_start;
4681    }
4682 
4683    /* Force per-vertex VRS. */
4684    if (radv_consider_force_vrs(pipeline, noop_fs, stages, *last_vgt_api_stage)) {
4685       assert(*last_vgt_api_stage == MESA_SHADER_VERTEX ||
4686              *last_vgt_api_stage == MESA_SHADER_TESS_EVAL ||
4687              *last_vgt_api_stage == MESA_SHADER_GEOMETRY);
4688       nir_shader *last_vgt_shader = stages[*last_vgt_api_stage].nir;
4689       NIR_PASS(_, last_vgt_shader, radv_force_primitive_shading_rate, device);
4690    }
4691 
4692    bool optimize_conservatively = pipeline_key->optimisations_disabled;
4693 
4694    /* Determine if shaders uses NGG before linking because it's needed for some NIR pass. */
4695    radv_fill_shader_info_ngg(pipeline, pipeline_key, stages);
4696 
4697    bool pipeline_has_ngg = (stages[MESA_SHADER_VERTEX].nir && stages[MESA_SHADER_VERTEX].info.is_ngg) ||
4698                            (stages[MESA_SHADER_TESS_EVAL].nir && stages[MESA_SHADER_TESS_EVAL].info.is_ngg) ||
4699                            (stages[MESA_SHADER_MESH].nir && stages[MESA_SHADER_MESH].info.is_ngg);
4700 
4701    if (stages[MESA_SHADER_GEOMETRY].nir) {
4702       unsigned nir_gs_flags = nir_lower_gs_intrinsics_per_stream;
4703 
4704       if (pipeline_has_ngg) {
4705          nir_gs_flags |= nir_lower_gs_intrinsics_count_primitives |
4706                          nir_lower_gs_intrinsics_count_vertices_per_primitive |
4707                          nir_lower_gs_intrinsics_overwrite_incomplete;
4708       }
4709 
4710       NIR_PASS(_, stages[MESA_SHADER_GEOMETRY].nir, nir_lower_gs_intrinsics, nir_gs_flags);
4711    }
4712 
4713    radv_link_shaders(pipeline, pipeline_key, stages, optimize_conservatively, *last_vgt_api_stage);
4714    radv_set_driver_locations(pipeline, stages, *last_vgt_api_stage);
4715 
4716    for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i) {
4717       if (stages[i].nir) {
4718          int64_t stage_start = os_time_get_nano();
4719 
4720          radv_optimize_nir(stages[i].nir, optimize_conservatively, false);
4721 
4722          /* Gather info again, information such as outputs_read can be out-of-date. */
4723          nir_shader_gather_info(stages[i].nir, nir_shader_get_entrypoint(stages[i].nir));
4724          radv_lower_io(device, stages[i].nir, stages[MESA_SHADER_MESH].nir);
4725 
4726          stages[i].feedback.duration += os_time_get_nano() - stage_start;
4727       }
4728    }
4729 
4730    if (stages[MESA_SHADER_TESS_CTRL].nir) {
4731       nir_lower_patch_vertices(stages[MESA_SHADER_TESS_EVAL].nir,
4732                                stages[MESA_SHADER_TESS_CTRL].nir->info.tess.tcs_vertices_out, NULL);
4733       gather_tess_info(device, stages, pipeline_key);
4734    }
4735 
4736    if (stages[MESA_SHADER_VERTEX].nir) {
4737       NIR_PASS(_, stages[MESA_SHADER_VERTEX].nir, radv_lower_vs_input, pipeline_key);
4738    }
4739 
4740    if (stages[MESA_SHADER_FRAGMENT].nir && !radv_use_llvm_for_stage(device, MESA_SHADER_FRAGMENT)) {
4741       /* TODO: Convert the LLVM backend. */
4742       NIR_PASS(_, stages[MESA_SHADER_FRAGMENT].nir, radv_lower_fs_output, pipeline_key);
4743    }
4744 
4745    radv_fill_shader_info(pipeline, pipeline_layout, pipeline_key, stages, *last_vgt_api_stage);
4746 
4747    if (pipeline_has_ngg) {
4748       struct gfx10_ngg_info *ngg_info;
4749 
4750       if (stages[MESA_SHADER_GEOMETRY].nir)
4751          ngg_info = &stages[MESA_SHADER_GEOMETRY].info.ngg_info;
4752       else if (stages[MESA_SHADER_TESS_CTRL].nir)
4753          ngg_info = &stages[MESA_SHADER_TESS_EVAL].info.ngg_info;
4754       else if (stages[MESA_SHADER_VERTEX].nir)
4755          ngg_info = &stages[MESA_SHADER_VERTEX].info.ngg_info;
4756       else if (stages[MESA_SHADER_MESH].nir)
4757          ngg_info = &stages[MESA_SHADER_MESH].info.ngg_info;
4758       else
4759          unreachable("Missing NGG shader stage.");
4760 
4761       if (*last_vgt_api_stage == MESA_SHADER_MESH)
4762          gfx10_get_ngg_ms_info(&stages[MESA_SHADER_MESH], ngg_info);
4763       else
4764          gfx10_get_ngg_info(pipeline_key, pipeline, stages, ngg_info);
4765    } else if (stages[MESA_SHADER_GEOMETRY].nir) {
4766       struct gfx9_gs_info *gs_info = &stages[MESA_SHADER_GEOMETRY].info.gs_ring_info;
4767 
4768       gfx9_get_gs_info(pipeline_key, pipeline, stages, gs_info);
4769    } else {
4770       gl_shader_stage hw_vs_api_stage =
4771          stages[MESA_SHADER_TESS_EVAL].nir ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
4772       stages[hw_vs_api_stage].info.workgroup_size = stages[hw_vs_api_stage].info.wave_size;
4773    }
4774 
4775    radv_determine_ngg_settings(pipeline, pipeline_key, stages, *last_vgt_api_stage);
4776 
4777    radv_declare_pipeline_args(device, stages, pipeline_key);
4778 
4779    if (stages[MESA_SHADER_FRAGMENT].nir) {
4780       NIR_PASS(_, stages[MESA_SHADER_FRAGMENT].nir, radv_lower_fs_intrinsics,
4781                &stages[MESA_SHADER_FRAGMENT], pipeline_key);
4782    }
4783 
4784    for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i) {
4785       if (stages[i].nir) {
4786          int64_t stage_start = os_time_get_nano();
4787 
4788          /* Wave and workgroup size should already be filled. */
4789          assert(stages[i].info.wave_size && stages[i].info.workgroup_size);
4790 
4791          if (!radv_use_llvm_for_stage(device, i)) {
4792             nir_lower_non_uniform_access_options options = {
4793                .types = nir_lower_non_uniform_ubo_access | nir_lower_non_uniform_ssbo_access |
4794                         nir_lower_non_uniform_texture_access | nir_lower_non_uniform_image_access,
4795                .callback = &non_uniform_access_callback,
4796                .callback_data = NULL,
4797             };
4798             NIR_PASS(_, stages[i].nir, nir_lower_non_uniform_access, &options);
4799          }
4800          NIR_PASS(_, stages[i].nir, nir_lower_memory_model);
4801 
4802          nir_load_store_vectorize_options vectorize_opts = {
4803             .modes = nir_var_mem_ssbo | nir_var_mem_ubo | nir_var_mem_push_const |
4804                      nir_var_mem_shared | nir_var_mem_global,
4805             .callback = mem_vectorize_callback,
4806             .robust_modes = 0,
4807             /* On GFX6, read2/write2 is out-of-bounds if the offset register is negative, even if
4808              * the final offset is not.
4809              */
4810             .has_shared2_amd = device->physical_device->rad_info.gfx_level >= GFX7,
4811          };
4812 
4813          if (device->robust_buffer_access2) {
4814             vectorize_opts.robust_modes =
4815                nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_mem_push_const;
4816          }
4817 
4818          bool progress = false;
4819          NIR_PASS(progress, stages[i].nir, nir_opt_load_store_vectorize, &vectorize_opts);
4820          if (progress) {
4821             NIR_PASS(_, stages[i].nir, nir_copy_prop);
4822             NIR_PASS(_, stages[i].nir, nir_opt_shrink_stores,
4823                      !device->instance->disable_shrink_image_store);
4824 
4825             /* Gather info again, to update whether 8/16-bit are used. */
4826             nir_shader_gather_info(stages[i].nir, nir_shader_get_entrypoint(stages[i].nir));
4827          }
4828 
4829          struct radv_shader_info *info = &stages[i].info;
4830          if (pipeline->device->physical_device->rad_info.gfx_level >= GFX9) {
4831             if (i == MESA_SHADER_VERTEX && stages[MESA_SHADER_TESS_CTRL].nir)
4832                info = &stages[MESA_SHADER_TESS_CTRL].info;
4833             else if (i == MESA_SHADER_VERTEX && stages[MESA_SHADER_GEOMETRY].nir)
4834                info = &stages[MESA_SHADER_GEOMETRY].info;
4835             else if (i == MESA_SHADER_TESS_EVAL && stages[MESA_SHADER_GEOMETRY].nir)
4836                info = &stages[MESA_SHADER_GEOMETRY].info;
4837          }
4838          NIR_PASS(_, stages[i].nir, radv_nir_lower_ycbcr_textures, pipeline_layout);
4839          NIR_PASS_V(stages[i].nir, radv_nir_apply_pipeline_layout, device, pipeline_layout, info,
4840                     &stages[i].args);
4841 
4842          NIR_PASS(_, stages[i].nir, nir_opt_shrink_vectors);
4843 
4844          NIR_PASS(_, stages[i].nir, nir_lower_alu_width, opt_vectorize_callback, device);
4845 
4846          /* lower ALU operations */
4847          NIR_PASS(_, stages[i].nir, nir_lower_int64);
4848 
4849          NIR_PASS(_, stages[i].nir, nir_opt_idiv_const, 8);
4850 
4851          NIR_PASS(_, stages[i].nir, nir_lower_idiv,
4852                   &(nir_lower_idiv_options){
4853                      .imprecise_32bit_lowering = false,
4854                      .allow_fp16 = device->physical_device->rad_info.gfx_level >= GFX9,
4855                   });
4856 
4857          nir_move_options sink_opts = nir_move_const_undef | nir_move_copies;
4858          if (i != MESA_SHADER_FRAGMENT || !pipeline_key->disable_sinking_load_input_fs)
4859             sink_opts |= nir_move_load_input;
4860 
4861          NIR_PASS(_, stages[i].nir, nir_opt_sink, sink_opts);
4862          NIR_PASS(_, stages[i].nir, nir_opt_move,
4863                   nir_move_load_input | nir_move_const_undef | nir_move_copies);
4864 
4865          /* Lower I/O intrinsics to memory instructions. */
4866          bool io_to_mem = radv_lower_io_to_mem(device, &stages[i], pipeline_key);
4867          bool lowered_ngg = pipeline_has_ngg && i == *last_vgt_api_stage;
4868          if (lowered_ngg)
4869             radv_lower_ngg(device, &stages[i], pipeline_key);
4870 
4871          NIR_PASS(_, stages[i].nir, ac_nir_lower_global_access);
4872          NIR_PASS_V(stages[i].nir, radv_nir_lower_abi, device->physical_device->rad_info.gfx_level,
4873                     &stages[i].info, &stages[i].args, pipeline_key,
4874                     radv_use_llvm_for_stage(device, i));
4875          radv_optimize_nir_algebraic(
4876             stages[i].nir, io_to_mem || lowered_ngg || i == MESA_SHADER_COMPUTE || i == MESA_SHADER_TASK);
4877 
4878          if (stages[i].nir->info.bit_sizes_int & (8 | 16)) {
4879             if (device->physical_device->rad_info.gfx_level >= GFX8) {
4880                NIR_PASS(_, stages[i].nir, nir_convert_to_lcssa, true, true);
4881                nir_divergence_analysis(stages[i].nir);
4882             }
4883 
4884             if (nir_lower_bit_size(stages[i].nir, lower_bit_size_callback, device)) {
4885                NIR_PASS(_, stages[i].nir, nir_opt_constant_folding);
4886             }
4887 
4888             if (device->physical_device->rad_info.gfx_level >= GFX8)
4889                NIR_PASS(_, stages[i].nir, nir_opt_remove_phis); /* cleanup LCSSA phis */
4890          }
4891          if (((stages[i].nir->info.bit_sizes_int | stages[i].nir->info.bit_sizes_float) & 16) &&
4892              device->physical_device->rad_info.gfx_level >= GFX9) {
4893             bool separate_g16 = device->physical_device->rad_info.gfx_level >= GFX10;
4894             struct nir_fold_tex_srcs_options fold_srcs_options[] = {
4895                {
4896                   .sampler_dims =
4897                      ~(BITFIELD_BIT(GLSL_SAMPLER_DIM_CUBE) | BITFIELD_BIT(GLSL_SAMPLER_DIM_BUF)),
4898                   .src_types = (1 << nir_tex_src_coord) | (1 << nir_tex_src_lod) |
4899                                (1 << nir_tex_src_bias) | (1 << nir_tex_src_min_lod) |
4900                                (1 << nir_tex_src_ms_index) |
4901                                (separate_g16 ? 0 : (1 << nir_tex_src_ddx) | (1 << nir_tex_src_ddy)),
4902                },
4903                {
4904                   .sampler_dims = ~BITFIELD_BIT(GLSL_SAMPLER_DIM_CUBE),
4905                   .src_types = (1 << nir_tex_src_ddx) | (1 << nir_tex_src_ddy),
4906                },
4907             };
4908             struct nir_fold_16bit_tex_image_options fold_16bit_options = {
4909                .rounding_mode = nir_rounding_mode_rtne,
4910                .fold_tex_dest = true,
4911                .fold_image_load_store_data = true,
4912                .fold_srcs_options_count = separate_g16 ? 2 : 1,
4913                .fold_srcs_options = fold_srcs_options,
4914             };
4915             NIR_PASS(_, stages[i].nir, nir_fold_16bit_tex_image, &fold_16bit_options);
4916 
4917             NIR_PASS(_, stages[i].nir, nir_opt_vectorize, opt_vectorize_callback, device);
4918          }
4919 
4920          /* cleanup passes */
4921          NIR_PASS(_, stages[i].nir, nir_lower_alu_width, opt_vectorize_callback, device);
4922          NIR_PASS(_, stages[i].nir, nir_lower_load_const_to_scalar);
4923          NIR_PASS(_, stages[i].nir, nir_copy_prop);
4924          NIR_PASS(_, stages[i].nir, nir_opt_dce);
4925 
4926          sink_opts |= nir_move_comparisons | nir_move_load_ubo | nir_move_load_ssbo;
4927          NIR_PASS(_, stages[i].nir, nir_opt_sink, sink_opts);
4928 
4929          nir_move_options move_opts = nir_move_const_undef | nir_move_load_ubo |
4930                                       nir_move_load_input | nir_move_comparisons | nir_move_copies;
4931          NIR_PASS(_, stages[i].nir, nir_opt_move, move_opts);
4932 
4933          stages[i].feedback.duration += os_time_get_nano() - stage_start;
4934       }
4935    }
4936 
4937    for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i) {
4938       if (stages[i].nir) {
4939          if (radv_can_dump_shader(device, stages[i].nir, false))
4940             nir_print_shader(stages[i].nir, stderr);
4941       }
4942    }
4943 
4944    /* Compile NIR shaders to AMD assembly. */
4945    radv_pipeline_nir_to_asm(pipeline, stages, pipeline_key, pipeline_layout, keep_executable_info,
4946                             keep_statistic_info, *last_vgt_api_stage, binaries, &gs_copy_binary);
4947 
4948    if (keep_executable_info) {
4949       for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i) {
4950          struct radv_shader *shader = pipeline->shaders[i];
4951          if (!shader)
4952             continue;
4953 
4954          if (!stages[i].spirv.size)
4955             continue;
4956 
4957          shader->spirv = malloc(stages[i].spirv.size);
4958          memcpy(shader->spirv, stages[i].spirv.data, stages[i].spirv.size);
4959          shader->spirv_size = stages[i].spirv.size;
4960       }
4961    }
4962 
4963    /* Upload shader binaries. */
4964    radv_upload_shaders(device, pipeline, binaries, gs_copy_binary);
4965 
4966    if (!keep_executable_info) {
4967       if (pipeline->gs_copy_shader) {
4968          assert(!binaries[MESA_SHADER_COMPUTE] && !pipeline->shaders[MESA_SHADER_COMPUTE]);
4969          binaries[MESA_SHADER_COMPUTE] = gs_copy_binary;
4970          pipeline->shaders[MESA_SHADER_COMPUTE] = pipeline->gs_copy_shader;
4971       }
4972 
4973       radv_pipeline_cache_insert_shaders(device, cache, hash, pipeline, binaries,
4974                                          stack_sizes ? *stack_sizes : NULL,
4975                                          num_stack_sizes ? *num_stack_sizes : 0);
4976 
4977       if (pipeline->gs_copy_shader) {
4978          pipeline->gs_copy_shader = pipeline->shaders[MESA_SHADER_COMPUTE];
4979          pipeline->shaders[MESA_SHADER_COMPUTE] = NULL;
4980          binaries[MESA_SHADER_COMPUTE] = NULL;
4981       }
4982    }
4983 
4984    free(gs_copy_binary);
4985    for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i) {
4986       free(binaries[i]);
4987       if (stages[i].nir) {
4988          if (radv_can_dump_shader_stats(device, stages[i].nir) && pipeline->shaders[i]) {
4989             radv_dump_shader_stats(device, pipeline, i, stderr);
4990          }
4991 
4992          ralloc_free(stages[i].nir);
4993       }
4994    }
4995 
4996 done:
4997    pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
4998 
4999    if (creation_feedback) {
5000       *creation_feedback->pPipelineCreationFeedback = pipeline_feedback;
5001 
5002       uint32_t stage_count = creation_feedback->pipelineStageCreationFeedbackCount;
5003       assert(stage_count == 0 || stageCount == stage_count);
5004       for (uint32_t i = 0; i < stage_count; i++) {
5005          gl_shader_stage s = vk_to_mesa_shader_stage(pStages[i].stage);
5006          creation_feedback->pPipelineStageCreationFeedbacks[i] = stages[s].feedback;
5007       }
5008    }
5009 
5010    return result;
5011 }
5012 
5013 static uint32_t
radv_pipeline_stage_to_user_data_0(struct radv_graphics_pipeline * pipeline,gl_shader_stage stage,enum amd_gfx_level gfx_level)5014 radv_pipeline_stage_to_user_data_0(struct radv_graphics_pipeline *pipeline, gl_shader_stage stage,
5015                                    enum amd_gfx_level gfx_level)
5016 {
5017    bool has_gs = radv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY);
5018    bool has_tess = radv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL);
5019    bool has_ngg = radv_pipeline_has_ngg(pipeline);
5020 
5021    switch (stage) {
5022    case MESA_SHADER_FRAGMENT:
5023       return R_00B030_SPI_SHADER_USER_DATA_PS_0;
5024    case MESA_SHADER_VERTEX:
5025       if (has_tess) {
5026          if (gfx_level >= GFX10) {
5027             return R_00B430_SPI_SHADER_USER_DATA_HS_0;
5028          } else if (gfx_level == GFX9) {
5029             return R_00B430_SPI_SHADER_USER_DATA_LS_0;
5030          } else {
5031             return R_00B530_SPI_SHADER_USER_DATA_LS_0;
5032          }
5033       }
5034 
5035       if (has_gs) {
5036          if (gfx_level >= GFX10) {
5037             return R_00B230_SPI_SHADER_USER_DATA_GS_0;
5038          } else {
5039             return R_00B330_SPI_SHADER_USER_DATA_ES_0;
5040          }
5041       }
5042 
5043       if (has_ngg)
5044          return R_00B230_SPI_SHADER_USER_DATA_GS_0;
5045 
5046       return R_00B130_SPI_SHADER_USER_DATA_VS_0;
5047    case MESA_SHADER_GEOMETRY:
5048       return gfx_level == GFX9 ? R_00B330_SPI_SHADER_USER_DATA_ES_0
5049                                : R_00B230_SPI_SHADER_USER_DATA_GS_0;
5050    case MESA_SHADER_COMPUTE:
5051    case MESA_SHADER_TASK:
5052       return R_00B900_COMPUTE_USER_DATA_0;
5053    case MESA_SHADER_TESS_CTRL:
5054       return gfx_level == GFX9 ? R_00B430_SPI_SHADER_USER_DATA_LS_0
5055                                : R_00B430_SPI_SHADER_USER_DATA_HS_0;
5056    case MESA_SHADER_TESS_EVAL:
5057       if (has_gs) {
5058          return gfx_level >= GFX10 ? R_00B230_SPI_SHADER_USER_DATA_GS_0
5059                                    : R_00B330_SPI_SHADER_USER_DATA_ES_0;
5060       } else if (has_ngg) {
5061          return R_00B230_SPI_SHADER_USER_DATA_GS_0;
5062       } else {
5063          return R_00B130_SPI_SHADER_USER_DATA_VS_0;
5064       }
5065    case MESA_SHADER_MESH:
5066       assert(has_ngg);
5067       return R_00B230_SPI_SHADER_USER_DATA_GS_0;
5068    default:
5069       unreachable("unknown shader");
5070    }
5071 }
5072 
5073 struct radv_bin_size_entry {
5074    unsigned bpp;
5075    VkExtent2D extent;
5076 };
5077 
5078 static VkExtent2D
radv_gfx9_compute_bin_size(const struct radv_graphics_pipeline * pipeline,const struct radv_graphics_pipeline_info * info)5079 radv_gfx9_compute_bin_size(const struct radv_graphics_pipeline *pipeline,
5080                            const struct radv_graphics_pipeline_info *info)
5081 {
5082    const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
5083    static const struct radv_bin_size_entry color_size_table[][3][9] = {
5084       {
5085          /* One RB / SE */
5086          {
5087             /* One shader engine */
5088             {0, {128, 128}},
5089             {1, {64, 128}},
5090             {2, {32, 128}},
5091             {3, {16, 128}},
5092             {17, {0, 0}},
5093             {UINT_MAX, {0, 0}},
5094          },
5095          {
5096             /* Two shader engines */
5097             {0, {128, 128}},
5098             {2, {64, 128}},
5099             {3, {32, 128}},
5100             {5, {16, 128}},
5101             {17, {0, 0}},
5102             {UINT_MAX, {0, 0}},
5103          },
5104          {
5105             /* Four shader engines */
5106             {0, {128, 128}},
5107             {3, {64, 128}},
5108             {5, {16, 128}},
5109             {17, {0, 0}},
5110             {UINT_MAX, {0, 0}},
5111          },
5112       },
5113       {
5114          /* Two RB / SE */
5115          {
5116             /* One shader engine */
5117             {0, {128, 128}},
5118             {2, {64, 128}},
5119             {3, {32, 128}},
5120             {5, {16, 128}},
5121             {33, {0, 0}},
5122             {UINT_MAX, {0, 0}},
5123          },
5124          {
5125             /* Two shader engines */
5126             {0, {128, 128}},
5127             {3, {64, 128}},
5128             {5, {32, 128}},
5129             {9, {16, 128}},
5130             {33, {0, 0}},
5131             {UINT_MAX, {0, 0}},
5132          },
5133          {
5134             /* Four shader engines */
5135             {0, {256, 256}},
5136             {2, {128, 256}},
5137             {3, {128, 128}},
5138             {5, {64, 128}},
5139             {9, {16, 128}},
5140             {33, {0, 0}},
5141             {UINT_MAX, {0, 0}},
5142          },
5143       },
5144       {
5145          /* Four RB / SE */
5146          {
5147             /* One shader engine */
5148             {0, {128, 256}},
5149             {2, {128, 128}},
5150             {3, {64, 128}},
5151             {5, {32, 128}},
5152             {9, {16, 128}},
5153             {33, {0, 0}},
5154             {UINT_MAX, {0, 0}},
5155          },
5156          {
5157             /* Two shader engines */
5158             {0, {256, 256}},
5159             {2, {128, 256}},
5160             {3, {128, 128}},
5161             {5, {64, 128}},
5162             {9, {32, 128}},
5163             {17, {16, 128}},
5164             {33, {0, 0}},
5165             {UINT_MAX, {0, 0}},
5166          },
5167          {
5168             /* Four shader engines */
5169             {0, {256, 512}},
5170             {2, {256, 256}},
5171             {3, {128, 256}},
5172             {5, {128, 128}},
5173             {9, {64, 128}},
5174             {17, {16, 128}},
5175             {33, {0, 0}},
5176             {UINT_MAX, {0, 0}},
5177          },
5178       },
5179    };
5180    static const struct radv_bin_size_entry ds_size_table[][3][9] = {
5181       {
5182          // One RB / SE
5183          {
5184             // One shader engine
5185             {0, {128, 256}},
5186             {2, {128, 128}},
5187             {4, {64, 128}},
5188             {7, {32, 128}},
5189             {13, {16, 128}},
5190             {49, {0, 0}},
5191             {UINT_MAX, {0, 0}},
5192          },
5193          {
5194             // Two shader engines
5195             {0, {256, 256}},
5196             {2, {128, 256}},
5197             {4, {128, 128}},
5198             {7, {64, 128}},
5199             {13, {32, 128}},
5200             {25, {16, 128}},
5201             {49, {0, 0}},
5202             {UINT_MAX, {0, 0}},
5203          },
5204          {
5205             // Four shader engines
5206             {0, {256, 512}},
5207             {2, {256, 256}},
5208             {4, {128, 256}},
5209             {7, {128, 128}},
5210             {13, {64, 128}},
5211             {25, {16, 128}},
5212             {49, {0, 0}},
5213             {UINT_MAX, {0, 0}},
5214          },
5215       },
5216       {
5217          // Two RB / SE
5218          {
5219             // One shader engine
5220             {0, {256, 256}},
5221             {2, {128, 256}},
5222             {4, {128, 128}},
5223             {7, {64, 128}},
5224             {13, {32, 128}},
5225             {25, {16, 128}},
5226             {97, {0, 0}},
5227             {UINT_MAX, {0, 0}},
5228          },
5229          {
5230             // Two shader engines
5231             {0, {256, 512}},
5232             {2, {256, 256}},
5233             {4, {128, 256}},
5234             {7, {128, 128}},
5235             {13, {64, 128}},
5236             {25, {32, 128}},
5237             {49, {16, 128}},
5238             {97, {0, 0}},
5239             {UINT_MAX, {0, 0}},
5240          },
5241          {
5242             // Four shader engines
5243             {0, {512, 512}},
5244             {2, {256, 512}},
5245             {4, {256, 256}},
5246             {7, {128, 256}},
5247             {13, {128, 128}},
5248             {25, {64, 128}},
5249             {49, {16, 128}},
5250             {97, {0, 0}},
5251             {UINT_MAX, {0, 0}},
5252          },
5253       },
5254       {
5255          // Four RB / SE
5256          {
5257             // One shader engine
5258             {0, {256, 512}},
5259             {2, {256, 256}},
5260             {4, {128, 256}},
5261             {7, {128, 128}},
5262             {13, {64, 128}},
5263             {25, {32, 128}},
5264             {49, {16, 128}},
5265             {UINT_MAX, {0, 0}},
5266          },
5267          {
5268             // Two shader engines
5269             {0, {512, 512}},
5270             {2, {256, 512}},
5271             {4, {256, 256}},
5272             {7, {128, 256}},
5273             {13, {128, 128}},
5274             {25, {64, 128}},
5275             {49, {32, 128}},
5276             {97, {16, 128}},
5277             {UINT_MAX, {0, 0}},
5278          },
5279          {
5280             // Four shader engines
5281             {0, {512, 512}},
5282             {4, {256, 512}},
5283             {7, {256, 256}},
5284             {13, {128, 256}},
5285             {25, {128, 128}},
5286             {49, {64, 128}},
5287             {97, {16, 128}},
5288             {UINT_MAX, {0, 0}},
5289          },
5290       },
5291    };
5292 
5293    VkExtent2D extent = {512, 512};
5294 
5295    unsigned log_num_rb_per_se =
5296       util_logbase2_ceil(pdevice->rad_info.max_render_backends / pdevice->rad_info.max_se);
5297    unsigned log_num_se = util_logbase2_ceil(pdevice->rad_info.max_se);
5298 
5299    unsigned total_samples = 1u << G_028BE0_MSAA_NUM_SAMPLES(pipeline->ms.pa_sc_aa_config);
5300    unsigned ps_iter_samples = 1u << G_028804_PS_ITER_SAMPLES(pipeline->ms.db_eqaa);
5301    unsigned effective_samples = total_samples;
5302    unsigned color_bytes_per_pixel = 0;
5303 
5304    for (unsigned i = 0; i < info->ri.color_att_count; i++) {
5305       if (!info->cb.att[i].color_write_mask)
5306          continue;
5307 
5308       if (info->ri.color_att_formats[i] == VK_FORMAT_UNDEFINED)
5309          continue;
5310 
5311       color_bytes_per_pixel += vk_format_get_blocksize(info->ri.color_att_formats[i]);
5312    }
5313 
5314    /* MSAA images typically don't use all samples all the time. */
5315    if (effective_samples >= 2 && ps_iter_samples <= 1)
5316       effective_samples = 2;
5317    color_bytes_per_pixel *= effective_samples;
5318 
5319    const struct radv_bin_size_entry *color_entry = color_size_table[log_num_rb_per_se][log_num_se];
5320    while (color_entry[1].bpp <= color_bytes_per_pixel)
5321       ++color_entry;
5322 
5323    extent = color_entry->extent;
5324 
5325    if (radv_pipeline_has_ds_attachments(&info->ri)) {
5326       /* Coefficients taken from AMDVLK */
5327       unsigned depth_coeff = info->ri.depth_att_format != VK_FORMAT_UNDEFINED ? 5 : 0;
5328       unsigned stencil_coeff = info->ri.stencil_att_format != VK_FORMAT_UNDEFINED ? 1 : 0;
5329       unsigned ds_bytes_per_pixel = 4 * (depth_coeff + stencil_coeff) * total_samples;
5330 
5331       const struct radv_bin_size_entry *ds_entry = ds_size_table[log_num_rb_per_se][log_num_se];
5332       while (ds_entry[1].bpp <= ds_bytes_per_pixel)
5333          ++ds_entry;
5334 
5335       if (ds_entry->extent.width * ds_entry->extent.height < extent.width * extent.height)
5336          extent = ds_entry->extent;
5337    }
5338 
5339    return extent;
5340 }
5341 
5342 static VkExtent2D
radv_gfx10_compute_bin_size(const struct radv_graphics_pipeline * pipeline,const struct radv_graphics_pipeline_info * info)5343 radv_gfx10_compute_bin_size(const struct radv_graphics_pipeline *pipeline,
5344                             const struct radv_graphics_pipeline_info *info)
5345 {
5346    const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
5347    VkExtent2D extent = {512, 512};
5348 
5349    const unsigned db_tag_size = 64;
5350    const unsigned db_tag_count = 312;
5351    const unsigned color_tag_size = 1024;
5352    const unsigned color_tag_count = 31;
5353    const unsigned fmask_tag_size = 256;
5354    const unsigned fmask_tag_count = 44;
5355 
5356    const unsigned rb_count = pdevice->rad_info.max_render_backends;
5357    const unsigned pipe_count = MAX2(rb_count, pdevice->rad_info.num_tcc_blocks);
5358 
5359    const unsigned db_tag_part = (db_tag_count * rb_count / pipe_count) * db_tag_size * pipe_count;
5360    const unsigned color_tag_part =
5361       (color_tag_count * rb_count / pipe_count) * color_tag_size * pipe_count;
5362    const unsigned fmask_tag_part =
5363       (fmask_tag_count * rb_count / pipe_count) * fmask_tag_size * pipe_count;
5364 
5365    const unsigned total_samples =
5366       1u << G_028BE0_MSAA_NUM_SAMPLES(pipeline->ms.pa_sc_aa_config);
5367    const unsigned samples_log = util_logbase2_ceil(total_samples);
5368 
5369    unsigned color_bytes_per_pixel = 0;
5370    unsigned fmask_bytes_per_pixel = 0;
5371 
5372    for (unsigned i = 0; i < info->ri.color_att_count; i++) {
5373       if (!info->cb.att[i].color_write_mask)
5374          continue;
5375 
5376       if (info->ri.color_att_formats[i] == VK_FORMAT_UNDEFINED)
5377          continue;
5378 
5379       color_bytes_per_pixel += vk_format_get_blocksize(info->ri.color_att_formats[i]);
5380 
5381       if (total_samples > 1) {
5382          assert(samples_log <= 3);
5383          const unsigned fmask_array[] = {0, 1, 1, 4};
5384          fmask_bytes_per_pixel += fmask_array[samples_log];
5385       }
5386    }
5387 
5388    color_bytes_per_pixel *= total_samples;
5389    color_bytes_per_pixel = MAX2(color_bytes_per_pixel, 1);
5390 
5391    const unsigned color_pixel_count_log = util_logbase2(color_tag_part / color_bytes_per_pixel);
5392    extent.width = 1ull << ((color_pixel_count_log + 1) / 2);
5393    extent.height = 1ull << (color_pixel_count_log / 2);
5394 
5395    if (fmask_bytes_per_pixel) {
5396       const unsigned fmask_pixel_count_log = util_logbase2(fmask_tag_part / fmask_bytes_per_pixel);
5397 
5398       const VkExtent2D fmask_extent =
5399          (VkExtent2D){.width = 1ull << ((fmask_pixel_count_log + 1) / 2),
5400                       .height = 1ull << (color_pixel_count_log / 2)};
5401 
5402       if (fmask_extent.width * fmask_extent.height < extent.width * extent.height)
5403          extent = fmask_extent;
5404    }
5405 
5406    if (radv_pipeline_has_ds_attachments(&info->ri)) {
5407       /* Coefficients taken from AMDVLK */
5408       unsigned depth_coeff = info->ri.depth_att_format != VK_FORMAT_UNDEFINED ? 5 : 0;
5409       unsigned stencil_coeff = info->ri.stencil_att_format != VK_FORMAT_UNDEFINED ? 1 : 0;
5410       unsigned db_bytes_per_pixel = (depth_coeff + stencil_coeff) * total_samples;
5411 
5412       const unsigned db_pixel_count_log = util_logbase2(db_tag_part / db_bytes_per_pixel);
5413 
5414       const VkExtent2D db_extent = (VkExtent2D){.width = 1ull << ((db_pixel_count_log + 1) / 2),
5415                                                 .height = 1ull << (color_pixel_count_log / 2)};
5416 
5417       if (db_extent.width * db_extent.height < extent.width * extent.height)
5418          extent = db_extent;
5419    }
5420 
5421    extent.width = MAX2(extent.width, 128);
5422    extent.height = MAX2(extent.width, 64);
5423 
5424    return extent;
5425 }
5426 
5427 static void
radv_pipeline_init_disabled_binning_state(struct radv_graphics_pipeline * pipeline,const struct radv_graphics_pipeline_info * info)5428 radv_pipeline_init_disabled_binning_state(struct radv_graphics_pipeline *pipeline,
5429                                           const struct radv_graphics_pipeline_info *info)
5430 {
5431    const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
5432    uint32_t pa_sc_binner_cntl_0 = S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) |
5433                                   S_028C44_DISABLE_START_OF_PRIM(1);
5434 
5435    if (pdevice->rad_info.gfx_level >= GFX10) {
5436       unsigned min_bytes_per_pixel = 0;
5437 
5438       for (unsigned i = 0; i < info->ri.color_att_count; i++) {
5439          if (!info->cb.att[i].color_write_mask)
5440             continue;
5441 
5442          if (info->ri.color_att_formats[i] == VK_FORMAT_UNDEFINED)
5443             continue;
5444 
5445          unsigned bytes = vk_format_get_blocksize(info->ri.color_att_formats[i]);
5446          if (!min_bytes_per_pixel || bytes < min_bytes_per_pixel)
5447             min_bytes_per_pixel = bytes;
5448       }
5449 
5450       pa_sc_binner_cntl_0 =
5451          S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_NEW_SC) | S_028C44_BIN_SIZE_X(0) |
5452          S_028C44_BIN_SIZE_Y(0) | S_028C44_BIN_SIZE_X_EXTEND(2) |       /* 128 */
5453          S_028C44_BIN_SIZE_Y_EXTEND(min_bytes_per_pixel <= 4 ? 2 : 1) | /* 128 or 64 */
5454          S_028C44_DISABLE_START_OF_PRIM(1);
5455    }
5456 
5457    pipeline->binning.pa_sc_binner_cntl_0 = pa_sc_binner_cntl_0;
5458 }
5459 
5460 struct radv_binning_settings
radv_get_binning_settings(const struct radv_physical_device * pdev)5461 radv_get_binning_settings(const struct radv_physical_device *pdev)
5462 {
5463    struct radv_binning_settings settings;
5464    if (pdev->rad_info.has_dedicated_vram) {
5465       if (pdev->rad_info.max_render_backends > 4) {
5466          settings.context_states_per_bin = 1;
5467          settings.persistent_states_per_bin = 1;
5468       } else {
5469          settings.context_states_per_bin = 3;
5470          settings.persistent_states_per_bin = 8;
5471       }
5472       settings.fpovs_per_batch = 63;
5473    } else {
5474       /* The context states are affected by the scissor bug. */
5475       settings.context_states_per_bin = 6;
5476       /* 32 causes hangs for RAVEN. */
5477       settings.persistent_states_per_bin = 16;
5478       settings.fpovs_per_batch = 63;
5479    }
5480 
5481    if (pdev->rad_info.has_gfx9_scissor_bug)
5482       settings.context_states_per_bin = 1;
5483 
5484    return settings;
5485 }
5486 
5487 static void
radv_pipeline_init_binning_state(struct radv_graphics_pipeline * pipeline,const struct radv_blend_state * blend,const struct radv_graphics_pipeline_info * info)5488 radv_pipeline_init_binning_state(struct radv_graphics_pipeline *pipeline,
5489                                  const struct radv_blend_state *blend,
5490                                  const struct radv_graphics_pipeline_info *info)
5491 {
5492    const struct radv_device *device = pipeline->base.device;
5493 
5494    if (device->physical_device->rad_info.gfx_level < GFX9)
5495       return;
5496 
5497    VkExtent2D bin_size;
5498    if (device->physical_device->rad_info.gfx_level >= GFX10) {
5499       bin_size = radv_gfx10_compute_bin_size(pipeline, info);
5500    } else if (device->physical_device->rad_info.gfx_level == GFX9) {
5501       bin_size = radv_gfx9_compute_bin_size(pipeline, info);
5502    } else
5503       unreachable("Unhandled generation for binning bin size calculation");
5504 
5505    if (device->pbb_allowed && bin_size.width && bin_size.height) {
5506       struct radv_binning_settings settings = radv_get_binning_settings(device->physical_device);
5507 
5508       const uint32_t pa_sc_binner_cntl_0 =
5509          S_028C44_BINNING_MODE(V_028C44_BINNING_ALLOWED) |
5510          S_028C44_BIN_SIZE_X(bin_size.width == 16) | S_028C44_BIN_SIZE_Y(bin_size.height == 16) |
5511          S_028C44_BIN_SIZE_X_EXTEND(util_logbase2(MAX2(bin_size.width, 32)) - 5) |
5512          S_028C44_BIN_SIZE_Y_EXTEND(util_logbase2(MAX2(bin_size.height, 32)) - 5) |
5513          S_028C44_CONTEXT_STATES_PER_BIN(settings.context_states_per_bin - 1) |
5514          S_028C44_PERSISTENT_STATES_PER_BIN(settings.persistent_states_per_bin - 1) |
5515          S_028C44_DISABLE_START_OF_PRIM(1) |
5516          S_028C44_FPOVS_PER_BATCH(settings.fpovs_per_batch) | S_028C44_OPTIMAL_BIN_SELECTION(1);
5517 
5518       pipeline->binning.pa_sc_binner_cntl_0 = pa_sc_binner_cntl_0;
5519    } else
5520       radv_pipeline_init_disabled_binning_state(pipeline, info);
5521 }
5522 
5523 static void
radv_pipeline_emit_depth_stencil_state(struct radeon_cmdbuf * ctx_cs,const struct radv_depth_stencil_state * ds_state)5524 radv_pipeline_emit_depth_stencil_state(struct radeon_cmdbuf *ctx_cs,
5525                                        const struct radv_depth_stencil_state *ds_state)
5526 {
5527    radeon_set_context_reg(ctx_cs, R_028000_DB_RENDER_CONTROL, ds_state->db_render_control);
5528 
5529    radeon_set_context_reg_seq(ctx_cs, R_02800C_DB_RENDER_OVERRIDE, 2);
5530    radeon_emit(ctx_cs, ds_state->db_render_override);
5531    radeon_emit(ctx_cs, ds_state->db_render_override2);
5532 }
5533 
5534 static void
radv_pipeline_emit_blend_state(struct radeon_cmdbuf * ctx_cs,const struct radv_graphics_pipeline * pipeline,const struct radv_blend_state * blend)5535 radv_pipeline_emit_blend_state(struct radeon_cmdbuf *ctx_cs,
5536                                const struct radv_graphics_pipeline *pipeline,
5537                                const struct radv_blend_state *blend)
5538 {
5539    const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
5540 
5541    radeon_set_context_reg_seq(ctx_cs, R_028780_CB_BLEND0_CONTROL, 8);
5542    radeon_emit_array(ctx_cs, blend->cb_blend_control, 8);
5543    radeon_set_context_reg(ctx_cs, R_028B70_DB_ALPHA_TO_MASK, blend->db_alpha_to_mask);
5544 
5545    if (pdevice->rad_info.has_rbplus) {
5546 
5547       radeon_set_context_reg_seq(ctx_cs, R_028760_SX_MRT0_BLEND_OPT, 8);
5548       radeon_emit_array(ctx_cs, blend->sx_mrt_blend_opt, 8);
5549    }
5550 
5551    radeon_set_context_reg(ctx_cs, R_028714_SPI_SHADER_COL_FORMAT, blend->spi_shader_col_format);
5552 
5553    radeon_set_context_reg(ctx_cs, R_02823C_CB_SHADER_MASK, blend->cb_shader_mask);
5554 }
5555 
5556 static void
radv_pipeline_emit_raster_state(struct radeon_cmdbuf * ctx_cs,const struct radv_graphics_pipeline * pipeline,const struct radv_graphics_pipeline_info * info)5557 radv_pipeline_emit_raster_state(struct radeon_cmdbuf *ctx_cs,
5558                                 const struct radv_graphics_pipeline *pipeline,
5559                                 const struct radv_graphics_pipeline_info *info)
5560 {
5561    const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
5562    const VkConservativeRasterizationModeEXT mode = info->rs.conservative_mode;
5563    uint32_t pa_sc_conservative_rast = S_028C4C_NULL_SQUAD_AA_MASK_ENABLE(1);
5564 
5565    if (pdevice->rad_info.gfx_level >= GFX9) {
5566       /* Conservative rasterization. */
5567       if (mode != VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT) {
5568          pa_sc_conservative_rast = S_028C4C_PREZ_AA_MASK_ENABLE(1) | S_028C4C_POSTZ_AA_MASK_ENABLE(1) |
5569                                    S_028C4C_CENTROID_SAMPLE_OVERRIDE(1);
5570 
5571          if (mode == VK_CONSERVATIVE_RASTERIZATION_MODE_OVERESTIMATE_EXT) {
5572             pa_sc_conservative_rast |=
5573                S_028C4C_OVER_RAST_ENABLE(1) | S_028C4C_OVER_RAST_SAMPLE_SELECT(0) |
5574                S_028C4C_UNDER_RAST_ENABLE(0) | S_028C4C_UNDER_RAST_SAMPLE_SELECT(1) |
5575                S_028C4C_PBB_UNCERTAINTY_REGION_ENABLE(1);
5576          } else {
5577             assert(mode == VK_CONSERVATIVE_RASTERIZATION_MODE_UNDERESTIMATE_EXT);
5578             pa_sc_conservative_rast |=
5579                S_028C4C_OVER_RAST_ENABLE(0) | S_028C4C_OVER_RAST_SAMPLE_SELECT(1) |
5580                S_028C4C_UNDER_RAST_ENABLE(1) | S_028C4C_UNDER_RAST_SAMPLE_SELECT(0) |
5581                S_028C4C_PBB_UNCERTAINTY_REGION_ENABLE(0);
5582          }
5583       }
5584 
5585       radeon_set_context_reg(ctx_cs, R_028C4C_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL,
5586                              pa_sc_conservative_rast);
5587    }
5588 }
5589 
5590 static void
radv_pipeline_emit_multisample_state(struct radeon_cmdbuf * ctx_cs,const struct radv_graphics_pipeline * pipeline)5591 radv_pipeline_emit_multisample_state(struct radeon_cmdbuf *ctx_cs,
5592                                      const struct radv_graphics_pipeline *pipeline)
5593 {
5594    const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
5595    const struct radv_multisample_state *ms = &pipeline->ms;
5596 
5597    radeon_set_context_reg_seq(ctx_cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2);
5598    radeon_emit(ctx_cs, ms->pa_sc_aa_mask[0]);
5599    radeon_emit(ctx_cs, ms->pa_sc_aa_mask[1]);
5600 
5601    radeon_set_context_reg(ctx_cs, R_028804_DB_EQAA, ms->db_eqaa);
5602    radeon_set_context_reg(ctx_cs, R_028BE0_PA_SC_AA_CONFIG, ms->pa_sc_aa_config);
5603 
5604    radeon_set_context_reg_seq(ctx_cs, R_028A48_PA_SC_MODE_CNTL_0, 2);
5605    radeon_emit(ctx_cs, ms->pa_sc_mode_cntl_0);
5606    radeon_emit(ctx_cs, ms->pa_sc_mode_cntl_1);
5607 
5608    /* The exclusion bits can be set to improve rasterization efficiency
5609     * if no sample lies on the pixel boundary (-8 sample offset). It's
5610     * currently always TRUE because the driver doesn't support 16 samples.
5611     */
5612    bool exclusion = pdevice->rad_info.gfx_level >= GFX7;
5613    radeon_set_context_reg(
5614       ctx_cs, R_02882C_PA_SU_PRIM_FILTER_CNTL,
5615       S_02882C_XMAX_RIGHT_EXCLUSION(exclusion) | S_02882C_YMAX_BOTTOM_EXCLUSION(exclusion));
5616 }
5617 
5618 static void
radv_pipeline_emit_vgt_gs_mode(struct radeon_cmdbuf * ctx_cs,const struct radv_graphics_pipeline * pipeline)5619 radv_pipeline_emit_vgt_gs_mode(struct radeon_cmdbuf *ctx_cs,
5620                                const struct radv_graphics_pipeline *pipeline)
5621 {
5622    const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
5623    const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline);
5624    const struct radv_shader *vs = pipeline->base.shaders[MESA_SHADER_TESS_EVAL]
5625                                   ? pipeline->base.shaders[MESA_SHADER_TESS_EVAL]
5626                                   : pipeline->base.shaders[MESA_SHADER_VERTEX];
5627    unsigned vgt_primitiveid_en = 0;
5628    uint32_t vgt_gs_mode = 0;
5629 
5630    if (radv_pipeline_has_ngg(pipeline))
5631       return;
5632 
5633    if (radv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
5634       const struct radv_shader *gs = pipeline->base.shaders[MESA_SHADER_GEOMETRY];
5635 
5636       vgt_gs_mode = ac_vgt_gs_mode(gs->info.gs.vertices_out, pdevice->rad_info.gfx_level);
5637    } else if (outinfo->export_prim_id || vs->info.uses_prim_id) {
5638       vgt_gs_mode = S_028A40_MODE(V_028A40_GS_SCENARIO_A);
5639       vgt_primitiveid_en |= S_028A84_PRIMITIVEID_EN(1);
5640    }
5641 
5642    radeon_set_context_reg(ctx_cs, R_028A84_VGT_PRIMITIVEID_EN, vgt_primitiveid_en);
5643    radeon_set_context_reg(ctx_cs, R_028A40_VGT_GS_MODE, vgt_gs_mode);
5644 }
5645 
5646 static void
radv_pipeline_emit_hw_vs(struct radeon_cmdbuf * ctx_cs,struct radeon_cmdbuf * cs,const struct radv_graphics_pipeline * pipeline,const struct radv_shader * shader)5647 radv_pipeline_emit_hw_vs(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
5648                          const struct radv_graphics_pipeline *pipeline, const struct radv_shader *shader)
5649 {
5650    const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
5651    uint64_t va = radv_shader_get_va(shader);
5652 
5653    radeon_set_sh_reg_seq(cs, R_00B120_SPI_SHADER_PGM_LO_VS, 4);
5654    radeon_emit(cs, va >> 8);
5655    radeon_emit(cs, S_00B124_MEM_BASE(va >> 40));
5656    radeon_emit(cs, shader->config.rsrc1);
5657    radeon_emit(cs, shader->config.rsrc2);
5658 
5659    const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline);
5660    unsigned clip_dist_mask, cull_dist_mask, total_mask;
5661    clip_dist_mask = outinfo->clip_dist_mask;
5662    cull_dist_mask = outinfo->cull_dist_mask;
5663    total_mask = clip_dist_mask | cull_dist_mask;
5664 
5665    bool misc_vec_ena = outinfo->writes_pointsize || outinfo->writes_layer ||
5666                        outinfo->writes_viewport_index || outinfo->writes_primitive_shading_rate;
5667    unsigned spi_vs_out_config, nparams;
5668 
5669    /* VS is required to export at least one param. */
5670    nparams = MAX2(outinfo->param_exports, 1);
5671    spi_vs_out_config = S_0286C4_VS_EXPORT_COUNT(nparams - 1);
5672 
5673    if (pdevice->rad_info.gfx_level >= GFX10) {
5674       spi_vs_out_config |= S_0286C4_NO_PC_EXPORT(outinfo->param_exports == 0);
5675    }
5676 
5677    radeon_set_context_reg(ctx_cs, R_0286C4_SPI_VS_OUT_CONFIG, spi_vs_out_config);
5678 
5679    radeon_set_context_reg(
5680       ctx_cs, R_02870C_SPI_SHADER_POS_FORMAT,
5681       S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
5682          S_02870C_POS1_EXPORT_FORMAT(outinfo->pos_exports > 1 ? V_02870C_SPI_SHADER_4COMP
5683                                                               : V_02870C_SPI_SHADER_NONE) |
5684          S_02870C_POS2_EXPORT_FORMAT(outinfo->pos_exports > 2 ? V_02870C_SPI_SHADER_4COMP
5685                                                               : V_02870C_SPI_SHADER_NONE) |
5686          S_02870C_POS3_EXPORT_FORMAT(outinfo->pos_exports > 3 ? V_02870C_SPI_SHADER_4COMP
5687                                                               : V_02870C_SPI_SHADER_NONE));
5688 
5689    radeon_set_context_reg(ctx_cs, R_02881C_PA_CL_VS_OUT_CNTL,
5690                           S_02881C_USE_VTX_POINT_SIZE(outinfo->writes_pointsize) |
5691                              S_02881C_USE_VTX_RENDER_TARGET_INDX(outinfo->writes_layer) |
5692                              S_02881C_USE_VTX_VIEWPORT_INDX(outinfo->writes_viewport_index) |
5693                              S_02881C_USE_VTX_VRS_RATE(outinfo->writes_primitive_shading_rate) |
5694                              S_02881C_VS_OUT_MISC_VEC_ENA(misc_vec_ena) |
5695                              S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(misc_vec_ena) |
5696                              S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0f) != 0) |
5697                              S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xf0) != 0) |
5698                              total_mask << 8 | clip_dist_mask);
5699 
5700    if (pdevice->rad_info.gfx_level <= GFX8)
5701       radeon_set_context_reg(ctx_cs, R_028AB4_VGT_REUSE_OFF, outinfo->writes_viewport_index);
5702 
5703    unsigned late_alloc_wave64, cu_mask;
5704    ac_compute_late_alloc(&pdevice->rad_info, false, false, shader->config.scratch_bytes_per_wave > 0,
5705                          &late_alloc_wave64, &cu_mask);
5706 
5707    if (pdevice->rad_info.gfx_level >= GFX7) {
5708       if (pdevice->rad_info.gfx_level >= GFX10) {
5709          ac_set_reg_cu_en(cs, R_00B118_SPI_SHADER_PGM_RSRC3_VS,
5710                           S_00B118_CU_EN(cu_mask) | S_00B118_WAVE_LIMIT(0x3F),
5711                           C_00B118_CU_EN, 0, &pdevice->rad_info,
5712                           (void*)gfx10_set_sh_reg_idx3);
5713       } else {
5714          radeon_set_sh_reg_idx(pdevice, cs, R_00B118_SPI_SHADER_PGM_RSRC3_VS, 3,
5715                                S_00B118_CU_EN(cu_mask) | S_00B118_WAVE_LIMIT(0x3F));
5716       }
5717       radeon_set_sh_reg(cs, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(late_alloc_wave64));
5718    }
5719    if (pdevice->rad_info.gfx_level >= GFX10) {
5720       uint32_t oversub_pc_lines = late_alloc_wave64 ? pdevice->rad_info.pc_lines / 4 : 0;
5721       gfx10_emit_ge_pc_alloc(cs, pdevice->rad_info.gfx_level, oversub_pc_lines);
5722    }
5723 }
5724 
5725 static void
radv_pipeline_emit_hw_es(struct radeon_cmdbuf * cs,const struct radv_graphics_pipeline * pipeline,const struct radv_shader * shader)5726 radv_pipeline_emit_hw_es(struct radeon_cmdbuf *cs, const struct radv_graphics_pipeline *pipeline,
5727                          const struct radv_shader *shader)
5728 {
5729    uint64_t va = radv_shader_get_va(shader);
5730 
5731    radeon_set_sh_reg_seq(cs, R_00B320_SPI_SHADER_PGM_LO_ES, 4);
5732    radeon_emit(cs, va >> 8);
5733    radeon_emit(cs, S_00B324_MEM_BASE(va >> 40));
5734    radeon_emit(cs, shader->config.rsrc1);
5735    radeon_emit(cs, shader->config.rsrc2);
5736 }
5737 
5738 static void
radv_pipeline_emit_hw_ls(struct radeon_cmdbuf * cs,const struct radv_graphics_pipeline * pipeline,const struct radv_shader * shader)5739 radv_pipeline_emit_hw_ls(struct radeon_cmdbuf *cs, const struct radv_graphics_pipeline *pipeline,
5740                          const struct radv_shader *shader)
5741 {
5742    const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
5743    unsigned num_lds_blocks = pipeline->base.shaders[MESA_SHADER_TESS_CTRL]->info.tcs.num_lds_blocks;
5744    uint64_t va = radv_shader_get_va(shader);
5745    uint32_t rsrc2 = shader->config.rsrc2;
5746 
5747    radeon_set_sh_reg(cs, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
5748 
5749    rsrc2 |= S_00B52C_LDS_SIZE(num_lds_blocks);
5750    if (pdevice->rad_info.gfx_level == GFX7 && pdevice->rad_info.family != CHIP_HAWAII)
5751       radeon_set_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, rsrc2);
5752 
5753    radeon_set_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2);
5754    radeon_emit(cs, shader->config.rsrc1);
5755    radeon_emit(cs, rsrc2);
5756 }
5757 
5758 static void
radv_pipeline_emit_hw_ngg(struct radeon_cmdbuf * ctx_cs,struct radeon_cmdbuf * cs,const struct radv_graphics_pipeline * pipeline,const struct radv_shader * shader)5759 radv_pipeline_emit_hw_ngg(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
5760                           const struct radv_graphics_pipeline *pipeline,
5761                           const struct radv_shader *shader)
5762 {
5763    const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
5764    uint64_t va = radv_shader_get_va(shader);
5765    gl_shader_stage es_type =
5766       radv_pipeline_has_stage(pipeline, MESA_SHADER_MESH) ? MESA_SHADER_MESH :
5767       radv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL) ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
5768    struct radv_shader *es = pipeline->base.shaders[es_type];
5769    const struct gfx10_ngg_info *ngg_state = &shader->info.ngg_info;
5770 
5771    radeon_set_sh_reg(cs, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
5772 
5773    radeon_set_sh_reg_seq(cs, R_00B228_SPI_SHADER_PGM_RSRC1_GS, 2);
5774    radeon_emit(cs, shader->config.rsrc1);
5775    radeon_emit(cs, shader->config.rsrc2);
5776 
5777    const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline);
5778    unsigned clip_dist_mask, cull_dist_mask, total_mask;
5779    clip_dist_mask = outinfo->clip_dist_mask;
5780    cull_dist_mask = outinfo->cull_dist_mask;
5781    total_mask = clip_dist_mask | cull_dist_mask;
5782 
5783    bool misc_vec_ena = outinfo->writes_pointsize || outinfo->writes_layer ||
5784                        outinfo->writes_viewport_index || outinfo->writes_primitive_shading_rate;
5785    bool es_enable_prim_id = outinfo->export_prim_id || (es && es->info.uses_prim_id);
5786    bool break_wave_at_eoi = false;
5787    unsigned ge_cntl;
5788 
5789    if (es_type == MESA_SHADER_TESS_EVAL) {
5790       struct radv_shader *gs = pipeline->base.shaders[MESA_SHADER_GEOMETRY];
5791 
5792       if (es_enable_prim_id || (gs && gs->info.uses_prim_id))
5793          break_wave_at_eoi = true;
5794    }
5795 
5796    bool no_pc_export = outinfo->param_exports == 0 && outinfo->prim_param_exports == 0;
5797    unsigned num_params = MAX2(outinfo->param_exports, 1);
5798    unsigned num_prim_params = outinfo->prim_param_exports;
5799    radeon_set_context_reg(
5800       ctx_cs, R_0286C4_SPI_VS_OUT_CONFIG,
5801       S_0286C4_VS_EXPORT_COUNT(num_params - 1) |
5802       S_0286C4_PRIM_EXPORT_COUNT(num_prim_params) |
5803       S_0286C4_NO_PC_EXPORT(no_pc_export));
5804 
5805    unsigned idx_format = V_028708_SPI_SHADER_1COMP;
5806    if (outinfo->writes_layer_per_primitive ||
5807        outinfo->writes_viewport_index_per_primitive ||
5808        outinfo->writes_primitive_shading_rate_per_primitive)
5809       idx_format = V_028708_SPI_SHADER_2COMP;
5810 
5811    radeon_set_context_reg(ctx_cs, R_028708_SPI_SHADER_IDX_FORMAT,
5812                           S_028708_IDX0_EXPORT_FORMAT(idx_format));
5813    radeon_set_context_reg(
5814       ctx_cs, R_02870C_SPI_SHADER_POS_FORMAT,
5815       S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
5816          S_02870C_POS1_EXPORT_FORMAT(outinfo->pos_exports > 1 ? V_02870C_SPI_SHADER_4COMP
5817                                                               : V_02870C_SPI_SHADER_NONE) |
5818          S_02870C_POS2_EXPORT_FORMAT(outinfo->pos_exports > 2 ? V_02870C_SPI_SHADER_4COMP
5819                                                               : V_02870C_SPI_SHADER_NONE) |
5820          S_02870C_POS3_EXPORT_FORMAT(outinfo->pos_exports > 3 ? V_02870C_SPI_SHADER_4COMP
5821                                                               : V_02870C_SPI_SHADER_NONE));
5822 
5823    radeon_set_context_reg(ctx_cs, R_02881C_PA_CL_VS_OUT_CNTL,
5824                           S_02881C_USE_VTX_POINT_SIZE(outinfo->writes_pointsize) |
5825                              S_02881C_USE_VTX_RENDER_TARGET_INDX(outinfo->writes_layer) |
5826                              S_02881C_USE_VTX_VIEWPORT_INDX(outinfo->writes_viewport_index) |
5827                              S_02881C_USE_VTX_VRS_RATE(outinfo->writes_primitive_shading_rate) |
5828                              S_02881C_VS_OUT_MISC_VEC_ENA(misc_vec_ena) |
5829                              S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(misc_vec_ena) |
5830                              S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0f) != 0) |
5831                              S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xf0) != 0) |
5832                              total_mask << 8 | clip_dist_mask);
5833 
5834    radeon_set_context_reg(ctx_cs, R_028A84_VGT_PRIMITIVEID_EN,
5835                           S_028A84_PRIMITIVEID_EN(es_enable_prim_id) |
5836                              S_028A84_NGG_DISABLE_PROVOK_REUSE(outinfo->export_prim_id));
5837 
5838    radeon_set_context_reg(ctx_cs, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
5839                           ngg_state->vgt_esgs_ring_itemsize);
5840 
5841    /* NGG specific registers. */
5842    struct radv_shader *gs = pipeline->base.shaders[MESA_SHADER_GEOMETRY];
5843    uint32_t gs_num_invocations = gs ? gs->info.gs.invocations : 1;
5844 
5845    if (pdevice->rad_info.gfx_level < GFX11) {
5846       radeon_set_context_reg(
5847          ctx_cs, R_028A44_VGT_GS_ONCHIP_CNTL,
5848          S_028A44_ES_VERTS_PER_SUBGRP(ngg_state->hw_max_esverts) |
5849             S_028A44_GS_PRIMS_PER_SUBGRP(ngg_state->max_gsprims) |
5850             S_028A44_GS_INST_PRIMS_IN_SUBGRP(ngg_state->max_gsprims * gs_num_invocations));
5851    }
5852 
5853    radeon_set_context_reg(ctx_cs, R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP,
5854                           S_0287FC_MAX_VERTS_PER_SUBGROUP(ngg_state->max_out_verts));
5855    radeon_set_context_reg(ctx_cs, R_028B4C_GE_NGG_SUBGRP_CNTL,
5856                           S_028B4C_PRIM_AMP_FACTOR(ngg_state->prim_amp_factor) |
5857                              S_028B4C_THDS_PER_SUBGRP(0)); /* for fast launch */
5858    radeon_set_context_reg(
5859       ctx_cs, R_028B90_VGT_GS_INSTANCE_CNT,
5860       S_028B90_CNT(gs_num_invocations) | S_028B90_ENABLE(gs_num_invocations > 1) |
5861          S_028B90_EN_MAX_VERT_OUT_PER_GS_INSTANCE(ngg_state->max_vert_out_per_gs_instance));
5862 
5863    if (pdevice->rad_info.gfx_level >= GFX11) {
5864       ge_cntl = S_03096C_PRIMS_PER_SUBGRP(ngg_state->max_gsprims) |
5865                 S_03096C_VERTS_PER_SUBGRP(ngg_state->enable_vertex_grouping
5866                                           ? ngg_state->hw_max_esverts
5867                                           : 256) | /* 256 = disable vertex grouping */
5868                 S_03096C_BREAK_PRIMGRP_AT_EOI(break_wave_at_eoi) |
5869                 S_03096C_PRIM_GRP_SIZE_GFX11(256);
5870    } else {
5871       ge_cntl = S_03096C_PRIM_GRP_SIZE_GFX10(ngg_state->max_gsprims) |
5872                 S_03096C_VERT_GRP_SIZE(ngg_state->enable_vertex_grouping
5873                                           ? ngg_state->hw_max_esverts
5874                                           : 256) | /* 256 = disable vertex grouping */
5875                 S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
5876    }
5877 
5878    /* Bug workaround for a possible hang with non-tessellation cases.
5879     * Tessellation always sets GE_CNTL.VERT_GRP_SIZE = 0
5880     *
5881     * Requirement: GE_CNTL.VERT_GRP_SIZE = VGT_GS_ONCHIP_CNTL.ES_VERTS_PER_SUBGRP - 5
5882     */
5883    if (pdevice->rad_info.gfx_level == GFX10 &&
5884        !radv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL) && ngg_state->hw_max_esverts != 256) {
5885       ge_cntl &= C_03096C_VERT_GRP_SIZE;
5886 
5887       if (ngg_state->hw_max_esverts > 5) {
5888          ge_cntl |= S_03096C_VERT_GRP_SIZE(ngg_state->hw_max_esverts - 5);
5889       }
5890    }
5891 
5892    radeon_set_uconfig_reg(ctx_cs, R_03096C_GE_CNTL, ge_cntl);
5893 
5894    unsigned late_alloc_wave64, cu_mask;
5895    ac_compute_late_alloc(&pdevice->rad_info, true, shader->info.has_ngg_culling,
5896                          shader->config.scratch_bytes_per_wave > 0, &late_alloc_wave64, &cu_mask);
5897 
5898    if (pdevice->rad_info.gfx_level >= GFX11) {
5899       /* TODO: figure out how S_00B204_CU_EN_GFX11 interacts with ac_set_reg_cu_en */
5900       gfx10_set_sh_reg_idx3(cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
5901                             S_00B21C_CU_EN(cu_mask) | S_00B21C_WAVE_LIMIT(0x3F));
5902       gfx10_set_sh_reg_idx3(
5903          cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
5904          S_00B204_CU_EN_GFX11(0x1) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64));
5905    } else if (pdevice->rad_info.gfx_level >= GFX10) {
5906       ac_set_reg_cu_en(cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
5907                        S_00B21C_CU_EN(cu_mask) | S_00B21C_WAVE_LIMIT(0x3F),
5908                        C_00B21C_CU_EN, 0, &pdevice->rad_info, (void*)gfx10_set_sh_reg_idx3);
5909       ac_set_reg_cu_en(cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
5910                        S_00B204_CU_EN_GFX10(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64),
5911                        C_00B204_CU_EN_GFX10, 16, &pdevice->rad_info,
5912                        (void*)gfx10_set_sh_reg_idx3);
5913    } else {
5914       radeon_set_sh_reg_idx(
5915          pdevice, cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, 3,
5916          S_00B21C_CU_EN(cu_mask) | S_00B21C_WAVE_LIMIT(0x3F));
5917       radeon_set_sh_reg_idx(
5918          pdevice, cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, 3,
5919          S_00B204_CU_EN_GFX10(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64));
5920    }
5921 
5922    uint32_t oversub_pc_lines = late_alloc_wave64 ? pdevice->rad_info.pc_lines / 4 : 0;
5923    if (shader->info.has_ngg_culling) {
5924       unsigned oversub_factor = 2;
5925 
5926       if (outinfo->param_exports > 4)
5927          oversub_factor = 4;
5928       else if (outinfo->param_exports > 2)
5929          oversub_factor = 3;
5930 
5931       oversub_pc_lines *= oversub_factor;
5932    }
5933 
5934    gfx10_emit_ge_pc_alloc(cs, pdevice->rad_info.gfx_level, oversub_pc_lines);
5935 }
5936 
5937 static void
radv_pipeline_emit_hw_hs(struct radeon_cmdbuf * cs,const struct radv_graphics_pipeline * pipeline,const struct radv_shader * shader)5938 radv_pipeline_emit_hw_hs(struct radeon_cmdbuf *cs, const struct radv_graphics_pipeline *pipeline,
5939                          const struct radv_shader *shader)
5940 {
5941    const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
5942    uint64_t va = radv_shader_get_va(shader);
5943 
5944    if (pdevice->rad_info.gfx_level >= GFX9) {
5945       if (pdevice->rad_info.gfx_level >= GFX10) {
5946          radeon_set_sh_reg(cs, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
5947       } else {
5948          radeon_set_sh_reg(cs, R_00B410_SPI_SHADER_PGM_LO_LS, va >> 8);
5949       }
5950 
5951       radeon_set_sh_reg_seq(cs, R_00B428_SPI_SHADER_PGM_RSRC1_HS, 2);
5952       radeon_emit(cs, shader->config.rsrc1);
5953       radeon_emit(cs, shader->config.rsrc2);
5954    } else {
5955       radeon_set_sh_reg_seq(cs, R_00B420_SPI_SHADER_PGM_LO_HS, 4);
5956       radeon_emit(cs, va >> 8);
5957       radeon_emit(cs, S_00B424_MEM_BASE(va >> 40));
5958       radeon_emit(cs, shader->config.rsrc1);
5959       radeon_emit(cs, shader->config.rsrc2);
5960    }
5961 }
5962 
5963 static void
radv_pipeline_emit_vertex_shader(struct radeon_cmdbuf * ctx_cs,struct radeon_cmdbuf * cs,const struct radv_graphics_pipeline * pipeline)5964 radv_pipeline_emit_vertex_shader(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
5965                                  const struct radv_graphics_pipeline *pipeline)
5966 {
5967    struct radv_shader *vs;
5968 
5969    /* Skip shaders merged into HS/GS */
5970    vs = pipeline->base.shaders[MESA_SHADER_VERTEX];
5971    if (!vs)
5972       return;
5973 
5974    if (vs->info.vs.as_ls)
5975       radv_pipeline_emit_hw_ls(cs, pipeline, vs);
5976    else if (vs->info.vs.as_es)
5977       radv_pipeline_emit_hw_es(cs, pipeline, vs);
5978    else if (vs->info.is_ngg)
5979       radv_pipeline_emit_hw_ngg(ctx_cs, cs, pipeline, vs);
5980    else
5981       radv_pipeline_emit_hw_vs(ctx_cs, cs, pipeline, vs);
5982 }
5983 
5984 static void
radv_pipeline_emit_tess_shaders(struct radeon_cmdbuf * ctx_cs,struct radeon_cmdbuf * cs,const struct radv_graphics_pipeline * pipeline)5985 radv_pipeline_emit_tess_shaders(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
5986                                 const struct radv_graphics_pipeline *pipeline)
5987 {
5988    const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
5989    struct radv_shader *tes, *tcs;
5990 
5991    tcs = pipeline->base.shaders[MESA_SHADER_TESS_CTRL];
5992    tes = pipeline->base.shaders[MESA_SHADER_TESS_EVAL];
5993 
5994    if (tes) {
5995       if (tes->info.is_ngg) {
5996          radv_pipeline_emit_hw_ngg(ctx_cs, cs, pipeline, tes);
5997       } else if (tes->info.tes.as_es)
5998          radv_pipeline_emit_hw_es(cs, pipeline, tes);
5999       else
6000          radv_pipeline_emit_hw_vs(ctx_cs, cs, pipeline, tes);
6001    }
6002 
6003    radv_pipeline_emit_hw_hs(cs, pipeline, tcs);
6004 
6005    if (pdevice->rad_info.gfx_level >= GFX10 &&
6006        !radv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY) && !radv_pipeline_has_ngg(pipeline)) {
6007       radeon_set_context_reg(ctx_cs, R_028A44_VGT_GS_ONCHIP_CNTL,
6008                              S_028A44_ES_VERTS_PER_SUBGRP(250) | S_028A44_GS_PRIMS_PER_SUBGRP(126) |
6009                                 S_028A44_GS_INST_PRIMS_IN_SUBGRP(126));
6010    }
6011 }
6012 
6013 static void
radv_pipeline_emit_tess_state(struct radeon_cmdbuf * ctx_cs,const struct radv_graphics_pipeline * pipeline,const struct radv_graphics_pipeline_info * info)6014 radv_pipeline_emit_tess_state(struct radeon_cmdbuf *ctx_cs,
6015                               const struct radv_graphics_pipeline *pipeline,
6016                               const struct radv_graphics_pipeline_info *info)
6017 {
6018    const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
6019    struct radv_shader *tes = radv_get_shader(&pipeline->base, MESA_SHADER_TESS_EVAL);
6020    unsigned type = 0, partitioning = 0, topology = 0, distribution_mode = 0;
6021    unsigned num_tcs_input_cp, num_tcs_output_cp, num_patches;
6022    unsigned ls_hs_config;
6023 
6024    num_tcs_input_cp = info->ts.patch_control_points;
6025    num_tcs_output_cp =
6026       pipeline->base.shaders[MESA_SHADER_TESS_CTRL]->info.tcs.tcs_vertices_out; // TCS VERTICES OUT
6027    num_patches = pipeline->base.shaders[MESA_SHADER_TESS_CTRL]->info.num_tess_patches;
6028 
6029    ls_hs_config = S_028B58_NUM_PATCHES(num_patches) | S_028B58_HS_NUM_INPUT_CP(num_tcs_input_cp) |
6030                   S_028B58_HS_NUM_OUTPUT_CP(num_tcs_output_cp);
6031 
6032    if (pdevice->rad_info.gfx_level >= GFX7) {
6033       radeon_set_context_reg_idx(ctx_cs, R_028B58_VGT_LS_HS_CONFIG, 2, ls_hs_config);
6034    } else {
6035       radeon_set_context_reg(ctx_cs, R_028B58_VGT_LS_HS_CONFIG, ls_hs_config);
6036    }
6037 
6038    switch (tes->info.tes._primitive_mode) {
6039    case TESS_PRIMITIVE_TRIANGLES:
6040       type = V_028B6C_TESS_TRIANGLE;
6041       break;
6042    case TESS_PRIMITIVE_QUADS:
6043       type = V_028B6C_TESS_QUAD;
6044       break;
6045    case TESS_PRIMITIVE_ISOLINES:
6046       type = V_028B6C_TESS_ISOLINE;
6047       break;
6048    default:
6049       break;
6050    }
6051 
6052    switch (tes->info.tes.spacing) {
6053    case TESS_SPACING_EQUAL:
6054       partitioning = V_028B6C_PART_INTEGER;
6055       break;
6056    case TESS_SPACING_FRACTIONAL_ODD:
6057       partitioning = V_028B6C_PART_FRAC_ODD;
6058       break;
6059    case TESS_SPACING_FRACTIONAL_EVEN:
6060       partitioning = V_028B6C_PART_FRAC_EVEN;
6061       break;
6062    default:
6063       break;
6064    }
6065 
6066    bool ccw = tes->info.tes.ccw;
6067    if (info->ts.domain_origin != VK_TESSELLATION_DOMAIN_ORIGIN_UPPER_LEFT)
6068       ccw = !ccw;
6069 
6070    if (tes->info.tes.point_mode)
6071       topology = V_028B6C_OUTPUT_POINT;
6072    else if (tes->info.tes._primitive_mode == TESS_PRIMITIVE_ISOLINES)
6073       topology = V_028B6C_OUTPUT_LINE;
6074    else if (ccw)
6075       topology = V_028B6C_OUTPUT_TRIANGLE_CCW;
6076    else
6077       topology = V_028B6C_OUTPUT_TRIANGLE_CW;
6078 
6079    if (pdevice->rad_info.has_distributed_tess) {
6080       if (pdevice->rad_info.family == CHIP_FIJI || pdevice->rad_info.family >= CHIP_POLARIS10)
6081          distribution_mode = V_028B6C_TRAPEZOIDS;
6082       else
6083          distribution_mode = V_028B6C_DONUTS;
6084    } else
6085       distribution_mode = V_028B6C_NO_DIST;
6086 
6087    radeon_set_context_reg(ctx_cs, R_028B6C_VGT_TF_PARAM,
6088                           S_028B6C_TYPE(type) | S_028B6C_PARTITIONING(partitioning) |
6089                              S_028B6C_TOPOLOGY(topology) |
6090                              S_028B6C_DISTRIBUTION_MODE(distribution_mode));
6091 }
6092 
6093 static void
radv_pipeline_emit_hw_gs(struct radeon_cmdbuf * ctx_cs,struct radeon_cmdbuf * cs,const struct radv_graphics_pipeline * pipeline,const struct radv_shader * gs)6094 radv_pipeline_emit_hw_gs(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
6095                          const struct radv_graphics_pipeline *pipeline, const struct radv_shader *gs)
6096 {
6097    const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
6098    const struct gfx9_gs_info *gs_state = &gs->info.gs_ring_info;
6099    unsigned gs_max_out_vertices;
6100    const uint8_t *num_components;
6101    uint8_t max_stream;
6102    unsigned offset;
6103    uint64_t va;
6104 
6105    gs_max_out_vertices = gs->info.gs.vertices_out;
6106    max_stream = gs->info.gs.max_stream;
6107    num_components = gs->info.gs.num_stream_output_components;
6108 
6109    offset = num_components[0] * gs_max_out_vertices;
6110 
6111    radeon_set_context_reg_seq(ctx_cs, R_028A60_VGT_GSVS_RING_OFFSET_1, 3);
6112    radeon_emit(ctx_cs, offset);
6113    if (max_stream >= 1)
6114       offset += num_components[1] * gs_max_out_vertices;
6115    radeon_emit(ctx_cs, offset);
6116    if (max_stream >= 2)
6117       offset += num_components[2] * gs_max_out_vertices;
6118    radeon_emit(ctx_cs, offset);
6119    if (max_stream >= 3)
6120       offset += num_components[3] * gs_max_out_vertices;
6121    radeon_set_context_reg(ctx_cs, R_028AB0_VGT_GSVS_RING_ITEMSIZE, offset);
6122 
6123    radeon_set_context_reg_seq(ctx_cs, R_028B5C_VGT_GS_VERT_ITEMSIZE, 4);
6124    radeon_emit(ctx_cs, num_components[0]);
6125    radeon_emit(ctx_cs, (max_stream >= 1) ? num_components[1] : 0);
6126    radeon_emit(ctx_cs, (max_stream >= 2) ? num_components[2] : 0);
6127    radeon_emit(ctx_cs, (max_stream >= 3) ? num_components[3] : 0);
6128 
6129    uint32_t gs_num_invocations = gs->info.gs.invocations;
6130    radeon_set_context_reg(
6131       ctx_cs, R_028B90_VGT_GS_INSTANCE_CNT,
6132       S_028B90_CNT(MIN2(gs_num_invocations, 127)) | S_028B90_ENABLE(gs_num_invocations > 0));
6133 
6134    radeon_set_context_reg(ctx_cs, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
6135                           gs_state->vgt_esgs_ring_itemsize);
6136 
6137    va = radv_shader_get_va(gs);
6138 
6139    if (pdevice->rad_info.gfx_level >= GFX9) {
6140       if (pdevice->rad_info.gfx_level >= GFX10) {
6141          radeon_set_sh_reg(cs, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
6142       } else {
6143          radeon_set_sh_reg(cs, R_00B210_SPI_SHADER_PGM_LO_ES, va >> 8);
6144       }
6145 
6146       radeon_set_sh_reg_seq(cs, R_00B228_SPI_SHADER_PGM_RSRC1_GS, 2);
6147       radeon_emit(cs, gs->config.rsrc1);
6148       radeon_emit(cs, gs->config.rsrc2 | S_00B22C_LDS_SIZE(gs_state->lds_size));
6149 
6150       radeon_set_context_reg(ctx_cs, R_028A44_VGT_GS_ONCHIP_CNTL, gs_state->vgt_gs_onchip_cntl);
6151       radeon_set_context_reg(ctx_cs, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
6152                              gs_state->vgt_gs_max_prims_per_subgroup);
6153    } else {
6154       radeon_set_sh_reg_seq(cs, R_00B220_SPI_SHADER_PGM_LO_GS, 4);
6155       radeon_emit(cs, va >> 8);
6156       radeon_emit(cs, S_00B224_MEM_BASE(va >> 40));
6157       radeon_emit(cs, gs->config.rsrc1);
6158       radeon_emit(cs, gs->config.rsrc2);
6159    }
6160 
6161    if (pdevice->rad_info.gfx_level >= GFX10) {
6162       ac_set_reg_cu_en(cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
6163                        S_00B21C_CU_EN(0xffff) | S_00B21C_WAVE_LIMIT(0x3F),
6164                        C_00B21C_CU_EN, 0, &pdevice->rad_info,
6165                        (void*)gfx10_set_sh_reg_idx3);
6166       ac_set_reg_cu_en(cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
6167                        S_00B204_CU_EN_GFX10(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0),
6168                        C_00B204_CU_EN_GFX10, 16, &pdevice->rad_info,
6169                        (void*)gfx10_set_sh_reg_idx3);
6170    } else if (pdevice->rad_info.gfx_level >= GFX7) {
6171       radeon_set_sh_reg_idx(
6172          pdevice, cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, 3,
6173          S_00B21C_CU_EN(0xffff) | S_00B21C_WAVE_LIMIT(0x3F));
6174 
6175       if (pdevice->rad_info.gfx_level >= GFX10) {
6176          radeon_set_sh_reg_idx(
6177             pdevice, cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, 3,
6178             S_00B204_CU_EN_GFX10(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0));
6179       }
6180    }
6181 
6182    radv_pipeline_emit_hw_vs(ctx_cs, cs, pipeline, pipeline->base.gs_copy_shader);
6183 }
6184 
6185 static void
radv_pipeline_emit_geometry_shader(struct radeon_cmdbuf * ctx_cs,struct radeon_cmdbuf * cs,const struct radv_graphics_pipeline * pipeline)6186 radv_pipeline_emit_geometry_shader(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
6187                                    const struct radv_graphics_pipeline *pipeline)
6188 {
6189    struct radv_shader *gs;
6190 
6191    gs = pipeline->base.shaders[MESA_SHADER_GEOMETRY];
6192    if (!gs)
6193       return;
6194 
6195    if (gs->info.is_ngg)
6196       radv_pipeline_emit_hw_ngg(ctx_cs, cs, pipeline, gs);
6197    else
6198       radv_pipeline_emit_hw_gs(ctx_cs, cs, pipeline, gs);
6199 
6200    radeon_set_context_reg(ctx_cs, R_028B38_VGT_GS_MAX_VERT_OUT, gs->info.gs.vertices_out);
6201 }
6202 
6203 static void
radv_pipeline_emit_mesh_shader(struct radeon_cmdbuf * ctx_cs,struct radeon_cmdbuf * cs,const struct radv_graphics_pipeline * pipeline)6204 radv_pipeline_emit_mesh_shader(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
6205                                const struct radv_graphics_pipeline *pipeline)
6206 {
6207    const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
6208    struct radv_shader *ms = pipeline->base.shaders[MESA_SHADER_MESH];
6209    if (!ms)
6210       return;
6211 
6212    radv_pipeline_emit_hw_ngg(ctx_cs, cs, pipeline, ms);
6213    radeon_set_context_reg(ctx_cs, R_028B38_VGT_GS_MAX_VERT_OUT, ms->info.workgroup_size);
6214    radeon_set_uconfig_reg_idx(pdevice, ctx_cs,
6215                               R_030908_VGT_PRIMITIVE_TYPE, 1, V_008958_DI_PT_POINTLIST);
6216 }
6217 
6218 static uint32_t
offset_to_ps_input(uint32_t offset,bool flat_shade,bool explicit,bool float16)6219 offset_to_ps_input(uint32_t offset, bool flat_shade, bool explicit, bool float16)
6220 {
6221    uint32_t ps_input_cntl;
6222    if (offset <= AC_EXP_PARAM_OFFSET_31) {
6223       ps_input_cntl = S_028644_OFFSET(offset);
6224       if (flat_shade || explicit)
6225          ps_input_cntl |= S_028644_FLAT_SHADE(1);
6226       if (explicit) {
6227          /* Force parameter cache to be read in passthrough
6228           * mode.
6229           */
6230          ps_input_cntl |= S_028644_OFFSET(1 << 5);
6231       }
6232       if (float16) {
6233          ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) | S_028644_ATTR0_VALID(1);
6234       }
6235    } else {
6236       /* The input is a DEFAULT_VAL constant. */
6237       assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 && offset <= AC_EXP_PARAM_DEFAULT_VAL_1111);
6238       offset -= AC_EXP_PARAM_DEFAULT_VAL_0000;
6239       ps_input_cntl = S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(offset);
6240    }
6241    return ps_input_cntl;
6242 }
6243 
6244 static void
single_slot_to_ps_input(const struct radv_vs_output_info * outinfo,unsigned slot,uint32_t * ps_input_cntl,unsigned * ps_offset,bool skip_undef,bool use_default_0,bool flat_shade)6245 single_slot_to_ps_input(const struct radv_vs_output_info *outinfo,
6246                         unsigned slot, uint32_t *ps_input_cntl, unsigned *ps_offset,
6247                         bool skip_undef, bool use_default_0, bool flat_shade)
6248 {
6249    unsigned vs_offset = outinfo->vs_output_param_offset[slot];
6250 
6251    if (vs_offset == AC_EXP_PARAM_UNDEFINED) {
6252       if (skip_undef)
6253          return;
6254       else if (use_default_0)
6255          vs_offset = AC_EXP_PARAM_DEFAULT_VAL_0000;
6256       else
6257          unreachable("vs_offset should not be AC_EXP_PARAM_UNDEFINED.");
6258    }
6259 
6260    ps_input_cntl[*ps_offset] = offset_to_ps_input(vs_offset, flat_shade, false, false);
6261    ++(*ps_offset);
6262 }
6263 
6264 static void
input_mask_to_ps_inputs(const struct radv_vs_output_info * outinfo,const struct radv_shader * ps,uint32_t input_mask,uint32_t * ps_input_cntl,unsigned * ps_offset)6265 input_mask_to_ps_inputs(const struct radv_vs_output_info *outinfo, const struct radv_shader *ps,
6266                         uint32_t input_mask, uint32_t *ps_input_cntl, unsigned *ps_offset)
6267 {
6268    u_foreach_bit(i, input_mask) {
6269       unsigned vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_VAR0 + i];
6270       if (vs_offset == AC_EXP_PARAM_UNDEFINED) {
6271          ps_input_cntl[*ps_offset] = S_028644_OFFSET(0x20);
6272          ++(*ps_offset);
6273          continue;
6274       }
6275 
6276       bool flat_shade = !!(ps->info.ps.flat_shaded_mask & (1u << *ps_offset));
6277       bool explicit = !!(ps->info.ps.explicit_shaded_mask & (1u << *ps_offset));
6278       bool float16 = !!(ps->info.ps.float16_shaded_mask & (1u << *ps_offset));
6279 
6280       ps_input_cntl[*ps_offset] = offset_to_ps_input(vs_offset, flat_shade, explicit, float16);
6281       ++(*ps_offset);
6282    }
6283 }
6284 
6285 static void
radv_pipeline_emit_ps_inputs(struct radeon_cmdbuf * ctx_cs,const struct radv_graphics_pipeline * pipeline)6286 radv_pipeline_emit_ps_inputs(struct radeon_cmdbuf *ctx_cs,
6287                              const struct radv_graphics_pipeline *pipeline)
6288 {
6289    struct radv_shader *ps = pipeline->base.shaders[MESA_SHADER_FRAGMENT];
6290    const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline);
6291    bool mesh = radv_pipeline_has_stage(pipeline, MESA_SHADER_MESH);
6292    uint32_t ps_input_cntl[32];
6293 
6294    unsigned ps_offset = 0;
6295 
6296    if (ps->info.ps.prim_id_input && !mesh)
6297       single_slot_to_ps_input(outinfo, VARYING_SLOT_PRIMITIVE_ID, ps_input_cntl, &ps_offset,
6298                               true, false, true);
6299 
6300    if (ps->info.ps.layer_input && !mesh)
6301       single_slot_to_ps_input(outinfo, VARYING_SLOT_LAYER, ps_input_cntl, &ps_offset,
6302                               false, true, true);
6303 
6304    if (ps->info.ps.viewport_index_input && !mesh)
6305       single_slot_to_ps_input(outinfo, VARYING_SLOT_VIEWPORT, ps_input_cntl, &ps_offset,
6306                               false, false, true);
6307 
6308    if (ps->info.ps.has_pcoord)
6309       ps_input_cntl[ps_offset++] = S_028644_PT_SPRITE_TEX(1) | S_028644_OFFSET(0x20);
6310 
6311    if (ps->info.ps.num_input_clips_culls) {
6312       single_slot_to_ps_input(outinfo, VARYING_SLOT_CLIP_DIST0, ps_input_cntl, &ps_offset,
6313                               true, false, false);
6314 
6315       if (ps->info.ps.num_input_clips_culls > 4)
6316          single_slot_to_ps_input(outinfo, VARYING_SLOT_CLIP_DIST1, ps_input_cntl, &ps_offset,
6317                                  true, false, false);
6318    }
6319 
6320    input_mask_to_ps_inputs(outinfo, ps, ps->info.ps.input_mask,
6321                            ps_input_cntl, &ps_offset);
6322 
6323    /* Per-primitive PS inputs: the HW needs these to be last. */
6324 
6325    if (ps->info.ps.prim_id_input && mesh)
6326       single_slot_to_ps_input(outinfo, VARYING_SLOT_PRIMITIVE_ID, ps_input_cntl, &ps_offset,
6327                               true, false, false);
6328 
6329    if (ps->info.ps.layer_input && mesh)
6330       single_slot_to_ps_input(outinfo, VARYING_SLOT_LAYER, ps_input_cntl, &ps_offset,
6331                               false, true, false);
6332 
6333    if (ps->info.ps.viewport_index_input && mesh)
6334       single_slot_to_ps_input(outinfo, VARYING_SLOT_VIEWPORT, ps_input_cntl, &ps_offset,
6335                               false, false, false);
6336 
6337    input_mask_to_ps_inputs(outinfo, ps, ps->info.ps.input_per_primitive_mask,
6338                            ps_input_cntl, &ps_offset);
6339 
6340    if (ps_offset) {
6341       radeon_set_context_reg_seq(ctx_cs, R_028644_SPI_PS_INPUT_CNTL_0, ps_offset);
6342       for (unsigned i = 0; i < ps_offset; i++) {
6343          radeon_emit(ctx_cs, ps_input_cntl[i]);
6344       }
6345    }
6346 }
6347 
6348 static uint32_t
radv_compute_db_shader_control(const struct radv_physical_device * pdevice,const struct radv_graphics_pipeline * pipeline,const struct radv_shader * ps)6349 radv_compute_db_shader_control(const struct radv_physical_device *pdevice,
6350                                const struct radv_graphics_pipeline *pipeline,
6351                                const struct radv_shader *ps)
6352 {
6353    unsigned conservative_z_export = V_02880C_EXPORT_ANY_Z;
6354    unsigned z_order;
6355    if (ps->info.ps.early_fragment_test || !ps->info.ps.writes_memory)
6356       z_order = V_02880C_EARLY_Z_THEN_LATE_Z;
6357    else
6358       z_order = V_02880C_LATE_Z;
6359 
6360    if (ps->info.ps.depth_layout == FRAG_DEPTH_LAYOUT_GREATER)
6361       conservative_z_export = V_02880C_EXPORT_GREATER_THAN_Z;
6362    else if (ps->info.ps.depth_layout == FRAG_DEPTH_LAYOUT_LESS)
6363       conservative_z_export = V_02880C_EXPORT_LESS_THAN_Z;
6364 
6365    bool disable_rbplus = pdevice->rad_info.has_rbplus && !pdevice->rad_info.rbplus_allowed;
6366 
6367    /* It shouldn't be needed to export gl_SampleMask when MSAA is disabled
6368     * but this appears to break Project Cars (DXVK). See
6369     * https://bugs.freedesktop.org/show_bug.cgi?id=109401
6370     */
6371    bool mask_export_enable = ps->info.ps.writes_sample_mask;
6372 
6373    return S_02880C_Z_EXPORT_ENABLE(ps->info.ps.writes_z) |
6374           S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(ps->info.ps.writes_stencil) |
6375           S_02880C_KILL_ENABLE(!!ps->info.ps.can_discard) |
6376           S_02880C_MASK_EXPORT_ENABLE(mask_export_enable) |
6377           S_02880C_CONSERVATIVE_Z_EXPORT(conservative_z_export) | S_02880C_Z_ORDER(z_order) |
6378           S_02880C_DEPTH_BEFORE_SHADER(ps->info.ps.early_fragment_test) |
6379           S_02880C_PRE_SHADER_DEPTH_COVERAGE_ENABLE(ps->info.ps.post_depth_coverage) |
6380           S_02880C_EXEC_ON_HIER_FAIL(ps->info.ps.writes_memory) |
6381           S_02880C_EXEC_ON_NOOP(ps->info.ps.writes_memory) |
6382           S_02880C_DUAL_QUAD_DISABLE(disable_rbplus);
6383 }
6384 
6385 static void
radv_pipeline_emit_fragment_shader(struct radeon_cmdbuf * ctx_cs,struct radeon_cmdbuf * cs,const struct radv_graphics_pipeline * pipeline)6386 radv_pipeline_emit_fragment_shader(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
6387                                    const struct radv_graphics_pipeline *pipeline)
6388 {
6389    const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
6390    struct radv_shader *ps;
6391    bool param_gen;
6392    uint64_t va;
6393    assert(pipeline->base.shaders[MESA_SHADER_FRAGMENT]);
6394 
6395    ps = pipeline->base.shaders[MESA_SHADER_FRAGMENT];
6396    va = radv_shader_get_va(ps);
6397 
6398    radeon_set_sh_reg_seq(cs, R_00B020_SPI_SHADER_PGM_LO_PS, 4);
6399    radeon_emit(cs, va >> 8);
6400    radeon_emit(cs, S_00B024_MEM_BASE(va >> 40));
6401    radeon_emit(cs, ps->config.rsrc1);
6402    radeon_emit(cs, ps->config.rsrc2);
6403 
6404    radeon_set_context_reg(ctx_cs, R_02880C_DB_SHADER_CONTROL,
6405                           radv_compute_db_shader_control(pdevice, pipeline, ps));
6406 
6407    radeon_set_context_reg_seq(ctx_cs, R_0286CC_SPI_PS_INPUT_ENA, 2);
6408    radeon_emit(ctx_cs, ps->config.spi_ps_input_ena);
6409    radeon_emit(ctx_cs, ps->config.spi_ps_input_addr);
6410 
6411    /* Workaround when there are no PS inputs but LDS is used. */
6412    param_gen = pdevice->rad_info.gfx_level >= GFX11 &&
6413                !ps->info.ps.num_interp && ps->config.lds_size;
6414 
6415    radeon_set_context_reg(
6416       ctx_cs, R_0286D8_SPI_PS_IN_CONTROL,
6417       S_0286D8_NUM_INTERP(ps->info.ps.num_interp) |
6418       S_0286D8_NUM_PRIM_INTERP(ps->info.ps.num_prim_interp) |
6419       S_0286D8_PS_W32_EN(ps->info.wave_size == 32) |
6420       S_0286D8_PARAM_GEN(param_gen));
6421 
6422    radeon_set_context_reg(ctx_cs, R_0286E0_SPI_BARYC_CNTL, pipeline->spi_baryc_cntl);
6423 
6424    radeon_set_context_reg(
6425       ctx_cs, R_028710_SPI_SHADER_Z_FORMAT,
6426       ac_get_spi_shader_z_format(ps->info.ps.writes_z, ps->info.ps.writes_stencil,
6427                                  ps->info.ps.writes_sample_mask, false));
6428 }
6429 
6430 static void
radv_pipeline_emit_vgt_vertex_reuse(struct radeon_cmdbuf * ctx_cs,const struct radv_graphics_pipeline * pipeline)6431 radv_pipeline_emit_vgt_vertex_reuse(struct radeon_cmdbuf *ctx_cs,
6432                                     const struct radv_graphics_pipeline *pipeline)
6433 {
6434    const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
6435 
6436    if (pdevice->rad_info.family < CHIP_POLARIS10 || pdevice->rad_info.gfx_level >= GFX10)
6437       return;
6438 
6439    unsigned vtx_reuse_depth = 30;
6440    if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL) &&
6441        radv_get_shader(&pipeline->base, MESA_SHADER_TESS_EVAL)->info.tes.spacing ==
6442           TESS_SPACING_FRACTIONAL_ODD) {
6443       vtx_reuse_depth = 14;
6444    }
6445    radeon_set_context_reg(ctx_cs, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
6446                           S_028C58_VTX_REUSE_DEPTH(vtx_reuse_depth));
6447 }
6448 
6449 static void
radv_pipeline_emit_vgt_shader_config(struct radeon_cmdbuf * ctx_cs,const struct radv_graphics_pipeline * pipeline)6450 radv_pipeline_emit_vgt_shader_config(struct radeon_cmdbuf *ctx_cs,
6451                                      const struct radv_graphics_pipeline *pipeline)
6452 {
6453    const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
6454    uint32_t stages = 0;
6455    if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL)) {
6456       stages |= S_028B54_LS_EN(V_028B54_LS_STAGE_ON) | S_028B54_HS_EN(1) | S_028B54_DYNAMIC_HS(1);
6457 
6458       if (radv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY))
6459          stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS) | S_028B54_GS_EN(1);
6460       else if (radv_pipeline_has_ngg(pipeline))
6461          stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS);
6462       else
6463          stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_DS);
6464    } else if (radv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
6465       stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) | S_028B54_GS_EN(1);
6466    } else if (radv_pipeline_has_stage(pipeline, MESA_SHADER_MESH)) {
6467       assert(!radv_pipeline_has_ngg_passthrough(pipeline));
6468       stages |= S_028B54_GS_EN(1) | S_028B54_GS_FAST_LAUNCH(1);
6469 
6470       if (pipeline->base.shaders[MESA_SHADER_MESH]->info.ms.needs_ms_scratch_ring)
6471          stages |= S_028B54_NGG_WAVE_ID_EN(1);
6472    } else if (radv_pipeline_has_ngg(pipeline)) {
6473       stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL);
6474    }
6475 
6476    if (radv_pipeline_has_ngg(pipeline)) {
6477       stages |= S_028B54_PRIMGEN_EN(1);
6478       if (pipeline->streamout_shader)
6479          stages |= S_028B54_NGG_WAVE_ID_EN(1);
6480       if (radv_pipeline_has_ngg_passthrough(pipeline)) {
6481          stages |= S_028B54_PRIMGEN_PASSTHRU_EN(1);
6482          if (pdevice->rad_info.family >= CHIP_NAVI23)
6483             stages |= S_028B54_PRIMGEN_PASSTHRU_NO_MSG(1);
6484       }
6485    } else if (radv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
6486       stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
6487    }
6488 
6489    if (pdevice->rad_info.gfx_level >= GFX9)
6490       stages |= S_028B54_MAX_PRIMGRP_IN_WAVE(2);
6491 
6492    if (pdevice->rad_info.gfx_level >= GFX10) {
6493       uint8_t hs_size = 64, gs_size = 64, vs_size = 64;
6494 
6495       if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL))
6496          hs_size = pipeline->base.shaders[MESA_SHADER_TESS_CTRL]->info.wave_size;
6497 
6498       if (pipeline->base.shaders[MESA_SHADER_GEOMETRY]) {
6499          vs_size = gs_size = pipeline->base.shaders[MESA_SHADER_GEOMETRY]->info.wave_size;
6500          if (radv_pipeline_has_gs_copy_shader(&pipeline->base))
6501             vs_size = pipeline->base.gs_copy_shader->info.wave_size;
6502       } else if (pipeline->base.shaders[MESA_SHADER_TESS_EVAL])
6503          vs_size = pipeline->base.shaders[MESA_SHADER_TESS_EVAL]->info.wave_size;
6504       else if (pipeline->base.shaders[MESA_SHADER_VERTEX])
6505          vs_size = pipeline->base.shaders[MESA_SHADER_VERTEX]->info.wave_size;
6506       else if (pipeline->base.shaders[MESA_SHADER_MESH])
6507          vs_size = gs_size = pipeline->base.shaders[MESA_SHADER_MESH]->info.wave_size;
6508 
6509       if (radv_pipeline_has_ngg(pipeline)) {
6510          assert(!radv_pipeline_has_gs_copy_shader(&pipeline->base));
6511          gs_size = vs_size;
6512       }
6513 
6514       /* legacy GS only supports Wave64 */
6515       stages |= S_028B54_HS_W32_EN(hs_size == 32 ? 1 : 0) |
6516                 S_028B54_GS_W32_EN(gs_size == 32 ? 1 : 0) |
6517                 S_028B54_VS_W32_EN(vs_size == 32 ? 1 : 0);
6518    }
6519 
6520    radeon_set_context_reg(ctx_cs, R_028B54_VGT_SHADER_STAGES_EN, stages);
6521 }
6522 
6523 static void
radv_pipeline_emit_cliprect_rule(struct radeon_cmdbuf * ctx_cs,const struct radv_graphics_pipeline_info * info)6524 radv_pipeline_emit_cliprect_rule(struct radeon_cmdbuf *ctx_cs,
6525                                  const struct radv_graphics_pipeline_info *info)
6526 {
6527    uint32_t cliprect_rule = 0;
6528 
6529    if (!info->dr.count) {
6530       cliprect_rule = 0xffff;
6531    } else {
6532       for (unsigned i = 0; i < (1u << MAX_DISCARD_RECTANGLES); ++i) {
6533          /* Interpret i as a bitmask, and then set the bit in
6534           * the mask if that combination of rectangles in which
6535           * the pixel is contained should pass the cliprect
6536           * test.
6537           */
6538          unsigned relevant_subset = i & ((1u << info->dr.count) - 1);
6539 
6540          if (info->dr.mode == VK_DISCARD_RECTANGLE_MODE_INCLUSIVE_EXT && !relevant_subset)
6541             continue;
6542 
6543          if (info->dr.mode == VK_DISCARD_RECTANGLE_MODE_EXCLUSIVE_EXT && relevant_subset)
6544             continue;
6545 
6546          cliprect_rule |= 1u << i;
6547       }
6548    }
6549 
6550    radeon_set_context_reg(ctx_cs, R_02820C_PA_SC_CLIPRECT_RULE, cliprect_rule);
6551 }
6552 
6553 static void
gfx10_pipeline_emit_ge_cntl(struct radeon_cmdbuf * ctx_cs,const struct radv_graphics_pipeline * pipeline)6554 gfx10_pipeline_emit_ge_cntl(struct radeon_cmdbuf *ctx_cs,
6555                             const struct radv_graphics_pipeline *pipeline)
6556 {
6557    bool break_wave_at_eoi = false;
6558    unsigned primgroup_size;
6559    unsigned vertgroup_size = 256; /* 256 = disable vertex grouping */
6560 
6561    if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL)) {
6562       primgroup_size = pipeline->base.shaders[MESA_SHADER_TESS_CTRL]->info.num_tess_patches;
6563    } else if (radv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
6564       const struct gfx9_gs_info *gs_state =
6565          &pipeline->base.shaders[MESA_SHADER_GEOMETRY]->info.gs_ring_info;
6566       unsigned vgt_gs_onchip_cntl = gs_state->vgt_gs_onchip_cntl;
6567       primgroup_size = G_028A44_GS_PRIMS_PER_SUBGRP(vgt_gs_onchip_cntl);
6568    } else {
6569       primgroup_size = 128; /* recommended without a GS and tess */
6570    }
6571 
6572    if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL)) {
6573       if (pipeline->base.shaders[MESA_SHADER_TESS_CTRL]->info.uses_prim_id ||
6574           radv_get_shader(&pipeline->base, MESA_SHADER_TESS_EVAL)->info.uses_prim_id)
6575          break_wave_at_eoi = true;
6576    }
6577 
6578    radeon_set_uconfig_reg(ctx_cs, R_03096C_GE_CNTL,
6579                           S_03096C_PRIM_GRP_SIZE_GFX10(primgroup_size) |
6580                              S_03096C_VERT_GRP_SIZE(vertgroup_size) |
6581                              S_03096C_PACKET_TO_ONE_PA(0) /* line stipple */ |
6582                              S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi));
6583 }
6584 
6585 static void
radv_pipeline_emit_vgt_gs_out(struct radeon_cmdbuf * ctx_cs,const struct radv_graphics_pipeline * pipeline,uint32_t vgt_gs_out_prim_type)6586 radv_pipeline_emit_vgt_gs_out(struct radeon_cmdbuf *ctx_cs,
6587                               const struct radv_graphics_pipeline *pipeline,
6588                               uint32_t vgt_gs_out_prim_type)
6589 {
6590    const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
6591 
6592    if (pdevice->rad_info.gfx_level >= GFX11) {
6593       radeon_set_uconfig_reg(ctx_cs, R_030998_VGT_GS_OUT_PRIM_TYPE, vgt_gs_out_prim_type);
6594    } else {
6595       radeon_set_context_reg(ctx_cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, vgt_gs_out_prim_type);
6596    }
6597 }
6598 
6599 static void
gfx103_pipeline_emit_vgt_draw_payload_cntl(struct radeon_cmdbuf * ctx_cs,const struct radv_graphics_pipeline * pipeline,const struct radv_graphics_pipeline_info * info)6600 gfx103_pipeline_emit_vgt_draw_payload_cntl(struct radeon_cmdbuf *ctx_cs,
6601                                            const struct radv_graphics_pipeline *pipeline,
6602                                            const struct radv_graphics_pipeline_info *info)
6603 {
6604    const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline);
6605 
6606    bool enable_vrs = radv_is_vrs_enabled(pipeline, info);
6607 
6608    /* Enables the second channel of the primitive export instruction.
6609     * This channel contains: VRS rate x, y, viewport and layer.
6610     */
6611    bool enable_prim_payload =
6612       outinfo &&
6613       (outinfo->writes_viewport_index_per_primitive ||
6614        outinfo->writes_layer_per_primitive ||
6615        outinfo->writes_primitive_shading_rate_per_primitive);
6616 
6617    radeon_set_context_reg(ctx_cs, R_028A98_VGT_DRAW_PAYLOAD_CNTL,
6618                           S_028A98_EN_VRS_RATE(enable_vrs) |
6619                           S_028A98_EN_PRIM_PAYLOAD(enable_prim_payload));
6620 }
6621 
6622 static bool
gfx103_pipeline_vrs_coarse_shading(const struct radv_graphics_pipeline * pipeline)6623 gfx103_pipeline_vrs_coarse_shading(const struct radv_graphics_pipeline *pipeline)
6624 {
6625    struct radv_shader *ps = pipeline->base.shaders[MESA_SHADER_FRAGMENT];
6626    struct radv_device *device = pipeline->base.device;
6627 
6628    if (device->instance->debug_flags & RADV_DEBUG_NO_VRS_FLAT_SHADING)
6629       return false;
6630 
6631    if (!ps->info.ps.allow_flat_shading)
6632       return false;
6633 
6634    return true;
6635 }
6636 
6637 static void
gfx103_pipeline_emit_vrs_state(struct radeon_cmdbuf * ctx_cs,const struct radv_graphics_pipeline * pipeline,const struct radv_graphics_pipeline_info * info)6638 gfx103_pipeline_emit_vrs_state(struct radeon_cmdbuf *ctx_cs,
6639                                const struct radv_graphics_pipeline *pipeline,
6640                                const struct radv_graphics_pipeline_info *info)
6641 {
6642    const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
6643    uint32_t mode = V_028064_VRS_COMB_MODE_PASSTHRU;
6644    uint8_t rate_x = 0, rate_y = 0;
6645    bool enable_vrs = radv_is_vrs_enabled(pipeline, info);
6646 
6647    if (!enable_vrs && gfx103_pipeline_vrs_coarse_shading(pipeline)) {
6648       /* When per-draw VRS is not enabled at all, try enabling VRS coarse shading 2x2 if the driver
6649        * determined that it's safe to enable.
6650        */
6651       mode = V_028064_VRS_COMB_MODE_OVERRIDE;
6652       rate_x = rate_y = 1;
6653    } else if (!radv_is_static_vrs_enabled(pipeline, info) && pipeline->force_vrs_per_vertex &&
6654               get_vs_output_info(pipeline)->writes_primitive_shading_rate) {
6655       /* Otherwise, if per-draw VRS is not enabled statically, try forcing per-vertex VRS if
6656        * requested by the user. Note that vkd3d-proton always has to declare VRS as dynamic because
6657        * in DX12 it's fully dynamic.
6658        */
6659       radeon_set_context_reg(ctx_cs, R_028848_PA_CL_VRS_CNTL,
6660          S_028848_SAMPLE_ITER_COMBINER_MODE(V_028848_VRS_COMB_MODE_OVERRIDE) |
6661          S_028848_VERTEX_RATE_COMBINER_MODE(V_028848_VRS_COMB_MODE_OVERRIDE));
6662 
6663       /* If the shader is using discard, turn off coarse shading because discard at 2x2 pixel
6664        * granularity degrades quality too much. MIN allows sample shading but not coarse shading.
6665        */
6666       struct radv_shader *ps = pipeline->base.shaders[MESA_SHADER_FRAGMENT];
6667 
6668       mode = ps->info.ps.can_discard ? V_028064_VRS_COMB_MODE_MIN : V_028064_VRS_COMB_MODE_PASSTHRU;
6669    }
6670 
6671    if (pdevice->rad_info.gfx_level >= GFX11) {
6672       radeon_set_context_reg(ctx_cs, R_0283D0_PA_SC_VRS_OVERRIDE_CNTL,
6673                              S_0283D0_VRS_OVERRIDE_RATE_COMBINER_MODE(mode) |
6674                                 S_0283D0_VRS_RATE((rate_x << 2) | rate_y));
6675    } else {
6676       radeon_set_context_reg(ctx_cs, R_028064_DB_VRS_OVERRIDE_CNTL,
6677                              S_028064_VRS_OVERRIDE_RATE_COMBINER_MODE(mode) |
6678                                 S_028064_VRS_OVERRIDE_RATE_X(rate_x) |
6679                                 S_028064_VRS_OVERRIDE_RATE_Y(rate_y));
6680    }
6681 }
6682 
6683 static void
radv_pipeline_emit_pm4(struct radv_graphics_pipeline * pipeline,const struct radv_blend_state * blend,const struct radv_depth_stencil_state * ds_state,uint32_t vgt_gs_out_prim_type,const struct radv_graphics_pipeline_info * info)6684 radv_pipeline_emit_pm4(struct radv_graphics_pipeline *pipeline,
6685                        const struct radv_blend_state *blend,
6686                        const struct radv_depth_stencil_state *ds_state,
6687                        uint32_t vgt_gs_out_prim_type,
6688                        const struct radv_graphics_pipeline_info *info)
6689 {
6690    const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
6691    struct radeon_cmdbuf *ctx_cs = &pipeline->base.ctx_cs;
6692    struct radeon_cmdbuf *cs = &pipeline->base.cs;
6693 
6694    cs->max_dw = 64;
6695    ctx_cs->max_dw = 256;
6696    cs->buf = malloc(4 * (cs->max_dw + ctx_cs->max_dw));
6697    ctx_cs->buf = cs->buf + cs->max_dw;
6698 
6699    radv_pipeline_emit_depth_stencil_state(ctx_cs, ds_state);
6700    radv_pipeline_emit_blend_state(ctx_cs, pipeline, blend);
6701    radv_pipeline_emit_raster_state(ctx_cs, pipeline, info);
6702    radv_pipeline_emit_multisample_state(ctx_cs, pipeline);
6703    radv_pipeline_emit_vgt_gs_mode(ctx_cs, pipeline);
6704    radv_pipeline_emit_vertex_shader(ctx_cs, cs, pipeline);
6705    radv_pipeline_emit_mesh_shader(ctx_cs, cs, pipeline);
6706 
6707    if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL)) {
6708       radv_pipeline_emit_tess_shaders(ctx_cs, cs, pipeline);
6709       radv_pipeline_emit_tess_state(ctx_cs, pipeline, info);
6710    }
6711 
6712    radv_pipeline_emit_geometry_shader(ctx_cs, cs, pipeline);
6713    radv_pipeline_emit_fragment_shader(ctx_cs, cs, pipeline);
6714    radv_pipeline_emit_ps_inputs(ctx_cs, pipeline);
6715    radv_pipeline_emit_vgt_vertex_reuse(ctx_cs, pipeline);
6716    radv_pipeline_emit_vgt_shader_config(ctx_cs, pipeline);
6717    radv_pipeline_emit_cliprect_rule(ctx_cs, info);
6718    radv_pipeline_emit_vgt_gs_out(ctx_cs, pipeline, vgt_gs_out_prim_type);
6719 
6720    if (pdevice->rad_info.gfx_level >= GFX10 && !radv_pipeline_has_ngg(pipeline))
6721       gfx10_pipeline_emit_ge_cntl(ctx_cs, pipeline);
6722 
6723    if (pdevice->rad_info.gfx_level >= GFX10_3) {
6724       gfx103_pipeline_emit_vgt_draw_payload_cntl(ctx_cs, pipeline, info);
6725       gfx103_pipeline_emit_vrs_state(ctx_cs, pipeline, info);
6726    }
6727 
6728    pipeline->base.ctx_cs_hash = _mesa_hash_data(ctx_cs->buf, ctx_cs->cdw * 4);
6729 
6730    assert(ctx_cs->cdw <= ctx_cs->max_dw);
6731    assert(cs->cdw <= cs->max_dw);
6732 }
6733 
6734 static void
radv_pipeline_init_vertex_input_state(struct radv_graphics_pipeline * pipeline,const struct radv_graphics_pipeline_info * info)6735 radv_pipeline_init_vertex_input_state(struct radv_graphics_pipeline *pipeline,
6736                                       const struct radv_graphics_pipeline_info *info)
6737 {
6738    const struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
6739    const struct radv_shader_info *vs_info = &radv_get_shader(&pipeline->base, MESA_SHADER_VERTEX)->info;
6740 
6741    for (uint32_t i = 0; i < MAX_VERTEX_ATTRIBS; i++) {
6742       pipeline->attrib_ends[i] = info->vi.attrib_ends[i];
6743       pipeline->attrib_index_offset[i] = info->vi.attrib_index_offset[i];
6744       pipeline->attrib_bindings[i] = info->vi.attrib_bindings[i];
6745    }
6746 
6747    for (uint32_t i = 0; i < MAX_VBS; i++) {
6748       pipeline->binding_stride[i] = info->vi.binding_stride[i];
6749    }
6750 
6751    pipeline->use_per_attribute_vb_descs = vs_info->vs.use_per_attribute_vb_descs;
6752    pipeline->last_vertex_attrib_bit = util_last_bit(vs_info->vs.vb_desc_usage_mask);
6753    if (pipeline->base.shaders[MESA_SHADER_VERTEX])
6754       pipeline->next_vertex_stage = MESA_SHADER_VERTEX;
6755    else if (pipeline->base.shaders[MESA_SHADER_TESS_CTRL])
6756       pipeline->next_vertex_stage = MESA_SHADER_TESS_CTRL;
6757    else
6758       pipeline->next_vertex_stage = MESA_SHADER_GEOMETRY;
6759    if (pipeline->next_vertex_stage == MESA_SHADER_VERTEX) {
6760       const struct radv_shader *vs_shader = pipeline->base.shaders[MESA_SHADER_VERTEX];
6761       pipeline->can_use_simple_input = vs_shader->info.is_ngg == pdevice->use_ngg &&
6762                                        vs_shader->info.wave_size == pdevice->ge_wave_size;
6763    } else {
6764       pipeline->can_use_simple_input = false;
6765    }
6766    if (vs_info->vs.dynamic_inputs)
6767       pipeline->vb_desc_usage_mask = BITFIELD_MASK(pipeline->last_vertex_attrib_bit);
6768    else
6769       pipeline->vb_desc_usage_mask = vs_info->vs.vb_desc_usage_mask;
6770    pipeline->vb_desc_alloc_size = util_bitcount(pipeline->vb_desc_usage_mask) * 16;
6771 }
6772 
6773 static struct radv_shader *
radv_pipeline_get_streamout_shader(struct radv_graphics_pipeline * pipeline)6774 radv_pipeline_get_streamout_shader(struct radv_graphics_pipeline *pipeline)
6775 {
6776    int i;
6777 
6778    for (i = MESA_SHADER_GEOMETRY; i >= MESA_SHADER_VERTEX; i--) {
6779       struct radv_shader *shader = radv_get_shader(&pipeline->base, i);
6780 
6781       if (shader && shader->info.so.num_outputs > 0)
6782          return shader;
6783    }
6784 
6785    return NULL;
6786 }
6787 
6788 static bool
radv_shader_need_indirect_descriptor_sets(struct radv_pipeline * pipeline,gl_shader_stage stage)6789 radv_shader_need_indirect_descriptor_sets(struct radv_pipeline *pipeline, gl_shader_stage stage)
6790 {
6791    struct radv_userdata_info *loc =
6792       radv_lookup_user_sgpr(pipeline, stage, AC_UD_INDIRECT_DESCRIPTOR_SETS);
6793    return loc->sgpr_idx != -1;
6794 }
6795 
6796 static void
radv_pipeline_init_shader_stages_state(struct radv_graphics_pipeline * pipeline)6797 radv_pipeline_init_shader_stages_state(struct radv_graphics_pipeline *pipeline)
6798 {
6799    struct radv_device *device = pipeline->base.device;
6800 
6801    for (unsigned i = 0; i < MESA_VULKAN_SHADER_STAGES; i++) {
6802       bool shader_exists = !!pipeline->base.shaders[i];
6803       if (shader_exists || i < MESA_SHADER_COMPUTE) {
6804          /* We need this info for some stages even when the shader doesn't exist. */
6805          pipeline->base.user_data_0[i] = radv_pipeline_stage_to_user_data_0(
6806             pipeline, i, device->physical_device->rad_info.gfx_level);
6807 
6808          if (shader_exists)
6809             pipeline->base.need_indirect_descriptor_sets |=
6810                radv_shader_need_indirect_descriptor_sets(&pipeline->base, i);
6811       }
6812    }
6813 
6814    gl_shader_stage first_stage =
6815       radv_pipeline_has_stage(pipeline, MESA_SHADER_MESH) ? MESA_SHADER_MESH : MESA_SHADER_VERTEX;
6816 
6817    struct radv_userdata_info *loc =
6818       radv_lookup_user_sgpr(&pipeline->base, first_stage, AC_UD_VS_BASE_VERTEX_START_INSTANCE);
6819    if (loc->sgpr_idx != -1) {
6820       pipeline->vtx_base_sgpr = pipeline->base.user_data_0[first_stage];
6821       pipeline->vtx_base_sgpr += loc->sgpr_idx * 4;
6822       pipeline->vtx_emit_num = loc->num_sgprs;
6823       pipeline->uses_drawid =
6824          radv_get_shader(&pipeline->base, first_stage)->info.vs.needs_draw_id;
6825       pipeline->uses_baseinstance =
6826          radv_get_shader(&pipeline->base, first_stage)->info.vs.needs_base_instance;
6827 
6828       assert(first_stage != MESA_SHADER_MESH || !pipeline->uses_baseinstance);
6829    }
6830 }
6831 
6832 static uint32_t
radv_pipeline_init_vgt_gs_out(struct radv_graphics_pipeline * pipeline,const struct radv_graphics_pipeline_info * info)6833 radv_pipeline_init_vgt_gs_out(struct radv_graphics_pipeline *pipeline,
6834                               const struct radv_graphics_pipeline_info *info)
6835 {
6836    uint32_t gs_out;
6837 
6838    if (radv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
6839       gs_out =
6840          si_conv_gl_prim_to_gs_out(pipeline->base.shaders[MESA_SHADER_GEOMETRY]->info.gs.output_prim);
6841    } else if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL)) {
6842       if (pipeline->base.shaders[MESA_SHADER_TESS_EVAL]->info.tes.point_mode) {
6843          gs_out = V_028A6C_POINTLIST;
6844       } else {
6845          gs_out = si_conv_tess_prim_to_gs_out(
6846             pipeline->base.shaders[MESA_SHADER_TESS_EVAL]->info.tes._primitive_mode);
6847       }
6848    } else if (radv_pipeline_has_stage(pipeline, MESA_SHADER_MESH)) {
6849       gs_out =
6850          si_conv_gl_prim_to_gs_out(pipeline->base.shaders[MESA_SHADER_MESH]->info.ms.output_prim);
6851    } else {
6852       gs_out = si_conv_prim_to_gs_out(info->ia.primitive_topology);
6853    }
6854 
6855    return gs_out;
6856 }
6857 
6858 static void
radv_pipeline_init_extra(struct radv_graphics_pipeline * pipeline,const struct radv_graphics_pipeline_create_info * extra,struct radv_blend_state * blend_state,struct radv_depth_stencil_state * ds_state,const struct radv_graphics_pipeline_info * info,uint32_t * vgt_gs_out_prim_type)6859 radv_pipeline_init_extra(struct radv_graphics_pipeline *pipeline,
6860                          const struct radv_graphics_pipeline_create_info *extra,
6861                          struct radv_blend_state *blend_state,
6862                          struct radv_depth_stencil_state *ds_state,
6863                          const struct radv_graphics_pipeline_info *info,
6864                          uint32_t *vgt_gs_out_prim_type)
6865 {
6866    if (extra->custom_blend_mode == V_028808_CB_ELIMINATE_FAST_CLEAR ||
6867        extra->custom_blend_mode == V_028808_CB_FMASK_DECOMPRESS ||
6868        extra->custom_blend_mode == V_028808_CB_DCC_DECOMPRESS_GFX8 ||
6869        extra->custom_blend_mode == V_028808_CB_DCC_DECOMPRESS_GFX11 ||
6870        extra->custom_blend_mode == V_028808_CB_RESOLVE) {
6871       /* According to the CB spec states, CB_SHADER_MASK should be set to enable writes to all four
6872        * channels of MRT0.
6873        */
6874       blend_state->cb_shader_mask = 0xf;
6875 
6876       if (extra->custom_blend_mode == V_028808_CB_RESOLVE)
6877          pipeline->cb_color_control |= S_028808_DISABLE_DUAL_QUAD(1);
6878 
6879       pipeline->cb_color_control &= C_028808_MODE;
6880       pipeline->cb_color_control |= S_028808_MODE(extra->custom_blend_mode);
6881    }
6882 
6883    if (extra->use_rectlist) {
6884       struct radv_dynamic_state *dynamic = &pipeline->dynamic_state;
6885       dynamic->primitive_topology = V_008958_DI_PT_RECTLIST;
6886 
6887       *vgt_gs_out_prim_type = V_028A6C_TRISTRIP;
6888       if (radv_pipeline_has_ngg(pipeline))
6889          *vgt_gs_out_prim_type = V_028A6C_RECTLIST;
6890 
6891       pipeline->rast_prim = *vgt_gs_out_prim_type;
6892    }
6893 
6894    if (radv_pipeline_has_ds_attachments(&info->ri)) {
6895       ds_state->db_render_control |= S_028000_DEPTH_CLEAR_ENABLE(extra->db_depth_clear);
6896       ds_state->db_render_control |= S_028000_STENCIL_CLEAR_ENABLE(extra->db_stencil_clear);
6897       ds_state->db_render_control |= S_028000_RESUMMARIZE_ENABLE(extra->resummarize_enable);
6898       ds_state->db_render_control |= S_028000_DEPTH_COMPRESS_DISABLE(extra->depth_compress_disable);
6899       ds_state->db_render_control |= S_028000_STENCIL_COMPRESS_DISABLE(extra->stencil_compress_disable);
6900    }
6901 }
6902 
6903 void
radv_pipeline_init(struct radv_device * device,struct radv_pipeline * pipeline,enum radv_pipeline_type type)6904 radv_pipeline_init(struct radv_device *device, struct radv_pipeline *pipeline,
6905                     enum radv_pipeline_type type)
6906 {
6907    vk_object_base_init(&device->vk, &pipeline->base, VK_OBJECT_TYPE_PIPELINE);
6908 
6909    pipeline->device = device;
6910    pipeline->type = type;
6911 }
6912 
6913 static VkResult
radv_graphics_pipeline_init(struct radv_graphics_pipeline * pipeline,struct radv_device * device,struct radv_pipeline_cache * cache,const VkGraphicsPipelineCreateInfo * pCreateInfo,const struct radv_graphics_pipeline_create_info * extra)6914 radv_graphics_pipeline_init(struct radv_graphics_pipeline *pipeline, struct radv_device *device,
6915                             struct radv_pipeline_cache *cache,
6916                             const VkGraphicsPipelineCreateInfo *pCreateInfo,
6917                             const struct radv_graphics_pipeline_create_info *extra)
6918 {
6919    RADV_FROM_HANDLE(radv_pipeline_layout, pipeline_layout, pCreateInfo->layout);
6920    VkResult result;
6921 
6922    pipeline->last_vgt_api_stage = MESA_SHADER_NONE;
6923 
6924    /* Mark all states declared dynamic at pipeline creation. */
6925    if (pCreateInfo->pDynamicState) {
6926       uint32_t count = pCreateInfo->pDynamicState->dynamicStateCount;
6927       for (uint32_t s = 0; s < count; s++) {
6928          pipeline->dynamic_states |=
6929             radv_dynamic_state_mask(pCreateInfo->pDynamicState->pDynamicStates[s]);
6930       }
6931    }
6932 
6933    /* Mark all active stages at pipeline creation. */
6934    for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) {
6935       const VkPipelineShaderStageCreateInfo *sinfo = &pCreateInfo->pStages[i];
6936 
6937       pipeline->active_stages |= sinfo->stage;
6938    }
6939 
6940    struct radv_graphics_pipeline_info info = radv_pipeline_init_graphics_info(pipeline, pCreateInfo);
6941 
6942    struct radv_blend_state blend = radv_pipeline_init_blend_state(pipeline, pCreateInfo, &info);
6943 
6944    const VkPipelineCreationFeedbackCreateInfo *creation_feedback =
6945       vk_find_struct_const(pCreateInfo->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);
6946 
6947    struct radv_pipeline_key key =
6948       radv_generate_graphics_pipeline_key(pipeline, pCreateInfo, &info, &blend);
6949 
6950    result = radv_create_shaders(&pipeline->base, pipeline_layout, device, cache, &key, pCreateInfo->pStages,
6951                                 pCreateInfo->stageCount, pCreateInfo->flags, NULL,
6952                                 creation_feedback, NULL, NULL, &pipeline->last_vgt_api_stage);
6953    if (result != VK_SUCCESS)
6954       return result;
6955 
6956    pipeline->spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1);
6957 
6958    uint32_t vgt_gs_out_prim_type = radv_pipeline_init_vgt_gs_out(pipeline, &info);
6959 
6960    radv_pipeline_init_multisample_state(pipeline, &blend, &info, vgt_gs_out_prim_type);
6961 
6962    if (!radv_pipeline_has_stage(pipeline, MESA_SHADER_MESH))
6963       radv_pipeline_init_input_assembly_state(pipeline, &info);
6964    radv_pipeline_init_dynamic_state(pipeline, &info);
6965 
6966    pipeline->negative_one_to_one = info.vp.negative_one_to_one;
6967 
6968    radv_pipeline_init_raster_state(pipeline, &info);
6969 
6970    struct radv_depth_stencil_state ds_state =
6971       radv_pipeline_init_depth_stencil_state(pipeline, &info);
6972 
6973    if (device->physical_device->rad_info.gfx_level >= GFX10_3)
6974       gfx103_pipeline_init_vrs_state(pipeline, &info);
6975 
6976    /* Ensure that some export memory is always allocated, for two reasons:
6977     *
6978     * 1) Correctness: The hardware ignores the EXEC mask if no export
6979     *    memory is allocated, so KILL and alpha test do not work correctly
6980     *    without this.
6981     * 2) Performance: Every shader needs at least a NULL export, even when
6982     *    it writes no color/depth output. The NULL export instruction
6983     *    stalls without this setting.
6984     *
6985     * Don't add this to CB_SHADER_MASK.
6986     *
6987     * GFX10 supports pixel shaders without exports by setting both the
6988     * color and Z formats to SPI_SHADER_ZERO. The hw will skip export
6989     * instructions if any are present.
6990     */
6991    struct radv_shader *ps = pipeline->base.shaders[MESA_SHADER_FRAGMENT];
6992    if ((device->physical_device->rad_info.gfx_level <= GFX9 || ps->info.ps.can_discard) &&
6993        !blend.spi_shader_col_format) {
6994       if (!ps->info.ps.writes_z && !ps->info.ps.writes_stencil && !ps->info.ps.writes_sample_mask)
6995          blend.spi_shader_col_format = V_028714_SPI_SHADER_32_R;
6996    }
6997 
6998    pipeline->col_format = blend.spi_shader_col_format;
6999    pipeline->cb_target_mask = blend.cb_target_mask;
7000 
7001    if (radv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY) && !radv_pipeline_has_ngg(pipeline)) {
7002       struct radv_shader *gs = pipeline->base.shaders[MESA_SHADER_GEOMETRY];
7003 
7004       radv_pipeline_init_gs_ring_state(pipeline, &gs->info.gs_ring_info);
7005    }
7006 
7007    if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL)) {
7008       pipeline->tess_patch_control_points = info.ts.patch_control_points;
7009    }
7010 
7011    if (!radv_pipeline_has_stage(pipeline, MESA_SHADER_MESH))
7012       radv_pipeline_init_vertex_input_state(pipeline, &info);
7013 
7014    radv_pipeline_init_binning_state(pipeline, &blend, &info);
7015    radv_pipeline_init_shader_stages_state(pipeline);
7016    radv_pipeline_init_scratch(device, &pipeline->base);
7017 
7018    /* Find the last vertex shader stage that eventually uses streamout. */
7019    pipeline->streamout_shader = radv_pipeline_get_streamout_shader(pipeline);
7020 
7021    pipeline->is_ngg = radv_pipeline_has_ngg(pipeline);
7022    pipeline->has_ngg_culling =
7023       pipeline->is_ngg &&
7024       pipeline->base.shaders[pipeline->last_vgt_api_stage]->info.has_ngg_culling;
7025    pipeline->force_vrs_per_vertex =
7026       pipeline->base.shaders[pipeline->last_vgt_api_stage]->info.force_vrs_per_vertex;
7027    pipeline->uses_user_sample_locations = info.ms.sample_locs_enable;
7028    pipeline->rast_prim = vgt_gs_out_prim_type;
7029 
7030    if (!(pipeline->dynamic_states & RADV_DYNAMIC_LINE_WIDTH)) {
7031       pipeline->line_width = info.rs.line_width;
7032    }
7033 
7034    pipeline->base.push_constant_size = pipeline_layout->push_constant_size;
7035    pipeline->base.dynamic_offset_count = pipeline_layout->dynamic_offset_count;
7036 
7037    if (extra) {
7038       radv_pipeline_init_extra(pipeline, extra, &blend, &ds_state, &info, &vgt_gs_out_prim_type);
7039    }
7040 
7041    radv_pipeline_emit_pm4(pipeline, &blend, &ds_state, vgt_gs_out_prim_type, &info);
7042 
7043    return result;
7044 }
7045 
7046 static VkResult
radv_graphics_pipeline_create_nonlegacy(VkDevice _device,VkPipelineCache _cache,const VkGraphicsPipelineCreateInfo * pCreateInfo,const struct radv_graphics_pipeline_create_info * extra,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipeline)7047 radv_graphics_pipeline_create_nonlegacy(VkDevice _device, VkPipelineCache _cache,
7048                                         const VkGraphicsPipelineCreateInfo *pCreateInfo,
7049                                         const struct radv_graphics_pipeline_create_info *extra,
7050                                         const VkAllocationCallbacks *pAllocator,
7051                                         VkPipeline *pPipeline)
7052 {
7053    RADV_FROM_HANDLE(radv_device, device, _device);
7054    RADV_FROM_HANDLE(radv_pipeline_cache, cache, _cache);
7055    struct radv_graphics_pipeline *pipeline;
7056    VkResult result;
7057 
7058    pipeline = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,
7059                          VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
7060    if (pipeline == NULL)
7061       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
7062 
7063    radv_pipeline_init(device, &pipeline->base, RADV_PIPELINE_GRAPHICS);
7064 
7065    result = radv_graphics_pipeline_init(pipeline, device, cache, pCreateInfo, extra);
7066    if (result != VK_SUCCESS) {
7067       radv_pipeline_destroy(device, &pipeline->base, pAllocator);
7068       return result;
7069    }
7070 
7071    *pPipeline = radv_pipeline_to_handle(&pipeline->base);
7072 
7073    return VK_SUCCESS;
7074 }
7075 
7076 /* This is a wrapper for radv_graphics_pipeline_create_nonlegacy that does all legacy conversions
7077  * for the VkGraphicsPipelineCreateInfo data. */
7078 VkResult
radv_graphics_pipeline_create(VkDevice _device,VkPipelineCache _cache,const VkGraphicsPipelineCreateInfo * pCreateInfo,const struct radv_graphics_pipeline_create_info * extra,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipeline)7079 radv_graphics_pipeline_create(VkDevice _device, VkPipelineCache _cache,
7080                               const VkGraphicsPipelineCreateInfo *pCreateInfo,
7081                               const struct radv_graphics_pipeline_create_info *extra,
7082                               const VkAllocationCallbacks *pAllocator, VkPipeline *pPipeline)
7083 {
7084    VkGraphicsPipelineCreateInfo create_info = *pCreateInfo;
7085 
7086    VkPipelineRenderingCreateInfo rendering_create_info;
7087    VkFormat color_formats[MAX_RTS];
7088    VkAttachmentSampleCountInfoAMD sample_info;
7089    VkSampleCountFlagBits samples[MAX_RTS];
7090    if (pCreateInfo->renderPass != VK_NULL_HANDLE) {
7091       RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
7092       struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
7093 
7094       rendering_create_info.sType = VK_STRUCTURE_TYPE_PIPELINE_RENDERING_CREATE_INFO;
7095       rendering_create_info.pNext = create_info.pNext;
7096       create_info.pNext = &rendering_create_info;
7097 
7098       rendering_create_info.viewMask = subpass->view_mask;
7099 
7100       VkFormat ds_format =
7101          subpass->depth_stencil_attachment
7102             ? pass->attachments[subpass->depth_stencil_attachment->attachment].format
7103             : VK_FORMAT_UNDEFINED;
7104 
7105       rendering_create_info.depthAttachmentFormat =
7106          vk_format_has_depth(ds_format) ? ds_format : VK_FORMAT_UNDEFINED;
7107       rendering_create_info.stencilAttachmentFormat =
7108          vk_format_has_stencil(ds_format) ? ds_format : VK_FORMAT_UNDEFINED;
7109 
7110       rendering_create_info.colorAttachmentCount = subpass->color_count;
7111       rendering_create_info.pColorAttachmentFormats = color_formats;
7112       for (unsigned i = 0; i < rendering_create_info.colorAttachmentCount; ++i) {
7113          if (subpass->color_attachments[i].attachment != VK_ATTACHMENT_UNUSED)
7114             color_formats[i] = pass->attachments[subpass->color_attachments[i].attachment].format;
7115          else
7116             color_formats[i] = VK_FORMAT_UNDEFINED;
7117       }
7118 
7119       create_info.renderPass = VK_NULL_HANDLE;
7120 
7121       sample_info.sType = VK_STRUCTURE_TYPE_ATTACHMENT_SAMPLE_COUNT_INFO_AMD;
7122       sample_info.pNext = create_info.pNext;
7123       create_info.pNext = &sample_info;
7124 
7125       sample_info.colorAttachmentCount = rendering_create_info.colorAttachmentCount;
7126       sample_info.pColorAttachmentSamples = samples;
7127       for (unsigned i = 0; i < sample_info.colorAttachmentCount; ++i) {
7128          if (subpass->color_attachments[i].attachment != VK_ATTACHMENT_UNUSED) {
7129             samples[i] = pass->attachments[subpass->color_attachments[i].attachment].samples;
7130          } else
7131             samples[i] = 1;
7132       }
7133       sample_info.depthStencilAttachmentSamples = subpass->depth_sample_count;
7134    }
7135 
7136    return radv_graphics_pipeline_create_nonlegacy(_device, _cache, &create_info, extra, pAllocator,
7137                                                   pPipeline);
7138 }
7139 
7140 VKAPI_ATTR VkResult VKAPI_CALL
radv_CreateGraphicsPipelines(VkDevice _device,VkPipelineCache pipelineCache,uint32_t count,const VkGraphicsPipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)7141 radv_CreateGraphicsPipelines(VkDevice _device, VkPipelineCache pipelineCache, uint32_t count,
7142                              const VkGraphicsPipelineCreateInfo *pCreateInfos,
7143                              const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines)
7144 {
7145    VkResult result = VK_SUCCESS;
7146    unsigned i = 0;
7147 
7148    for (; i < count; i++) {
7149       VkResult r;
7150       r = radv_graphics_pipeline_create(_device, pipelineCache, &pCreateInfos[i], NULL, pAllocator,
7151                                         &pPipelines[i]);
7152       if (r != VK_SUCCESS) {
7153          result = r;
7154          pPipelines[i] = VK_NULL_HANDLE;
7155 
7156          if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT)
7157             break;
7158       }
7159    }
7160 
7161    for (; i < count; ++i)
7162       pPipelines[i] = VK_NULL_HANDLE;
7163 
7164    return result;
7165 }
7166 
7167 void
radv_pipeline_emit_hw_cs(const struct radv_physical_device * pdevice,struct radeon_cmdbuf * cs,const struct radv_shader * shader)7168 radv_pipeline_emit_hw_cs(const struct radv_physical_device *pdevice, struct radeon_cmdbuf *cs,
7169                          const struct radv_shader *shader)
7170 {
7171    uint64_t va = radv_shader_get_va(shader);
7172 
7173    radeon_set_sh_reg(cs, R_00B830_COMPUTE_PGM_LO, va >> 8);
7174 
7175    radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
7176    radeon_emit(cs, shader->config.rsrc1);
7177    radeon_emit(cs, shader->config.rsrc2);
7178    if (pdevice->rad_info.gfx_level >= GFX10) {
7179       radeon_set_sh_reg(cs, R_00B8A0_COMPUTE_PGM_RSRC3, shader->config.rsrc3);
7180    }
7181 }
7182 
7183 void
radv_pipeline_emit_compute_state(const struct radv_physical_device * pdevice,struct radeon_cmdbuf * cs,const struct radv_shader * shader)7184 radv_pipeline_emit_compute_state(const struct radv_physical_device *pdevice,
7185                                  struct radeon_cmdbuf *cs, const struct radv_shader *shader)
7186 {
7187    unsigned threads_per_threadgroup;
7188    unsigned threadgroups_per_cu = 1;
7189    unsigned waves_per_threadgroup;
7190    unsigned max_waves_per_sh = 0;
7191 
7192    /* Calculate best compute resource limits. */
7193    threads_per_threadgroup =
7194       shader->info.cs.block_size[0] * shader->info.cs.block_size[1] * shader->info.cs.block_size[2];
7195    waves_per_threadgroup = DIV_ROUND_UP(threads_per_threadgroup, shader->info.wave_size);
7196 
7197    if (pdevice->rad_info.gfx_level >= GFX10 && waves_per_threadgroup == 1)
7198       threadgroups_per_cu = 2;
7199 
7200    radeon_set_sh_reg(
7201       cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
7202       ac_get_compute_resource_limits(&pdevice->rad_info, waves_per_threadgroup,
7203                                      max_waves_per_sh, threadgroups_per_cu));
7204 
7205    radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
7206    radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(shader->info.cs.block_size[0]));
7207    radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(shader->info.cs.block_size[1]));
7208    radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(shader->info.cs.block_size[2]));
7209 }
7210 
7211 static void
radv_compute_generate_pm4(struct radv_compute_pipeline * pipeline)7212 radv_compute_generate_pm4(struct radv_compute_pipeline *pipeline)
7213 {
7214    struct radv_physical_device *pdevice = pipeline->base.device->physical_device;
7215    struct radv_shader *shader = pipeline->base.shaders[MESA_SHADER_COMPUTE];
7216    struct radeon_cmdbuf *cs = &pipeline->base.cs;
7217 
7218    cs->max_dw = pdevice->rad_info.gfx_level >= GFX10 ? 19 : 16;
7219    cs->buf = malloc(cs->max_dw * 4);
7220 
7221    radv_pipeline_emit_hw_cs(pdevice, cs, shader);
7222    radv_pipeline_emit_compute_state(pdevice, cs, shader);
7223 
7224    assert(pipeline->base.cs.cdw <= pipeline->base.cs.max_dw);
7225 }
7226 
7227 static struct radv_pipeline_key
radv_generate_compute_pipeline_key(struct radv_compute_pipeline * pipeline,const VkComputePipelineCreateInfo * pCreateInfo)7228 radv_generate_compute_pipeline_key(struct radv_compute_pipeline *pipeline,
7229                                    const VkComputePipelineCreateInfo *pCreateInfo)
7230 {
7231    const VkPipelineShaderStageCreateInfo *stage = &pCreateInfo->stage;
7232    struct radv_pipeline_key key = radv_generate_pipeline_key(&pipeline->base, pCreateInfo->flags);
7233 
7234    const VkPipelineShaderStageRequiredSubgroupSizeCreateInfo *subgroup_size =
7235       vk_find_struct_const(stage->pNext,
7236                            PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO);
7237 
7238    if (subgroup_size) {
7239       assert(subgroup_size->requiredSubgroupSize == 32 ||
7240              subgroup_size->requiredSubgroupSize == 64);
7241       key.cs.compute_subgroup_size = subgroup_size->requiredSubgroupSize;
7242    } else if (stage->flags & VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT) {
7243       key.cs.require_full_subgroups = true;
7244    }
7245 
7246    return key;
7247 }
7248 
7249 VkResult
radv_compute_pipeline_create(VkDevice _device,VkPipelineCache _cache,const VkComputePipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,const uint8_t * custom_hash,struct radv_pipeline_shader_stack_size * rt_stack_sizes,uint32_t rt_group_count,VkPipeline * pPipeline)7250 radv_compute_pipeline_create(VkDevice _device, VkPipelineCache _cache,
7251                              const VkComputePipelineCreateInfo *pCreateInfo,
7252                              const VkAllocationCallbacks *pAllocator, const uint8_t *custom_hash,
7253                              struct radv_pipeline_shader_stack_size *rt_stack_sizes,
7254                              uint32_t rt_group_count, VkPipeline *pPipeline)
7255 {
7256    RADV_FROM_HANDLE(radv_device, device, _device);
7257    RADV_FROM_HANDLE(radv_pipeline_cache, cache, _cache);
7258    RADV_FROM_HANDLE(radv_pipeline_layout, pipeline_layout, pCreateInfo->layout);
7259    struct radv_compute_pipeline *pipeline;
7260    VkResult result;
7261 
7262    pipeline = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,
7263                          VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
7264    if (pipeline == NULL) {
7265       free(rt_stack_sizes);
7266       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
7267    }
7268 
7269    radv_pipeline_init(device, &pipeline->base, RADV_PIPELINE_COMPUTE);
7270 
7271    pipeline->rt_stack_sizes = rt_stack_sizes;
7272    pipeline->group_count = rt_group_count;
7273 
7274    const VkPipelineCreationFeedbackCreateInfo *creation_feedback =
7275       vk_find_struct_const(pCreateInfo->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);
7276 
7277    struct radv_pipeline_key key = radv_generate_compute_pipeline_key(pipeline, pCreateInfo);
7278 
7279    UNUSED gl_shader_stage last_vgt_api_stage = MESA_SHADER_NONE;
7280    result = radv_create_shaders(&pipeline->base, pipeline_layout, device, cache, &key, &pCreateInfo->stage,
7281                                 1, pCreateInfo->flags, custom_hash, creation_feedback,
7282                                 &pipeline->rt_stack_sizes, &pipeline->group_count,
7283                                 &last_vgt_api_stage);
7284    if (result != VK_SUCCESS) {
7285       radv_pipeline_destroy(device, &pipeline->base, pAllocator);
7286       return result;
7287    }
7288 
7289    pipeline->base.user_data_0[MESA_SHADER_COMPUTE] = R_00B900_COMPUTE_USER_DATA_0;
7290    pipeline->base.need_indirect_descriptor_sets |=
7291       radv_shader_need_indirect_descriptor_sets(&pipeline->base, MESA_SHADER_COMPUTE);
7292    radv_pipeline_init_scratch(device, &pipeline->base);
7293 
7294    pipeline->base.push_constant_size = pipeline_layout->push_constant_size;
7295    pipeline->base.dynamic_offset_count = pipeline_layout->dynamic_offset_count;
7296 
7297    if (device->physical_device->rad_info.has_cs_regalloc_hang_bug) {
7298       struct radv_shader *compute_shader = pipeline->base.shaders[MESA_SHADER_COMPUTE];
7299       unsigned *cs_block_size = compute_shader->info.cs.block_size;
7300 
7301       pipeline->cs_regalloc_hang_bug = cs_block_size[0] * cs_block_size[1] * cs_block_size[2] > 256;
7302    }
7303 
7304    radv_compute_generate_pm4(pipeline);
7305 
7306    *pPipeline = radv_pipeline_to_handle(&pipeline->base);
7307 
7308    return VK_SUCCESS;
7309 }
7310 
7311 VKAPI_ATTR VkResult VKAPI_CALL
radv_CreateComputePipelines(VkDevice _device,VkPipelineCache pipelineCache,uint32_t count,const VkComputePipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)7312 radv_CreateComputePipelines(VkDevice _device, VkPipelineCache pipelineCache, uint32_t count,
7313                             const VkComputePipelineCreateInfo *pCreateInfos,
7314                             const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines)
7315 {
7316    VkResult result = VK_SUCCESS;
7317 
7318    unsigned i = 0;
7319    for (; i < count; i++) {
7320       VkResult r;
7321       r = radv_compute_pipeline_create(_device, pipelineCache, &pCreateInfos[i], pAllocator, NULL,
7322                                        NULL, 0, &pPipelines[i]);
7323       if (r != VK_SUCCESS) {
7324          result = r;
7325          pPipelines[i] = VK_NULL_HANDLE;
7326 
7327          if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT)
7328             break;
7329       }
7330    }
7331 
7332    for (; i < count; ++i)
7333       pPipelines[i] = VK_NULL_HANDLE;
7334 
7335    return result;
7336 }
7337 
7338 static uint32_t
radv_get_executable_count(struct radv_pipeline * pipeline)7339 radv_get_executable_count(struct radv_pipeline *pipeline)
7340 {
7341    uint32_t ret = 0;
7342    for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i) {
7343       if (!pipeline->shaders[i])
7344          continue;
7345 
7346       if (i == MESA_SHADER_GEOMETRY &&
7347           !radv_pipeline_has_ngg(radv_pipeline_to_graphics(pipeline))) {
7348          ret += 2u;
7349       } else {
7350          ret += 1u;
7351       }
7352    }
7353    return ret;
7354 }
7355 
7356 static struct radv_shader *
radv_get_shader_from_executable_index(struct radv_pipeline * pipeline,int index,gl_shader_stage * stage)7357 radv_get_shader_from_executable_index(struct radv_pipeline *pipeline, int index,
7358                                       gl_shader_stage *stage)
7359 {
7360    for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i) {
7361       if (!pipeline->shaders[i])
7362          continue;
7363       if (!index) {
7364          *stage = i;
7365          return pipeline->shaders[i];
7366       }
7367 
7368       --index;
7369 
7370       if (i == MESA_SHADER_GEOMETRY &&
7371           !radv_pipeline_has_ngg(radv_pipeline_to_graphics(pipeline))) {
7372          if (!index) {
7373             *stage = i;
7374             return pipeline->gs_copy_shader;
7375          }
7376          --index;
7377       }
7378    }
7379 
7380    *stage = -1;
7381    return NULL;
7382 }
7383 
7384 /* Basically strlcpy (which does not exist on linux) specialized for
7385  * descriptions. */
7386 static void
desc_copy(char * desc,const char * src)7387 desc_copy(char *desc, const char *src)
7388 {
7389    int len = strlen(src);
7390    assert(len < VK_MAX_DESCRIPTION_SIZE);
7391    memcpy(desc, src, len);
7392    memset(desc + len, 0, VK_MAX_DESCRIPTION_SIZE - len);
7393 }
7394 
7395 VKAPI_ATTR VkResult VKAPI_CALL
radv_GetPipelineExecutablePropertiesKHR(VkDevice _device,const VkPipelineInfoKHR * pPipelineInfo,uint32_t * pExecutableCount,VkPipelineExecutablePropertiesKHR * pProperties)7396 radv_GetPipelineExecutablePropertiesKHR(VkDevice _device, const VkPipelineInfoKHR *pPipelineInfo,
7397                                         uint32_t *pExecutableCount,
7398                                         VkPipelineExecutablePropertiesKHR *pProperties)
7399 {
7400    RADV_FROM_HANDLE(radv_pipeline, pipeline, pPipelineInfo->pipeline);
7401    const uint32_t total_count = radv_get_executable_count(pipeline);
7402 
7403    if (!pProperties) {
7404       *pExecutableCount = total_count;
7405       return VK_SUCCESS;
7406    }
7407 
7408    const uint32_t count = MIN2(total_count, *pExecutableCount);
7409    for (unsigned i = 0, executable_idx = 0; i < MESA_VULKAN_SHADER_STAGES && executable_idx < count; ++i) {
7410       if (!pipeline->shaders[i])
7411          continue;
7412       pProperties[executable_idx].stages = mesa_to_vk_shader_stage(i);
7413       const char *name = NULL;
7414       const char *description = NULL;
7415       switch (i) {
7416       case MESA_SHADER_VERTEX:
7417          name = "Vertex Shader";
7418          description = "Vulkan Vertex Shader";
7419          break;
7420       case MESA_SHADER_TESS_CTRL:
7421          if (!pipeline->shaders[MESA_SHADER_VERTEX]) {
7422             pProperties[executable_idx].stages |= VK_SHADER_STAGE_VERTEX_BIT;
7423             name = "Vertex + Tessellation Control Shaders";
7424             description = "Combined Vulkan Vertex and Tessellation Control Shaders";
7425          } else {
7426             name = "Tessellation Control Shader";
7427             description = "Vulkan Tessellation Control Shader";
7428          }
7429          break;
7430       case MESA_SHADER_TESS_EVAL:
7431          name = "Tessellation Evaluation Shader";
7432          description = "Vulkan Tessellation Evaluation Shader";
7433          break;
7434       case MESA_SHADER_GEOMETRY:
7435          if (pipeline->shaders[MESA_SHADER_TESS_CTRL] && !pipeline->shaders[MESA_SHADER_TESS_EVAL]) {
7436             pProperties[executable_idx].stages |= VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT;
7437             name = "Tessellation Evaluation + Geometry Shaders";
7438             description = "Combined Vulkan Tessellation Evaluation and Geometry Shaders";
7439          } else if (!pipeline->shaders[MESA_SHADER_TESS_CTRL] && !pipeline->shaders[MESA_SHADER_VERTEX]) {
7440             pProperties[executable_idx].stages |= VK_SHADER_STAGE_VERTEX_BIT;
7441             name = "Vertex + Geometry Shader";
7442             description = "Combined Vulkan Vertex and Geometry Shaders";
7443          } else {
7444             name = "Geometry Shader";
7445             description = "Vulkan Geometry Shader";
7446          }
7447          break;
7448       case MESA_SHADER_FRAGMENT:
7449          name = "Fragment Shader";
7450          description = "Vulkan Fragment Shader";
7451          break;
7452       case MESA_SHADER_COMPUTE:
7453          name = "Compute Shader";
7454          description = "Vulkan Compute Shader";
7455          break;
7456       case MESA_SHADER_MESH:
7457          name = "Mesh Shader";
7458          description = "Vulkan Mesh Shader";
7459          break;
7460       case MESA_SHADER_TASK:
7461          name = "Task Shader";
7462          description = "Vulkan Task Shader";
7463          break;
7464       }
7465 
7466       pProperties[executable_idx].subgroupSize = pipeline->shaders[i]->info.wave_size;
7467       desc_copy(pProperties[executable_idx].name, name);
7468       desc_copy(pProperties[executable_idx].description, description);
7469 
7470       ++executable_idx;
7471       if (i == MESA_SHADER_GEOMETRY &&
7472           !radv_pipeline_has_ngg(radv_pipeline_to_graphics(pipeline))) {
7473          assert(pipeline->gs_copy_shader);
7474          if (executable_idx >= count)
7475             break;
7476 
7477          pProperties[executable_idx].stages = VK_SHADER_STAGE_GEOMETRY_BIT;
7478          pProperties[executable_idx].subgroupSize = 64;
7479          desc_copy(pProperties[executable_idx].name, "GS Copy Shader");
7480          desc_copy(pProperties[executable_idx].description,
7481                    "Extra shader stage that loads the GS output ringbuffer into the rasterizer");
7482 
7483          ++executable_idx;
7484       }
7485    }
7486 
7487    VkResult result = *pExecutableCount < total_count ? VK_INCOMPLETE : VK_SUCCESS;
7488    *pExecutableCount = count;
7489    return result;
7490 }
7491 
7492 VKAPI_ATTR VkResult VKAPI_CALL
radv_GetPipelineExecutableStatisticsKHR(VkDevice _device,const VkPipelineExecutableInfoKHR * pExecutableInfo,uint32_t * pStatisticCount,VkPipelineExecutableStatisticKHR * pStatistics)7493 radv_GetPipelineExecutableStatisticsKHR(VkDevice _device,
7494                                         const VkPipelineExecutableInfoKHR *pExecutableInfo,
7495                                         uint32_t *pStatisticCount,
7496                                         VkPipelineExecutableStatisticKHR *pStatistics)
7497 {
7498    RADV_FROM_HANDLE(radv_device, device, _device);
7499    RADV_FROM_HANDLE(radv_pipeline, pipeline, pExecutableInfo->pipeline);
7500    gl_shader_stage stage;
7501    struct radv_shader *shader =
7502       radv_get_shader_from_executable_index(pipeline, pExecutableInfo->executableIndex, &stage);
7503 
7504    const struct radv_physical_device *pdevice = device->physical_device;
7505 
7506    unsigned lds_increment = pdevice->rad_info.gfx_level >= GFX11 && stage == MESA_SHADER_FRAGMENT
7507       ? 1024 : pdevice->rad_info.lds_encode_granularity;
7508    unsigned max_waves = radv_get_max_waves(device, shader, stage);
7509 
7510    VkPipelineExecutableStatisticKHR *s = pStatistics;
7511    VkPipelineExecutableStatisticKHR *end = s + (pStatistics ? *pStatisticCount : 0);
7512    VkResult result = VK_SUCCESS;
7513 
7514    if (s < end) {
7515       desc_copy(s->name, "Driver pipeline hash");
7516       desc_copy(s->description, "Driver pipeline hash used by RGP");
7517       s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
7518       s->value.u64 = pipeline->pipeline_hash;
7519    }
7520    ++s;
7521 
7522    if (s < end) {
7523       desc_copy(s->name, "SGPRs");
7524       desc_copy(s->description, "Number of SGPR registers allocated per subgroup");
7525       s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
7526       s->value.u64 = shader->config.num_sgprs;
7527    }
7528    ++s;
7529 
7530    if (s < end) {
7531       desc_copy(s->name, "VGPRs");
7532       desc_copy(s->description, "Number of VGPR registers allocated per subgroup");
7533       s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
7534       s->value.u64 = shader->config.num_vgprs;
7535    }
7536    ++s;
7537 
7538    if (s < end) {
7539       desc_copy(s->name, "Spilled SGPRs");
7540       desc_copy(s->description, "Number of SGPR registers spilled per subgroup");
7541       s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
7542       s->value.u64 = shader->config.spilled_sgprs;
7543    }
7544    ++s;
7545 
7546    if (s < end) {
7547       desc_copy(s->name, "Spilled VGPRs");
7548       desc_copy(s->description, "Number of VGPR registers spilled per subgroup");
7549       s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
7550       s->value.u64 = shader->config.spilled_vgprs;
7551    }
7552    ++s;
7553 
7554    if (s < end) {
7555       desc_copy(s->name, "Code size");
7556       desc_copy(s->description, "Code size in bytes");
7557       s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
7558       s->value.u64 = shader->exec_size;
7559    }
7560    ++s;
7561 
7562    if (s < end) {
7563       desc_copy(s->name, "LDS size");
7564       desc_copy(s->description, "LDS size in bytes per workgroup");
7565       s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
7566       s->value.u64 = shader->config.lds_size * lds_increment;
7567    }
7568    ++s;
7569 
7570    if (s < end) {
7571       desc_copy(s->name, "Scratch size");
7572       desc_copy(s->description, "Private memory in bytes per subgroup");
7573       s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
7574       s->value.u64 = shader->config.scratch_bytes_per_wave;
7575    }
7576    ++s;
7577 
7578    if (s < end) {
7579       desc_copy(s->name, "Subgroups per SIMD");
7580       desc_copy(s->description, "The maximum number of subgroups in flight on a SIMD unit");
7581       s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
7582       s->value.u64 = max_waves;
7583    }
7584    ++s;
7585 
7586    if (shader->statistics) {
7587       for (unsigned i = 0; i < aco_num_statistics; i++) {
7588          const struct aco_compiler_statistic_info *info = &aco_statistic_infos[i];
7589          if (s < end) {
7590             desc_copy(s->name, info->name);
7591             desc_copy(s->description, info->desc);
7592             s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
7593             s->value.u64 = shader->statistics[i];
7594          }
7595          ++s;
7596       }
7597    }
7598 
7599    if (!pStatistics)
7600       *pStatisticCount = s - pStatistics;
7601    else if (s > end) {
7602       *pStatisticCount = end - pStatistics;
7603       result = VK_INCOMPLETE;
7604    } else {
7605       *pStatisticCount = s - pStatistics;
7606    }
7607 
7608    return result;
7609 }
7610 
7611 static VkResult
radv_copy_representation(void * data,size_t * data_size,const char * src)7612 radv_copy_representation(void *data, size_t *data_size, const char *src)
7613 {
7614    size_t total_size = strlen(src) + 1;
7615 
7616    if (!data) {
7617       *data_size = total_size;
7618       return VK_SUCCESS;
7619    }
7620 
7621    size_t size = MIN2(total_size, *data_size);
7622 
7623    memcpy(data, src, size);
7624    if (size)
7625       *((char *)data + size - 1) = 0;
7626    return size < total_size ? VK_INCOMPLETE : VK_SUCCESS;
7627 }
7628 
7629 VKAPI_ATTR VkResult VKAPI_CALL
radv_GetPipelineExecutableInternalRepresentationsKHR(VkDevice _device,const VkPipelineExecutableInfoKHR * pExecutableInfo,uint32_t * pInternalRepresentationCount,VkPipelineExecutableInternalRepresentationKHR * pInternalRepresentations)7630 radv_GetPipelineExecutableInternalRepresentationsKHR(
7631    VkDevice _device, const VkPipelineExecutableInfoKHR *pExecutableInfo,
7632    uint32_t *pInternalRepresentationCount,
7633    VkPipelineExecutableInternalRepresentationKHR *pInternalRepresentations)
7634 {
7635    RADV_FROM_HANDLE(radv_device, device, _device);
7636    RADV_FROM_HANDLE(radv_pipeline, pipeline, pExecutableInfo->pipeline);
7637    gl_shader_stage stage;
7638    struct radv_shader *shader =
7639       radv_get_shader_from_executable_index(pipeline, pExecutableInfo->executableIndex, &stage);
7640 
7641    VkPipelineExecutableInternalRepresentationKHR *p = pInternalRepresentations;
7642    VkPipelineExecutableInternalRepresentationKHR *end =
7643       p + (pInternalRepresentations ? *pInternalRepresentationCount : 0);
7644    VkResult result = VK_SUCCESS;
7645    /* optimized NIR */
7646    if (p < end) {
7647       p->isText = true;
7648       desc_copy(p->name, "NIR Shader(s)");
7649       desc_copy(p->description, "The optimized NIR shader(s)");
7650       if (radv_copy_representation(p->pData, &p->dataSize, shader->nir_string) != VK_SUCCESS)
7651          result = VK_INCOMPLETE;
7652    }
7653    ++p;
7654 
7655    /* backend IR */
7656    if (p < end) {
7657       p->isText = true;
7658       if (radv_use_llvm_for_stage(device, stage)) {
7659          desc_copy(p->name, "LLVM IR");
7660          desc_copy(p->description, "The LLVM IR after some optimizations");
7661       } else {
7662          desc_copy(p->name, "ACO IR");
7663          desc_copy(p->description, "The ACO IR after some optimizations");
7664       }
7665       if (radv_copy_representation(p->pData, &p->dataSize, shader->ir_string) != VK_SUCCESS)
7666          result = VK_INCOMPLETE;
7667    }
7668    ++p;
7669 
7670    /* Disassembler */
7671    if (p < end && shader->disasm_string) {
7672       p->isText = true;
7673       desc_copy(p->name, "Assembly");
7674       desc_copy(p->description, "Final Assembly");
7675       if (radv_copy_representation(p->pData, &p->dataSize, shader->disasm_string) != VK_SUCCESS)
7676          result = VK_INCOMPLETE;
7677    }
7678    ++p;
7679 
7680    if (!pInternalRepresentations)
7681       *pInternalRepresentationCount = p - pInternalRepresentations;
7682    else if (p > end) {
7683       result = VK_INCOMPLETE;
7684       *pInternalRepresentationCount = end - pInternalRepresentations;
7685    } else {
7686       *pInternalRepresentationCount = p - pInternalRepresentations;
7687    }
7688 
7689    return result;
7690 }
7691