• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2016 Red Hat.
3  * Copyright © 2016 Bas Nieuwenhuizen
4  *
5  * based in part on anv driver which is:
6  * Copyright © 2015 Intel Corporation
7  *
8  * Permission is hereby granted, free of charge, to any person obtaining a
9  * copy of this software and associated documentation files (the "Software"),
10  * to deal in the Software without restriction, including without limitation
11  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12  * and/or sell copies of the Software, and to permit persons to whom the
13  * Software is furnished to do so, subject to the following conditions:
14  *
15  * The above copyright notice and this permission notice (including the next
16  * paragraph) shall be included in all copies or substantial portions of the
17  * Software.
18  *
19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
22  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25  * IN THE SOFTWARE.
26  */
27 
28 #include "meta/radv_meta.h"
29 #include "radv_cs.h"
30 #include "radv_debug.h"
31 #include "radv_private.h"
32 #include "radv_radeon_winsys.h"
33 #include "radv_shader.h"
34 #include "sid.h"
35 #include "vk_common_entrypoints.h"
36 #include "vk_enum_defines.h"
37 #include "vk_format.h"
38 #include "vk_framebuffer.h"
39 #include "vk_render_pass.h"
40 #include "vk_util.h"
41 
42 #include "ac_debug.h"
43 #include "ac_shader_args.h"
44 
45 #include "aco_interface.h"
46 
47 #include "util/fast_idiv_by_const.h"
48 
49 enum {
50    RADV_PREFETCH_VBO_DESCRIPTORS = (1 << 0),
51    RADV_PREFETCH_VS = (1 << 1),
52    RADV_PREFETCH_TCS = (1 << 2),
53    RADV_PREFETCH_TES = (1 << 3),
54    RADV_PREFETCH_GS = (1 << 4),
55    RADV_PREFETCH_PS = (1 << 5),
56    RADV_PREFETCH_MS = (1 << 6),
57    RADV_PREFETCH_SHADERS = (RADV_PREFETCH_VS | RADV_PREFETCH_TCS | RADV_PREFETCH_TES | RADV_PREFETCH_GS |
58                             RADV_PREFETCH_PS | RADV_PREFETCH_MS)
59 };
60 
61 static void radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
62                                          VkImageLayout src_layout, VkImageLayout dst_layout, uint32_t src_family_index,
63                                          uint32_t dst_family_index, const VkImageSubresourceRange *range,
64                                          struct radv_sample_locations_state *sample_locs);
65 
66 static void
radv_bind_dynamic_state(struct radv_cmd_buffer * cmd_buffer,const struct radv_dynamic_state * src)67 radv_bind_dynamic_state(struct radv_cmd_buffer *cmd_buffer, const struct radv_dynamic_state *src)
68 {
69    struct radv_dynamic_state *dest = &cmd_buffer->state.dynamic;
70    uint64_t copy_mask = src->mask;
71    uint64_t dest_mask = 0;
72 
73    dest->vk.dr.rectangle_count = src->vk.dr.rectangle_count;
74    dest->sample_location.count = src->sample_location.count;
75 
76    if (copy_mask & RADV_DYNAMIC_VIEWPORT) {
77       if (dest->vk.vp.viewport_count != src->vk.vp.viewport_count) {
78          dest->vk.vp.viewport_count = src->vk.vp.viewport_count;
79          dest_mask |= RADV_DYNAMIC_VIEWPORT;
80       }
81 
82       if (memcmp(&dest->vk.vp.viewports, &src->vk.vp.viewports, src->vk.vp.viewport_count * sizeof(VkViewport))) {
83          typed_memcpy(dest->vk.vp.viewports, src->vk.vp.viewports, src->vk.vp.viewport_count);
84          typed_memcpy(dest->hw_vp.xform, src->hw_vp.xform, src->vk.vp.viewport_count);
85          dest_mask |= RADV_DYNAMIC_VIEWPORT;
86       }
87    }
88 
89    if (copy_mask & RADV_DYNAMIC_SCISSOR) {
90       if (dest->vk.vp.scissor_count != src->vk.vp.scissor_count) {
91          dest->vk.vp.scissor_count = src->vk.vp.scissor_count;
92          dest_mask |= RADV_DYNAMIC_SCISSOR;
93       }
94 
95       if (memcmp(&dest->vk.vp.scissors, &src->vk.vp.scissors, src->vk.vp.scissor_count * sizeof(VkRect2D))) {
96          typed_memcpy(dest->vk.vp.scissors, src->vk.vp.scissors, src->vk.vp.scissor_count);
97          dest_mask |= RADV_DYNAMIC_SCISSOR;
98       }
99    }
100 
101    if (copy_mask & RADV_DYNAMIC_BLEND_CONSTANTS) {
102       if (memcmp(&dest->vk.cb.blend_constants, &src->vk.cb.blend_constants, sizeof(src->vk.cb.blend_constants))) {
103          typed_memcpy(dest->vk.cb.blend_constants, src->vk.cb.blend_constants, 4);
104          dest_mask |= RADV_DYNAMIC_BLEND_CONSTANTS;
105       }
106    }
107 
108    if (copy_mask & RADV_DYNAMIC_DISCARD_RECTANGLE) {
109       if (memcmp(&dest->vk.dr.rectangles, &src->vk.dr.rectangles, src->vk.dr.rectangle_count * sizeof(VkRect2D))) {
110          typed_memcpy(dest->vk.dr.rectangles, src->vk.dr.rectangles, src->vk.dr.rectangle_count);
111          dest_mask |= RADV_DYNAMIC_DISCARD_RECTANGLE;
112       }
113    }
114 
115    if (copy_mask & RADV_DYNAMIC_SAMPLE_LOCATIONS) {
116       if (dest->sample_location.per_pixel != src->sample_location.per_pixel ||
117           dest->sample_location.grid_size.width != src->sample_location.grid_size.width ||
118           dest->sample_location.grid_size.height != src->sample_location.grid_size.height ||
119           memcmp(&dest->sample_location.locations, &src->sample_location.locations,
120                  src->sample_location.count * sizeof(VkSampleLocationEXT))) {
121          dest->sample_location.per_pixel = src->sample_location.per_pixel;
122          dest->sample_location.grid_size = src->sample_location.grid_size;
123          typed_memcpy(dest->sample_location.locations, src->sample_location.locations, src->sample_location.count);
124          dest_mask |= RADV_DYNAMIC_SAMPLE_LOCATIONS;
125       }
126    }
127 
128    if (copy_mask & RADV_DYNAMIC_COLOR_WRITE_MASK) {
129       for (uint32_t i = 0; i < MAX_RTS; i++) {
130          if (dest->vk.cb.attachments[i].write_mask != src->vk.cb.attachments[i].write_mask) {
131             dest->vk.cb.attachments[i].write_mask = src->vk.cb.attachments[i].write_mask;
132             dest_mask |= RADV_DYNAMIC_COLOR_WRITE_MASK;
133          }
134       }
135    }
136 
137    if (copy_mask & RADV_DYNAMIC_COLOR_BLEND_ENABLE) {
138       for (uint32_t i = 0; i < MAX_RTS; i++) {
139          if (dest->vk.cb.attachments[i].blend_enable != src->vk.cb.attachments[i].blend_enable) {
140             dest->vk.cb.attachments[i].blend_enable = src->vk.cb.attachments[i].blend_enable;
141             dest_mask |= RADV_DYNAMIC_COLOR_BLEND_ENABLE;
142          }
143       }
144    }
145 
146    if (copy_mask & RADV_DYNAMIC_COLOR_BLEND_EQUATION) {
147       for (uint32_t i = 0; i < MAX_RTS; i++) {
148          if (dest->vk.cb.attachments[i].src_color_blend_factor != src->vk.cb.attachments[i].src_color_blend_factor ||
149              dest->vk.cb.attachments[i].dst_color_blend_factor != src->vk.cb.attachments[i].dst_color_blend_factor ||
150              dest->vk.cb.attachments[i].color_blend_op != src->vk.cb.attachments[i].color_blend_op ||
151              dest->vk.cb.attachments[i].src_alpha_blend_factor != src->vk.cb.attachments[i].src_alpha_blend_factor ||
152              dest->vk.cb.attachments[i].dst_alpha_blend_factor != src->vk.cb.attachments[i].dst_alpha_blend_factor ||
153              dest->vk.cb.attachments[i].alpha_blend_op != src->vk.cb.attachments[i].alpha_blend_op) {
154             dest->vk.cb.attachments[i].src_color_blend_factor = src->vk.cb.attachments[i].src_color_blend_factor;
155             dest->vk.cb.attachments[i].dst_color_blend_factor = src->vk.cb.attachments[i].dst_color_blend_factor;
156             dest->vk.cb.attachments[i].color_blend_op = src->vk.cb.attachments[i].color_blend_op;
157             dest->vk.cb.attachments[i].src_alpha_blend_factor = src->vk.cb.attachments[i].src_alpha_blend_factor;
158             dest->vk.cb.attachments[i].dst_alpha_blend_factor = src->vk.cb.attachments[i].dst_alpha_blend_factor;
159             dest->vk.cb.attachments[i].alpha_blend_op = src->vk.cb.attachments[i].alpha_blend_op;
160             dest_mask |= RADV_DYNAMIC_COLOR_BLEND_EQUATION;
161          }
162       }
163    }
164 
165 #define RADV_CMP_COPY(field, flag)                                                                                     \
166    if (copy_mask & flag) {                                                                                             \
167       if (dest->field != src->field) {                                                                                 \
168          dest->field = src->field;                                                                                     \
169          dest_mask |= flag;                                                                                            \
170       }                                                                                                                \
171    }
172 
173    RADV_CMP_COPY(vk.ia.primitive_topology, RADV_DYNAMIC_PRIMITIVE_TOPOLOGY);
174    RADV_CMP_COPY(vk.ia.primitive_restart_enable, RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE);
175 
176    RADV_CMP_COPY(vk.vp.depth_clip_negative_one_to_one, RADV_DYNAMIC_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE);
177 
178    RADV_CMP_COPY(vk.ts.patch_control_points, RADV_DYNAMIC_PATCH_CONTROL_POINTS);
179    RADV_CMP_COPY(vk.ts.domain_origin, RADV_DYNAMIC_TESS_DOMAIN_ORIGIN);
180 
181    RADV_CMP_COPY(vk.rs.line.width, RADV_DYNAMIC_LINE_WIDTH);
182    RADV_CMP_COPY(vk.rs.depth_bias.constant, RADV_DYNAMIC_DEPTH_BIAS);
183    RADV_CMP_COPY(vk.rs.depth_bias.clamp, RADV_DYNAMIC_DEPTH_BIAS);
184    RADV_CMP_COPY(vk.rs.depth_bias.slope, RADV_DYNAMIC_DEPTH_BIAS);
185    RADV_CMP_COPY(vk.rs.depth_bias.representation, RADV_DYNAMIC_DEPTH_BIAS);
186    RADV_CMP_COPY(vk.rs.line.stipple.factor, RADV_DYNAMIC_LINE_STIPPLE);
187    RADV_CMP_COPY(vk.rs.line.stipple.pattern, RADV_DYNAMIC_LINE_STIPPLE);
188    RADV_CMP_COPY(vk.rs.cull_mode, RADV_DYNAMIC_CULL_MODE);
189    RADV_CMP_COPY(vk.rs.front_face, RADV_DYNAMIC_FRONT_FACE);
190    RADV_CMP_COPY(vk.rs.depth_bias.enable, RADV_DYNAMIC_DEPTH_BIAS_ENABLE);
191    RADV_CMP_COPY(vk.rs.rasterizer_discard_enable, RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE);
192    RADV_CMP_COPY(vk.rs.polygon_mode, RADV_DYNAMIC_POLYGON_MODE);
193    RADV_CMP_COPY(vk.rs.line.stipple.enable, RADV_DYNAMIC_LINE_STIPPLE_ENABLE);
194    RADV_CMP_COPY(vk.rs.depth_clip_enable, RADV_DYNAMIC_DEPTH_CLIP_ENABLE);
195    RADV_CMP_COPY(vk.rs.conservative_mode, RADV_DYNAMIC_CONSERVATIVE_RAST_MODE);
196    RADV_CMP_COPY(vk.rs.provoking_vertex, RADV_DYNAMIC_PROVOKING_VERTEX_MODE);
197    RADV_CMP_COPY(vk.rs.depth_clamp_enable, RADV_DYNAMIC_DEPTH_CLAMP_ENABLE);
198    RADV_CMP_COPY(vk.rs.line.mode, RADV_DYNAMIC_LINE_RASTERIZATION_MODE);
199 
200    RADV_CMP_COPY(vk.ms.alpha_to_coverage_enable, RADV_DYNAMIC_ALPHA_TO_COVERAGE_ENABLE);
201    RADV_CMP_COPY(vk.ms.sample_mask, RADV_DYNAMIC_SAMPLE_MASK);
202    RADV_CMP_COPY(vk.ms.rasterization_samples, RADV_DYNAMIC_RASTERIZATION_SAMPLES);
203    RADV_CMP_COPY(vk.ms.sample_locations_enable, RADV_DYNAMIC_SAMPLE_LOCATIONS_ENABLE);
204 
205    RADV_CMP_COPY(vk.ds.depth.bounds_test.min, RADV_DYNAMIC_DEPTH_BOUNDS);
206    RADV_CMP_COPY(vk.ds.depth.bounds_test.max, RADV_DYNAMIC_DEPTH_BOUNDS);
207    RADV_CMP_COPY(vk.ds.stencil.front.compare_mask, RADV_DYNAMIC_STENCIL_COMPARE_MASK);
208    RADV_CMP_COPY(vk.ds.stencil.back.compare_mask, RADV_DYNAMIC_STENCIL_COMPARE_MASK);
209    RADV_CMP_COPY(vk.ds.stencil.front.write_mask, RADV_DYNAMIC_STENCIL_WRITE_MASK);
210    RADV_CMP_COPY(vk.ds.stencil.back.write_mask, RADV_DYNAMIC_STENCIL_WRITE_MASK);
211    RADV_CMP_COPY(vk.ds.stencil.front.reference, RADV_DYNAMIC_STENCIL_REFERENCE);
212    RADV_CMP_COPY(vk.ds.stencil.back.reference, RADV_DYNAMIC_STENCIL_REFERENCE);
213    RADV_CMP_COPY(vk.ds.depth.test_enable, RADV_DYNAMIC_DEPTH_TEST_ENABLE);
214    RADV_CMP_COPY(vk.ds.depth.write_enable, RADV_DYNAMIC_DEPTH_WRITE_ENABLE);
215    RADV_CMP_COPY(vk.ds.depth.compare_op, RADV_DYNAMIC_DEPTH_COMPARE_OP);
216    RADV_CMP_COPY(vk.ds.depth.bounds_test.enable, RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE);
217    RADV_CMP_COPY(vk.ds.stencil.test_enable, RADV_DYNAMIC_STENCIL_TEST_ENABLE);
218    RADV_CMP_COPY(vk.ds.stencil.front.op.fail, RADV_DYNAMIC_STENCIL_OP);
219    RADV_CMP_COPY(vk.ds.stencil.front.op.pass, RADV_DYNAMIC_STENCIL_OP);
220    RADV_CMP_COPY(vk.ds.stencil.front.op.depth_fail, RADV_DYNAMIC_STENCIL_OP);
221    RADV_CMP_COPY(vk.ds.stencil.front.op.compare, RADV_DYNAMIC_STENCIL_OP);
222    RADV_CMP_COPY(vk.ds.stencil.back.op.fail, RADV_DYNAMIC_STENCIL_OP);
223    RADV_CMP_COPY(vk.ds.stencil.back.op.pass, RADV_DYNAMIC_STENCIL_OP);
224    RADV_CMP_COPY(vk.ds.stencil.back.op.depth_fail, RADV_DYNAMIC_STENCIL_OP);
225    RADV_CMP_COPY(vk.ds.stencil.back.op.compare, RADV_DYNAMIC_STENCIL_OP);
226 
227    RADV_CMP_COPY(vk.cb.logic_op, RADV_DYNAMIC_LOGIC_OP);
228    RADV_CMP_COPY(vk.cb.color_write_enables, RADV_DYNAMIC_COLOR_WRITE_ENABLE);
229    RADV_CMP_COPY(vk.cb.logic_op_enable, RADV_DYNAMIC_LOGIC_OP_ENABLE);
230 
231    RADV_CMP_COPY(vk.fsr.fragment_size.width, RADV_DYNAMIC_FRAGMENT_SHADING_RATE);
232    RADV_CMP_COPY(vk.fsr.fragment_size.height, RADV_DYNAMIC_FRAGMENT_SHADING_RATE);
233    RADV_CMP_COPY(vk.fsr.combiner_ops[0], RADV_DYNAMIC_FRAGMENT_SHADING_RATE);
234    RADV_CMP_COPY(vk.fsr.combiner_ops[1], RADV_DYNAMIC_FRAGMENT_SHADING_RATE);
235 
236    RADV_CMP_COPY(vk.dr.enable, RADV_DYNAMIC_DISCARD_RECTANGLE_ENABLE);
237    RADV_CMP_COPY(vk.dr.mode, RADV_DYNAMIC_DISCARD_RECTANGLE_MODE);
238 
239    RADV_CMP_COPY(feedback_loop_aspects, RADV_DYNAMIC_ATTACHMENT_FEEDBACK_LOOP_ENABLE);
240 
241 #undef RADV_CMP_COPY
242 
243    cmd_buffer->state.dirty |= dest_mask;
244 
245    /* Handle driver specific states that need to be re-emitted when PSO are bound. */
246    if (dest_mask & (RADV_DYNAMIC_VIEWPORT | RADV_DYNAMIC_POLYGON_MODE | RADV_DYNAMIC_LINE_WIDTH |
247                     RADV_DYNAMIC_PRIMITIVE_TOPOLOGY)) {
248       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_GUARDBAND;
249    }
250 
251    if (cmd_buffer->device->physical_device->rad_info.rbplus_allowed && (dest_mask & RADV_DYNAMIC_COLOR_WRITE_MASK)) {
252       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_RBPLUS;
253    }
254 }
255 
256 bool
radv_cmd_buffer_uses_mec(struct radv_cmd_buffer * cmd_buffer)257 radv_cmd_buffer_uses_mec(struct radv_cmd_buffer *cmd_buffer)
258 {
259    return cmd_buffer->qf == RADV_QUEUE_COMPUTE && cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7;
260 }
261 
262 enum amd_ip_type
radv_queue_family_to_ring(const struct radv_physical_device * physical_device,enum radv_queue_family f)263 radv_queue_family_to_ring(const struct radv_physical_device *physical_device, enum radv_queue_family f)
264 {
265    switch (f) {
266    case RADV_QUEUE_GENERAL:
267       return AMD_IP_GFX;
268    case RADV_QUEUE_COMPUTE:
269       return AMD_IP_COMPUTE;
270    case RADV_QUEUE_TRANSFER:
271       return AMD_IP_SDMA;
272    case RADV_QUEUE_VIDEO_DEC:
273       return physical_device->vid_decode_ip;
274    case RADV_QUEUE_VIDEO_ENC:
275       return AMD_IP_VCN_ENC;
276    default:
277       unreachable("Unknown queue family");
278    }
279 }
280 
281 static void
radv_write_data(struct radv_cmd_buffer * cmd_buffer,const unsigned engine_sel,const uint64_t va,const unsigned count,const uint32_t * data,const bool predicating)282 radv_write_data(struct radv_cmd_buffer *cmd_buffer, const unsigned engine_sel, const uint64_t va, const unsigned count,
283                 const uint32_t *data, const bool predicating)
284 {
285    radv_cs_write_data(cmd_buffer->device, cmd_buffer->cs, cmd_buffer->qf, engine_sel, va, count, data, predicating);
286 }
287 
288 static void
radv_emit_clear_data(struct radv_cmd_buffer * cmd_buffer,unsigned engine_sel,uint64_t va,unsigned size)289 radv_emit_clear_data(struct radv_cmd_buffer *cmd_buffer, unsigned engine_sel, uint64_t va, unsigned size)
290 {
291    uint32_t *zeroes = alloca(size);
292    memset(zeroes, 0, size);
293    radv_write_data(cmd_buffer, engine_sel, va, size / 4, zeroes, false);
294 }
295 
296 static void
radv_cmd_buffer_finish_shader_part_cache(struct radv_cmd_buffer * cmd_buffer)297 radv_cmd_buffer_finish_shader_part_cache(struct radv_cmd_buffer *cmd_buffer)
298 {
299    ralloc_free(cmd_buffer->vs_prologs.table);
300    ralloc_free(cmd_buffer->ps_epilogs.table);
301    ralloc_free(cmd_buffer->tcs_epilogs.table);
302 }
303 
304 static bool
radv_cmd_buffer_init_shader_part_cache(struct radv_device * device,struct radv_cmd_buffer * cmd_buffer)305 radv_cmd_buffer_init_shader_part_cache(struct radv_device *device, struct radv_cmd_buffer *cmd_buffer)
306 {
307    if (device->vs_prologs.ops) {
308       if (!_mesa_set_init(&cmd_buffer->vs_prologs, NULL, device->vs_prologs.ops->hash, device->vs_prologs.ops->equals))
309          return false;
310    }
311    if (device->tcs_epilogs.ops) {
312       if (!_mesa_set_init(&cmd_buffer->tcs_epilogs, NULL, device->tcs_epilogs.ops->hash,
313                           device->tcs_epilogs.ops->equals))
314          return false;
315    }
316    if (device->ps_epilogs.ops) {
317       if (!_mesa_set_init(&cmd_buffer->ps_epilogs, NULL, device->ps_epilogs.ops->hash, device->ps_epilogs.ops->equals))
318          return false;
319    }
320    return true;
321 }
322 
323 static void
radv_destroy_cmd_buffer(struct vk_command_buffer * vk_cmd_buffer)324 radv_destroy_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer)
325 {
326    struct radv_cmd_buffer *cmd_buffer = container_of(vk_cmd_buffer, struct radv_cmd_buffer, vk);
327 
328    if (cmd_buffer->qf != RADV_QUEUE_SPARSE) {
329       util_dynarray_fini(&cmd_buffer->ray_history);
330 
331       list_for_each_entry_safe (struct radv_cmd_buffer_upload, up, &cmd_buffer->upload.list, list) {
332          radv_rmv_log_command_buffer_bo_destroy(cmd_buffer->device, up->upload_bo);
333          cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, up->upload_bo);
334          list_del(&up->list);
335          free(up);
336       }
337 
338       if (cmd_buffer->upload.upload_bo) {
339          radv_rmv_log_command_buffer_bo_destroy(cmd_buffer->device, cmd_buffer->upload.upload_bo);
340          cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, cmd_buffer->upload.upload_bo);
341       }
342 
343       if (cmd_buffer->cs)
344          cmd_buffer->device->ws->cs_destroy(cmd_buffer->cs);
345       if (cmd_buffer->gang.cs)
346          cmd_buffer->device->ws->cs_destroy(cmd_buffer->gang.cs);
347       if (cmd_buffer->transfer.copy_temp)
348          cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, cmd_buffer->transfer.copy_temp);
349 
350       radv_cmd_buffer_finish_shader_part_cache(cmd_buffer);
351 
352       for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
353          struct radv_descriptor_set_header *set = &cmd_buffer->descriptors[i].push_set.set;
354          free(set->mapped_ptr);
355          if (set->layout)
356             vk_descriptor_set_layout_unref(&cmd_buffer->device->vk, &set->layout->vk);
357          vk_object_base_finish(&set->base);
358       }
359 
360       vk_object_base_finish(&cmd_buffer->meta_push_descriptors.base);
361    }
362 
363    vk_command_buffer_finish(&cmd_buffer->vk);
364    vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer);
365 }
366 
367 static VkResult
radv_create_cmd_buffer(struct vk_command_pool * pool,struct vk_command_buffer ** cmd_buffer_out)368 radv_create_cmd_buffer(struct vk_command_pool *pool, struct vk_command_buffer **cmd_buffer_out)
369 {
370    struct radv_device *device = container_of(pool->base.device, struct radv_device, vk);
371 
372    struct radv_cmd_buffer *cmd_buffer;
373    unsigned ring;
374    cmd_buffer = vk_zalloc(&pool->alloc, sizeof(*cmd_buffer), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
375    if (cmd_buffer == NULL)
376       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
377 
378    VkResult result = vk_command_buffer_init(pool, &cmd_buffer->vk, &radv_cmd_buffer_ops, 0);
379    if (result != VK_SUCCESS) {
380       vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer);
381       return result;
382    }
383 
384    cmd_buffer->device = device;
385 
386    cmd_buffer->qf = vk_queue_to_radv(device->physical_device, pool->queue_family_index);
387 
388    if (cmd_buffer->qf != RADV_QUEUE_SPARSE) {
389       list_inithead(&cmd_buffer->upload.list);
390 
391       if (!radv_cmd_buffer_init_shader_part_cache(device, cmd_buffer)) {
392          radv_destroy_cmd_buffer(&cmd_buffer->vk);
393          return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
394       }
395 
396       ring = radv_queue_family_to_ring(device->physical_device, cmd_buffer->qf);
397 
398       cmd_buffer->cs =
399          device->ws->cs_create(device->ws, ring, cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
400       if (!cmd_buffer->cs) {
401          radv_destroy_cmd_buffer(&cmd_buffer->vk);
402          return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
403       }
404 
405       vk_object_base_init(&device->vk, &cmd_buffer->meta_push_descriptors.base, VK_OBJECT_TYPE_DESCRIPTOR_SET);
406 
407       for (unsigned i = 0; i < MAX_BIND_POINTS; i++)
408          vk_object_base_init(&device->vk, &cmd_buffer->descriptors[i].push_set.set.base, VK_OBJECT_TYPE_DESCRIPTOR_SET);
409 
410       util_dynarray_init(&cmd_buffer->ray_history, NULL);
411    }
412 
413    *cmd_buffer_out = &cmd_buffer->vk;
414 
415    return VK_SUCCESS;
416 }
417 
418 void
radv_cmd_buffer_reset_rendering(struct radv_cmd_buffer * cmd_buffer)419 radv_cmd_buffer_reset_rendering(struct radv_cmd_buffer *cmd_buffer)
420 {
421    memset(&cmd_buffer->state.render, 0, sizeof(cmd_buffer->state.render));
422 }
423 
424 static void
radv_reset_cmd_buffer(struct vk_command_buffer * vk_cmd_buffer,UNUSED VkCommandBufferResetFlags flags)425 radv_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer, UNUSED VkCommandBufferResetFlags flags)
426 {
427    struct radv_cmd_buffer *cmd_buffer = container_of(vk_cmd_buffer, struct radv_cmd_buffer, vk);
428 
429    vk_command_buffer_reset(&cmd_buffer->vk);
430 
431    if (cmd_buffer->qf == RADV_QUEUE_SPARSE)
432       return;
433 
434    cmd_buffer->device->ws->cs_reset(cmd_buffer->cs);
435    if (cmd_buffer->gang.cs)
436       cmd_buffer->device->ws->cs_reset(cmd_buffer->gang.cs);
437 
438    list_for_each_entry_safe (struct radv_cmd_buffer_upload, up, &cmd_buffer->upload.list, list) {
439       radv_rmv_log_command_buffer_bo_destroy(cmd_buffer->device, up->upload_bo);
440       cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, up->upload_bo);
441       list_del(&up->list);
442       free(up);
443    }
444 
445    util_dynarray_clear(&cmd_buffer->ray_history);
446 
447    cmd_buffer->push_constant_stages = 0;
448    cmd_buffer->scratch_size_per_wave_needed = 0;
449    cmd_buffer->scratch_waves_wanted = 0;
450    cmd_buffer->compute_scratch_size_per_wave_needed = 0;
451    cmd_buffer->compute_scratch_waves_wanted = 0;
452    cmd_buffer->esgs_ring_size_needed = 0;
453    cmd_buffer->gsvs_ring_size_needed = 0;
454    cmd_buffer->tess_rings_needed = false;
455    cmd_buffer->task_rings_needed = false;
456    cmd_buffer->mesh_scratch_ring_needed = false;
457    cmd_buffer->gds_needed = false;
458    cmd_buffer->gds_oa_needed = false;
459    cmd_buffer->sample_positions_needed = false;
460    cmd_buffer->gang.sem.leader_value = 0;
461    cmd_buffer->gang.sem.emitted_leader_value = 0;
462    cmd_buffer->gang.sem.va = 0;
463    cmd_buffer->shader_upload_seq = 0;
464 
465    if (cmd_buffer->upload.upload_bo)
466       radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->upload.upload_bo);
467    cmd_buffer->upload.offset = 0;
468 
469    memset(cmd_buffer->vertex_binding_buffers, 0, sizeof(struct radv_buffer *) * cmd_buffer->used_vertex_bindings);
470    cmd_buffer->used_vertex_bindings = 0;
471 
472    for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
473       cmd_buffer->descriptors[i].dirty = 0;
474       cmd_buffer->descriptors[i].valid = 0;
475    }
476 
477    radv_cmd_buffer_reset_rendering(cmd_buffer);
478 }
479 
480 const struct vk_command_buffer_ops radv_cmd_buffer_ops = {
481    .create = radv_create_cmd_buffer,
482    .reset = radv_reset_cmd_buffer,
483    .destroy = radv_destroy_cmd_buffer,
484 };
485 
486 static bool
radv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer * cmd_buffer,uint64_t min_needed)487 radv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer *cmd_buffer, uint64_t min_needed)
488 {
489    uint64_t new_size;
490    struct radeon_winsys_bo *bo = NULL;
491    struct radv_cmd_buffer_upload *upload;
492    struct radv_device *device = cmd_buffer->device;
493 
494    new_size = MAX2(min_needed, 16 * 1024);
495    new_size = MAX2(new_size, 2 * cmd_buffer->upload.size);
496 
497    VkResult result = device->ws->buffer_create(
498       device->ws, new_size, 4096, device->ws->cs_domain(device->ws),
499       RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_32BIT | RADEON_FLAG_GTT_WC,
500       RADV_BO_PRIORITY_UPLOAD_BUFFER, 0, &bo);
501 
502    if (result != VK_SUCCESS) {
503       vk_command_buffer_set_error(&cmd_buffer->vk, result);
504       return false;
505    }
506 
507    radv_cs_add_buffer(device->ws, cmd_buffer->cs, bo);
508    if (cmd_buffer->upload.upload_bo) {
509       upload = malloc(sizeof(*upload));
510 
511       if (!upload) {
512          vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
513          device->ws->buffer_destroy(device->ws, bo);
514          return false;
515       }
516 
517       memcpy(upload, &cmd_buffer->upload, sizeof(*upload));
518       list_add(&upload->list, &cmd_buffer->upload.list);
519    }
520 
521    cmd_buffer->upload.upload_bo = bo;
522    cmd_buffer->upload.size = new_size;
523    cmd_buffer->upload.offset = 0;
524    cmd_buffer->upload.map = device->ws->buffer_map(cmd_buffer->upload.upload_bo);
525 
526    if (!cmd_buffer->upload.map) {
527       vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_DEVICE_MEMORY);
528       return false;
529    }
530    radv_rmv_log_command_buffer_bo_create(device, cmd_buffer->upload.upload_bo, 0, cmd_buffer->upload.size, 0);
531 
532    return true;
533 }
534 
535 bool
radv_cmd_buffer_upload_alloc_aligned(struct radv_cmd_buffer * cmd_buffer,unsigned size,unsigned alignment,unsigned * out_offset,void ** ptr)536 radv_cmd_buffer_upload_alloc_aligned(struct radv_cmd_buffer *cmd_buffer, unsigned size, unsigned alignment,
537                                      unsigned *out_offset, void **ptr)
538 {
539    assert(size % 4 == 0);
540 
541    const struct radeon_info *rad_info = &cmd_buffer->device->physical_device->rad_info;
542 
543    /* Align to the scalar cache line size if it results in this allocation
544     * being placed in less of them.
545     */
546    unsigned offset = cmd_buffer->upload.offset;
547    unsigned line_size = rad_info->gfx_level >= GFX10 ? 64 : 32;
548    unsigned gap = align(offset, line_size) - offset;
549    if ((size & (line_size - 1)) > gap)
550       offset = align(offset, line_size);
551 
552    if (alignment)
553       offset = align(offset, alignment);
554    if (offset + size > cmd_buffer->upload.size) {
555       if (!radv_cmd_buffer_resize_upload_buf(cmd_buffer, size))
556          return false;
557       offset = 0;
558    }
559 
560    *out_offset = offset;
561    *ptr = cmd_buffer->upload.map + offset;
562 
563    cmd_buffer->upload.offset = offset + size;
564    return true;
565 }
566 
567 bool
radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer * cmd_buffer,unsigned size,unsigned * out_offset,void ** ptr)568 radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer *cmd_buffer, unsigned size, unsigned *out_offset, void **ptr)
569 {
570    return radv_cmd_buffer_upload_alloc_aligned(cmd_buffer, size, 0, out_offset, ptr);
571 }
572 
573 bool
radv_cmd_buffer_upload_data(struct radv_cmd_buffer * cmd_buffer,unsigned size,const void * data,unsigned * out_offset)574 radv_cmd_buffer_upload_data(struct radv_cmd_buffer *cmd_buffer, unsigned size, const void *data, unsigned *out_offset)
575 {
576    uint8_t *ptr;
577 
578    if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, out_offset, (void **)&ptr))
579       return false;
580    assert(ptr);
581 
582    memcpy(ptr, data, size);
583    return true;
584 }
585 
586 void
radv_cmd_buffer_trace_emit(struct radv_cmd_buffer * cmd_buffer)587 radv_cmd_buffer_trace_emit(struct radv_cmd_buffer *cmd_buffer)
588 {
589    struct radv_device *device = cmd_buffer->device;
590    struct radeon_cmdbuf *cs = cmd_buffer->cs;
591    uint64_t va;
592 
593    if (cmd_buffer->qf != RADV_QUEUE_GENERAL && cmd_buffer->qf != RADV_QUEUE_COMPUTE)
594       return;
595 
596    va = radv_buffer_get_va(device->trace_bo);
597    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
598       va += 4;
599 
600    ++cmd_buffer->state.trace_id;
601    radv_write_data(cmd_buffer, V_370_ME, va, 1, &cmd_buffer->state.trace_id, false);
602 
603    radeon_check_space(cmd_buffer->device->ws, cs, 2);
604 
605    radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
606    radeon_emit(cs, AC_ENCODE_TRACE_POINT(cmd_buffer->state.trace_id));
607 }
608 
609 static void
radv_gang_barrier(struct radv_cmd_buffer * cmd_buffer,VkPipelineStageFlags2 src_stage_mask,VkPipelineStageFlags2 dst_stage_mask)610 radv_gang_barrier(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 src_stage_mask,
611                   VkPipelineStageFlags2 dst_stage_mask)
612 {
613    /* Update flush bits from the main cmdbuf, except the stage flush. */
614    cmd_buffer->gang.flush_bits |=
615       cmd_buffer->state.flush_bits & RADV_CMD_FLUSH_ALL_COMPUTE & ~RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
616 
617    /* Add stage flush only when necessary. */
618    if (src_stage_mask & (VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_EXT | VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT |
619                          VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
620       cmd_buffer->gang.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
621 
622    /* Block task shaders when we have to wait for CP DMA on the GFX cmdbuf. */
623    if (src_stage_mask &
624        (VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_CLEAR_BIT | VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT |
625         VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
626       dst_stage_mask |= cmd_buffer->state.dma_is_busy ? VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_EXT : 0;
627 
628    /* Increment the GFX/ACE semaphore when task shaders are blocked. */
629    if (dst_stage_mask & (VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT | VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT |
630                          VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_EXT))
631       cmd_buffer->gang.sem.leader_value++;
632 }
633 
634 void
radv_gang_cache_flush(struct radv_cmd_buffer * cmd_buffer)635 radv_gang_cache_flush(struct radv_cmd_buffer *cmd_buffer)
636 {
637    struct radeon_cmdbuf *ace_cs = cmd_buffer->gang.cs;
638    const uint32_t flush_bits = cmd_buffer->gang.flush_bits;
639    enum rgp_flush_bits sqtt_flush_bits = 0;
640 
641    radv_cs_emit_cache_flush(cmd_buffer->device->ws, ace_cs, cmd_buffer->device->physical_device->rad_info.gfx_level,
642                             NULL, 0, RADV_QUEUE_COMPUTE, flush_bits, &sqtt_flush_bits, 0);
643 
644    cmd_buffer->gang.flush_bits = 0;
645 }
646 
647 static bool
radv_gang_sem_init(struct radv_cmd_buffer * cmd_buffer)648 radv_gang_sem_init(struct radv_cmd_buffer *cmd_buffer)
649 {
650    if (cmd_buffer->gang.sem.va)
651       return true;
652 
653    /* DWORD 0: GFX->ACE semaphore (GFX blocks ACE, ie. ACE waits for GFX)
654     * DWORD 1: ACE->GFX semaphore
655     */
656    uint64_t sem_init = 0;
657    uint32_t va_off = 0;
658    if (!radv_cmd_buffer_upload_data(cmd_buffer, sizeof(uint64_t), &sem_init, &va_off)) {
659       vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
660       return false;
661    }
662 
663    cmd_buffer->gang.sem.va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + va_off;
664    return true;
665 }
666 
667 static bool
radv_gang_leader_sem_dirty(const struct radv_cmd_buffer * cmd_buffer)668 radv_gang_leader_sem_dirty(const struct radv_cmd_buffer *cmd_buffer)
669 {
670    return cmd_buffer->gang.sem.leader_value != cmd_buffer->gang.sem.emitted_leader_value;
671 }
672 
673 static bool
radv_gang_follower_sem_dirty(const struct radv_cmd_buffer * cmd_buffer)674 radv_gang_follower_sem_dirty(const struct radv_cmd_buffer *cmd_buffer)
675 {
676    return cmd_buffer->gang.sem.follower_value != cmd_buffer->gang.sem.emitted_follower_value;
677 }
678 
679 ALWAYS_INLINE static bool
radv_flush_gang_semaphore(struct radv_cmd_buffer * cmd_buffer,struct radeon_cmdbuf * cs,const enum radv_queue_family qf,const uint32_t va_off,const uint32_t value)680 radv_flush_gang_semaphore(struct radv_cmd_buffer *cmd_buffer, struct radeon_cmdbuf *cs, const enum radv_queue_family qf,
681                           const uint32_t va_off, const uint32_t value)
682 {
683    if (!radv_gang_sem_init(cmd_buffer))
684       return false;
685 
686    ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 12);
687 
688    radv_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level, qf,
689                                 V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT,
690                                 cmd_buffer->gang.sem.va + va_off, value, cmd_buffer->gfx9_eop_bug_va);
691 
692    assert(cmd_buffer->cs->cdw <= cdw_max);
693    return true;
694 }
695 
696 ALWAYS_INLINE static bool
radv_flush_gang_leader_semaphore(struct radv_cmd_buffer * cmd_buffer)697 radv_flush_gang_leader_semaphore(struct radv_cmd_buffer *cmd_buffer)
698 {
699    if (!radv_gang_leader_sem_dirty(cmd_buffer))
700       return false;
701 
702    /* Gang leader writes a value to the semaphore which the follower can wait for. */
703    cmd_buffer->gang.sem.emitted_leader_value = cmd_buffer->gang.sem.leader_value;
704    return radv_flush_gang_semaphore(cmd_buffer, cmd_buffer->cs, cmd_buffer->qf, 0, cmd_buffer->gang.sem.leader_value);
705 }
706 
707 ALWAYS_INLINE static bool
radv_flush_gang_follower_semaphore(struct radv_cmd_buffer * cmd_buffer)708 radv_flush_gang_follower_semaphore(struct radv_cmd_buffer *cmd_buffer)
709 {
710    if (!radv_gang_follower_sem_dirty(cmd_buffer))
711       return false;
712 
713    /* Follower writes a value to the semaphore which the gang leader can wait for. */
714    cmd_buffer->gang.sem.emitted_follower_value = cmd_buffer->gang.sem.follower_value;
715    return radv_flush_gang_semaphore(cmd_buffer, cmd_buffer->gang.cs, RADV_QUEUE_COMPUTE, 4,
716                                     cmd_buffer->gang.sem.follower_value);
717 }
718 
719 ALWAYS_INLINE static void
radv_wait_gang_semaphore(struct radv_cmd_buffer * cmd_buffer,struct radeon_cmdbuf * cs,const enum radv_queue_family qf,const uint32_t va_off,const uint32_t value)720 radv_wait_gang_semaphore(struct radv_cmd_buffer *cmd_buffer, struct radeon_cmdbuf *cs, const enum radv_queue_family qf,
721                          const uint32_t va_off, const uint32_t value)
722 {
723    assert(cmd_buffer->gang.sem.va);
724    radeon_check_space(cmd_buffer->device->ws, cs, 7);
725    radv_cp_wait_mem(cs, qf, WAIT_REG_MEM_GREATER_OR_EQUAL, cmd_buffer->gang.sem.va + va_off, value, 0xffffffff);
726 }
727 
728 ALWAYS_INLINE static void
radv_wait_gang_leader(struct radv_cmd_buffer * cmd_buffer)729 radv_wait_gang_leader(struct radv_cmd_buffer *cmd_buffer)
730 {
731    /* Follower waits for the semaphore which the gang leader wrote. */
732    radv_wait_gang_semaphore(cmd_buffer, cmd_buffer->gang.cs, RADV_QUEUE_COMPUTE, 0, cmd_buffer->gang.sem.leader_value);
733 }
734 
735 ALWAYS_INLINE static void
radv_wait_gang_follower(struct radv_cmd_buffer * cmd_buffer)736 radv_wait_gang_follower(struct radv_cmd_buffer *cmd_buffer)
737 {
738    /* Gang leader waits for the semaphore which the follower wrote. */
739    radv_wait_gang_semaphore(cmd_buffer, cmd_buffer->cs, cmd_buffer->qf, 4, cmd_buffer->gang.sem.follower_value);
740 }
741 
742 bool
radv_gang_init(struct radv_cmd_buffer * cmd_buffer)743 radv_gang_init(struct radv_cmd_buffer *cmd_buffer)
744 {
745    if (cmd_buffer->gang.cs)
746       return true;
747 
748    struct radv_device *device = cmd_buffer->device;
749    struct radeon_cmdbuf *ace_cs =
750       device->ws->cs_create(device->ws, AMD_IP_COMPUTE, cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
751 
752    if (!ace_cs) {
753       vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_DEVICE_MEMORY);
754       return false;
755    }
756 
757    cmd_buffer->gang.cs = ace_cs;
758    return true;
759 }
760 
761 static VkResult
radv_gang_finalize(struct radv_cmd_buffer * cmd_buffer)762 radv_gang_finalize(struct radv_cmd_buffer *cmd_buffer)
763 {
764    assert(cmd_buffer->gang.cs);
765    struct radv_device *device = cmd_buffer->device;
766    struct radeon_cmdbuf *ace_cs = cmd_buffer->gang.cs;
767 
768    /* Emit pending cache flush. */
769    radv_gang_cache_flush(cmd_buffer);
770 
771    /* Clear the leader<->follower semaphores if they exist.
772     * This is necessary in case the same cmd buffer is submitted again in the future.
773     */
774    if (cmd_buffer->gang.sem.va) {
775       uint64_t leader2follower_va = cmd_buffer->gang.sem.va;
776       uint64_t follower2leader_va = cmd_buffer->gang.sem.va + 4;
777       const uint32_t zero = 0;
778 
779       /* Follower: write 0 to the leader->follower semaphore. */
780       radv_cs_write_data(device, ace_cs, RADV_QUEUE_COMPUTE, V_370_ME, leader2follower_va, 1, &zero, false);
781 
782       /* Leader: write 0 to the follower->leader semaphore. */
783       radv_write_data(cmd_buffer, V_370_ME, follower2leader_va, 1, &zero, false);
784    }
785 
786    return device->ws->cs_finalize(ace_cs);
787 }
788 
789 static void
radv_cmd_buffer_after_draw(struct radv_cmd_buffer * cmd_buffer,enum radv_cmd_flush_bits flags,bool dgc)790 radv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer, enum radv_cmd_flush_bits flags, bool dgc)
791 {
792    const struct radv_device *device = cmd_buffer->device;
793    if (unlikely(device->sqtt.bo) && !dgc) {
794       radeon_check_space(device->ws, cmd_buffer->cs, 2);
795 
796       radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, cmd_buffer->state.predicating));
797       radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) | EVENT_INDEX(0));
798    }
799 
800    if (device->instance->debug_flags & RADV_DEBUG_SYNC_SHADERS) {
801       enum rgp_flush_bits sqtt_flush_bits = 0;
802       assert(flags & (RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH));
803 
804       /* Force wait for graphics or compute engines to be idle. */
805       radv_cs_emit_cache_flush(device->ws, cmd_buffer->cs, device->physical_device->rad_info.gfx_level,
806                                &cmd_buffer->gfx9_fence_idx, cmd_buffer->gfx9_fence_va, cmd_buffer->qf, flags,
807                                &sqtt_flush_bits, cmd_buffer->gfx9_eop_bug_va);
808 
809       if ((flags & RADV_CMD_FLAG_PS_PARTIAL_FLUSH) && radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_TASK)) {
810          /* Force wait for compute engines to be idle on the internal cmdbuf. */
811          radv_cs_emit_cache_flush(device->ws, cmd_buffer->gang.cs, device->physical_device->rad_info.gfx_level, NULL, 0,
812                                   RADV_QUEUE_COMPUTE, RADV_CMD_FLAG_CS_PARTIAL_FLUSH, &sqtt_flush_bits, 0);
813       }
814    }
815 
816    if (radv_device_fault_detection_enabled(device))
817       radv_cmd_buffer_trace_emit(cmd_buffer);
818 }
819 
820 static void
radv_save_pipeline(struct radv_cmd_buffer * cmd_buffer,struct radv_pipeline * pipeline)821 radv_save_pipeline(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline)
822 {
823    struct radv_device *device = cmd_buffer->device;
824    enum amd_ip_type ring;
825    uint32_t data[2];
826    uint64_t va;
827 
828    va = radv_buffer_get_va(device->trace_bo);
829 
830    ring = radv_queue_family_to_ring(device->physical_device, cmd_buffer->qf);
831 
832    switch (ring) {
833    case AMD_IP_GFX:
834       va += 8;
835       break;
836    case AMD_IP_COMPUTE:
837       va += 16;
838       break;
839    default:
840       assert(!"invalid IP type");
841    }
842 
843    uint64_t pipeline_address = (uintptr_t)pipeline;
844    data[0] = pipeline_address;
845    data[1] = pipeline_address >> 32;
846 
847    radv_write_data(cmd_buffer, V_370_ME, va, 2, data, false);
848 }
849 
850 static void
radv_save_vertex_descriptors(struct radv_cmd_buffer * cmd_buffer,uint64_t vb_ptr)851 radv_save_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, uint64_t vb_ptr)
852 {
853    struct radv_device *device = cmd_buffer->device;
854    uint32_t data[2];
855    uint64_t va;
856 
857    va = radv_buffer_get_va(device->trace_bo);
858    va += 24;
859 
860    data[0] = vb_ptr;
861    data[1] = vb_ptr >> 32;
862 
863    radv_write_data(cmd_buffer, V_370_ME, va, 2, data, false);
864 }
865 
866 static void
radv_save_vs_prolog(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader_part * prolog)867 radv_save_vs_prolog(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader_part *prolog)
868 {
869    struct radv_device *device = cmd_buffer->device;
870    uint32_t data[2];
871    uint64_t va;
872 
873    va = radv_buffer_get_va(device->trace_bo);
874    va += 32;
875 
876    uint64_t prolog_address = (uintptr_t)prolog;
877    data[0] = prolog_address;
878    data[1] = prolog_address >> 32;
879 
880    radv_write_data(cmd_buffer, V_370_ME, va, 2, data, false);
881 }
882 
883 void
radv_set_descriptor_set(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point,struct radv_descriptor_set * set,unsigned idx)884 radv_set_descriptor_set(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point,
885                         struct radv_descriptor_set *set, unsigned idx)
886 {
887    struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);
888 
889    descriptors_state->sets[idx] = set;
890 
891    descriptors_state->valid |= (1u << idx); /* active descriptors */
892    descriptors_state->dirty |= (1u << idx);
893 }
894 
895 static void
radv_save_descriptors(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)896 radv_save_descriptors(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point)
897 {
898    struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);
899    struct radv_device *device = cmd_buffer->device;
900    uint32_t data[MAX_SETS * 2] = {0};
901    uint64_t va;
902    va = radv_buffer_get_va(device->trace_bo) + 40;
903 
904    u_foreach_bit (i, descriptors_state->valid) {
905       struct radv_descriptor_set *set = descriptors_state->sets[i];
906       data[i * 2] = (uint64_t)(uintptr_t)set;
907       data[i * 2 + 1] = (uint64_t)(uintptr_t)set >> 32;
908    }
909 
910    radv_write_data(cmd_buffer, V_370_ME, va, MAX_SETS * 2, data, false);
911 }
912 
913 const struct radv_userdata_info *
radv_get_user_sgpr(const struct radv_shader * shader,int idx)914 radv_get_user_sgpr(const struct radv_shader *shader, int idx)
915 {
916    return &shader->info.user_sgprs_locs.shader_data[idx];
917 }
918 
919 static void
radv_emit_userdata_address(struct radv_device * device,struct radeon_cmdbuf * cs,struct radv_shader * shader,uint32_t base_reg,int idx,uint64_t va)920 radv_emit_userdata_address(struct radv_device *device, struct radeon_cmdbuf *cs, struct radv_shader *shader,
921                            uint32_t base_reg, int idx, uint64_t va)
922 {
923    const struct radv_userdata_info *loc = &shader->info.user_sgprs_locs.shader_data[idx];
924 
925    if (loc->sgpr_idx == -1)
926       return;
927 
928    assert(loc->num_sgprs == 1);
929 
930    radv_emit_shader_pointer(device, cs, base_reg + loc->sgpr_idx * 4, va, false);
931 }
932 
933 static uint64_t
radv_descriptor_get_va(const struct radv_descriptor_state * descriptors_state,unsigned set_idx)934 radv_descriptor_get_va(const struct radv_descriptor_state *descriptors_state, unsigned set_idx)
935 {
936    struct radv_descriptor_set *set = descriptors_state->sets[set_idx];
937    uint64_t va;
938 
939    if (set) {
940       va = set->header.va;
941    } else {
942       va = descriptors_state->descriptor_buffers[set_idx];
943    }
944 
945    return va;
946 }
947 
948 static void
radv_emit_descriptor_pointers(struct radv_device * device,struct radeon_cmdbuf * cs,struct radv_shader * shader,uint32_t sh_base,struct radv_descriptor_state * descriptors_state)949 radv_emit_descriptor_pointers(struct radv_device *device, struct radeon_cmdbuf *cs, struct radv_shader *shader,
950                               uint32_t sh_base, struct radv_descriptor_state *descriptors_state)
951 {
952    struct radv_userdata_locations *locs = &shader->info.user_sgprs_locs;
953    unsigned mask = locs->descriptor_sets_enabled;
954 
955    mask &= descriptors_state->dirty & descriptors_state->valid;
956 
957    while (mask) {
958       int start, count;
959 
960       u_bit_scan_consecutive_range(&mask, &start, &count);
961 
962       struct radv_userdata_info *loc = &locs->descriptor_sets[start];
963       unsigned sh_offset = sh_base + loc->sgpr_idx * 4;
964 
965       radv_emit_shader_pointer_head(cs, sh_offset, count, true);
966       for (int i = 0; i < count; i++) {
967          uint64_t va = radv_descriptor_get_va(descriptors_state, start + i);
968 
969          radv_emit_shader_pointer_body(device, cs, va, true);
970       }
971    }
972 }
973 
974 static unsigned
radv_get_rasterization_prim(struct radv_cmd_buffer * cmd_buffer)975 radv_get_rasterization_prim(struct radv_cmd_buffer *cmd_buffer)
976 {
977    const struct radv_shader *last_vgt_shader = cmd_buffer->state.last_vgt_shader;
978    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
979 
980    if (cmd_buffer->state.active_stages &
981        (VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT | VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT |
982         VK_SHADER_STAGE_GEOMETRY_BIT | VK_SHADER_STAGE_MESH_BIT_EXT)) {
983       /* Ignore dynamic primitive topology for TES/GS/MS stages. */
984       return cmd_buffer->state.rast_prim;
985    }
986 
987    return radv_conv_prim_to_gs_out(d->vk.ia.primitive_topology, last_vgt_shader->info.is_ngg);
988 }
989 
990 static ALWAYS_INLINE unsigned
radv_get_rasterization_samples(struct radv_cmd_buffer * cmd_buffer)991 radv_get_rasterization_samples(struct radv_cmd_buffer *cmd_buffer)
992 {
993    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
994 
995    if (d->vk.rs.line.mode == VK_LINE_RASTERIZATION_MODE_BRESENHAM_KHR &&
996        radv_rast_prim_is_line(radv_get_rasterization_prim(cmd_buffer))) {
997       /* From the Vulkan spec 1.3.221:
998        *
999        * "When Bresenham lines are being rasterized, sample locations may all be treated as being at
1000        * the pixel center (this may affect attribute and depth interpolation)."
1001        *
1002        * "One consequence of this is that Bresenham lines cover the same pixels regardless of the
1003        * number of rasterization samples, and cover all samples in those pixels (unless masked out
1004        * or killed)."
1005        */
1006       return 1;
1007    }
1008 
1009    if (d->vk.rs.line.mode == VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_KHR &&
1010        radv_rast_prim_is_line(radv_get_rasterization_prim(cmd_buffer))) {
1011       return RADV_NUM_SMOOTH_AA_SAMPLES;
1012    }
1013 
1014    return MAX2(1, d->vk.ms.rasterization_samples);
1015 }
1016 
1017 static ALWAYS_INLINE unsigned
radv_get_ps_iter_samples(struct radv_cmd_buffer * cmd_buffer)1018 radv_get_ps_iter_samples(struct radv_cmd_buffer *cmd_buffer)
1019 {
1020    const struct radv_rendering_state *render = &cmd_buffer->state.render;
1021    unsigned ps_iter_samples = 1;
1022 
1023    if (cmd_buffer->state.ms.sample_shading_enable) {
1024       unsigned rasterization_samples = radv_get_rasterization_samples(cmd_buffer);
1025       unsigned color_samples = MAX2(render->color_samples, rasterization_samples);
1026 
1027       ps_iter_samples = ceilf(cmd_buffer->state.ms.min_sample_shading * color_samples);
1028       ps_iter_samples = util_next_power_of_two(ps_iter_samples);
1029    }
1030 
1031    return ps_iter_samples;
1032 }
1033 
1034 /**
1035  * Convert the user sample locations to hardware sample locations (the values
1036  * that will be emitted by PA_SC_AA_SAMPLE_LOCS_PIXEL_*).
1037  */
1038 static void
radv_convert_user_sample_locs(const struct radv_sample_locations_state * state,uint32_t x,uint32_t y,VkOffset2D * sample_locs)1039 radv_convert_user_sample_locs(const struct radv_sample_locations_state *state, uint32_t x, uint32_t y,
1040                               VkOffset2D *sample_locs)
1041 {
1042    uint32_t x_offset = x % state->grid_size.width;
1043    uint32_t y_offset = y % state->grid_size.height;
1044    uint32_t num_samples = (uint32_t)state->per_pixel;
1045    uint32_t pixel_offset;
1046 
1047    pixel_offset = (x_offset + y_offset * state->grid_size.width) * num_samples;
1048 
1049    assert(pixel_offset <= MAX_SAMPLE_LOCATIONS);
1050    const VkSampleLocationEXT *user_locs = &state->locations[pixel_offset];
1051 
1052    for (uint32_t i = 0; i < num_samples; i++) {
1053       float shifted_pos_x = user_locs[i].x - 0.5;
1054       float shifted_pos_y = user_locs[i].y - 0.5;
1055 
1056       int32_t scaled_pos_x = floorf(shifted_pos_x * 16);
1057       int32_t scaled_pos_y = floorf(shifted_pos_y * 16);
1058 
1059       sample_locs[i].x = CLAMP(scaled_pos_x, -8, 7);
1060       sample_locs[i].y = CLAMP(scaled_pos_y, -8, 7);
1061    }
1062 }
1063 
1064 /**
1065  * Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask based on hardware sample
1066  * locations.
1067  */
1068 static void
radv_compute_sample_locs_pixel(uint32_t num_samples,VkOffset2D * sample_locs,uint32_t * sample_locs_pixel)1069 radv_compute_sample_locs_pixel(uint32_t num_samples, VkOffset2D *sample_locs, uint32_t *sample_locs_pixel)
1070 {
1071    for (uint32_t i = 0; i < num_samples; i++) {
1072       uint32_t sample_reg_idx = i / 4;
1073       uint32_t sample_loc_idx = i % 4;
1074       int32_t pos_x = sample_locs[i].x;
1075       int32_t pos_y = sample_locs[i].y;
1076 
1077       uint32_t shift_x = 8 * sample_loc_idx;
1078       uint32_t shift_y = shift_x + 4;
1079 
1080       sample_locs_pixel[sample_reg_idx] |= (pos_x & 0xf) << shift_x;
1081       sample_locs_pixel[sample_reg_idx] |= (pos_y & 0xf) << shift_y;
1082    }
1083 }
1084 
1085 /**
1086  * Compute the PA_SC_CENTROID_PRIORITY_* mask based on the top left hardware
1087  * sample locations.
1088  */
1089 static uint64_t
radv_compute_centroid_priority(struct radv_cmd_buffer * cmd_buffer,VkOffset2D * sample_locs,uint32_t num_samples)1090 radv_compute_centroid_priority(struct radv_cmd_buffer *cmd_buffer, VkOffset2D *sample_locs, uint32_t num_samples)
1091 {
1092    uint32_t *centroid_priorities = alloca(num_samples * sizeof(*centroid_priorities));
1093    uint32_t sample_mask = num_samples - 1;
1094    uint32_t *distances = alloca(num_samples * sizeof(*distances));
1095    uint64_t centroid_priority = 0;
1096 
1097    /* Compute the distances from center for each sample. */
1098    for (int i = 0; i < num_samples; i++) {
1099       distances[i] = (sample_locs[i].x * sample_locs[i].x) + (sample_locs[i].y * sample_locs[i].y);
1100    }
1101 
1102    /* Compute the centroid priorities by looking at the distances array. */
1103    for (int i = 0; i < num_samples; i++) {
1104       uint32_t min_idx = 0;
1105 
1106       for (int j = 1; j < num_samples; j++) {
1107          if (distances[j] < distances[min_idx])
1108             min_idx = j;
1109       }
1110 
1111       centroid_priorities[i] = min_idx;
1112       distances[min_idx] = 0xffffffff;
1113    }
1114 
1115    /* Compute the final centroid priority. */
1116    for (int i = 0; i < 8; i++) {
1117       centroid_priority |= centroid_priorities[i & sample_mask] << (i * 4);
1118    }
1119 
1120    return centroid_priority << 32 | centroid_priority;
1121 }
1122 
1123 /**
1124  * Emit the sample locations that are specified with VK_EXT_sample_locations.
1125  */
1126 static void
radv_emit_sample_locations(struct radv_cmd_buffer * cmd_buffer)1127 radv_emit_sample_locations(struct radv_cmd_buffer *cmd_buffer)
1128 {
1129    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1130    uint32_t num_samples = (uint32_t)d->sample_location.per_pixel;
1131    struct radeon_cmdbuf *cs = cmd_buffer->cs;
1132    uint32_t sample_locs_pixel[4][2] = {0};
1133    VkOffset2D sample_locs[4][8]; /* 8 is the max. sample count supported */
1134    uint64_t centroid_priority;
1135 
1136    if (!d->sample_location.count || !d->vk.ms.sample_locations_enable)
1137       return;
1138 
1139    /* Convert the user sample locations to hardware sample locations. */
1140    radv_convert_user_sample_locs(&d->sample_location, 0, 0, sample_locs[0]);
1141    radv_convert_user_sample_locs(&d->sample_location, 1, 0, sample_locs[1]);
1142    radv_convert_user_sample_locs(&d->sample_location, 0, 1, sample_locs[2]);
1143    radv_convert_user_sample_locs(&d->sample_location, 1, 1, sample_locs[3]);
1144 
1145    /* Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask. */
1146    for (uint32_t i = 0; i < 4; i++) {
1147       radv_compute_sample_locs_pixel(num_samples, sample_locs[i], sample_locs_pixel[i]);
1148    }
1149 
1150    /* Compute the PA_SC_CENTROID_PRIORITY_* mask. */
1151    centroid_priority = radv_compute_centroid_priority(cmd_buffer, sample_locs[0], num_samples);
1152 
1153    /* Emit the specified user sample locations. */
1154    switch (num_samples) {
1155    case 2:
1156    case 4:
1157       radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_pixel[0][0]);
1158       radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_pixel[1][0]);
1159       radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs_pixel[2][0]);
1160       radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs_pixel[3][0]);
1161       break;
1162    case 8:
1163       radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_pixel[0][0]);
1164       radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_pixel[1][0]);
1165       radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs_pixel[2][0]);
1166       radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs_pixel[3][0]);
1167       radeon_set_context_reg(cs, R_028BFC_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_1, sample_locs_pixel[0][1]);
1168       radeon_set_context_reg(cs, R_028C0C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_1, sample_locs_pixel[1][1]);
1169       radeon_set_context_reg(cs, R_028C1C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_1, sample_locs_pixel[2][1]);
1170       radeon_set_context_reg(cs, R_028C2C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1, sample_locs_pixel[3][1]);
1171       break;
1172    default:
1173       unreachable("invalid number of samples");
1174    }
1175 
1176    radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
1177    radeon_emit(cs, centroid_priority);
1178    radeon_emit(cs, centroid_priority >> 32);
1179 }
1180 
1181 static void
radv_emit_inline_push_consts(struct radv_device * device,struct radeon_cmdbuf * cs,const struct radv_shader * shader,uint32_t base_reg,int idx,uint32_t * values)1182 radv_emit_inline_push_consts(struct radv_device *device, struct radeon_cmdbuf *cs, const struct radv_shader *shader,
1183                              uint32_t base_reg, int idx, uint32_t *values)
1184 {
1185    const struct radv_userdata_info *loc = &shader->info.user_sgprs_locs.shader_data[idx];
1186 
1187    if (loc->sgpr_idx == -1)
1188       return;
1189 
1190    radeon_check_space(device->ws, cs, 2 + loc->num_sgprs);
1191 
1192    radeon_set_sh_reg_seq(cs, base_reg + loc->sgpr_idx * 4, loc->num_sgprs);
1193    radeon_emit_array(cs, values, loc->num_sgprs);
1194 }
1195 
1196 struct radv_bin_size_entry {
1197    unsigned bpp;
1198    VkExtent2D extent;
1199 };
1200 
1201 static VkExtent2D
radv_gfx10_compute_bin_size(struct radv_cmd_buffer * cmd_buffer)1202 radv_gfx10_compute_bin_size(struct radv_cmd_buffer *cmd_buffer)
1203 {
1204    const struct radv_physical_device *pdevice = cmd_buffer->device->physical_device;
1205    const struct radv_rendering_state *render = &cmd_buffer->state.render;
1206    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1207    VkExtent2D extent = {512, 512};
1208 
1209    const unsigned db_tag_size = 64;
1210    const unsigned db_tag_count = 312;
1211    const unsigned color_tag_size = 1024;
1212    const unsigned color_tag_count = 31;
1213    const unsigned fmask_tag_size = 256;
1214    const unsigned fmask_tag_count = 44;
1215 
1216    const unsigned rb_count = pdevice->rad_info.max_render_backends;
1217    const unsigned pipe_count = MAX2(rb_count, pdevice->rad_info.num_tcc_blocks);
1218 
1219    const unsigned db_tag_part = (db_tag_count * rb_count / pipe_count) * db_tag_size * pipe_count;
1220    const unsigned color_tag_part = (color_tag_count * rb_count / pipe_count) * color_tag_size * pipe_count;
1221    const unsigned fmask_tag_part = (fmask_tag_count * rb_count / pipe_count) * fmask_tag_size * pipe_count;
1222 
1223    const unsigned total_samples = radv_get_rasterization_samples(cmd_buffer);
1224    const unsigned samples_log = util_logbase2_ceil(total_samples);
1225 
1226    unsigned color_bytes_per_pixel = 0;
1227    unsigned fmask_bytes_per_pixel = 0;
1228 
1229    for (unsigned i = 0; i < render->color_att_count; ++i) {
1230       struct radv_image_view *iview = render->color_att[i].iview;
1231 
1232       if (!iview)
1233          continue;
1234 
1235       if (!d->vk.cb.attachments[i].write_mask)
1236          continue;
1237 
1238       color_bytes_per_pixel += vk_format_get_blocksize(render->color_att[i].format);
1239 
1240       if (total_samples > 1) {
1241          assert(samples_log <= 3);
1242          const unsigned fmask_array[] = {0, 1, 1, 4};
1243          fmask_bytes_per_pixel += fmask_array[samples_log];
1244       }
1245    }
1246 
1247    color_bytes_per_pixel *= total_samples;
1248    color_bytes_per_pixel = MAX2(color_bytes_per_pixel, 1);
1249 
1250    const unsigned color_pixel_count_log = util_logbase2(color_tag_part / color_bytes_per_pixel);
1251    extent.width = 1ull << ((color_pixel_count_log + 1) / 2);
1252    extent.height = 1ull << (color_pixel_count_log / 2);
1253 
1254    if (fmask_bytes_per_pixel) {
1255       const unsigned fmask_pixel_count_log = util_logbase2(fmask_tag_part / fmask_bytes_per_pixel);
1256 
1257       const VkExtent2D fmask_extent = (VkExtent2D){.width = 1ull << ((fmask_pixel_count_log + 1) / 2),
1258                                                    .height = 1ull << (color_pixel_count_log / 2)};
1259 
1260       if (fmask_extent.width * fmask_extent.height < extent.width * extent.height)
1261          extent = fmask_extent;
1262    }
1263 
1264    if (render->ds_att.iview) {
1265       /* Coefficients taken from AMDVLK */
1266       unsigned depth_coeff = vk_format_has_depth(render->ds_att.format) ? 5 : 0;
1267       unsigned stencil_coeff = vk_format_has_stencil(render->ds_att.format) ? 1 : 0;
1268       unsigned db_bytes_per_pixel = (depth_coeff + stencil_coeff) * total_samples;
1269 
1270       const unsigned db_pixel_count_log = util_logbase2(db_tag_part / db_bytes_per_pixel);
1271 
1272       const VkExtent2D db_extent =
1273          (VkExtent2D){.width = 1ull << ((db_pixel_count_log + 1) / 2), .height = 1ull << (color_pixel_count_log / 2)};
1274 
1275       if (db_extent.width * db_extent.height < extent.width * extent.height)
1276          extent = db_extent;
1277    }
1278 
1279    extent.width = MAX2(extent.width, 128);
1280    extent.height = MAX2(extent.width, 64);
1281 
1282    return extent;
1283 }
1284 
1285 static VkExtent2D
radv_gfx9_compute_bin_size(struct radv_cmd_buffer * cmd_buffer)1286 radv_gfx9_compute_bin_size(struct radv_cmd_buffer *cmd_buffer)
1287 {
1288    const struct radv_physical_device *pdevice = cmd_buffer->device->physical_device;
1289    const struct radv_rendering_state *render = &cmd_buffer->state.render;
1290    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1291    static const struct radv_bin_size_entry color_size_table[][3][9] = {
1292       {
1293          /* One RB / SE */
1294          {
1295             /* One shader engine */
1296             {0, {128, 128}},
1297             {1, {64, 128}},
1298             {2, {32, 128}},
1299             {3, {16, 128}},
1300             {17, {0, 0}},
1301             {UINT_MAX, {0, 0}},
1302          },
1303          {
1304             /* Two shader engines */
1305             {0, {128, 128}},
1306             {2, {64, 128}},
1307             {3, {32, 128}},
1308             {5, {16, 128}},
1309             {17, {0, 0}},
1310             {UINT_MAX, {0, 0}},
1311          },
1312          {
1313             /* Four shader engines */
1314             {0, {128, 128}},
1315             {3, {64, 128}},
1316             {5, {16, 128}},
1317             {17, {0, 0}},
1318             {UINT_MAX, {0, 0}},
1319          },
1320       },
1321       {
1322          /* Two RB / SE */
1323          {
1324             /* One shader engine */
1325             {0, {128, 128}},
1326             {2, {64, 128}},
1327             {3, {32, 128}},
1328             {5, {16, 128}},
1329             {33, {0, 0}},
1330             {UINT_MAX, {0, 0}},
1331          },
1332          {
1333             /* Two shader engines */
1334             {0, {128, 128}},
1335             {3, {64, 128}},
1336             {5, {32, 128}},
1337             {9, {16, 128}},
1338             {33, {0, 0}},
1339             {UINT_MAX, {0, 0}},
1340          },
1341          {
1342             /* Four shader engines */
1343             {0, {256, 256}},
1344             {2, {128, 256}},
1345             {3, {128, 128}},
1346             {5, {64, 128}},
1347             {9, {16, 128}},
1348             {33, {0, 0}},
1349             {UINT_MAX, {0, 0}},
1350          },
1351       },
1352       {
1353          /* Four RB / SE */
1354          {
1355             /* One shader engine */
1356             {0, {128, 256}},
1357             {2, {128, 128}},
1358             {3, {64, 128}},
1359             {5, {32, 128}},
1360             {9, {16, 128}},
1361             {33, {0, 0}},
1362             {UINT_MAX, {0, 0}},
1363          },
1364          {
1365             /* Two shader engines */
1366             {0, {256, 256}},
1367             {2, {128, 256}},
1368             {3, {128, 128}},
1369             {5, {64, 128}},
1370             {9, {32, 128}},
1371             {17, {16, 128}},
1372             {33, {0, 0}},
1373             {UINT_MAX, {0, 0}},
1374          },
1375          {
1376             /* Four shader engines */
1377             {0, {256, 512}},
1378             {2, {256, 256}},
1379             {3, {128, 256}},
1380             {5, {128, 128}},
1381             {9, {64, 128}},
1382             {17, {16, 128}},
1383             {33, {0, 0}},
1384             {UINT_MAX, {0, 0}},
1385          },
1386       },
1387    };
1388    static const struct radv_bin_size_entry ds_size_table[][3][9] = {
1389       {
1390          // One RB / SE
1391          {
1392             // One shader engine
1393             {0, {128, 256}},
1394             {2, {128, 128}},
1395             {4, {64, 128}},
1396             {7, {32, 128}},
1397             {13, {16, 128}},
1398             {49, {0, 0}},
1399             {UINT_MAX, {0, 0}},
1400          },
1401          {
1402             // Two shader engines
1403             {0, {256, 256}},
1404             {2, {128, 256}},
1405             {4, {128, 128}},
1406             {7, {64, 128}},
1407             {13, {32, 128}},
1408             {25, {16, 128}},
1409             {49, {0, 0}},
1410             {UINT_MAX, {0, 0}},
1411          },
1412          {
1413             // Four shader engines
1414             {0, {256, 512}},
1415             {2, {256, 256}},
1416             {4, {128, 256}},
1417             {7, {128, 128}},
1418             {13, {64, 128}},
1419             {25, {16, 128}},
1420             {49, {0, 0}},
1421             {UINT_MAX, {0, 0}},
1422          },
1423       },
1424       {
1425          // Two RB / SE
1426          {
1427             // One shader engine
1428             {0, {256, 256}},
1429             {2, {128, 256}},
1430             {4, {128, 128}},
1431             {7, {64, 128}},
1432             {13, {32, 128}},
1433             {25, {16, 128}},
1434             {97, {0, 0}},
1435             {UINT_MAX, {0, 0}},
1436          },
1437          {
1438             // Two shader engines
1439             {0, {256, 512}},
1440             {2, {256, 256}},
1441             {4, {128, 256}},
1442             {7, {128, 128}},
1443             {13, {64, 128}},
1444             {25, {32, 128}},
1445             {49, {16, 128}},
1446             {97, {0, 0}},
1447             {UINT_MAX, {0, 0}},
1448          },
1449          {
1450             // Four shader engines
1451             {0, {512, 512}},
1452             {2, {256, 512}},
1453             {4, {256, 256}},
1454             {7, {128, 256}},
1455             {13, {128, 128}},
1456             {25, {64, 128}},
1457             {49, {16, 128}},
1458             {97, {0, 0}},
1459             {UINT_MAX, {0, 0}},
1460          },
1461       },
1462       {
1463          // Four RB / SE
1464          {
1465             // One shader engine
1466             {0, {256, 512}},
1467             {2, {256, 256}},
1468             {4, {128, 256}},
1469             {7, {128, 128}},
1470             {13, {64, 128}},
1471             {25, {32, 128}},
1472             {49, {16, 128}},
1473             {UINT_MAX, {0, 0}},
1474          },
1475          {
1476             // Two shader engines
1477             {0, {512, 512}},
1478             {2, {256, 512}},
1479             {4, {256, 256}},
1480             {7, {128, 256}},
1481             {13, {128, 128}},
1482             {25, {64, 128}},
1483             {49, {32, 128}},
1484             {97, {16, 128}},
1485             {UINT_MAX, {0, 0}},
1486          },
1487          {
1488             // Four shader engines
1489             {0, {512, 512}},
1490             {4, {256, 512}},
1491             {7, {256, 256}},
1492             {13, {128, 256}},
1493             {25, {128, 128}},
1494             {49, {64, 128}},
1495             {97, {16, 128}},
1496             {UINT_MAX, {0, 0}},
1497          },
1498       },
1499    };
1500 
1501    VkExtent2D extent = {512, 512};
1502 
1503    unsigned log_num_rb_per_se = util_logbase2_ceil(pdevice->rad_info.max_render_backends / pdevice->rad_info.max_se);
1504    unsigned log_num_se = util_logbase2_ceil(pdevice->rad_info.max_se);
1505 
1506    unsigned total_samples = radv_get_rasterization_samples(cmd_buffer);
1507    unsigned ps_iter_samples = radv_get_ps_iter_samples(cmd_buffer);
1508    unsigned effective_samples = total_samples;
1509    unsigned color_bytes_per_pixel = 0;
1510 
1511    for (unsigned i = 0; i < render->color_att_count; ++i) {
1512       struct radv_image_view *iview = render->color_att[i].iview;
1513 
1514       if (!iview)
1515          continue;
1516 
1517       if (!d->vk.cb.attachments[i].write_mask)
1518          continue;
1519 
1520       color_bytes_per_pixel += vk_format_get_blocksize(render->color_att[i].format);
1521    }
1522 
1523    /* MSAA images typically don't use all samples all the time. */
1524    if (effective_samples >= 2 && ps_iter_samples <= 1)
1525       effective_samples = 2;
1526    color_bytes_per_pixel *= effective_samples;
1527 
1528    const struct radv_bin_size_entry *color_entry = color_size_table[log_num_rb_per_se][log_num_se];
1529    while (color_entry[1].bpp <= color_bytes_per_pixel)
1530       ++color_entry;
1531 
1532    extent = color_entry->extent;
1533 
1534    if (render->ds_att.iview) {
1535       /* Coefficients taken from AMDVLK */
1536       unsigned depth_coeff = vk_format_has_depth(render->ds_att.format) ? 5 : 0;
1537       unsigned stencil_coeff = vk_format_has_stencil(render->ds_att.format) ? 1 : 0;
1538       unsigned ds_bytes_per_pixel = 4 * (depth_coeff + stencil_coeff) * total_samples;
1539 
1540       const struct radv_bin_size_entry *ds_entry = ds_size_table[log_num_rb_per_se][log_num_se];
1541       while (ds_entry[1].bpp <= ds_bytes_per_pixel)
1542          ++ds_entry;
1543 
1544       if (ds_entry->extent.width * ds_entry->extent.height < extent.width * extent.height)
1545          extent = ds_entry->extent;
1546    }
1547 
1548    return extent;
1549 }
1550 
1551 static unsigned
radv_get_disabled_binning_state(struct radv_cmd_buffer * cmd_buffer)1552 radv_get_disabled_binning_state(struct radv_cmd_buffer *cmd_buffer)
1553 {
1554    const struct radv_physical_device *pdevice = cmd_buffer->device->physical_device;
1555    const struct radv_rendering_state *render = &cmd_buffer->state.render;
1556    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1557    uint32_t pa_sc_binner_cntl_0;
1558 
1559    if (pdevice->rad_info.gfx_level >= GFX10) {
1560       unsigned min_bytes_per_pixel = 0;
1561 
1562       for (unsigned i = 0; i < render->color_att_count; ++i) {
1563          struct radv_image_view *iview = render->color_att[i].iview;
1564 
1565          if (!iview)
1566             continue;
1567 
1568          if (!d->vk.cb.attachments[i].write_mask)
1569             continue;
1570 
1571          unsigned bytes = vk_format_get_blocksize(render->color_att[i].format);
1572          if (!min_bytes_per_pixel || bytes < min_bytes_per_pixel)
1573             min_bytes_per_pixel = bytes;
1574       }
1575 
1576       pa_sc_binner_cntl_0 = S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_NEW_SC) | S_028C44_BIN_SIZE_X(0) |
1577                             S_028C44_BIN_SIZE_Y(0) | S_028C44_BIN_SIZE_X_EXTEND(2) |       /* 128 */
1578                             S_028C44_BIN_SIZE_Y_EXTEND(min_bytes_per_pixel <= 4 ? 2 : 1) | /* 128 or 64 */
1579                             S_028C44_DISABLE_START_OF_PRIM(1) | S_028C44_FLUSH_ON_BINNING_TRANSITION(1);
1580    } else {
1581       pa_sc_binner_cntl_0 = S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) |
1582                             S_028C44_DISABLE_START_OF_PRIM(1) |
1583                             S_028C44_FLUSH_ON_BINNING_TRANSITION(pdevice->rad_info.family == CHIP_VEGA12 ||
1584                                                                  pdevice->rad_info.family == CHIP_VEGA20 ||
1585                                                                  pdevice->rad_info.family >= CHIP_RAVEN2);
1586    }
1587 
1588    return pa_sc_binner_cntl_0;
1589 }
1590 
1591 static unsigned
radv_get_binning_state(struct radv_cmd_buffer * cmd_buffer)1592 radv_get_binning_state(struct radv_cmd_buffer *cmd_buffer)
1593 {
1594    const struct radv_device *device = cmd_buffer->device;
1595    unsigned pa_sc_binner_cntl_0;
1596    VkExtent2D bin_size;
1597 
1598    if (device->physical_device->rad_info.gfx_level >= GFX10) {
1599       bin_size = radv_gfx10_compute_bin_size(cmd_buffer);
1600    } else {
1601       assert(device->physical_device->rad_info.gfx_level == GFX9);
1602       bin_size = radv_gfx9_compute_bin_size(cmd_buffer);
1603    }
1604 
1605    if (device->pbb_allowed && bin_size.width && bin_size.height) {
1606       struct radv_binning_settings *settings = &device->physical_device->binning_settings;
1607 
1608       pa_sc_binner_cntl_0 =
1609          S_028C44_BINNING_MODE(V_028C44_BINNING_ALLOWED) | S_028C44_BIN_SIZE_X(bin_size.width == 16) |
1610          S_028C44_BIN_SIZE_Y(bin_size.height == 16) |
1611          S_028C44_BIN_SIZE_X_EXTEND(util_logbase2(MAX2(bin_size.width, 32)) - 5) |
1612          S_028C44_BIN_SIZE_Y_EXTEND(util_logbase2(MAX2(bin_size.height, 32)) - 5) |
1613          S_028C44_CONTEXT_STATES_PER_BIN(settings->context_states_per_bin - 1) |
1614          S_028C44_PERSISTENT_STATES_PER_BIN(settings->persistent_states_per_bin - 1) |
1615          S_028C44_DISABLE_START_OF_PRIM(1) | S_028C44_FPOVS_PER_BATCH(settings->fpovs_per_batch) |
1616          S_028C44_OPTIMAL_BIN_SELECTION(1) |
1617          S_028C44_FLUSH_ON_BINNING_TRANSITION(device->physical_device->rad_info.family == CHIP_VEGA12 ||
1618                                               device->physical_device->rad_info.family == CHIP_VEGA20 ||
1619                                               device->physical_device->rad_info.family >= CHIP_RAVEN2);
1620    } else {
1621       pa_sc_binner_cntl_0 = radv_get_disabled_binning_state(cmd_buffer);
1622    }
1623 
1624    return pa_sc_binner_cntl_0;
1625 }
1626 
1627 static void
radv_emit_binning_state(struct radv_cmd_buffer * cmd_buffer)1628 radv_emit_binning_state(struct radv_cmd_buffer *cmd_buffer)
1629 {
1630    unsigned pa_sc_binner_cntl_0;
1631 
1632    if (cmd_buffer->device->physical_device->rad_info.gfx_level < GFX9)
1633       return;
1634 
1635    pa_sc_binner_cntl_0 = radv_get_binning_state(cmd_buffer);
1636 
1637    if (pa_sc_binner_cntl_0 == cmd_buffer->state.last_pa_sc_binner_cntl_0)
1638       return;
1639 
1640    radeon_set_context_reg(cmd_buffer->cs, R_028C44_PA_SC_BINNER_CNTL_0, pa_sc_binner_cntl_0);
1641 
1642    cmd_buffer->state.last_pa_sc_binner_cntl_0 = pa_sc_binner_cntl_0;
1643 }
1644 
1645 static void
radv_emit_shader_prefetch(struct radv_cmd_buffer * cmd_buffer,struct radv_shader * shader)1646 radv_emit_shader_prefetch(struct radv_cmd_buffer *cmd_buffer, struct radv_shader *shader)
1647 {
1648    uint64_t va;
1649 
1650    if (!shader)
1651       return;
1652 
1653    va = radv_shader_get_va(shader);
1654 
1655    radv_cp_dma_prefetch(cmd_buffer, va, shader->code_size);
1656 }
1657 
1658 ALWAYS_INLINE static void
radv_emit_prefetch_L2(struct radv_cmd_buffer * cmd_buffer,bool first_stage_only)1659 radv_emit_prefetch_L2(struct radv_cmd_buffer *cmd_buffer, bool first_stage_only)
1660 {
1661    struct radv_cmd_state *state = &cmd_buffer->state;
1662    uint32_t mask = state->prefetch_L2_mask;
1663 
1664    /* Fast prefetch path for starting draws as soon as possible. */
1665    if (first_stage_only)
1666       mask &= RADV_PREFETCH_VS | RADV_PREFETCH_VBO_DESCRIPTORS | RADV_PREFETCH_MS;
1667 
1668    if (mask & RADV_PREFETCH_VS)
1669       radv_emit_shader_prefetch(cmd_buffer, cmd_buffer->state.shaders[MESA_SHADER_VERTEX]);
1670 
1671    if (mask & RADV_PREFETCH_MS)
1672       radv_emit_shader_prefetch(cmd_buffer, cmd_buffer->state.shaders[MESA_SHADER_MESH]);
1673 
1674    if (mask & RADV_PREFETCH_VBO_DESCRIPTORS)
1675       radv_cp_dma_prefetch(cmd_buffer, state->vb_va, state->vb_size);
1676 
1677    if (mask & RADV_PREFETCH_TCS)
1678       radv_emit_shader_prefetch(cmd_buffer, cmd_buffer->state.shaders[MESA_SHADER_TESS_CTRL]);
1679 
1680    if (mask & RADV_PREFETCH_TES)
1681       radv_emit_shader_prefetch(cmd_buffer, cmd_buffer->state.shaders[MESA_SHADER_TESS_EVAL]);
1682 
1683    if (mask & RADV_PREFETCH_GS) {
1684       radv_emit_shader_prefetch(cmd_buffer, cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY]);
1685       if (cmd_buffer->state.gs_copy_shader)
1686          radv_emit_shader_prefetch(cmd_buffer, cmd_buffer->state.gs_copy_shader);
1687    }
1688 
1689    if (mask & RADV_PREFETCH_PS) {
1690       radv_emit_shader_prefetch(cmd_buffer, cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT]);
1691    }
1692 
1693    state->prefetch_L2_mask &= ~mask;
1694 }
1695 
1696 static void
radv_emit_rbplus_state(struct radv_cmd_buffer * cmd_buffer)1697 radv_emit_rbplus_state(struct radv_cmd_buffer *cmd_buffer)
1698 {
1699    assert(cmd_buffer->device->physical_device->rad_info.rbplus_allowed);
1700 
1701    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1702    struct radv_rendering_state *render = &cmd_buffer->state.render;
1703 
1704    unsigned sx_ps_downconvert = 0;
1705    unsigned sx_blend_opt_epsilon = 0;
1706    unsigned sx_blend_opt_control = 0;
1707 
1708    for (unsigned i = 0; i < render->color_att_count; i++) {
1709       unsigned format, swap;
1710       bool has_alpha, has_rgb;
1711       if (render->color_att[i].iview == NULL) {
1712          /* We don't set the DISABLE bits, because the HW can't have holes,
1713           * so the SPI color format is set to 32-bit 1-component. */
1714          sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
1715          continue;
1716       }
1717 
1718       struct radv_color_buffer_info *cb = &render->color_att[i].cb;
1719 
1720       format = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11
1721                   ? G_028C70_FORMAT_GFX11(cb->cb_color_info)
1722                   : G_028C70_FORMAT_GFX6(cb->cb_color_info);
1723       swap = G_028C70_COMP_SWAP(cb->cb_color_info);
1724       has_alpha = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11
1725                      ? !G_028C74_FORCE_DST_ALPHA_1_GFX11(cb->cb_color_attrib)
1726                      : !G_028C74_FORCE_DST_ALPHA_1_GFX6(cb->cb_color_attrib);
1727 
1728       uint32_t spi_format = (cmd_buffer->state.col_format_non_compacted >> (i * 4)) & 0xf;
1729       uint32_t colormask = d->vk.cb.attachments[i].write_mask;
1730 
1731       if (format == V_028C70_COLOR_8 || format == V_028C70_COLOR_16 || format == V_028C70_COLOR_32)
1732          has_rgb = !has_alpha;
1733       else
1734          has_rgb = true;
1735 
1736       /* Check the colormask and export format. */
1737       if (!(colormask & 0x7))
1738          has_rgb = false;
1739       if (!(colormask & 0x8))
1740          has_alpha = false;
1741 
1742       if (spi_format == V_028714_SPI_SHADER_ZERO) {
1743          has_rgb = false;
1744          has_alpha = false;
1745       }
1746 
1747       /* The HW doesn't quite blend correctly with rgb9e5 if we disable the alpha
1748        * optimization, even though it has no alpha. */
1749       if (has_rgb && format == V_028C70_COLOR_5_9_9_9)
1750          has_alpha = true;
1751 
1752       /* Disable value checking for disabled channels. */
1753       if (!has_rgb)
1754          sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4);
1755       if (!has_alpha)
1756          sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4);
1757 
1758       /* Enable down-conversion for 32bpp and smaller formats. */
1759       switch (format) {
1760       case V_028C70_COLOR_8:
1761       case V_028C70_COLOR_8_8:
1762       case V_028C70_COLOR_8_8_8_8:
1763          /* For 1 and 2-channel formats, use the superset thereof. */
1764          if (spi_format == V_028714_SPI_SHADER_FP16_ABGR || spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
1765              spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
1766             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_8_8_8_8 << (i * 4);
1767 
1768             if (G_028C70_NUMBER_TYPE(cb->cb_color_info) != V_028C70_NUMBER_SRGB)
1769                sx_blend_opt_epsilon |= V_028758_8BIT_FORMAT_0_5 << (i * 4);
1770          }
1771          break;
1772 
1773       case V_028C70_COLOR_5_6_5:
1774          if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1775             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_5_6_5 << (i * 4);
1776             sx_blend_opt_epsilon |= V_028758_6BIT_FORMAT_0_5 << (i * 4);
1777          }
1778          break;
1779 
1780       case V_028C70_COLOR_1_5_5_5:
1781          if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1782             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_1_5_5_5 << (i * 4);
1783             sx_blend_opt_epsilon |= V_028758_5BIT_FORMAT_0_5 << (i * 4);
1784          }
1785          break;
1786 
1787       case V_028C70_COLOR_4_4_4_4:
1788          if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1789             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_4_4_4_4 << (i * 4);
1790             sx_blend_opt_epsilon |= V_028758_4BIT_FORMAT_0_5 << (i * 4);
1791          }
1792          break;
1793 
1794       case V_028C70_COLOR_32:
1795          if (swap == V_028C70_SWAP_STD && spi_format == V_028714_SPI_SHADER_32_R)
1796             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
1797          else if (swap == V_028C70_SWAP_ALT_REV && spi_format == V_028714_SPI_SHADER_32_AR)
1798             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_A << (i * 4);
1799          break;
1800 
1801       case V_028C70_COLOR_16:
1802       case V_028C70_COLOR_16_16:
1803          /* For 1-channel formats, use the superset thereof. */
1804          if (spi_format == V_028714_SPI_SHADER_UNORM16_ABGR || spi_format == V_028714_SPI_SHADER_SNORM16_ABGR ||
1805              spi_format == V_028714_SPI_SHADER_UINT16_ABGR || spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
1806             if (swap == V_028C70_SWAP_STD || swap == V_028C70_SWAP_STD_REV)
1807                sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_GR << (i * 4);
1808             else
1809                sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_AR << (i * 4);
1810          }
1811          break;
1812 
1813       case V_028C70_COLOR_10_11_11:
1814          if (spi_format == V_028714_SPI_SHADER_FP16_ABGR)
1815             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_10_11_11 << (i * 4);
1816          break;
1817 
1818       case V_028C70_COLOR_2_10_10_10:
1819          if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1820             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_2_10_10_10 << (i * 4);
1821             sx_blend_opt_epsilon |= V_028758_10BIT_FORMAT_0_5 << (i * 4);
1822          }
1823          break;
1824       case V_028C70_COLOR_5_9_9_9:
1825          if (spi_format == V_028714_SPI_SHADER_FP16_ABGR)
1826             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_9_9_9_E5 << (i * 4);
1827          break;
1828       }
1829    }
1830 
1831    /* Do not set the DISABLE bits for the unused attachments, as that
1832     * breaks dual source blending in SkQP and does not seem to improve
1833     * performance. */
1834 
1835    if (sx_ps_downconvert != cmd_buffer->state.last_sx_ps_downconvert ||
1836        sx_blend_opt_epsilon != cmd_buffer->state.last_sx_blend_opt_epsilon ||
1837        sx_blend_opt_control != cmd_buffer->state.last_sx_blend_opt_control) {
1838       radeon_set_context_reg_seq(cmd_buffer->cs, R_028754_SX_PS_DOWNCONVERT, 3);
1839       radeon_emit(cmd_buffer->cs, sx_ps_downconvert);
1840       radeon_emit(cmd_buffer->cs, sx_blend_opt_epsilon);
1841       radeon_emit(cmd_buffer->cs, sx_blend_opt_control);
1842 
1843       cmd_buffer->state.last_sx_ps_downconvert = sx_ps_downconvert;
1844       cmd_buffer->state.last_sx_blend_opt_epsilon = sx_blend_opt_epsilon;
1845       cmd_buffer->state.last_sx_blend_opt_control = sx_blend_opt_control;
1846    }
1847 
1848    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_RBPLUS;
1849 }
1850 
1851 static void
radv_emit_ps_epilog_state(struct radv_cmd_buffer * cmd_buffer,struct radv_shader_part * ps_epilog)1852 radv_emit_ps_epilog_state(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_part *ps_epilog)
1853 {
1854    struct radv_shader *ps_shader = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
1855    const struct radv_device *device = cmd_buffer->device;
1856 
1857    if (cmd_buffer->state.emitted_ps_epilog == ps_epilog)
1858       return;
1859 
1860    uint32_t col_format = radv_compact_spi_shader_col_format(ps_shader, ps_epilog->spi_shader_col_format);
1861 
1862    bool need_null_export_workaround =
1863       radv_needs_null_export_workaround(device, ps_shader, cmd_buffer->state.custom_blend_mode);
1864    if (need_null_export_workaround && !col_format)
1865       col_format = V_028714_SPI_SHADER_32_R;
1866    radeon_set_context_reg(cmd_buffer->cs, R_028714_SPI_SHADER_COL_FORMAT, col_format);
1867    radeon_set_context_reg(cmd_buffer->cs, R_02823C_CB_SHADER_MASK,
1868                           ac_get_cb_shader_mask(ps_epilog->spi_shader_col_format));
1869 
1870    if (ps_epilog->spi_shader_z_format)
1871       radeon_set_context_reg(cmd_buffer->cs, R_028710_SPI_SHADER_Z_FORMAT, ps_epilog->spi_shader_z_format);
1872 
1873    assert(ps_shader->config.num_shared_vgprs == 0);
1874    if (G_00B848_VGPRS(ps_epilog->rsrc1) > G_00B848_VGPRS(ps_shader->config.rsrc1)) {
1875       uint32_t rsrc1 = ps_shader->config.rsrc1;
1876       rsrc1 = (rsrc1 & C_00B848_VGPRS) | (ps_epilog->rsrc1 & ~C_00B848_VGPRS);
1877       radeon_set_sh_reg(cmd_buffer->cs, R_00B028_SPI_SHADER_PGM_RSRC1_PS, rsrc1);
1878    }
1879 
1880    radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, ps_epilog->bo);
1881 
1882    assert((ps_epilog->va >> 32) == cmd_buffer->device->physical_device->rad_info.address32_hi);
1883 
1884    struct radv_userdata_info *loc = &ps_shader->info.user_sgprs_locs.shader_data[AC_UD_PS_EPILOG_PC];
1885    uint32_t base_reg = ps_shader->info.user_data_0;
1886    assert(loc->sgpr_idx != -1);
1887    assert(loc->num_sgprs == 1);
1888    radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, ps_epilog->va, false);
1889 
1890    cmd_buffer->shader_upload_seq = MAX2(cmd_buffer->shader_upload_seq, ps_epilog->upload_seq);
1891 
1892    cmd_buffer->state.emitted_ps_epilog = ps_epilog;
1893 }
1894 
1895 static void
radv_emit_tcs_epilog_state(struct radv_cmd_buffer * cmd_buffer,struct radv_shader_part * tcs_epilog)1896 radv_emit_tcs_epilog_state(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_part *tcs_epilog)
1897 {
1898    const enum amd_gfx_level gfx_level = cmd_buffer->device->physical_device->rad_info.gfx_level;
1899    struct radv_shader *tcs = cmd_buffer->state.shaders[MESA_SHADER_TESS_CTRL];
1900    uint32_t rsrc1;
1901 
1902    if (cmd_buffer->state.emitted_tcs_epilog == tcs_epilog)
1903       return;
1904 
1905    if (tcs->info.merged_shader_compiled_separately) {
1906       radv_shader_combine_cfg_vs_tcs(cmd_buffer->state.shaders[MESA_SHADER_VERTEX], tcs, &rsrc1, NULL);
1907    } else {
1908       rsrc1 = tcs->config.rsrc1;
1909    }
1910 
1911    assert(tcs->config.num_shared_vgprs == 0);
1912    if (G_00B848_VGPRS(tcs_epilog->rsrc1) > G_00B848_VGPRS(rsrc1))
1913       rsrc1 = (rsrc1 & C_00B848_VGPRS) | (tcs_epilog->rsrc1 & ~C_00B848_VGPRS);
1914    if (gfx_level < GFX10 && G_00B228_SGPRS(tcs_epilog->rsrc1) > G_00B228_SGPRS(rsrc1))
1915       rsrc1 = (rsrc1 & C_00B228_SGPRS) | (tcs_epilog->rsrc1 & ~C_00B228_SGPRS);
1916 
1917    radeon_set_sh_reg(cmd_buffer->cs, R_00B428_SPI_SHADER_PGM_RSRC1_HS, rsrc1);
1918 
1919    radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, tcs_epilog->bo);
1920 
1921    assert((tcs_epilog->va >> 32) == cmd_buffer->device->physical_device->rad_info.address32_hi);
1922 
1923    struct radv_userdata_info *loc = &tcs->info.user_sgprs_locs.shader_data[AC_UD_TCS_EPILOG_PC];
1924    uint32_t base_reg = tcs->info.user_data_0;
1925    assert(loc->sgpr_idx != -1);
1926    assert(loc->num_sgprs == 1);
1927    radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, tcs_epilog->va, false);
1928 
1929    cmd_buffer->shader_upload_seq = MAX2(cmd_buffer->shader_upload_seq, tcs_epilog->upload_seq);
1930 
1931    cmd_buffer->state.emitted_tcs_epilog = tcs_epilog;
1932 }
1933 
1934 static void
radv_emit_graphics_pipeline(struct radv_cmd_buffer * cmd_buffer)1935 radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
1936 {
1937    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
1938    const struct radv_device *device = cmd_buffer->device;
1939 
1940    if (cmd_buffer->state.emitted_graphics_pipeline == pipeline)
1941       return;
1942 
1943    if (cmd_buffer->state.emitted_graphics_pipeline) {
1944       if (radv_rast_prim_is_points_or_lines(cmd_buffer->state.emitted_graphics_pipeline->rast_prim) !=
1945           radv_rast_prim_is_points_or_lines(pipeline->rast_prim))
1946          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_GUARDBAND;
1947 
1948       if (cmd_buffer->state.emitted_graphics_pipeline->ms.min_sample_shading != pipeline->ms.min_sample_shading ||
1949           cmd_buffer->state.emitted_graphics_pipeline->uses_out_of_order_rast != pipeline->uses_out_of_order_rast ||
1950           cmd_buffer->state.emitted_graphics_pipeline->uses_vrs_attachment != pipeline->uses_vrs_attachment ||
1951           cmd_buffer->state.emitted_graphics_pipeline->rast_prim != pipeline->rast_prim)
1952 
1953          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_RASTERIZATION_SAMPLES;
1954 
1955       if (cmd_buffer->state.emitted_graphics_pipeline->ms.sample_shading_enable != pipeline->ms.sample_shading_enable) {
1956          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_RASTERIZATION_SAMPLES;
1957          if (device->physical_device->rad_info.gfx_level >= GFX10_3)
1958             cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE;
1959       }
1960 
1961       if (cmd_buffer->state.emitted_graphics_pipeline->db_render_control != pipeline->db_render_control)
1962          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FRAMEBUFFER;
1963    }
1964 
1965    radeon_emit_array(cmd_buffer->cs, pipeline->base.cs.buf, pipeline->base.cs.cdw);
1966 
1967    if (!cmd_buffer->state.emitted_graphics_pipeline ||
1968        cmd_buffer->state.emitted_graphics_pipeline->base.ctx_cs.cdw != pipeline->base.ctx_cs.cdw ||
1969        cmd_buffer->state.emitted_graphics_pipeline->base.ctx_cs_hash != pipeline->base.ctx_cs_hash ||
1970        memcmp(cmd_buffer->state.emitted_graphics_pipeline->base.ctx_cs.buf, pipeline->base.ctx_cs.buf,
1971               pipeline->base.ctx_cs.cdw * 4)) {
1972       radeon_emit_array(cmd_buffer->cs, pipeline->base.ctx_cs.buf, pipeline->base.ctx_cs.cdw);
1973    }
1974 
1975    if (device->pbb_allowed) {
1976       struct radv_binning_settings *settings = &device->physical_device->binning_settings;
1977 
1978       if ((!cmd_buffer->state.emitted_graphics_pipeline ||
1979            cmd_buffer->state.emitted_graphics_pipeline->base.shaders[MESA_SHADER_FRAGMENT] !=
1980               cmd_buffer->state.graphics_pipeline->base.shaders[MESA_SHADER_FRAGMENT]) &&
1981           (settings->context_states_per_bin > 1 || settings->persistent_states_per_bin > 1)) {
1982          /* Break the batch on PS changes. */
1983          radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
1984          radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
1985       }
1986    }
1987 
1988    if (pipeline->sqtt_shaders_reloc) {
1989       /* Emit shaders relocation because RGP requires them to be contiguous in memory. */
1990       radv_sqtt_emit_relocated_shaders(cmd_buffer, pipeline);
1991    }
1992 
1993    for (unsigned s = 0; s < MESA_VULKAN_SHADER_STAGES; s++) {
1994       struct radv_shader *shader = cmd_buffer->state.shaders[s];
1995 
1996       if (!shader)
1997          continue;
1998 
1999       radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, shader->bo);
2000    }
2001 
2002    if (cmd_buffer->state.gs_copy_shader) {
2003       radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->state.gs_copy_shader->bo);
2004    }
2005 
2006    struct radv_shader *task_shader = cmd_buffer->state.shaders[MESA_SHADER_TASK];
2007    if (task_shader) {
2008       radv_emit_compute_shader(device->physical_device, cmd_buffer->gang.cs, task_shader);
2009 
2010       /* Relocate the task shader because RGP requires shaders to be contiguous in memory. */
2011       if (pipeline->sqtt_shaders_reloc) {
2012          const struct radv_sqtt_shaders_reloc *reloc = pipeline->sqtt_shaders_reloc;
2013          const uint64_t va = reloc->va[MESA_SHADER_TASK];
2014 
2015          radeon_set_sh_reg(cmd_buffer->gang.cs, R_00B830_COMPUTE_PGM_LO, va >> 8);
2016       }
2017    }
2018 
2019    if (radv_device_fault_detection_enabled(cmd_buffer->device))
2020       radv_save_pipeline(cmd_buffer, &pipeline->base);
2021 
2022    cmd_buffer->state.emitted_graphics_pipeline = pipeline;
2023 
2024    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_PIPELINE;
2025 }
2026 
2027 static bool
radv_get_depth_clip_enable(struct radv_cmd_buffer * cmd_buffer)2028 radv_get_depth_clip_enable(struct radv_cmd_buffer *cmd_buffer)
2029 {
2030    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2031 
2032    return d->vk.rs.depth_clip_enable == VK_MESA_DEPTH_CLIP_ENABLE_TRUE ||
2033           (d->vk.rs.depth_clip_enable == VK_MESA_DEPTH_CLIP_ENABLE_NOT_CLAMP && !d->vk.rs.depth_clamp_enable);
2034 }
2035 
2036 enum radv_depth_clamp_mode {
2037    RADV_DEPTH_CLAMP_MODE_VIEWPORT = 0,    /* Clamp to the viewport min/max depth bounds */
2038    RADV_DEPTH_CLAMP_MODE_ZERO_TO_ONE = 1, /* Clamp between 0.0f and 1.0f */
2039    RADV_DEPTH_CLAMP_MODE_DISABLED = 2,    /* Disable depth clamping */
2040 };
2041 
2042 static enum radv_depth_clamp_mode
radv_get_depth_clamp_mode(struct radv_cmd_buffer * cmd_buffer)2043 radv_get_depth_clamp_mode(struct radv_cmd_buffer *cmd_buffer)
2044 {
2045    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2046    bool depth_clip_enable = radv_get_depth_clip_enable(cmd_buffer);
2047    const struct radv_device *device = cmd_buffer->device;
2048    enum radv_depth_clamp_mode mode;
2049 
2050    mode = RADV_DEPTH_CLAMP_MODE_VIEWPORT;
2051    if (!d->vk.rs.depth_clamp_enable) {
2052       /* For optimal performance, depth clamping should always be enabled except if the application
2053        * disables clamping explicitly or uses depth values outside of the [0.0, 1.0] range.
2054        */
2055       if (!depth_clip_enable || device->vk.enabled_extensions.EXT_depth_range_unrestricted) {
2056          mode = RADV_DEPTH_CLAMP_MODE_DISABLED;
2057       } else {
2058          mode = RADV_DEPTH_CLAMP_MODE_ZERO_TO_ONE;
2059       }
2060    }
2061 
2062    return mode;
2063 }
2064 
2065 static void
radv_emit_viewport(struct radv_cmd_buffer * cmd_buffer)2066 radv_emit_viewport(struct radv_cmd_buffer *cmd_buffer)
2067 {
2068    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2069    enum radv_depth_clamp_mode depth_clamp_mode = radv_get_depth_clamp_mode(cmd_buffer);
2070 
2071    assert(d->vk.vp.viewport_count);
2072    radeon_set_context_reg_seq(cmd_buffer->cs, R_02843C_PA_CL_VPORT_XSCALE, d->vk.vp.viewport_count * 6);
2073 
2074    for (unsigned i = 0; i < d->vk.vp.viewport_count; i++) {
2075       radeon_emit(cmd_buffer->cs, fui(d->hw_vp.xform[i].scale[0]));
2076       radeon_emit(cmd_buffer->cs, fui(d->hw_vp.xform[i].translate[0]));
2077       radeon_emit(cmd_buffer->cs, fui(d->hw_vp.xform[i].scale[1]));
2078       radeon_emit(cmd_buffer->cs, fui(d->hw_vp.xform[i].translate[1]));
2079 
2080       double scale_z, translate_z;
2081       if (d->vk.vp.depth_clip_negative_one_to_one) {
2082          scale_z = d->hw_vp.xform[i].scale[2] * 0.5f;
2083          translate_z = (d->hw_vp.xform[i].translate[2] + d->vk.vp.viewports[i].maxDepth) * 0.5f;
2084       } else {
2085          scale_z = d->hw_vp.xform[i].scale[2];
2086          translate_z = d->hw_vp.xform[i].translate[2];
2087       }
2088       radeon_emit(cmd_buffer->cs, fui(scale_z));
2089       radeon_emit(cmd_buffer->cs, fui(translate_z));
2090    }
2091 
2092    radeon_set_context_reg_seq(cmd_buffer->cs, R_0282D0_PA_SC_VPORT_ZMIN_0, d->vk.vp.viewport_count * 2);
2093    for (unsigned i = 0; i < d->vk.vp.viewport_count; i++) {
2094       float zmin, zmax;
2095 
2096       if (depth_clamp_mode == RADV_DEPTH_CLAMP_MODE_ZERO_TO_ONE) {
2097          zmin = 0.0f;
2098          zmax = 1.0f;
2099       } else {
2100          zmin = MIN2(d->vk.vp.viewports[i].minDepth, d->vk.vp.viewports[i].maxDepth);
2101          zmax = MAX2(d->vk.vp.viewports[i].minDepth, d->vk.vp.viewports[i].maxDepth);
2102       }
2103 
2104       radeon_emit(cmd_buffer->cs, fui(zmin));
2105       radeon_emit(cmd_buffer->cs, fui(zmax));
2106    }
2107 }
2108 
2109 static void
radv_emit_scissor(struct radv_cmd_buffer * cmd_buffer)2110 radv_emit_scissor(struct radv_cmd_buffer *cmd_buffer)
2111 {
2112    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2113 
2114    radv_write_scissors(cmd_buffer->cs, d->vk.vp.scissor_count, d->vk.vp.scissors, d->vk.vp.viewports);
2115 }
2116 
2117 static void
radv_emit_discard_rectangle(struct radv_cmd_buffer * cmd_buffer)2118 radv_emit_discard_rectangle(struct radv_cmd_buffer *cmd_buffer)
2119 {
2120    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2121    uint32_t cliprect_rule = 0;
2122 
2123    if (!d->vk.dr.enable) {
2124       cliprect_rule = 0xffff;
2125    } else {
2126       for (unsigned i = 0; i < (1u << MAX_DISCARD_RECTANGLES); ++i) {
2127          /* Interpret i as a bitmask, and then set the bit in
2128           * the mask if that combination of rectangles in which
2129           * the pixel is contained should pass the cliprect
2130           * test.
2131           */
2132          unsigned relevant_subset = i & ((1u << d->vk.dr.rectangle_count) - 1);
2133 
2134          if (d->vk.dr.mode == VK_DISCARD_RECTANGLE_MODE_INCLUSIVE_EXT && !relevant_subset)
2135             continue;
2136 
2137          if (d->vk.dr.mode == VK_DISCARD_RECTANGLE_MODE_EXCLUSIVE_EXT && relevant_subset)
2138             continue;
2139 
2140          cliprect_rule |= 1u << i;
2141       }
2142 
2143       radeon_set_context_reg_seq(cmd_buffer->cs, R_028210_PA_SC_CLIPRECT_0_TL, d->vk.dr.rectangle_count * 2);
2144       for (unsigned i = 0; i < d->vk.dr.rectangle_count; ++i) {
2145          VkRect2D rect = d->vk.dr.rectangles[i];
2146          radeon_emit(cmd_buffer->cs, S_028210_TL_X(rect.offset.x) | S_028210_TL_Y(rect.offset.y));
2147          radeon_emit(cmd_buffer->cs, S_028214_BR_X(rect.offset.x + rect.extent.width) |
2148                                         S_028214_BR_Y(rect.offset.y + rect.extent.height));
2149       }
2150    }
2151 
2152    radeon_set_context_reg(cmd_buffer->cs, R_02820C_PA_SC_CLIPRECT_RULE, cliprect_rule);
2153 }
2154 
2155 static void
radv_emit_line_width(struct radv_cmd_buffer * cmd_buffer)2156 radv_emit_line_width(struct radv_cmd_buffer *cmd_buffer)
2157 {
2158    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2159 
2160    radeon_set_context_reg(cmd_buffer->cs, R_028A08_PA_SU_LINE_CNTL,
2161                           S_028A08_WIDTH(CLAMP(d->vk.rs.line.width * 8, 0, 0xFFFF)));
2162 }
2163 
2164 static void
radv_emit_blend_constants(struct radv_cmd_buffer * cmd_buffer)2165 radv_emit_blend_constants(struct radv_cmd_buffer *cmd_buffer)
2166 {
2167    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2168 
2169    radeon_set_context_reg_seq(cmd_buffer->cs, R_028414_CB_BLEND_RED, 4);
2170    radeon_emit_array(cmd_buffer->cs, (uint32_t *)d->vk.cb.blend_constants, 4);
2171 }
2172 
2173 static void
radv_emit_stencil(struct radv_cmd_buffer * cmd_buffer)2174 radv_emit_stencil(struct radv_cmd_buffer *cmd_buffer)
2175 {
2176    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2177 
2178    radeon_set_context_reg_seq(cmd_buffer->cs, R_028430_DB_STENCILREFMASK, 2);
2179    radeon_emit(cmd_buffer->cs, S_028430_STENCILTESTVAL(d->vk.ds.stencil.front.reference) |
2180                                   S_028430_STENCILMASK(d->vk.ds.stencil.front.compare_mask) |
2181                                   S_028430_STENCILWRITEMASK(d->vk.ds.stencil.front.write_mask) |
2182                                   S_028430_STENCILOPVAL(1));
2183    radeon_emit(cmd_buffer->cs, S_028434_STENCILTESTVAL_BF(d->vk.ds.stencil.back.reference) |
2184                                   S_028434_STENCILMASK_BF(d->vk.ds.stencil.back.compare_mask) |
2185                                   S_028434_STENCILWRITEMASK_BF(d->vk.ds.stencil.back.write_mask) |
2186                                   S_028434_STENCILOPVAL_BF(1));
2187 }
2188 
2189 static void
radv_emit_depth_bounds(struct radv_cmd_buffer * cmd_buffer)2190 radv_emit_depth_bounds(struct radv_cmd_buffer *cmd_buffer)
2191 {
2192    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2193 
2194    radeon_set_context_reg_seq(cmd_buffer->cs, R_028020_DB_DEPTH_BOUNDS_MIN, 2);
2195    radeon_emit(cmd_buffer->cs, fui(d->vk.ds.depth.bounds_test.min));
2196    radeon_emit(cmd_buffer->cs, fui(d->vk.ds.depth.bounds_test.max));
2197 }
2198 
2199 static void
radv_emit_depth_bias(struct radv_cmd_buffer * cmd_buffer)2200 radv_emit_depth_bias(struct radv_cmd_buffer *cmd_buffer)
2201 {
2202    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2203    struct radv_rendering_state *render = &cmd_buffer->state.render;
2204    unsigned slope = fui(d->vk.rs.depth_bias.slope * 16.0f);
2205    unsigned pa_su_poly_offset_db_fmt_cntl = 0;
2206 
2207    if (vk_format_has_depth(render->ds_att.format) &&
2208        d->vk.rs.depth_bias.representation != VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT) {
2209       VkFormat format = vk_format_depth_only(render->ds_att.format);
2210 
2211       if (format == VK_FORMAT_D16_UNORM) {
2212          pa_su_poly_offset_db_fmt_cntl = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-16);
2213       } else {
2214          assert(format == VK_FORMAT_D32_SFLOAT);
2215          if (d->vk.rs.depth_bias.representation ==
2216              VK_DEPTH_BIAS_REPRESENTATION_LEAST_REPRESENTABLE_VALUE_FORCE_UNORM_EXT) {
2217             pa_su_poly_offset_db_fmt_cntl = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-24);
2218          } else {
2219             pa_su_poly_offset_db_fmt_cntl =
2220                S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-23) | S_028B78_POLY_OFFSET_DB_IS_FLOAT_FMT(1);
2221          }
2222       }
2223    }
2224 
2225    radeon_set_context_reg_seq(cmd_buffer->cs, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, 5);
2226    radeon_emit(cmd_buffer->cs, fui(d->vk.rs.depth_bias.clamp));    /* CLAMP */
2227    radeon_emit(cmd_buffer->cs, slope);                             /* FRONT SCALE */
2228    radeon_emit(cmd_buffer->cs, fui(d->vk.rs.depth_bias.constant)); /* FRONT OFFSET */
2229    radeon_emit(cmd_buffer->cs, slope);                             /* BACK SCALE */
2230    radeon_emit(cmd_buffer->cs, fui(d->vk.rs.depth_bias.constant)); /* BACK OFFSET */
2231 
2232    radeon_set_context_reg(cmd_buffer->cs, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, pa_su_poly_offset_db_fmt_cntl);
2233 }
2234 
2235 static void
radv_emit_line_stipple(struct radv_cmd_buffer * cmd_buffer)2236 radv_emit_line_stipple(struct radv_cmd_buffer *cmd_buffer)
2237 {
2238    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2239    enum amd_gfx_level gfx_level = cmd_buffer->device->physical_device->rad_info.gfx_level;
2240    /* GFX9 chips fail linestrip CTS tests unless this is set to 0 = no reset */
2241    uint32_t auto_reset_cntl = (gfx_level == GFX9) ? 0 : 2;
2242 
2243    if (radv_primitive_topology_is_line_list(d->vk.ia.primitive_topology))
2244       auto_reset_cntl = 1;
2245 
2246    radeon_set_context_reg(cmd_buffer->cs, R_028A0C_PA_SC_LINE_STIPPLE,
2247                           S_028A0C_LINE_PATTERN(d->vk.rs.line.stipple.pattern) |
2248                              S_028A0C_REPEAT_COUNT(d->vk.rs.line.stipple.factor - 1) |
2249                              S_028A0C_AUTO_RESET_CNTL(auto_reset_cntl));
2250 }
2251 
2252 static uint32_t
radv_get_pa_su_sc_mode_cntl(const struct radv_cmd_buffer * cmd_buffer)2253 radv_get_pa_su_sc_mode_cntl(const struct radv_cmd_buffer *cmd_buffer)
2254 {
2255    enum amd_gfx_level gfx_level = cmd_buffer->device->physical_device->rad_info.gfx_level;
2256    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2257    unsigned pa_su_sc_mode_cntl;
2258 
2259    pa_su_sc_mode_cntl =
2260       S_028814_CULL_FRONT(!!(d->vk.rs.cull_mode & VK_CULL_MODE_FRONT_BIT)) |
2261       S_028814_CULL_BACK(!!(d->vk.rs.cull_mode & VK_CULL_MODE_BACK_BIT)) | S_028814_FACE(d->vk.rs.front_face) |
2262       S_028814_POLY_OFFSET_FRONT_ENABLE(d->vk.rs.depth_bias.enable) |
2263       S_028814_POLY_OFFSET_BACK_ENABLE(d->vk.rs.depth_bias.enable) |
2264       S_028814_POLY_OFFSET_PARA_ENABLE(d->vk.rs.depth_bias.enable) |
2265       S_028814_POLY_MODE(d->vk.rs.polygon_mode != V_028814_X_DRAW_TRIANGLES) |
2266       S_028814_POLYMODE_FRONT_PTYPE(d->vk.rs.polygon_mode) | S_028814_POLYMODE_BACK_PTYPE(d->vk.rs.polygon_mode) |
2267       S_028814_PROVOKING_VTX_LAST(d->vk.rs.provoking_vertex == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT);
2268 
2269    if (gfx_level >= GFX10) {
2270       /* Ensure that SC processes the primitive group in the same order as PA produced them.  Needed
2271        * when either POLY_MODE or PERPENDICULAR_ENDCAP_ENA is set.
2272        */
2273       pa_su_sc_mode_cntl |=
2274          S_028814_KEEP_TOGETHER_ENABLE(d->vk.rs.polygon_mode != V_028814_X_DRAW_TRIANGLES ||
2275                                        d->vk.rs.line.mode == VK_LINE_RASTERIZATION_MODE_RECTANGULAR_KHR);
2276    }
2277 
2278    return pa_su_sc_mode_cntl;
2279 }
2280 
2281 static void
radv_emit_culling(struct radv_cmd_buffer * cmd_buffer)2282 radv_emit_culling(struct radv_cmd_buffer *cmd_buffer)
2283 {
2284    unsigned pa_su_sc_mode_cntl = radv_get_pa_su_sc_mode_cntl(cmd_buffer);
2285 
2286    radeon_set_context_reg(cmd_buffer->cs, R_028814_PA_SU_SC_MODE_CNTL, pa_su_sc_mode_cntl);
2287 }
2288 
2289 static void
radv_emit_provoking_vertex_mode(struct radv_cmd_buffer * cmd_buffer)2290 radv_emit_provoking_vertex_mode(struct radv_cmd_buffer *cmd_buffer)
2291 {
2292    const struct radv_shader *last_vgt_shader = cmd_buffer->state.last_vgt_shader;
2293    const unsigned stage = last_vgt_shader->info.stage;
2294    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2295    const struct radv_userdata_info *loc = radv_get_user_sgpr(last_vgt_shader, AC_UD_NGG_PROVOKING_VTX);
2296    unsigned provoking_vtx = 0;
2297    uint32_t base_reg;
2298 
2299    if (loc->sgpr_idx == -1)
2300       return;
2301 
2302    if (d->vk.rs.provoking_vertex == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT) {
2303       if (stage == MESA_SHADER_VERTEX) {
2304          provoking_vtx = radv_conv_prim_to_gs_out(d->vk.ia.primitive_topology, last_vgt_shader->info.is_ngg);
2305       } else {
2306          assert(stage == MESA_SHADER_GEOMETRY);
2307          provoking_vtx = last_vgt_shader->info.gs.vertices_in - 1;
2308       }
2309    }
2310 
2311    base_reg = last_vgt_shader->info.user_data_0;
2312    radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, provoking_vtx);
2313 }
2314 
2315 static void
radv_emit_primitive_topology(struct radv_cmd_buffer * cmd_buffer)2316 radv_emit_primitive_topology(struct radv_cmd_buffer *cmd_buffer)
2317 {
2318    const struct radv_shader *last_vgt_shader = cmd_buffer->state.last_vgt_shader;
2319    const struct radv_userdata_info *loc = radv_get_user_sgpr(last_vgt_shader, AC_UD_NUM_VERTS_PER_PRIM);
2320    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2321    uint32_t base_reg;
2322 
2323    assert(!cmd_buffer->state.mesh_shading);
2324 
2325    if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) {
2326       radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device, cmd_buffer->cs, R_030908_VGT_PRIMITIVE_TYPE, 1,
2327                                  d->vk.ia.primitive_topology);
2328    } else {
2329       radeon_set_config_reg(cmd_buffer->cs, R_008958_VGT_PRIMITIVE_TYPE, d->vk.ia.primitive_topology);
2330    }
2331 
2332    if (loc->sgpr_idx == -1)
2333       return;
2334 
2335    base_reg = last_vgt_shader->info.user_data_0;
2336    radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4,
2337                      radv_conv_prim_to_gs_out(d->vk.ia.primitive_topology, last_vgt_shader->info.is_ngg) + 1);
2338 }
2339 
2340 static void
radv_emit_depth_control(struct radv_cmd_buffer * cmd_buffer)2341 radv_emit_depth_control(struct radv_cmd_buffer *cmd_buffer)
2342 {
2343    const struct radv_rendering_state *render = &cmd_buffer->state.render;
2344    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2345    const bool stencil_test_enable =
2346       d->vk.ds.stencil.test_enable && (render->ds_att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT);
2347 
2348    radeon_set_context_reg(
2349       cmd_buffer->cs, R_028800_DB_DEPTH_CONTROL,
2350       S_028800_Z_ENABLE(d->vk.ds.depth.test_enable ? 1 : 0) |
2351          S_028800_Z_WRITE_ENABLE(d->vk.ds.depth.write_enable ? 1 : 0) | S_028800_ZFUNC(d->vk.ds.depth.compare_op) |
2352          S_028800_DEPTH_BOUNDS_ENABLE(d->vk.ds.depth.bounds_test.enable ? 1 : 0) |
2353          S_028800_STENCIL_ENABLE(stencil_test_enable) | S_028800_BACKFACE_ENABLE(stencil_test_enable) |
2354          S_028800_STENCILFUNC(d->vk.ds.stencil.front.op.compare) |
2355          S_028800_STENCILFUNC_BF(d->vk.ds.stencil.back.op.compare));
2356 }
2357 
2358 static void
radv_emit_stencil_control(struct radv_cmd_buffer * cmd_buffer)2359 radv_emit_stencil_control(struct radv_cmd_buffer *cmd_buffer)
2360 {
2361    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2362 
2363    radeon_set_context_reg(cmd_buffer->cs, R_02842C_DB_STENCIL_CONTROL,
2364                           S_02842C_STENCILFAIL(radv_translate_stencil_op(d->vk.ds.stencil.front.op.fail)) |
2365                              S_02842C_STENCILZPASS(radv_translate_stencil_op(d->vk.ds.stencil.front.op.pass)) |
2366                              S_02842C_STENCILZFAIL(radv_translate_stencil_op(d->vk.ds.stencil.front.op.depth_fail)) |
2367                              S_02842C_STENCILFAIL_BF(radv_translate_stencil_op(d->vk.ds.stencil.back.op.fail)) |
2368                              S_02842C_STENCILZPASS_BF(radv_translate_stencil_op(d->vk.ds.stencil.back.op.pass)) |
2369                              S_02842C_STENCILZFAIL_BF(radv_translate_stencil_op(d->vk.ds.stencil.back.op.depth_fail)));
2370 }
2371 
2372 static bool
radv_should_force_vrs1x1(struct radv_cmd_buffer * cmd_buffer)2373 radv_should_force_vrs1x1(struct radv_cmd_buffer *cmd_buffer)
2374 {
2375    const struct radv_physical_device *pdevice = cmd_buffer->device->physical_device;
2376    const struct radv_shader *ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
2377 
2378    return pdevice->rad_info.gfx_level >= GFX10_3 &&
2379           (cmd_buffer->state.ms.sample_shading_enable || (ps && ps->info.ps.force_sample_iter_shading_rate));
2380 }
2381 
2382 static void
radv_emit_fragment_shading_rate(struct radv_cmd_buffer * cmd_buffer)2383 radv_emit_fragment_shading_rate(struct radv_cmd_buffer *cmd_buffer)
2384 {
2385    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2386 
2387    /* When per-vertex VRS is forced and the dynamic fragment shading rate is a no-op, ignore
2388     * it. This is needed for vkd3d-proton because it always declares per-draw VRS as dynamic.
2389     */
2390    if (cmd_buffer->device->force_vrs != RADV_FORCE_VRS_1x1 && d->vk.fsr.fragment_size.width == 1 &&
2391        d->vk.fsr.fragment_size.height == 1 &&
2392        d->vk.fsr.combiner_ops[0] == VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR &&
2393        d->vk.fsr.combiner_ops[1] == VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR)
2394       return;
2395 
2396    uint32_t rate_x = MIN2(2, d->vk.fsr.fragment_size.width) - 1;
2397    uint32_t rate_y = MIN2(2, d->vk.fsr.fragment_size.height) - 1;
2398    uint32_t pipeline_comb_mode = d->vk.fsr.combiner_ops[0];
2399    uint32_t htile_comb_mode = d->vk.fsr.combiner_ops[1];
2400    uint32_t pa_cl_vrs_cntl = 0;
2401 
2402    assert(cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10_3);
2403 
2404    if (!cmd_buffer->state.render.vrs_att.iview) {
2405       /* When the current subpass has no VRS attachment, the VRS rates are expected to be 1x1, so we
2406        * can cheat by tweaking the different combiner modes.
2407        */
2408       switch (htile_comb_mode) {
2409       case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MIN_KHR:
2410          /* The result of min(A, 1x1) is always 1x1. */
2411          FALLTHROUGH;
2412       case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR:
2413          /* Force the per-draw VRS rate to 1x1. */
2414          rate_x = rate_y = 0;
2415 
2416          /* As the result of min(A, 1x1) or replace(A, 1x1) are always 1x1, set the vertex rate
2417           * combiner mode as passthrough.
2418           */
2419          pipeline_comb_mode = V_028848_SC_VRS_COMB_MODE_PASSTHRU;
2420          break;
2421       case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MAX_KHR:
2422          /* The result of max(A, 1x1) is always A. */
2423          FALLTHROUGH;
2424       case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR:
2425          /* Nothing to do here because the SAMPLE_ITER combiner mode should already be passthrough. */
2426          break;
2427       default:
2428          break;
2429       }
2430    }
2431 
2432    /* Emit per-draw VRS rate which is the first combiner. */
2433    radeon_set_uconfig_reg(cmd_buffer->cs, R_03098C_GE_VRS_RATE, S_03098C_RATE_X(rate_x) | S_03098C_RATE_Y(rate_y));
2434 
2435    /* Disable VRS and use the rates from PS_ITER_SAMPLES if:
2436     *
2437     * 1) sample shading is enabled or per-sample interpolation is used by the fragment shader
2438     * 2) the fragment shader requires 1x1 shading rate for some other reason
2439     */
2440    if (radv_should_force_vrs1x1(cmd_buffer)) {
2441       pa_cl_vrs_cntl |= S_028848_SAMPLE_ITER_COMBINER_MODE(V_028848_SC_VRS_COMB_MODE_OVERRIDE);
2442    }
2443 
2444    /* VERTEX_RATE_COMBINER_MODE controls the combiner mode between the
2445     * draw rate and the vertex rate.
2446     */
2447    if (cmd_buffer->state.mesh_shading) {
2448       pa_cl_vrs_cntl |= S_028848_VERTEX_RATE_COMBINER_MODE(V_028848_SC_VRS_COMB_MODE_PASSTHRU) |
2449                         S_028848_PRIMITIVE_RATE_COMBINER_MODE(pipeline_comb_mode);
2450    } else {
2451       pa_cl_vrs_cntl |= S_028848_VERTEX_RATE_COMBINER_MODE(pipeline_comb_mode) |
2452                         S_028848_PRIMITIVE_RATE_COMBINER_MODE(V_028848_SC_VRS_COMB_MODE_PASSTHRU);
2453    }
2454 
2455    /* HTILE_RATE_COMBINER_MODE controls the combiner mode between the primitive rate and the HTILE
2456     * rate.
2457     */
2458    pa_cl_vrs_cntl |= S_028848_HTILE_RATE_COMBINER_MODE(htile_comb_mode);
2459 
2460    radeon_set_context_reg(cmd_buffer->cs, R_028848_PA_CL_VRS_CNTL, pa_cl_vrs_cntl);
2461 }
2462 
2463 static uint32_t
radv_get_primitive_reset_index(const struct radv_cmd_buffer * cmd_buffer)2464 radv_get_primitive_reset_index(const struct radv_cmd_buffer *cmd_buffer)
2465 {
2466    const uint32_t index_type = G_028A7C_INDEX_TYPE(cmd_buffer->state.index_type);
2467    switch (index_type) {
2468    case V_028A7C_VGT_INDEX_8:
2469       return 0xffu;
2470    case V_028A7C_VGT_INDEX_16:
2471       return 0xffffu;
2472    case V_028A7C_VGT_INDEX_32:
2473       return 0xffffffffu;
2474    default:
2475       unreachable("invalid index type");
2476    }
2477 }
2478 
2479 static void
radv_emit_primitive_restart_enable(struct radv_cmd_buffer * cmd_buffer)2480 radv_emit_primitive_restart_enable(struct radv_cmd_buffer *cmd_buffer)
2481 {
2482    const enum amd_gfx_level gfx_level = cmd_buffer->device->physical_device->rad_info.gfx_level;
2483    const struct radv_dynamic_state *const d = &cmd_buffer->state.dynamic;
2484    struct radeon_cmdbuf *cs = cmd_buffer->cs;
2485    const bool en = d->vk.ia.primitive_restart_enable;
2486 
2487    if (gfx_level >= GFX11) {
2488       radeon_set_uconfig_reg(cs, R_03092C_GE_MULTI_PRIM_IB_RESET_EN,
2489                              S_03092C_RESET_EN(en) |
2490                                 /* This disables primitive restart for non-indexed draws.
2491                                  * By keeping this set, we don't have to unset RESET_EN
2492                                  * for non-indexed draws. */
2493                                 S_03092C_DISABLE_FOR_AUTO_INDEX(1));
2494    } else if (gfx_level >= GFX9) {
2495       radeon_set_uconfig_reg(cs, R_03092C_VGT_MULTI_PRIM_IB_RESET_EN, en);
2496    } else {
2497       radeon_set_context_reg(cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, en);
2498    }
2499 
2500    /* GFX6-7: All 32 bits are compared.
2501     * GFX8: Only index type bits are compared.
2502     * GFX9+: Default is same as GFX8, MATCH_ALL_BITS=1 selects GFX6-7 behavior
2503     */
2504    if (en && gfx_level <= GFX7) {
2505       const uint32_t primitive_reset_index = radv_get_primitive_reset_index(cmd_buffer);
2506 
2507       if (primitive_reset_index != cmd_buffer->state.last_primitive_reset_index) {
2508          radeon_set_context_reg(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, primitive_reset_index);
2509          cmd_buffer->state.last_primitive_reset_index = primitive_reset_index;
2510       }
2511    }
2512 }
2513 
2514 static void
radv_emit_clipping(struct radv_cmd_buffer * cmd_buffer)2515 radv_emit_clipping(struct radv_cmd_buffer *cmd_buffer)
2516 {
2517    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2518    bool depth_clip_enable = radv_get_depth_clip_enable(cmd_buffer);
2519 
2520    radeon_set_context_reg(
2521       cmd_buffer->cs, R_028810_PA_CL_CLIP_CNTL,
2522       S_028810_DX_RASTERIZATION_KILL(d->vk.rs.rasterizer_discard_enable) |
2523          S_028810_ZCLIP_NEAR_DISABLE(!depth_clip_enable) | S_028810_ZCLIP_FAR_DISABLE(!depth_clip_enable) |
2524          S_028810_DX_CLIP_SPACE_DEF(!d->vk.vp.depth_clip_negative_one_to_one) | S_028810_DX_LINEAR_ATTR_CLIP_ENA(1));
2525 }
2526 
2527 static bool
radv_is_mrt0_dual_src(struct radv_cmd_buffer * cmd_buffer)2528 radv_is_mrt0_dual_src(struct radv_cmd_buffer *cmd_buffer)
2529 {
2530    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2531 
2532    if (!d->vk.cb.attachments[0].write_mask || !d->vk.cb.attachments[0].blend_enable)
2533       return false;
2534 
2535    return radv_can_enable_dual_src(&d->vk.cb.attachments[0]);
2536 }
2537 
2538 static void
radv_emit_logic_op(struct radv_cmd_buffer * cmd_buffer)2539 radv_emit_logic_op(struct radv_cmd_buffer *cmd_buffer)
2540 {
2541    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2542    unsigned cb_color_control = 0;
2543 
2544    if (d->vk.cb.logic_op_enable) {
2545       cb_color_control |= S_028808_ROP3(d->vk.cb.logic_op);
2546    } else {
2547       cb_color_control |= S_028808_ROP3(V_028808_ROP3_COPY);
2548    }
2549 
2550    if (cmd_buffer->device->physical_device->rad_info.has_rbplus) {
2551       /* RB+ doesn't work with dual source blending, logic op and CB_RESOLVE. */
2552       bool mrt0_is_dual_src = radv_is_mrt0_dual_src(cmd_buffer);
2553 
2554       cb_color_control |= S_028808_DISABLE_DUAL_QUAD(mrt0_is_dual_src || d->vk.cb.logic_op_enable ||
2555                                                      cmd_buffer->state.custom_blend_mode == V_028808_CB_RESOLVE);
2556    }
2557 
2558    if (cmd_buffer->state.custom_blend_mode) {
2559       cb_color_control |= S_028808_MODE(cmd_buffer->state.custom_blend_mode);
2560    } else {
2561       bool color_write_enabled = false;
2562 
2563       for (unsigned i = 0; i < MAX_RTS; i++) {
2564          if (d->vk.cb.attachments[i].write_mask) {
2565             color_write_enabled = true;
2566             break;
2567          }
2568       }
2569 
2570       if (color_write_enabled) {
2571          cb_color_control |= S_028808_MODE(V_028808_CB_NORMAL);
2572       } else {
2573          cb_color_control |= S_028808_MODE(V_028808_CB_DISABLE);
2574       }
2575    }
2576 
2577    radeon_set_context_reg(cmd_buffer->cs, R_028808_CB_COLOR_CONTROL, cb_color_control);
2578 }
2579 
2580 static void
radv_emit_color_write(struct radv_cmd_buffer * cmd_buffer)2581 radv_emit_color_write(struct radv_cmd_buffer *cmd_buffer)
2582 {
2583    const struct radv_device *device = cmd_buffer->device;
2584    const struct radv_binning_settings *settings = &device->physical_device->binning_settings;
2585    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2586    uint32_t color_write_enable = 0, color_write_mask = 0;
2587 
2588    u_foreach_bit (i, d->vk.cb.color_write_enables) {
2589       color_write_enable |= 0xfu << (i * 4);
2590    }
2591 
2592    for (unsigned i = 0; i < MAX_RTS; i++) {
2593       color_write_mask |= d->vk.cb.attachments[i].write_mask << (4 * i);
2594    }
2595 
2596    if (device->pbb_allowed && settings->context_states_per_bin > 1) {
2597       /* Flush DFSM on CB_TARGET_MASK changes. */
2598       radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
2599       radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
2600    }
2601 
2602    radeon_set_context_reg(cmd_buffer->cs, R_028238_CB_TARGET_MASK, color_write_mask & color_write_enable);
2603 }
2604 
2605 static void
radv_emit_patch_control_points(struct radv_cmd_buffer * cmd_buffer)2606 radv_emit_patch_control_points(struct radv_cmd_buffer *cmd_buffer)
2607 {
2608    const struct radv_physical_device *pdevice = cmd_buffer->device->physical_device;
2609    const struct radv_shader *vs = radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_VERTEX);
2610    const struct radv_shader *tcs = cmd_buffer->state.shaders[MESA_SHADER_TESS_CTRL];
2611    const struct radv_shader *tes = radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_TESS_EVAL);
2612    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2613    unsigned ls_hs_config, base_reg;
2614 
2615    /* Compute tessellation info that depends on the number of patch control points when this state
2616     * is dynamic.
2617     */
2618    if (cmd_buffer->state.uses_dynamic_patch_control_points) {
2619       /* Compute the number of patches. */
2620       cmd_buffer->state.tess_num_patches = get_tcs_num_patches(
2621          d->vk.ts.patch_control_points, tcs->info.tcs.tcs_vertices_out, vs->info.vs.num_linked_outputs,
2622          tcs->info.tcs.num_linked_outputs, tcs->info.tcs.num_linked_patch_outputs,
2623          pdevice->hs.tess_offchip_block_dw_size, pdevice->rad_info.gfx_level, pdevice->rad_info.family);
2624 
2625       /* Compute the LDS size. */
2626       cmd_buffer->state.tess_lds_size = calculate_tess_lds_size(
2627          pdevice->rad_info.gfx_level, d->vk.ts.patch_control_points, tcs->info.tcs.tcs_vertices_out,
2628          vs->info.vs.num_linked_outputs, cmd_buffer->state.tess_num_patches, tcs->info.tcs.num_linked_outputs,
2629          tcs->info.tcs.num_linked_patch_outputs);
2630    }
2631 
2632    ls_hs_config = S_028B58_NUM_PATCHES(cmd_buffer->state.tess_num_patches) |
2633                   S_028B58_HS_NUM_INPUT_CP(d->vk.ts.patch_control_points) |
2634                   S_028B58_HS_NUM_OUTPUT_CP(tcs->info.tcs.tcs_vertices_out);
2635 
2636    if (pdevice->rad_info.gfx_level >= GFX7) {
2637       radeon_set_context_reg_idx(cmd_buffer->cs, R_028B58_VGT_LS_HS_CONFIG, 2, ls_hs_config);
2638    } else {
2639       radeon_set_context_reg(cmd_buffer->cs, R_028B58_VGT_LS_HS_CONFIG, ls_hs_config);
2640    }
2641 
2642    if (pdevice->rad_info.gfx_level >= GFX9) {
2643       unsigned hs_rsrc2;
2644 
2645       if (tcs->info.merged_shader_compiled_separately) {
2646          radv_shader_combine_cfg_vs_tcs(cmd_buffer->state.shaders[MESA_SHADER_VERTEX], tcs, NULL, &hs_rsrc2);
2647       } else {
2648          hs_rsrc2 = tcs->config.rsrc2;
2649       }
2650 
2651       if (pdevice->rad_info.gfx_level >= GFX10) {
2652          hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX10(cmd_buffer->state.tess_lds_size);
2653       } else {
2654          hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX9(cmd_buffer->state.tess_lds_size);
2655       }
2656 
2657       radeon_set_sh_reg(cmd_buffer->cs, R_00B42C_SPI_SHADER_PGM_RSRC2_HS, hs_rsrc2);
2658    } else {
2659       unsigned ls_rsrc2 = vs->config.rsrc2 | S_00B52C_LDS_SIZE(cmd_buffer->state.tess_lds_size);
2660 
2661       radeon_set_sh_reg(cmd_buffer->cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, ls_rsrc2);
2662    }
2663 
2664    /* Emit user SGPRs for dynamic patch control points. */
2665    const struct radv_userdata_info *offchip = radv_get_user_sgpr(tcs, AC_UD_TCS_OFFCHIP_LAYOUT);
2666    if (offchip->sgpr_idx == -1)
2667       return;
2668    assert(offchip->num_sgprs == 1);
2669 
2670    unsigned tcs_offchip_layout =
2671       SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_PATCH_CONTROL_POINTS, d->vk.ts.patch_control_points) |
2672       SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_NUM_PATCHES, cmd_buffer->state.tess_num_patches) |
2673       SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_LSHS_VERTEX_STRIDE,
2674                      get_tcs_input_vertex_stride(vs->info.vs.num_linked_outputs) / 4);
2675 
2676    base_reg = tcs->info.user_data_0;
2677    radeon_set_sh_reg(cmd_buffer->cs, base_reg + offchip->sgpr_idx * 4, tcs_offchip_layout);
2678 
2679    const struct radv_userdata_info *num_patches = radv_get_user_sgpr(tes, AC_UD_TES_STATE);
2680    assert(num_patches->sgpr_idx != -1 && num_patches->num_sgprs == 1);
2681 
2682    const unsigned tes_state = SET_SGPR_FIELD(TES_STATE_NUM_PATCHES, cmd_buffer->state.tess_num_patches) |
2683                               SET_SGPR_FIELD(TES_STATE_TCS_VERTICES_OUT, tcs->info.tcs.tcs_vertices_out) |
2684                               SET_SGPR_FIELD(TES_STATE_NUM_TCS_OUTPUTS, tcs->info.tcs.num_linked_outputs);
2685 
2686    base_reg = tes->info.user_data_0;
2687    radeon_set_sh_reg(cmd_buffer->cs, base_reg + num_patches->sgpr_idx * 4, tes_state);
2688 }
2689 
2690 static void
radv_emit_conservative_rast_mode(struct radv_cmd_buffer * cmd_buffer)2691 radv_emit_conservative_rast_mode(struct radv_cmd_buffer *cmd_buffer)
2692 {
2693    const struct radv_physical_device *pdevice = cmd_buffer->device->physical_device;
2694    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2695 
2696    if (pdevice->rad_info.gfx_level >= GFX9) {
2697       uint32_t pa_sc_conservative_rast;
2698 
2699       if (d->vk.rs.conservative_mode != VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT) {
2700          const struct radv_shader *ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
2701          const bool uses_inner_coverage = ps && ps->info.ps.reads_fully_covered;
2702 
2703          pa_sc_conservative_rast =
2704             S_028C4C_PREZ_AA_MASK_ENABLE(1) | S_028C4C_POSTZ_AA_MASK_ENABLE(1) | S_028C4C_CENTROID_SAMPLE_OVERRIDE(1);
2705 
2706          /* Inner coverage requires underestimate conservative rasterization. */
2707          if (d->vk.rs.conservative_mode == VK_CONSERVATIVE_RASTERIZATION_MODE_OVERESTIMATE_EXT &&
2708              !uses_inner_coverage) {
2709             pa_sc_conservative_rast |= S_028C4C_OVER_RAST_ENABLE(1) | S_028C4C_UNDER_RAST_SAMPLE_SELECT(1) |
2710                                        S_028C4C_PBB_UNCERTAINTY_REGION_ENABLE(1);
2711          } else {
2712             pa_sc_conservative_rast |= S_028C4C_OVER_RAST_SAMPLE_SELECT(1) | S_028C4C_UNDER_RAST_ENABLE(1);
2713          }
2714       } else {
2715          pa_sc_conservative_rast = S_028C4C_NULL_SQUAD_AA_MASK_ENABLE(1);
2716       }
2717 
2718       radeon_set_context_reg(cmd_buffer->cs, R_028C4C_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL, pa_sc_conservative_rast);
2719    }
2720 }
2721 
2722 static void
radv_emit_depth_clamp_enable(struct radv_cmd_buffer * cmd_buffer)2723 radv_emit_depth_clamp_enable(struct radv_cmd_buffer *cmd_buffer)
2724 {
2725    enum radv_depth_clamp_mode mode = radv_get_depth_clamp_mode(cmd_buffer);
2726 
2727    radeon_set_context_reg(cmd_buffer->cs, R_02800C_DB_RENDER_OVERRIDE,
2728                           S_02800C_FORCE_HIS_ENABLE0(V_02800C_FORCE_DISABLE) |
2729                              S_02800C_FORCE_HIS_ENABLE1(V_02800C_FORCE_DISABLE) |
2730                              S_02800C_DISABLE_VIEWPORT_CLAMP(mode == RADV_DEPTH_CLAMP_MODE_DISABLED));
2731 }
2732 
2733 static void
radv_emit_rasterization_samples(struct radv_cmd_buffer * cmd_buffer)2734 radv_emit_rasterization_samples(struct radv_cmd_buffer *cmd_buffer)
2735 {
2736    const struct radv_physical_device *pdevice = cmd_buffer->device->physical_device;
2737    unsigned rasterization_samples = radv_get_rasterization_samples(cmd_buffer);
2738    unsigned ps_iter_samples = radv_get_ps_iter_samples(cmd_buffer);
2739    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2740    unsigned spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1);
2741    unsigned pa_sc_mode_cntl_1;
2742 
2743    pa_sc_mode_cntl_1 =
2744       S_028A4C_WALK_FENCE_ENABLE(1) | // TODO linear dst fixes
2745       S_028A4C_WALK_FENCE_SIZE(pdevice->rad_info.num_tile_pipes == 2 ? 2 : 3) |
2746       S_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(cmd_buffer->state.uses_out_of_order_rast) |
2747       S_028A4C_OUT_OF_ORDER_WATER_MARK(0x7) |
2748       /* always 1: */
2749       S_028A4C_SUPERTILE_WALK_ORDER_ENABLE(1) | S_028A4C_TILE_WALK_ORDER_ENABLE(1) |
2750       S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE(1) | S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) |
2751       S_028A4C_FORCE_EOV_REZ_ENABLE(1) |
2752       /* This should only be set when VRS surfaces aren't enabled on GFX11, otherwise the GPU might
2753        * hang.
2754        */
2755       S_028A4C_WALK_ALIGN8_PRIM_FITS_ST(pdevice->rad_info.gfx_level < GFX11 || !cmd_buffer->state.uses_vrs_attachment);
2756 
2757    if (!d->sample_location.count)
2758       radv_emit_default_sample_locations(cmd_buffer->cs, rasterization_samples);
2759 
2760    if (ps_iter_samples > 1) {
2761       spi_baryc_cntl |= S_0286E0_POS_FLOAT_LOCATION(2);
2762       pa_sc_mode_cntl_1 |= S_028A4C_PS_ITER_SAMPLE(1);
2763    }
2764 
2765    if (radv_should_force_vrs1x1(cmd_buffer)) {
2766       /* Make sure sample shading is enabled even if only MSAA1x is used because the SAMPLE_ITER
2767        * combiner is in passthrough mode if PS_ITER_SAMPLE is 0, and it uses the per-draw rate. The
2768        * default VRS rate when sample shading is enabled is 1x1.
2769        */
2770       if (!G_028A4C_PS_ITER_SAMPLE(pa_sc_mode_cntl_1))
2771          pa_sc_mode_cntl_1 |= S_028A4C_PS_ITER_SAMPLE(1);
2772    }
2773 
2774    radeon_set_context_reg(cmd_buffer->cs, R_0286E0_SPI_BARYC_CNTL, spi_baryc_cntl);
2775    radeon_set_context_reg(cmd_buffer->cs, R_028A4C_PA_SC_MODE_CNTL_1, pa_sc_mode_cntl_1);
2776 }
2777 
2778 static void
radv_emit_fb_color_state(struct radv_cmd_buffer * cmd_buffer,int index,struct radv_color_buffer_info * cb,struct radv_image_view * iview,VkImageLayout layout)2779 radv_emit_fb_color_state(struct radv_cmd_buffer *cmd_buffer, int index, struct radv_color_buffer_info *cb,
2780                          struct radv_image_view *iview, VkImageLayout layout)
2781 {
2782    bool is_vi = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX8;
2783    uint32_t cb_fdcc_control = cb->cb_dcc_control;
2784    uint32_t cb_color_info = cb->cb_color_info;
2785    struct radv_image *image = iview->image;
2786 
2787    if (!radv_layout_dcc_compressed(cmd_buffer->device, image, iview->vk.base_mip_level, layout,
2788                                    radv_image_queue_family_mask(image, cmd_buffer->qf, cmd_buffer->qf))) {
2789       if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
2790          cb_fdcc_control &= C_028C78_FDCC_ENABLE;
2791       } else {
2792          cb_color_info &= C_028C70_DCC_ENABLE;
2793       }
2794    }
2795 
2796    const enum radv_fmask_compression fmask_comp = radv_layout_fmask_compression(
2797       cmd_buffer->device, image, layout, radv_image_queue_family_mask(image, cmd_buffer->qf, cmd_buffer->qf));
2798    if (fmask_comp == RADV_FMASK_COMPRESSION_NONE) {
2799       cb_color_info &= C_028C70_COMPRESSION;
2800    }
2801 
2802    if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
2803       radeon_set_context_reg_seq(cmd_buffer->cs, R_028C6C_CB_COLOR0_VIEW + index * 0x3c, 4);
2804       radeon_emit(cmd_buffer->cs, cb->cb_color_view);   /* CB_COLOR0_VIEW */
2805       radeon_emit(cmd_buffer->cs, cb->cb_color_info);   /* CB_COLOR0_INFO */
2806       radeon_emit(cmd_buffer->cs, cb->cb_color_attrib); /* CB_COLOR0_ATTRIB */
2807       radeon_emit(cmd_buffer->cs, cb_fdcc_control);     /* CB_COLOR0_FDCC_CONTROL */
2808 
2809       radeon_set_context_reg(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, cb->cb_color_base);
2810       radeon_set_context_reg(cmd_buffer->cs, R_028E40_CB_COLOR0_BASE_EXT + index * 4, cb->cb_color_base >> 32);
2811       radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->cb_dcc_base);
2812       radeon_set_context_reg(cmd_buffer->cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + index * 4, cb->cb_dcc_base >> 32);
2813       radeon_set_context_reg(cmd_buffer->cs, R_028EC0_CB_COLOR0_ATTRIB2 + index * 4, cb->cb_color_attrib2);
2814       radeon_set_context_reg(cmd_buffer->cs, R_028EE0_CB_COLOR0_ATTRIB3 + index * 4, cb->cb_color_attrib3);
2815    } else if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10) {
2816       radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
2817       radeon_emit(cmd_buffer->cs, cb->cb_color_base);
2818       radeon_emit(cmd_buffer->cs, 0);
2819       radeon_emit(cmd_buffer->cs, 0);
2820       radeon_emit(cmd_buffer->cs, cb->cb_color_view);
2821       radeon_emit(cmd_buffer->cs, cb_color_info);
2822       radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
2823       radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
2824       radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
2825       radeon_emit(cmd_buffer->cs, 0);
2826       radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
2827       radeon_emit(cmd_buffer->cs, 0);
2828 
2829       radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->cb_dcc_base);
2830 
2831       radeon_set_context_reg(cmd_buffer->cs, R_028E40_CB_COLOR0_BASE_EXT + index * 4, cb->cb_color_base >> 32);
2832       radeon_set_context_reg(cmd_buffer->cs, R_028E60_CB_COLOR0_CMASK_BASE_EXT + index * 4, cb->cb_color_cmask >> 32);
2833       radeon_set_context_reg(cmd_buffer->cs, R_028E80_CB_COLOR0_FMASK_BASE_EXT + index * 4, cb->cb_color_fmask >> 32);
2834       radeon_set_context_reg(cmd_buffer->cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + index * 4, cb->cb_dcc_base >> 32);
2835       radeon_set_context_reg(cmd_buffer->cs, R_028EC0_CB_COLOR0_ATTRIB2 + index * 4, cb->cb_color_attrib2);
2836       radeon_set_context_reg(cmd_buffer->cs, R_028EE0_CB_COLOR0_ATTRIB3 + index * 4, cb->cb_color_attrib3);
2837    } else if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX9) {
2838       radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
2839       radeon_emit(cmd_buffer->cs, cb->cb_color_base);
2840       radeon_emit(cmd_buffer->cs, S_028C64_BASE_256B(cb->cb_color_base >> 32));
2841       radeon_emit(cmd_buffer->cs, cb->cb_color_attrib2);
2842       radeon_emit(cmd_buffer->cs, cb->cb_color_view);
2843       radeon_emit(cmd_buffer->cs, cb_color_info);
2844       radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
2845       radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
2846       radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
2847       radeon_emit(cmd_buffer->cs, S_028C80_BASE_256B(cb->cb_color_cmask >> 32));
2848       radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
2849       radeon_emit(cmd_buffer->cs, S_028C88_BASE_256B(cb->cb_color_fmask >> 32));
2850 
2851       radeon_set_context_reg_seq(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, 2);
2852       radeon_emit(cmd_buffer->cs, cb->cb_dcc_base);
2853       radeon_emit(cmd_buffer->cs, S_028C98_BASE_256B(cb->cb_dcc_base >> 32));
2854 
2855       radeon_set_context_reg(cmd_buffer->cs, R_0287A0_CB_MRT0_EPITCH + index * 4, cb->cb_mrt_epitch);
2856    } else {
2857       radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
2858       radeon_emit(cmd_buffer->cs, cb->cb_color_base);
2859       radeon_emit(cmd_buffer->cs, cb->cb_color_pitch);
2860       radeon_emit(cmd_buffer->cs, cb->cb_color_slice);
2861       radeon_emit(cmd_buffer->cs, cb->cb_color_view);
2862       radeon_emit(cmd_buffer->cs, cb_color_info);
2863       radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
2864       radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
2865       radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
2866       radeon_emit(cmd_buffer->cs, cb->cb_color_cmask_slice);
2867       radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
2868       radeon_emit(cmd_buffer->cs, cb->cb_color_fmask_slice);
2869 
2870       if (is_vi) { /* DCC BASE */
2871          radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->cb_dcc_base);
2872       }
2873    }
2874 
2875    if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11 ? G_028C78_FDCC_ENABLE(cb_fdcc_control)
2876                                                                         : G_028C70_DCC_ENABLE(cb_color_info)) {
2877       /* Drawing with DCC enabled also compresses colorbuffers. */
2878       VkImageSubresourceRange range = {
2879          .aspectMask = iview->vk.aspects,
2880          .baseMipLevel = iview->vk.base_mip_level,
2881          .levelCount = iview->vk.level_count,
2882          .baseArrayLayer = iview->vk.base_array_layer,
2883          .layerCount = iview->vk.layer_count,
2884       };
2885 
2886       radv_update_dcc_metadata(cmd_buffer, image, &range, true);
2887    }
2888 }
2889 
2890 static void
radv_update_zrange_precision(struct radv_cmd_buffer * cmd_buffer,struct radv_ds_buffer_info * ds,const struct radv_image_view * iview,bool requires_cond_exec)2891 radv_update_zrange_precision(struct radv_cmd_buffer *cmd_buffer, struct radv_ds_buffer_info *ds,
2892                              const struct radv_image_view *iview, bool requires_cond_exec)
2893 {
2894    const struct radv_image *image = iview->image;
2895    uint32_t db_z_info = ds->db_z_info;
2896    uint32_t db_z_info_reg;
2897 
2898    if (!cmd_buffer->device->physical_device->rad_info.has_tc_compat_zrange_bug || !radv_image_is_tc_compat_htile(image))
2899       return;
2900 
2901    db_z_info &= C_028040_ZRANGE_PRECISION;
2902 
2903    if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX9) {
2904       db_z_info_reg = R_028038_DB_Z_INFO;
2905    } else {
2906       db_z_info_reg = R_028040_DB_Z_INFO;
2907    }
2908 
2909    /* When we don't know the last fast clear value we need to emit a
2910     * conditional packet that will eventually skip the following
2911     * SET_CONTEXT_REG packet.
2912     */
2913    if (requires_cond_exec) {
2914       uint64_t va = radv_get_tc_compat_zrange_va(image, iview->vk.base_mip_level);
2915 
2916       radeon_emit(cmd_buffer->cs, PKT3(PKT3_COND_EXEC, 3, 0));
2917       radeon_emit(cmd_buffer->cs, va);
2918       radeon_emit(cmd_buffer->cs, va >> 32);
2919       radeon_emit(cmd_buffer->cs, 0);
2920       radeon_emit(cmd_buffer->cs, 3); /* SET_CONTEXT_REG size */
2921    }
2922 
2923    radeon_set_context_reg(cmd_buffer->cs, db_z_info_reg, db_z_info);
2924 }
2925 
2926 static struct radv_image *
radv_cmd_buffer_get_vrs_image(struct radv_cmd_buffer * cmd_buffer)2927 radv_cmd_buffer_get_vrs_image(struct radv_cmd_buffer *cmd_buffer)
2928 {
2929    struct radv_device *device = cmd_buffer->device;
2930 
2931    if (!device->vrs.image) {
2932       VkResult result;
2933 
2934       /* The global VRS state is initialized on-demand to avoid wasting VRAM. */
2935       result = radv_device_init_vrs_state(device);
2936       if (result != VK_SUCCESS) {
2937          vk_command_buffer_set_error(&cmd_buffer->vk, result);
2938          return NULL;
2939       }
2940    }
2941 
2942    return device->vrs.image;
2943 }
2944 
2945 static void
radv_emit_fb_ds_state(struct radv_cmd_buffer * cmd_buffer,struct radv_ds_buffer_info * ds,struct radv_image_view * iview,bool depth_compressed,bool stencil_compressed)2946 radv_emit_fb_ds_state(struct radv_cmd_buffer *cmd_buffer, struct radv_ds_buffer_info *ds, struct radv_image_view *iview,
2947                       bool depth_compressed, bool stencil_compressed)
2948 {
2949    uint64_t db_htile_data_base = ds->db_htile_data_base;
2950    uint32_t db_htile_surface = ds->db_htile_surface;
2951    uint32_t db_render_control = ds->db_render_control | cmd_buffer->state.db_render_control;
2952    uint32_t db_z_info = ds->db_z_info;
2953 
2954    if (!depth_compressed)
2955       db_render_control |= S_028000_DEPTH_COMPRESS_DISABLE(1);
2956    if (!stencil_compressed)
2957       db_render_control |= S_028000_STENCIL_COMPRESS_DISABLE(1);
2958 
2959    if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX10_3) {
2960       if (!cmd_buffer->state.render.vrs_att.iview) {
2961          db_htile_surface &= C_028ABC_VRS_HTILE_ENCODING;
2962       } else {
2963          /* On GFX10.3, when a subpass uses VRS attachment but HTILE can't be enabled, we fallback to
2964           * our internal HTILE buffer.
2965           */
2966          if (!radv_htile_enabled(iview->image, iview->vk.base_mip_level) && radv_cmd_buffer_get_vrs_image(cmd_buffer)) {
2967             struct radv_buffer *htile_buffer = cmd_buffer->device->vrs.buffer;
2968 
2969             assert(!G_028038_TILE_SURFACE_ENABLE(db_z_info) && !db_htile_data_base && !db_htile_surface);
2970             db_z_info |= S_028038_TILE_SURFACE_ENABLE(1);
2971             db_htile_data_base = radv_buffer_get_va(htile_buffer->bo) >> 8;
2972             db_htile_surface = S_028ABC_FULL_CACHE(1) | S_028ABC_PIPE_ALIGNED(1) |
2973                                S_028ABC_VRS_HTILE_ENCODING(V_028ABC_VRS_HTILE_4BIT_ENCODING);
2974          }
2975       }
2976    }
2977 
2978    radeon_set_context_reg(cmd_buffer->cs, R_028000_DB_RENDER_CONTROL, db_render_control);
2979    radeon_set_context_reg(cmd_buffer->cs, R_028008_DB_DEPTH_VIEW, ds->db_depth_view);
2980    radeon_set_context_reg(cmd_buffer->cs, R_028010_DB_RENDER_OVERRIDE2, ds->db_render_override2);
2981    radeon_set_context_reg(cmd_buffer->cs, R_028ABC_DB_HTILE_SURFACE, db_htile_surface);
2982 
2983    if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10) {
2984       radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, db_htile_data_base);
2985       radeon_set_context_reg(cmd_buffer->cs, R_02801C_DB_DEPTH_SIZE_XY, ds->db_depth_size);
2986 
2987       if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
2988          radeon_set_context_reg_seq(cmd_buffer->cs, R_028040_DB_Z_INFO, 6);
2989       } else {
2990          radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 7);
2991          radeon_emit(cmd_buffer->cs, S_02803C_RESOURCE_LEVEL(1));
2992       }
2993       radeon_emit(cmd_buffer->cs, db_z_info);
2994       radeon_emit(cmd_buffer->cs, ds->db_stencil_info);
2995       radeon_emit(cmd_buffer->cs, ds->db_z_read_base);
2996       radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);
2997       radeon_emit(cmd_buffer->cs, ds->db_z_read_base);
2998       radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);
2999 
3000       radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_READ_BASE_HI, 5);
3001       radeon_emit(cmd_buffer->cs, ds->db_z_read_base >> 32);
3002       radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base >> 32);
3003       radeon_emit(cmd_buffer->cs, ds->db_z_read_base >> 32);
3004       radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base >> 32);
3005       radeon_emit(cmd_buffer->cs, db_htile_data_base >> 32);
3006    } else if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX9) {
3007       radeon_set_context_reg_seq(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, 3);
3008       radeon_emit(cmd_buffer->cs, db_htile_data_base);
3009       radeon_emit(cmd_buffer->cs, S_028018_BASE_HI(db_htile_data_base >> 32));
3010       radeon_emit(cmd_buffer->cs, ds->db_depth_size);
3011 
3012       radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 10);
3013       radeon_emit(cmd_buffer->cs, db_z_info);                                         /* DB_Z_INFO */
3014       radeon_emit(cmd_buffer->cs, ds->db_stencil_info);                               /* DB_STENCIL_INFO */
3015       radeon_emit(cmd_buffer->cs, ds->db_z_read_base);                                /* DB_Z_READ_BASE */
3016       radeon_emit(cmd_buffer->cs, S_028044_BASE_HI(ds->db_z_read_base >> 32));        /* DB_Z_READ_BASE_HI */
3017       radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);                          /* DB_STENCIL_READ_BASE */
3018       radeon_emit(cmd_buffer->cs, S_02804C_BASE_HI(ds->db_stencil_read_base >> 32));  /* DB_STENCIL_READ_BASE_HI */
3019       radeon_emit(cmd_buffer->cs, ds->db_z_write_base);                               /* DB_Z_WRITE_BASE */
3020       radeon_emit(cmd_buffer->cs, S_028054_BASE_HI(ds->db_z_write_base >> 32));       /* DB_Z_WRITE_BASE_HI */
3021       radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base);                         /* DB_STENCIL_WRITE_BASE */
3022       radeon_emit(cmd_buffer->cs, S_02805C_BASE_HI(ds->db_stencil_write_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */
3023 
3024       radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_INFO2, 2);
3025       radeon_emit(cmd_buffer->cs, ds->db_z_info2);
3026       radeon_emit(cmd_buffer->cs, ds->db_stencil_info2);
3027    } else {
3028       radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, db_htile_data_base);
3029 
3030       radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 9);
3031       radeon_emit(cmd_buffer->cs, ds->db_depth_info);         /* R_02803C_DB_DEPTH_INFO */
3032       radeon_emit(cmd_buffer->cs, db_z_info);                 /* R_028040_DB_Z_INFO */
3033       radeon_emit(cmd_buffer->cs, ds->db_stencil_info);       /* R_028044_DB_STENCIL_INFO */
3034       radeon_emit(cmd_buffer->cs, ds->db_z_read_base);        /* R_028048_DB_Z_READ_BASE */
3035       radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);  /* R_02804C_DB_STENCIL_READ_BASE */
3036       radeon_emit(cmd_buffer->cs, ds->db_z_write_base);       /* R_028050_DB_Z_WRITE_BASE */
3037       radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base); /* R_028054_DB_STENCIL_WRITE_BASE */
3038       radeon_emit(cmd_buffer->cs, ds->db_depth_size);         /* R_028058_DB_DEPTH_SIZE */
3039       radeon_emit(cmd_buffer->cs, ds->db_depth_slice);        /* R_02805C_DB_DEPTH_SLICE */
3040    }
3041 
3042    /* Update the ZRANGE_PRECISION value for the TC-compat bug. */
3043    radv_update_zrange_precision(cmd_buffer, ds, iview, true);
3044 }
3045 
3046 static void
radv_emit_null_ds_state(struct radv_cmd_buffer * cmd_buffer)3047 radv_emit_null_ds_state(struct radv_cmd_buffer *cmd_buffer)
3048 {
3049    const enum amd_gfx_level gfx_level = cmd_buffer->device->physical_device->rad_info.gfx_level;
3050    unsigned db_render_control = 0;
3051    unsigned num_samples = 0;
3052 
3053    /* On GFX11, DB_Z_INFO.NUM_SAMPLES should always match MSAA_EXPOSED_SAMPLES. It affects VRS,
3054     * occlusion queries and Primitive Ordered Pixel Shading if depth and stencil are not bound.
3055     */
3056    if (gfx_level == GFX11) {
3057       num_samples = util_logbase2(radv_get_rasterization_samples(cmd_buffer));
3058       radv_gfx11_set_db_render_control(cmd_buffer->device, 1, &db_render_control);
3059    }
3060 
3061    if (gfx_level == GFX9) {
3062       radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 2);
3063    } else {
3064       radeon_set_context_reg_seq(cmd_buffer->cs, R_028040_DB_Z_INFO, 2);
3065    }
3066 
3067    radeon_emit(cmd_buffer->cs, S_028040_FORMAT(V_028040_Z_INVALID) | S_028040_NUM_SAMPLES(num_samples));
3068    radeon_emit(cmd_buffer->cs, S_028044_FORMAT(V_028044_STENCIL_INVALID));
3069 
3070    radeon_set_context_reg(cmd_buffer->cs, R_028000_DB_RENDER_CONTROL, db_render_control);
3071    radeon_set_context_reg(cmd_buffer->cs, R_028010_DB_RENDER_OVERRIDE2,
3072                           S_028010_CENTROID_COMPUTATION_MODE(gfx_level >= GFX10_3));
3073 }
3074 /**
3075  * Update the fast clear depth/stencil values if the image is bound as a
3076  * depth/stencil buffer.
3077  */
3078 static void
radv_update_bound_fast_clear_ds(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview,VkClearDepthStencilValue ds_clear_value,VkImageAspectFlags aspects)3079 radv_update_bound_fast_clear_ds(struct radv_cmd_buffer *cmd_buffer, const struct radv_image_view *iview,
3080                                 VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects)
3081 {
3082    const struct radv_image *image = iview->image;
3083    struct radeon_cmdbuf *cs = cmd_buffer->cs;
3084 
3085    if (cmd_buffer->state.render.ds_att.iview == NULL || cmd_buffer->state.render.ds_att.iview->image != image)
3086       return;
3087 
3088    if (aspects == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
3089       radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2);
3090       radeon_emit(cs, ds_clear_value.stencil);
3091       radeon_emit(cs, fui(ds_clear_value.depth));
3092    } else if (aspects == VK_IMAGE_ASPECT_DEPTH_BIT) {
3093       radeon_set_context_reg(cs, R_02802C_DB_DEPTH_CLEAR, fui(ds_clear_value.depth));
3094    } else {
3095       assert(aspects == VK_IMAGE_ASPECT_STENCIL_BIT);
3096       radeon_set_context_reg(cs, R_028028_DB_STENCIL_CLEAR, ds_clear_value.stencil);
3097    }
3098 
3099    /* Update the ZRANGE_PRECISION value for the TC-compat bug. This is
3100     * only needed when clearing Z to 0.0.
3101     */
3102    if ((aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && ds_clear_value.depth == 0.0) {
3103       radv_update_zrange_precision(cmd_buffer, &cmd_buffer->state.render.ds_att.ds, iview, false);
3104    }
3105 
3106    cmd_buffer->state.context_roll_without_scissor_emitted = true;
3107 }
3108 
3109 /**
3110  * Set the clear depth/stencil values to the image's metadata.
3111  */
3112 static void
radv_set_ds_clear_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,VkClearDepthStencilValue ds_clear_value,VkImageAspectFlags aspects)3113 radv_set_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
3114                            const VkImageSubresourceRange *range, VkClearDepthStencilValue ds_clear_value,
3115                            VkImageAspectFlags aspects)
3116 {
3117    struct radeon_cmdbuf *cs = cmd_buffer->cs;
3118    uint32_t level_count = vk_image_subresource_level_count(&image->vk, range);
3119 
3120    if (aspects == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
3121       uint64_t va = radv_get_ds_clear_value_va(image, range->baseMipLevel);
3122 
3123       /* Use the fastest way when both aspects are used. */
3124       ASSERTED unsigned cdw_end = radv_cs_write_data_head(cmd_buffer->device, cmd_buffer->cs, cmd_buffer->qf, V_370_PFP,
3125                                                           va, 2 * level_count, cmd_buffer->state.predicating);
3126 
3127       for (uint32_t l = 0; l < level_count; l++) {
3128          radeon_emit(cs, ds_clear_value.stencil);
3129          radeon_emit(cs, fui(ds_clear_value.depth));
3130       }
3131 
3132       assert(cmd_buffer->cs->cdw == cdw_end);
3133    } else {
3134       /* Otherwise we need one WRITE_DATA packet per level. */
3135       for (uint32_t l = 0; l < level_count; l++) {
3136          uint64_t va = radv_get_ds_clear_value_va(image, range->baseMipLevel + l);
3137          unsigned value;
3138 
3139          if (aspects == VK_IMAGE_ASPECT_DEPTH_BIT) {
3140             value = fui(ds_clear_value.depth);
3141             va += 4;
3142          } else {
3143             assert(aspects == VK_IMAGE_ASPECT_STENCIL_BIT);
3144             value = ds_clear_value.stencil;
3145          }
3146 
3147          radv_write_data(cmd_buffer, V_370_PFP, va, 1, &value, cmd_buffer->state.predicating);
3148       }
3149    }
3150 }
3151 
3152 /**
3153  * Update the TC-compat metadata value for this image.
3154  */
3155 static void
radv_set_tc_compat_zrange_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,uint32_t value)3156 radv_set_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
3157                                    const VkImageSubresourceRange *range, uint32_t value)
3158 {
3159    struct radeon_cmdbuf *cs = cmd_buffer->cs;
3160 
3161    if (!cmd_buffer->device->physical_device->rad_info.has_tc_compat_zrange_bug)
3162       return;
3163 
3164    uint64_t va = radv_get_tc_compat_zrange_va(image, range->baseMipLevel);
3165    uint32_t level_count = vk_image_subresource_level_count(&image->vk, range);
3166 
3167    ASSERTED unsigned cdw_end = radv_cs_write_data_head(cmd_buffer->device, cmd_buffer->cs, cmd_buffer->qf, V_370_PFP,
3168                                                        va, level_count, cmd_buffer->state.predicating);
3169 
3170    for (uint32_t l = 0; l < level_count; l++)
3171       radeon_emit(cs, value);
3172 
3173    assert(cmd_buffer->cs->cdw == cdw_end);
3174 }
3175 
3176 static void
radv_update_tc_compat_zrange_metadata(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview,VkClearDepthStencilValue ds_clear_value)3177 radv_update_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer, const struct radv_image_view *iview,
3178                                       VkClearDepthStencilValue ds_clear_value)
3179 {
3180    VkImageSubresourceRange range = {
3181       .aspectMask = iview->vk.aspects,
3182       .baseMipLevel = iview->vk.base_mip_level,
3183       .levelCount = iview->vk.level_count,
3184       .baseArrayLayer = iview->vk.base_array_layer,
3185       .layerCount = iview->vk.layer_count,
3186    };
3187    uint32_t cond_val;
3188 
3189    /* Conditionally set DB_Z_INFO.ZRANGE_PRECISION to 0 when the last
3190     * depth clear value is 0.0f.
3191     */
3192    cond_val = ds_clear_value.depth == 0.0f ? UINT_MAX : 0;
3193 
3194    radv_set_tc_compat_zrange_metadata(cmd_buffer, iview->image, &range, cond_val);
3195 }
3196 
3197 /**
3198  * Update the clear depth/stencil values for this image.
3199  */
3200 void
radv_update_ds_clear_metadata(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview,VkClearDepthStencilValue ds_clear_value,VkImageAspectFlags aspects)3201 radv_update_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, const struct radv_image_view *iview,
3202                               VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects)
3203 {
3204    VkImageSubresourceRange range = {
3205       .aspectMask = iview->vk.aspects,
3206       .baseMipLevel = iview->vk.base_mip_level,
3207       .levelCount = iview->vk.level_count,
3208       .baseArrayLayer = iview->vk.base_array_layer,
3209       .layerCount = iview->vk.layer_count,
3210    };
3211    struct radv_image *image = iview->image;
3212 
3213    assert(radv_htile_enabled(image, range.baseMipLevel));
3214 
3215    radv_set_ds_clear_metadata(cmd_buffer, iview->image, &range, ds_clear_value, aspects);
3216 
3217    if (radv_image_is_tc_compat_htile(image) && (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) {
3218       radv_update_tc_compat_zrange_metadata(cmd_buffer, iview, ds_clear_value);
3219    }
3220 
3221    radv_update_bound_fast_clear_ds(cmd_buffer, iview, ds_clear_value, aspects);
3222 }
3223 
3224 /**
3225  * Load the clear depth/stencil values from the image's metadata.
3226  */
3227 static void
radv_load_ds_clear_metadata(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview)3228 radv_load_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, const struct radv_image_view *iview)
3229 {
3230    struct radeon_cmdbuf *cs = cmd_buffer->cs;
3231    const struct radv_image *image = iview->image;
3232    VkImageAspectFlags aspects = vk_format_aspects(image->vk.format);
3233    uint64_t va = radv_get_ds_clear_value_va(image, iview->vk.base_mip_level);
3234    unsigned reg_offset = 0, reg_count = 0;
3235 
3236    assert(radv_image_has_htile(image));
3237 
3238    if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
3239       ++reg_count;
3240    } else {
3241       ++reg_offset;
3242       va += 4;
3243    }
3244    if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
3245       ++reg_count;
3246 
3247    uint32_t reg = R_028028_DB_STENCIL_CLEAR + 4 * reg_offset;
3248 
3249    if (cmd_buffer->device->physical_device->rad_info.has_load_ctx_reg_pkt) {
3250       radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, 0));
3251       radeon_emit(cs, va);
3252       radeon_emit(cs, va >> 32);
3253       radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
3254       radeon_emit(cs, reg_count);
3255    } else {
3256       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
3257       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) |
3258                          (reg_count == 2 ? COPY_DATA_COUNT_SEL : 0));
3259       radeon_emit(cs, va);
3260       radeon_emit(cs, va >> 32);
3261       radeon_emit(cs, reg >> 2);
3262       radeon_emit(cs, 0);
3263 
3264       radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
3265       radeon_emit(cs, 0);
3266    }
3267 }
3268 
3269 /*
3270  * With DCC some colors don't require CMASK elimination before being
3271  * used as a texture. This sets a predicate value to determine if the
3272  * cmask eliminate is required.
3273  */
3274 void
radv_update_fce_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,bool value)3275 radv_update_fce_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
3276                          const VkImageSubresourceRange *range, bool value)
3277 {
3278    if (!image->fce_pred_offset)
3279       return;
3280 
3281    uint64_t pred_val = value;
3282    uint64_t va = radv_image_get_fce_pred_va(image, range->baseMipLevel);
3283    uint32_t level_count = vk_image_subresource_level_count(&image->vk, range);
3284 
3285    ASSERTED unsigned cdw_end = radv_cs_write_data_head(cmd_buffer->device, cmd_buffer->cs, cmd_buffer->qf, V_370_PFP,
3286                                                        va, 2 * level_count, false);
3287 
3288    for (uint32_t l = 0; l < level_count; l++) {
3289       radeon_emit(cmd_buffer->cs, pred_val);
3290       radeon_emit(cmd_buffer->cs, pred_val >> 32);
3291    }
3292 
3293    assert(cmd_buffer->cs->cdw == cdw_end);
3294 }
3295 
3296 /**
3297  * Update the DCC predicate to reflect the compression state.
3298  */
3299 void
radv_update_dcc_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,bool value)3300 radv_update_dcc_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
3301                          const VkImageSubresourceRange *range, bool value)
3302 {
3303    if (image->dcc_pred_offset == 0)
3304       return;
3305 
3306    uint64_t pred_val = value;
3307    uint64_t va = radv_image_get_dcc_pred_va(image, range->baseMipLevel);
3308    uint32_t level_count = vk_image_subresource_level_count(&image->vk, range);
3309 
3310    assert(radv_dcc_enabled(image, range->baseMipLevel));
3311 
3312    ASSERTED unsigned cdw_end = radv_cs_write_data_head(cmd_buffer->device, cmd_buffer->cs, cmd_buffer->qf, V_370_PFP,
3313                                                        va, 2 * level_count, false);
3314 
3315    for (uint32_t l = 0; l < level_count; l++) {
3316       radeon_emit(cmd_buffer->cs, pred_val);
3317       radeon_emit(cmd_buffer->cs, pred_val >> 32);
3318    }
3319 
3320    assert(cmd_buffer->cs->cdw == cdw_end);
3321 }
3322 
3323 /**
3324  * Update the fast clear color values if the image is bound as a color buffer.
3325  */
3326 static void
radv_update_bound_fast_clear_color(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,int cb_idx,uint32_t color_values[2])3327 radv_update_bound_fast_clear_color(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, int cb_idx,
3328                                    uint32_t color_values[2])
3329 {
3330    struct radeon_cmdbuf *cs = cmd_buffer->cs;
3331 
3332    if (cb_idx >= cmd_buffer->state.render.color_att_count || cmd_buffer->state.render.color_att[cb_idx].iview == NULL ||
3333        cmd_buffer->state.render.color_att[cb_idx].iview->image != image)
3334       return;
3335 
3336    ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4);
3337 
3338    radeon_set_context_reg_seq(cs, R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c, 2);
3339    radeon_emit(cs, color_values[0]);
3340    radeon_emit(cs, color_values[1]);
3341 
3342    assert(cmd_buffer->cs->cdw <= cdw_max);
3343 
3344    cmd_buffer->state.context_roll_without_scissor_emitted = true;
3345 }
3346 
3347 /**
3348  * Set the clear color values to the image's metadata.
3349  */
3350 static void
radv_set_color_clear_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,uint32_t color_values[2])3351 radv_set_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
3352                               const VkImageSubresourceRange *range, uint32_t color_values[2])
3353 {
3354    struct radeon_cmdbuf *cs = cmd_buffer->cs;
3355    uint32_t level_count = vk_image_subresource_level_count(&image->vk, range);
3356 
3357    assert(radv_image_has_cmask(image) || radv_dcc_enabled(image, range->baseMipLevel));
3358 
3359    if (radv_image_has_clear_value(image)) {
3360       uint64_t va = radv_image_get_fast_clear_va(image, range->baseMipLevel);
3361 
3362       ASSERTED unsigned cdw_end = radv_cs_write_data_head(cmd_buffer->device, cmd_buffer->cs, cmd_buffer->qf, V_370_PFP,
3363                                                           va, 2 * level_count, cmd_buffer->state.predicating);
3364 
3365       for (uint32_t l = 0; l < level_count; l++) {
3366          radeon_emit(cs, color_values[0]);
3367          radeon_emit(cs, color_values[1]);
3368       }
3369 
3370       assert(cmd_buffer->cs->cdw == cdw_end);
3371    } else {
3372       /* Some default value we can set in the update. */
3373       assert(color_values[0] == 0 && color_values[1] == 0);
3374    }
3375 }
3376 
3377 /**
3378  * Update the clear color values for this image.
3379  */
3380 void
radv_update_color_clear_metadata(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview,int cb_idx,uint32_t color_values[2])3381 radv_update_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, const struct radv_image_view *iview, int cb_idx,
3382                                  uint32_t color_values[2])
3383 {
3384    struct radv_image *image = iview->image;
3385    VkImageSubresourceRange range = {
3386       .aspectMask = iview->vk.aspects,
3387       .baseMipLevel = iview->vk.base_mip_level,
3388       .levelCount = iview->vk.level_count,
3389       .baseArrayLayer = iview->vk.base_array_layer,
3390       .layerCount = iview->vk.layer_count,
3391    };
3392 
3393    assert(radv_image_has_cmask(image) || radv_dcc_enabled(image, iview->vk.base_mip_level));
3394 
3395    /* Do not need to update the clear value for images that are fast cleared with the comp-to-single
3396     * mode because the hardware gets the value from the image directly.
3397     */
3398    if (iview->image->support_comp_to_single)
3399       return;
3400 
3401    radv_set_color_clear_metadata(cmd_buffer, image, &range, color_values);
3402 
3403    radv_update_bound_fast_clear_color(cmd_buffer, image, cb_idx, color_values);
3404 }
3405 
3406 /**
3407  * Load the clear color values from the image's metadata.
3408  */
3409 static void
radv_load_color_clear_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image_view * iview,int cb_idx)3410 radv_load_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image_view *iview, int cb_idx)
3411 {
3412    struct radeon_cmdbuf *cs = cmd_buffer->cs;
3413    struct radv_image *image = iview->image;
3414 
3415    if (!radv_image_has_cmask(image) && !radv_dcc_enabled(image, iview->vk.base_mip_level))
3416       return;
3417 
3418    if (iview->image->support_comp_to_single)
3419       return;
3420 
3421    if (!radv_image_has_clear_value(image)) {
3422       uint32_t color_values[2] = {0, 0};
3423       radv_update_bound_fast_clear_color(cmd_buffer, image, cb_idx, color_values);
3424       return;
3425    }
3426 
3427    uint64_t va = radv_image_get_fast_clear_va(image, iview->vk.base_mip_level);
3428    uint32_t reg = R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c;
3429 
3430    if (cmd_buffer->device->physical_device->rad_info.has_load_ctx_reg_pkt) {
3431       radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, cmd_buffer->state.predicating));
3432       radeon_emit(cs, va);
3433       radeon_emit(cs, va >> 32);
3434       radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
3435       radeon_emit(cs, 2);
3436    } else {
3437       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating));
3438       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) | COPY_DATA_COUNT_SEL);
3439       radeon_emit(cs, va);
3440       radeon_emit(cs, va >> 32);
3441       radeon_emit(cs, reg >> 2);
3442       radeon_emit(cs, 0);
3443 
3444       radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
3445       radeon_emit(cs, 0);
3446    }
3447 }
3448 
3449 /* GFX9+ metadata cache flushing workaround. metadata cache coherency is
3450  * broken if the CB caches data of multiple mips of the same image at the
3451  * same time.
3452  *
3453  * Insert some flushes to avoid this.
3454  */
3455 static void
radv_emit_fb_mip_change_flush(struct radv_cmd_buffer * cmd_buffer)3456 radv_emit_fb_mip_change_flush(struct radv_cmd_buffer *cmd_buffer)
3457 {
3458    struct radv_rendering_state *render = &cmd_buffer->state.render;
3459    bool color_mip_changed = false;
3460 
3461    /* Entire workaround is not applicable before GFX9 */
3462    if (cmd_buffer->device->physical_device->rad_info.gfx_level < GFX9)
3463       return;
3464 
3465    for (int i = 0; i < render->color_att_count; ++i) {
3466       struct radv_image_view *iview = render->color_att[i].iview;
3467       if (!iview)
3468          continue;
3469 
3470       if ((radv_image_has_cmask(iview->image) || radv_dcc_enabled(iview->image, iview->vk.base_mip_level) ||
3471            radv_dcc_enabled(iview->image, cmd_buffer->state.cb_mip[i])) &&
3472           cmd_buffer->state.cb_mip[i] != iview->vk.base_mip_level)
3473          color_mip_changed = true;
3474 
3475       cmd_buffer->state.cb_mip[i] = iview->vk.base_mip_level;
3476    }
3477 
3478    if (color_mip_changed) {
3479       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
3480    }
3481 
3482    const struct radv_image_view *iview = render->ds_att.iview;
3483    if (iview) {
3484       if ((radv_htile_enabled(iview->image, iview->vk.base_mip_level) ||
3485            radv_htile_enabled(iview->image, cmd_buffer->state.ds_mip)) &&
3486           cmd_buffer->state.ds_mip != iview->vk.base_mip_level) {
3487          cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
3488       }
3489 
3490       cmd_buffer->state.ds_mip = iview->vk.base_mip_level;
3491    }
3492 }
3493 
3494 /* This function does the flushes for mip changes if the levels are not zero for
3495  * all render targets. This way we can assume at the start of the next cmd_buffer
3496  * that rendering to mip 0 doesn't need any flushes. As that is the most common
3497  * case that saves some flushes. */
3498 static void
radv_emit_mip_change_flush_default(struct radv_cmd_buffer * cmd_buffer)3499 radv_emit_mip_change_flush_default(struct radv_cmd_buffer *cmd_buffer)
3500 {
3501    /* Entire workaround is not applicable before GFX9 */
3502    if (cmd_buffer->device->physical_device->rad_info.gfx_level < GFX9)
3503       return;
3504 
3505    bool need_color_mip_flush = false;
3506    for (unsigned i = 0; i < 8; ++i) {
3507       if (cmd_buffer->state.cb_mip[i]) {
3508          need_color_mip_flush = true;
3509          break;
3510       }
3511    }
3512 
3513    if (need_color_mip_flush) {
3514       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
3515    }
3516 
3517    if (cmd_buffer->state.ds_mip) {
3518       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
3519    }
3520 
3521    memset(cmd_buffer->state.cb_mip, 0, sizeof(cmd_buffer->state.cb_mip));
3522    cmd_buffer->state.ds_mip = 0;
3523 }
3524 
3525 /* GFX9+ metadata cache flushing workaround. metadata cache coherency is
3526  * broken if the CB caches data of multiple mips of the same image at the
3527  * same time.
3528  *
3529  * Insert some flushes to avoid this.
3530  */
3531 static void
radv_emit_fb_mip_change_flush(struct radv_cmd_buffer * cmd_buffer)3532 radv_emit_fb_mip_change_flush(struct radv_cmd_buffer *cmd_buffer)
3533 {
3534 	struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
3535 	const struct radv_subpass *subpass = cmd_buffer->state.subpass;
3536 	bool color_mip_changed = false;
3537 
3538 	/* Entire workaround is not applicable before GFX9 */
3539 	if (cmd_buffer->device->physical_device->rad_info.chip_class < GFX9)
3540 		return;
3541 
3542 	if (!framebuffer)
3543 		return;
3544 
3545 	for (int i = 0; i < subpass->color_count; ++i) {
3546 		int idx = subpass->color_attachments[i].attachment;
3547 		if (idx == VK_ATTACHMENT_UNUSED)
3548 			continue;
3549 
3550 		struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview;
3551 
3552 		if ((radv_image_has_CB_metadata(iview->image) ||
3553 		     radv_image_has_dcc(iview->image)) &&
3554 		    cmd_buffer->state.cb_mip[i] != iview->base_mip)
3555 			color_mip_changed = true;
3556 
3557 		cmd_buffer->state.cb_mip[i] = iview->base_mip;
3558 	}
3559 
3560 	if (color_mip_changed) {
3561 		cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
3562 		                                RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
3563 	}
3564 }
3565 
3566 /* This function does the flushes for mip changes if the levels are not zero for
3567  * all render targets. This way we can assume at the start of the next cmd_buffer
3568  * that rendering to mip 0 doesn't need any flushes. As that is the most common
3569  * case that saves some flushes. */
3570 static void
radv_emit_mip_change_flush_default(struct radv_cmd_buffer * cmd_buffer)3571 radv_emit_mip_change_flush_default(struct radv_cmd_buffer *cmd_buffer)
3572 {
3573 	/* Entire workaround is not applicable before GFX9 */
3574 	if (cmd_buffer->device->physical_device->rad_info.chip_class < GFX9)
3575 		return;
3576 
3577 	bool need_color_mip_flush = false;
3578 	for (unsigned i = 0; i < 8; ++i) {
3579 		if (cmd_buffer->state.cb_mip[i]) {
3580 			need_color_mip_flush = true;
3581 			break;
3582 		}
3583 	}
3584 
3585 	if (need_color_mip_flush) {
3586 		cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
3587 		                                RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
3588 	}
3589 
3590 	memset(cmd_buffer->state.cb_mip, 0, sizeof(cmd_buffer->state.cb_mip));
3591 }
3592 
3593 static void
radv_emit_framebuffer_state(struct radv_cmd_buffer * cmd_buffer)3594 radv_emit_framebuffer_state(struct radv_cmd_buffer *cmd_buffer)
3595 {
3596    struct radv_rendering_state *render = &cmd_buffer->state.render;
3597    int i;
3598    bool disable_constant_encode_ac01 = false;
3599    unsigned color_invalid = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11
3600                                ? S_028C70_FORMAT_GFX11(V_028C70_COLOR_INVALID)
3601                                : S_028C70_FORMAT_GFX6(V_028C70_COLOR_INVALID);
3602    VkExtent2D extent = {MAX_FRAMEBUFFER_WIDTH, MAX_FRAMEBUFFER_HEIGHT};
3603 
3604    ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 51 + MAX_RTS * 70);
3605 
3606    for (i = 0; i < render->color_att_count; ++i) {
3607       struct radv_image_view *iview = render->color_att[i].iview;
3608       if (!iview) {
3609          radeon_set_context_reg(cmd_buffer->cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, color_invalid);
3610          continue;
3611       }
3612 
3613       VkImageLayout layout = render->color_att[i].layout;
3614 
3615       radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, iview->image->bindings[0].bo);
3616 
3617       assert(iview->vk.aspects & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_PLANE_0_BIT |
3618                                   VK_IMAGE_ASPECT_PLANE_1_BIT | VK_IMAGE_ASPECT_PLANE_2_BIT));
3619 
3620       if (iview->image->disjoint && iview->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
3621          for (uint32_t plane_id = 0; plane_id < iview->image->plane_count; plane_id++) {
3622             radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, iview->image->bindings[plane_id].bo);
3623          }
3624       } else {
3625          uint32_t plane_id = iview->image->disjoint ? iview->plane_id : 0;
3626          radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, iview->image->bindings[plane_id].bo);
3627       }
3628 
3629       radv_emit_fb_color_state(cmd_buffer, i, &render->color_att[i].cb, iview, layout);
3630 
3631       radv_load_color_clear_metadata(cmd_buffer, iview, i);
3632 
3633       if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9 && iview->image->dcc_sign_reinterpret) {
3634          /* Disable constant encoding with the clear value of "1" with different DCC signedness
3635           * because the hardware will fill "1" instead of the clear value.
3636           */
3637          disable_constant_encode_ac01 = true;
3638       }
3639 
3640       extent.width = MIN2(extent.width, iview->vk.extent.width);
3641       extent.height = MIN2(extent.height, iview->vk.extent.height);
3642    }
3643    for (; i < cmd_buffer->state.last_subpass_color_count; i++) {
3644       radeon_set_context_reg(cmd_buffer->cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, color_invalid);
3645    }
3646    cmd_buffer->state.last_subpass_color_count = render->color_att_count;
3647 
3648    if (render->ds_att.iview) {
3649       struct radv_image_view *iview = render->ds_att.iview;
3650       const struct radv_image *image = iview->image;
3651       radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, image->bindings[0].bo);
3652 
3653       uint32_t qf_mask = radv_image_queue_family_mask(image, cmd_buffer->qf, cmd_buffer->qf);
3654       bool depth_compressed =
3655          radv_layout_is_htile_compressed(cmd_buffer->device, image, render->ds_att.layout, qf_mask);
3656       bool stencil_compressed =
3657          radv_layout_is_htile_compressed(cmd_buffer->device, image, render->ds_att.stencil_layout, qf_mask);
3658 
3659       radv_emit_fb_ds_state(cmd_buffer, &render->ds_att.ds, iview, depth_compressed, stencil_compressed);
3660 
3661       if (depth_compressed || stencil_compressed) {
3662          /* Only load the depth/stencil fast clear values when
3663           * compressed rendering is enabled.
3664           */
3665          radv_load_ds_clear_metadata(cmd_buffer, iview);
3666       }
3667 
3668       extent.width = MIN2(extent.width, iview->vk.extent.width);
3669       extent.height = MIN2(extent.height, iview->vk.extent.height);
3670    } else if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX10_3 && render->vrs_att.iview &&
3671               radv_cmd_buffer_get_vrs_image(cmd_buffer)) {
3672       /* When a subpass uses a VRS attachment without binding a depth/stencil attachment, we have to
3673        * bind our internal depth buffer that contains the VRS data as part of HTILE.
3674        */
3675       VkImageLayout layout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
3676       struct radv_buffer *htile_buffer = cmd_buffer->device->vrs.buffer;
3677       struct radv_image *image = cmd_buffer->device->vrs.image;
3678       struct radv_ds_buffer_info ds;
3679       struct radv_image_view iview;
3680 
3681       radv_image_view_init(&iview, cmd_buffer->device,
3682                            &(VkImageViewCreateInfo){
3683                               .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
3684                               .image = radv_image_to_handle(image),
3685                               .viewType = radv_meta_get_view_type(image),
3686                               .format = image->vk.format,
3687                               .subresourceRange =
3688                                  {
3689                                     .aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT,
3690                                     .baseMipLevel = 0,
3691                                     .levelCount = 1,
3692                                     .baseArrayLayer = 0,
3693                                     .layerCount = 1,
3694                                  },
3695                            },
3696                            0, NULL);
3697 
3698       radv_initialise_vrs_surface(image, htile_buffer, &ds);
3699 
3700       radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, htile_buffer->bo);
3701 
3702       bool depth_compressed = radv_layout_is_htile_compressed(
3703          cmd_buffer->device, image, layout, radv_image_queue_family_mask(image, cmd_buffer->qf, cmd_buffer->qf));
3704       radv_emit_fb_ds_state(cmd_buffer, &ds, &iview, depth_compressed, false);
3705 
3706       radv_image_view_finish(&iview);
3707    } else {
3708       radv_emit_null_ds_state(cmd_buffer);
3709    }
3710 
3711    if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
3712       bool vrs_surface_enable = render->vrs_att.iview != NULL;
3713       unsigned xmax = 0, ymax = 0;
3714       uint64_t va = 0;
3715 
3716       if (vrs_surface_enable) {
3717          struct radv_image *vrs_image = render->vrs_att.iview->image;
3718 
3719          va = radv_buffer_get_va(vrs_image->bindings[0].bo) + vrs_image->bindings[0].offset;
3720          va |= vrs_image->planes[0].surface.tile_swizzle << 8;
3721 
3722          xmax = vrs_image->vk.extent.width - 1;
3723          ymax = vrs_image->vk.extent.height - 1;
3724       }
3725 
3726       radeon_set_context_reg_seq(cmd_buffer->cs, R_0283F0_PA_SC_VRS_RATE_BASE, 3);
3727       radeon_emit(cmd_buffer->cs, va >> 8);
3728       radeon_emit(cmd_buffer->cs, S_0283F4_BASE_256B(va >> 40));
3729       radeon_emit(cmd_buffer->cs, S_0283F8_X_MAX(xmax) | S_0283F8_Y_MAX(ymax));
3730 
3731       radeon_set_context_reg(cmd_buffer->cs, R_0283D0_PA_SC_VRS_OVERRIDE_CNTL,
3732                              S_0283D0_VRS_SURFACE_ENABLE(vrs_surface_enable));
3733    }
3734 
3735    if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX8) {
3736       bool disable_constant_encode = cmd_buffer->device->physical_device->rad_info.has_dcc_constant_encode;
3737       enum amd_gfx_level gfx_level = cmd_buffer->device->physical_device->rad_info.gfx_level;
3738 
3739       if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
3740          radeon_set_context_reg(cmd_buffer->cs, R_028424_CB_FDCC_CONTROL, S_028424_SAMPLE_MASK_TRACKER_WATERMARK(0));
3741       } else {
3742          uint8_t watermark = gfx_level >= GFX10 ? 6 : 4;
3743 
3744          radeon_set_context_reg(cmd_buffer->cs, R_028424_CB_DCC_CONTROL,
3745                                 S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(gfx_level <= GFX9) |
3746                                    S_028424_OVERWRITE_COMBINER_WATERMARK(watermark) |
3747                                    S_028424_DISABLE_CONSTANT_ENCODE_AC01(disable_constant_encode_ac01) |
3748                                    S_028424_DISABLE_CONSTANT_ENCODE_REG(disable_constant_encode));
3749       }
3750    }
3751 
3752    radeon_set_context_reg(cmd_buffer->cs, R_028034_PA_SC_SCREEN_SCISSOR_BR,
3753                           S_028034_BR_X(extent.width) | S_028034_BR_Y(extent.height));
3754 
3755    assert(cmd_buffer->cs->cdw <= cdw_max);
3756 
3757    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_FRAMEBUFFER;
3758 }
3759 
3760 static void
radv_emit_guardband_state(struct radv_cmd_buffer * cmd_buffer)3761 radv_emit_guardband_state(struct radv_cmd_buffer *cmd_buffer)
3762 {
3763    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3764    unsigned rast_prim = radv_get_rasterization_prim(cmd_buffer);
3765 
3766    radv_write_guardband(cmd_buffer->cs, d->vk.vp.viewport_count, d->vk.vp.viewports, rast_prim, d->vk.rs.polygon_mode,
3767                         d->vk.rs.line.width);
3768 
3769    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_GUARDBAND;
3770 }
3771 
3772 /* Bind an internal index buffer for GPUs that hang with 0-sized index buffers to handle robustness2
3773  * which requires 0 for out-of-bounds access.
3774  */
3775 static void
radv_handle_zero_index_buffer_bug(struct radv_cmd_buffer * cmd_buffer,uint64_t * index_va,uint32_t * remaining_indexes)3776 radv_handle_zero_index_buffer_bug(struct radv_cmd_buffer *cmd_buffer, uint64_t *index_va, uint32_t *remaining_indexes)
3777 {
3778    const uint32_t zero = 0;
3779    uint32_t offset;
3780 
3781    if (!radv_cmd_buffer_upload_data(cmd_buffer, sizeof(uint32_t), &zero, &offset)) {
3782       vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
3783       return;
3784    }
3785 
3786    *index_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset;
3787    *remaining_indexes = 1;
3788 }
3789 
3790 static void
radv_emit_index_buffer(struct radv_cmd_buffer * cmd_buffer)3791 radv_emit_index_buffer(struct radv_cmd_buffer *cmd_buffer)
3792 {
3793    struct radeon_cmdbuf *cs = cmd_buffer->cs;
3794    struct radv_cmd_state *state = &cmd_buffer->state;
3795    uint32_t max_index_count = state->max_index_count;
3796    uint64_t index_va = state->index_va;
3797 
3798    /* With indirect generated commands the index buffer bind may be part of the
3799     * indirect command buffer, in which case the app may not have bound any yet. */
3800    if (state->index_type < 0)
3801       return;
3802 
3803    /* Handle indirect draw calls with NULL index buffer if the GPU doesn't support them. */
3804    if (!max_index_count && cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug) {
3805       radv_handle_zero_index_buffer_bug(cmd_buffer, &index_va, &max_index_count);
3806    }
3807 
3808    radeon_emit(cs, PKT3(PKT3_INDEX_BASE, 1, 0));
3809    radeon_emit(cs, index_va);
3810    radeon_emit(cs, index_va >> 32);
3811 
3812    radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
3813    radeon_emit(cs, max_index_count);
3814 
3815    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_INDEX_BUFFER;
3816 }
3817 
3818 static void
radv_flush_occlusion_query_state(struct radv_cmd_buffer * cmd_buffer)3819 radv_flush_occlusion_query_state(struct radv_cmd_buffer *cmd_buffer)
3820 {
3821    const enum amd_gfx_level gfx_level = cmd_buffer->device->physical_device->rad_info.gfx_level;
3822    const bool enable_occlusion_queries =
3823       cmd_buffer->state.active_occlusion_queries || cmd_buffer->state.inherited_occlusion_queries;
3824    uint32_t db_count_control;
3825 
3826    if (!enable_occlusion_queries) {
3827       db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(gfx_level < GFX11);
3828    } else {
3829       uint32_t sample_rate = util_logbase2(cmd_buffer->state.render.max_samples);
3830       bool gfx10_perfect =
3831          gfx_level >= GFX10 && (cmd_buffer->state.perfect_occlusion_queries_enabled ||
3832                                 cmd_buffer->state.inherited_query_control_flags & VK_QUERY_CONTROL_PRECISE_BIT);
3833 
3834       if (gfx_level >= GFX7) {
3835          /* Always enable PERFECT_ZPASS_COUNTS due to issues with partially
3836           * covered tiles, discards, and early depth testing. For more details,
3837           * see https://gitlab.freedesktop.org/mesa/mesa/-/issues/3218 */
3838          db_count_control = S_028004_PERFECT_ZPASS_COUNTS(1) |
3839                             S_028004_DISABLE_CONSERVATIVE_ZPASS_COUNTS(gfx10_perfect) |
3840                             S_028004_SAMPLE_RATE(sample_rate) | S_028004_ZPASS_ENABLE(1) |
3841                             S_028004_SLICE_EVEN_ENABLE(1) | S_028004_SLICE_ODD_ENABLE(1);
3842       } else {
3843          db_count_control = S_028004_PERFECT_ZPASS_COUNTS(1) | S_028004_SAMPLE_RATE(sample_rate);
3844       }
3845    }
3846 
3847    if (db_count_control != cmd_buffer->state.last_db_count_control) {
3848       radeon_set_context_reg(cmd_buffer->cs, R_028004_DB_COUNT_CONTROL, db_count_control);
3849 
3850       cmd_buffer->state.context_roll_without_scissor_emitted = true;
3851 
3852       cmd_buffer->state.last_db_count_control = db_count_control;
3853    }
3854 
3855    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_OCCLUSION_QUERY;
3856 }
3857 
3858 unsigned
radv_instance_rate_prolog_index(unsigned num_attributes,uint32_t instance_rate_inputs)3859 radv_instance_rate_prolog_index(unsigned num_attributes, uint32_t instance_rate_inputs)
3860 {
3861    /* instance_rate_vs_prologs is a flattened array of array of arrays of different sizes, or a
3862     * single array sorted in ascending order using:
3863     * - total number of attributes
3864     * - number of instanced attributes
3865     * - index of first instanced attribute
3866     */
3867 
3868    /* From total number of attributes to offset. */
3869    static const uint16_t total_to_offset[16] = {0, 1, 4, 10, 20, 35, 56, 84, 120, 165, 220, 286, 364, 455, 560, 680};
3870    unsigned start_index = total_to_offset[num_attributes - 1];
3871 
3872    /* From number of instanced attributes to offset. This would require a different LUT depending on
3873     * the total number of attributes, but we can exploit a pattern to use just the LUT for 16 total
3874     * attributes.
3875     */
3876    static const uint8_t count_to_offset_total16[16] = {0,   16,  31,  45,  58,  70,  81,  91,
3877                                                        100, 108, 115, 121, 126, 130, 133, 135};
3878    unsigned count = util_bitcount(instance_rate_inputs);
3879    unsigned offset_from_start_index = count_to_offset_total16[count - 1] - ((16 - num_attributes) * (count - 1));
3880 
3881    unsigned first = ffs(instance_rate_inputs) - 1;
3882    return start_index + offset_from_start_index + first;
3883 }
3884 
3885 static struct radv_shader_part *
lookup_vs_prolog(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * vs_shader,uint32_t * nontrivial_divisors)3886 lookup_vs_prolog(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *vs_shader, uint32_t *nontrivial_divisors)
3887 {
3888    assert(vs_shader->info.vs.dynamic_inputs);
3889 
3890    const struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
3891    struct radv_device *device = cmd_buffer->device;
3892 
3893    unsigned num_attributes = util_last_bit(vs_shader->info.vs.vb_desc_usage_mask);
3894    uint32_t attribute_mask = BITFIELD_MASK(num_attributes);
3895 
3896    uint32_t instance_rate_inputs = state->instance_rate_inputs & attribute_mask;
3897    uint32_t zero_divisors = state->zero_divisors & attribute_mask;
3898    *nontrivial_divisors = state->nontrivial_divisors & attribute_mask;
3899    uint32_t misaligned_mask = cmd_buffer->state.vbo_misaligned_mask;
3900    if (cmd_buffer->state.vbo_misaligned_mask_invalid) {
3901       assert(device->physical_device->rad_info.gfx_level == GFX6 ||
3902              device->physical_device->rad_info.gfx_level >= GFX10);
3903 
3904       u_foreach_bit (index, cmd_buffer->state.vbo_misaligned_mask_invalid & attribute_mask) {
3905          uint8_t binding = state->bindings[index];
3906          if (!(cmd_buffer->state.vbo_bound_mask & BITFIELD_BIT(binding)))
3907             continue;
3908 
3909          uint8_t req = state->format_align_req_minus_1[index];
3910          uint64_t vb_offset = cmd_buffer->vertex_bindings[binding].offset;
3911          uint64_t vb_stride;
3912 
3913          if (cmd_buffer->state.uses_dynamic_vertex_binding_stride) {
3914             vb_stride = cmd_buffer->vertex_bindings[binding].stride;
3915          } else {
3916             vb_stride = cmd_buffer->state.graphics_pipeline->binding_stride[binding];
3917          }
3918 
3919          VkDeviceSize offset = vb_offset + state->offsets[index];
3920          if ((offset & req) || (vb_stride & req))
3921             misaligned_mask |= BITFIELD_BIT(index);
3922       }
3923       cmd_buffer->state.vbo_misaligned_mask = misaligned_mask;
3924       cmd_buffer->state.vbo_misaligned_mask_invalid &= ~attribute_mask;
3925    }
3926    misaligned_mask |= state->nontrivial_formats;
3927    misaligned_mask &= attribute_mask;
3928 
3929    const bool can_use_simple_input =
3930       cmd_buffer->state.shaders[MESA_SHADER_VERTEX] &&
3931       !cmd_buffer->state.shaders[MESA_SHADER_VERTEX]->info.merged_shader_compiled_separately &&
3932       cmd_buffer->state.shaders[MESA_SHADER_VERTEX]->info.is_ngg == device->physical_device->use_ngg &&
3933       cmd_buffer->state.shaders[MESA_SHADER_VERTEX]->info.wave_size == device->physical_device->ge_wave_size;
3934 
3935    /* The instance ID input VGPR is placed differently when as_ls=true. as_ls is also needed to
3936     * workaround the LS VGPR initialization bug.
3937     */
3938    bool as_ls =
3939       vs_shader->info.vs.as_ls && (instance_rate_inputs || device->physical_device->rad_info.has_ls_vgpr_init_bug);
3940 
3941    /* try to use a pre-compiled prolog first */
3942    struct radv_shader_part *prolog = NULL;
3943    if (can_use_simple_input && !as_ls && !misaligned_mask && !state->alpha_adjust_lo && !state->alpha_adjust_hi) {
3944       if (!instance_rate_inputs) {
3945          prolog = device->simple_vs_prologs[num_attributes - 1];
3946       } else if (num_attributes <= 16 && !*nontrivial_divisors && !zero_divisors &&
3947                  util_bitcount(instance_rate_inputs) ==
3948                     (util_last_bit(instance_rate_inputs) - ffs(instance_rate_inputs) + 1)) {
3949          unsigned index = radv_instance_rate_prolog_index(num_attributes, instance_rate_inputs);
3950          prolog = device->instance_rate_vs_prologs[index];
3951       }
3952    }
3953    if (prolog)
3954       return prolog;
3955 
3956    struct radv_vs_prolog_key key;
3957    memset(&key, 0, sizeof(key));
3958    key.instance_rate_inputs = instance_rate_inputs;
3959    key.nontrivial_divisors = *nontrivial_divisors;
3960    key.zero_divisors = zero_divisors;
3961    /* If the attribute is aligned, post shuffle is implemented using DST_SEL instead. */
3962    key.post_shuffle = state->post_shuffle & misaligned_mask;
3963    key.alpha_adjust_hi = state->alpha_adjust_hi & attribute_mask;
3964    key.alpha_adjust_lo = state->alpha_adjust_lo & attribute_mask;
3965    u_foreach_bit (index, misaligned_mask)
3966       key.formats[index] = state->formats[index];
3967    key.num_attributes = num_attributes;
3968    key.misaligned_mask = misaligned_mask;
3969    key.as_ls = as_ls;
3970    key.is_ngg = vs_shader->info.is_ngg;
3971    key.wave32 = vs_shader->info.wave_size == 32;
3972 
3973    if (vs_shader->info.merged_shader_compiled_separately) {
3974       assert(vs_shader->info.next_stage == MESA_SHADER_TESS_CTRL || vs_shader->info.next_stage == MESA_SHADER_GEOMETRY);
3975       key.next_stage = vs_shader->info.next_stage;
3976    } else {
3977       key.next_stage = vs_shader->info.stage;
3978    }
3979 
3980    return radv_shader_part_cache_get(device, &device->vs_prologs, &cmd_buffer->vs_prologs, &key);
3981 }
3982 
3983 static void
emit_prolog_regs(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * vs_shader,const struct radv_shader_part * prolog)3984 emit_prolog_regs(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *vs_shader,
3985                  const struct radv_shader_part *prolog)
3986 {
3987    uint32_t rsrc1, rsrc2;
3988 
3989    /* no need to re-emit anything in this case */
3990    if (cmd_buffer->state.emitted_vs_prolog == prolog)
3991       return;
3992 
3993    enum amd_gfx_level chip = cmd_buffer->device->physical_device->rad_info.gfx_level;
3994 
3995    assert(cmd_buffer->state.emitted_graphics_pipeline == cmd_buffer->state.graphics_pipeline);
3996 
3997    if (vs_shader->info.merged_shader_compiled_separately) {
3998       if (vs_shader->info.next_stage == MESA_SHADER_GEOMETRY) {
3999          radv_shader_combine_cfg_vs_gs(vs_shader, cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY], &rsrc1, &rsrc2);
4000       } else {
4001          assert(vs_shader->info.next_stage == MESA_SHADER_TESS_CTRL);
4002 
4003          radv_shader_combine_cfg_vs_tcs(vs_shader, cmd_buffer->state.shaders[MESA_SHADER_TESS_CTRL], &rsrc1, &rsrc2);
4004       }
4005    } else {
4006       rsrc1 = vs_shader->config.rsrc1;
4007    }
4008 
4009    if (chip < GFX10 && G_00B228_SGPRS(prolog->rsrc1) > G_00B228_SGPRS(rsrc1))
4010       rsrc1 = (rsrc1 & C_00B228_SGPRS) | (prolog->rsrc1 & ~C_00B228_SGPRS);
4011 
4012    /* The main shader must not use less VGPRs than the prolog, otherwise shared vgprs might not
4013     * work.
4014     */
4015    assert(G_00B848_VGPRS(vs_shader->config.rsrc1) >= G_00B848_VGPRS(prolog->rsrc1));
4016 
4017    unsigned pgm_lo_reg = R_00B120_SPI_SHADER_PGM_LO_VS;
4018    unsigned rsrc1_reg = R_00B128_SPI_SHADER_PGM_RSRC1_VS;
4019    if (vs_shader->info.is_ngg || cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY] == vs_shader ||
4020        (vs_shader->info.merged_shader_compiled_separately && vs_shader->info.next_stage == MESA_SHADER_GEOMETRY)) {
4021       pgm_lo_reg = chip >= GFX10 ? R_00B320_SPI_SHADER_PGM_LO_ES : R_00B210_SPI_SHADER_PGM_LO_ES;
4022       rsrc1_reg = R_00B228_SPI_SHADER_PGM_RSRC1_GS;
4023    } else if (cmd_buffer->state.shaders[MESA_SHADER_TESS_CTRL] == vs_shader ||
4024               (vs_shader->info.merged_shader_compiled_separately &&
4025                vs_shader->info.next_stage == MESA_SHADER_TESS_CTRL)) {
4026       pgm_lo_reg = chip >= GFX10 ? R_00B520_SPI_SHADER_PGM_LO_LS : R_00B410_SPI_SHADER_PGM_LO_LS;
4027       rsrc1_reg = R_00B428_SPI_SHADER_PGM_RSRC1_HS;
4028    } else if (vs_shader->info.vs.as_ls) {
4029       pgm_lo_reg = R_00B520_SPI_SHADER_PGM_LO_LS;
4030       rsrc1_reg = R_00B528_SPI_SHADER_PGM_RSRC1_LS;
4031    } else if (vs_shader->info.vs.as_es) {
4032       pgm_lo_reg = R_00B320_SPI_SHADER_PGM_LO_ES;
4033       rsrc1_reg = R_00B328_SPI_SHADER_PGM_RSRC1_ES;
4034    }
4035 
4036    radeon_set_sh_reg(cmd_buffer->cs, pgm_lo_reg, prolog->va >> 8);
4037 
4038    if (chip < GFX10 || vs_shader->info.merged_shader_compiled_separately) {
4039       radeon_set_sh_reg(cmd_buffer->cs, rsrc1_reg, rsrc1);
4040 
4041       if (vs_shader->info.merged_shader_compiled_separately) {
4042          if (vs_shader->info.next_stage == MESA_SHADER_GEOMETRY) {
4043             const struct radv_shader *gs = cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY];
4044 
4045             radeon_set_sh_reg(cmd_buffer->cs, rsrc1_reg + 4, rsrc2 | S_00B22C_LDS_SIZE(gs->info.gs_ring_info.lds_size));
4046          } else {
4047             radeon_set_sh_reg(cmd_buffer->cs, rsrc1_reg + 4, rsrc2);
4048          }
4049       }
4050    } else {
4051       assert(rsrc1 == vs_shader->config.rsrc1);
4052    }
4053 
4054    radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, prolog->bo);
4055 }
4056 
4057 static void
emit_prolog_inputs(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * vs_shader,uint32_t nontrivial_divisors)4058 emit_prolog_inputs(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *vs_shader,
4059                    uint32_t nontrivial_divisors)
4060 {
4061    /* no need to re-emit anything in this case */
4062    if (!nontrivial_divisors && cmd_buffer->state.emitted_vs_prolog &&
4063        !cmd_buffer->state.emitted_vs_prolog->nontrivial_divisors)
4064       return;
4065 
4066    const struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
4067    uint64_t input_va = radv_shader_get_va(vs_shader);
4068 
4069    if (nontrivial_divisors) {
4070       unsigned inputs_offset;
4071       uint32_t *inputs;
4072       unsigned size = 8 + util_bitcount(nontrivial_divisors) * 8;
4073       if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, &inputs_offset, (void **)&inputs))
4074          return;
4075 
4076       *(inputs++) = input_va;
4077       *(inputs++) = input_va >> 32;
4078 
4079       u_foreach_bit (index, nontrivial_divisors) {
4080          uint32_t div = state->divisors[index];
4081          if (div == 0) {
4082             *(inputs++) = 0;
4083             *(inputs++) = 1;
4084          } else if (util_is_power_of_two_or_zero(div)) {
4085             *(inputs++) = util_logbase2(div) | (1 << 8);
4086             *(inputs++) = 0xffffffffu;
4087          } else {
4088             struct util_fast_udiv_info info = util_compute_fast_udiv_info(div, 32, 32);
4089             *(inputs++) = info.pre_shift | (info.increment << 8) | (info.post_shift << 16);
4090             *(inputs++) = info.multiplier;
4091          }
4092       }
4093 
4094       input_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + inputs_offset;
4095    }
4096 
4097    const struct radv_userdata_info *loc = &vs_shader->info.user_sgprs_locs.shader_data[AC_UD_VS_PROLOG_INPUTS];
4098    uint32_t base_reg = vs_shader->info.user_data_0;
4099    assert(loc->sgpr_idx != -1);
4100    assert(loc->num_sgprs == 2);
4101    radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, input_va, true);
4102 }
4103 
4104 static void
radv_emit_vertex_input(struct radv_cmd_buffer * cmd_buffer)4105 radv_emit_vertex_input(struct radv_cmd_buffer *cmd_buffer)
4106 {
4107    const struct radv_shader *vs_shader = radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_VERTEX);
4108 
4109    assert(!cmd_buffer->state.mesh_shading);
4110 
4111    if (!vs_shader->info.vs.has_prolog)
4112       return;
4113 
4114    uint32_t nontrivial_divisors;
4115    struct radv_shader_part *prolog = lookup_vs_prolog(cmd_buffer, vs_shader, &nontrivial_divisors);
4116    if (!prolog) {
4117       vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
4118       return;
4119    }
4120    emit_prolog_regs(cmd_buffer, vs_shader, prolog);
4121    emit_prolog_inputs(cmd_buffer, vs_shader, nontrivial_divisors);
4122 
4123    cmd_buffer->shader_upload_seq = MAX2(cmd_buffer->shader_upload_seq, prolog->upload_seq);
4124 
4125    cmd_buffer->state.emitted_vs_prolog = prolog;
4126 
4127    if (radv_device_fault_detection_enabled(cmd_buffer->device))
4128       radv_save_vs_prolog(cmd_buffer, prolog);
4129 }
4130 
4131 static void
radv_emit_tess_domain_origin(struct radv_cmd_buffer * cmd_buffer)4132 radv_emit_tess_domain_origin(struct radv_cmd_buffer *cmd_buffer)
4133 {
4134    const struct radv_physical_device *pdevice = cmd_buffer->device->physical_device;
4135    const struct radv_shader *tes = radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_TESS_EVAL);
4136    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
4137    unsigned type = 0, partitioning = 0, distribution_mode = 0;
4138    unsigned topology;
4139 
4140    switch (tes->info.tes._primitive_mode) {
4141    case TESS_PRIMITIVE_TRIANGLES:
4142       type = V_028B6C_TESS_TRIANGLE;
4143       break;
4144    case TESS_PRIMITIVE_QUADS:
4145       type = V_028B6C_TESS_QUAD;
4146       break;
4147    case TESS_PRIMITIVE_ISOLINES:
4148       type = V_028B6C_TESS_ISOLINE;
4149       break;
4150    default:
4151       unreachable("Invalid tess primitive type");
4152    }
4153 
4154    switch (tes->info.tes.spacing) {
4155    case TESS_SPACING_EQUAL:
4156       partitioning = V_028B6C_PART_INTEGER;
4157       break;
4158    case TESS_SPACING_FRACTIONAL_ODD:
4159       partitioning = V_028B6C_PART_FRAC_ODD;
4160       break;
4161    case TESS_SPACING_FRACTIONAL_EVEN:
4162       partitioning = V_028B6C_PART_FRAC_EVEN;
4163       break;
4164    default:
4165       unreachable("Invalid tess spacing type");
4166    }
4167 
4168    if (pdevice->rad_info.has_distributed_tess) {
4169       if (pdevice->rad_info.family == CHIP_FIJI || pdevice->rad_info.family >= CHIP_POLARIS10)
4170          distribution_mode = V_028B6C_TRAPEZOIDS;
4171       else
4172          distribution_mode = V_028B6C_DONUTS;
4173    } else {
4174       distribution_mode = V_028B6C_NO_DIST;
4175    }
4176 
4177    if (tes->info.tes.point_mode) {
4178       topology = V_028B6C_OUTPUT_POINT;
4179    } else if (tes->info.tes._primitive_mode == TESS_PRIMITIVE_ISOLINES) {
4180       topology = V_028B6C_OUTPUT_LINE;
4181    } else {
4182       bool ccw = tes->info.tes.ccw;
4183 
4184       if (d->vk.ts.domain_origin != VK_TESSELLATION_DOMAIN_ORIGIN_UPPER_LEFT) {
4185          ccw = !ccw;
4186       }
4187 
4188       topology = ccw ? V_028B6C_OUTPUT_TRIANGLE_CCW : V_028B6C_OUTPUT_TRIANGLE_CW;
4189    }
4190 
4191    radeon_set_context_reg(cmd_buffer->cs, R_028B6C_VGT_TF_PARAM,
4192                           S_028B6C_TYPE(type) | S_028B6C_PARTITIONING(partitioning) | S_028B6C_TOPOLOGY(topology) |
4193                              S_028B6C_DISTRIBUTION_MODE(distribution_mode));
4194 }
4195 
4196 static void
radv_emit_alpha_to_coverage_enable(struct radv_cmd_buffer * cmd_buffer)4197 radv_emit_alpha_to_coverage_enable(struct radv_cmd_buffer *cmd_buffer)
4198 {
4199    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
4200    unsigned db_alpha_to_mask = 0;
4201 
4202    if (cmd_buffer->device->instance->debug_flags & RADV_DEBUG_NO_ATOC_DITHERING) {
4203       db_alpha_to_mask = S_028B70_ALPHA_TO_MASK_OFFSET0(2) | S_028B70_ALPHA_TO_MASK_OFFSET1(2) |
4204                          S_028B70_ALPHA_TO_MASK_OFFSET2(2) | S_028B70_ALPHA_TO_MASK_OFFSET3(2) |
4205                          S_028B70_OFFSET_ROUND(0);
4206    } else {
4207       db_alpha_to_mask = S_028B70_ALPHA_TO_MASK_OFFSET0(3) | S_028B70_ALPHA_TO_MASK_OFFSET1(1) |
4208                          S_028B70_ALPHA_TO_MASK_OFFSET2(0) | S_028B70_ALPHA_TO_MASK_OFFSET3(2) |
4209                          S_028B70_OFFSET_ROUND(1);
4210    }
4211 
4212    db_alpha_to_mask |= S_028B70_ALPHA_TO_MASK_ENABLE(d->vk.ms.alpha_to_coverage_enable);
4213 
4214    radeon_set_context_reg(cmd_buffer->cs, R_028B70_DB_ALPHA_TO_MASK, db_alpha_to_mask);
4215 }
4216 
4217 static void
radv_emit_sample_mask(struct radv_cmd_buffer * cmd_buffer)4218 radv_emit_sample_mask(struct radv_cmd_buffer *cmd_buffer)
4219 {
4220    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
4221 
4222    radeon_set_context_reg_seq(cmd_buffer->cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2);
4223    radeon_emit(cmd_buffer->cs, d->vk.ms.sample_mask | ((uint32_t)d->vk.ms.sample_mask << 16));
4224    radeon_emit(cmd_buffer->cs, d->vk.ms.sample_mask | ((uint32_t)d->vk.ms.sample_mask << 16));
4225 }
4226 
4227 static void
radv_emit_color_blend(struct radv_cmd_buffer * cmd_buffer)4228 radv_emit_color_blend(struct radv_cmd_buffer *cmd_buffer)
4229 {
4230    const struct radv_physical_device *pdevice = cmd_buffer->device->physical_device;
4231    const enum amd_gfx_level gfx_level = pdevice->rad_info.gfx_level;
4232    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
4233    unsigned cb_blend_control[MAX_RTS], sx_mrt_blend_opt[MAX_RTS];
4234    bool mrt0_is_dual_src = radv_is_mrt0_dual_src(cmd_buffer);
4235 
4236    for (unsigned i = 0; i < MAX_RTS; i++) {
4237       VkBlendOp eqRGB = d->vk.cb.attachments[i].color_blend_op;
4238       VkBlendFactor srcRGB = d->vk.cb.attachments[i].src_color_blend_factor;
4239       VkBlendFactor dstRGB = d->vk.cb.attachments[i].dst_color_blend_factor;
4240       VkBlendOp eqA = d->vk.cb.attachments[i].alpha_blend_op;
4241       VkBlendFactor srcA = d->vk.cb.attachments[i].src_alpha_blend_factor;
4242       VkBlendFactor dstA = d->vk.cb.attachments[i].dst_alpha_blend_factor;
4243       unsigned srcRGB_opt, dstRGB_opt, srcA_opt, dstA_opt;
4244       unsigned blend_cntl = 0;
4245 
4246       cb_blend_control[i] = sx_mrt_blend_opt[i] = 0;
4247 
4248       /* Ignore other blend targets if dual-source blending is enabled to prevent wrong behaviour.
4249        */
4250       if (i > 0 && mrt0_is_dual_src)
4251          continue;
4252 
4253       if (!d->vk.cb.attachments[i].blend_enable) {
4254          sx_mrt_blend_opt[i] |= S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) |
4255                                 S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED);
4256          continue;
4257       }
4258 
4259       radv_normalize_blend_factor(eqRGB, &srcRGB, &dstRGB);
4260       radv_normalize_blend_factor(eqA, &srcA, &dstA);
4261 
4262       /* Blending optimizations for RB+.
4263        * These transformations don't change the behavior.
4264        *
4265        * First, get rid of DST in the blend factors:
4266        *    func(src * DST, dst * 0) ---> func(src * 0, dst * SRC)
4267        */
4268       radv_blend_remove_dst(&eqRGB, &srcRGB, &dstRGB, VK_BLEND_FACTOR_DST_COLOR, VK_BLEND_FACTOR_SRC_COLOR);
4269 
4270       radv_blend_remove_dst(&eqA, &srcA, &dstA, VK_BLEND_FACTOR_DST_COLOR, VK_BLEND_FACTOR_SRC_COLOR);
4271 
4272       radv_blend_remove_dst(&eqA, &srcA, &dstA, VK_BLEND_FACTOR_DST_ALPHA, VK_BLEND_FACTOR_SRC_ALPHA);
4273 
4274       /* Look up the ideal settings from tables. */
4275       srcRGB_opt = radv_translate_blend_opt_factor(srcRGB, false);
4276       dstRGB_opt = radv_translate_blend_opt_factor(dstRGB, false);
4277       srcA_opt = radv_translate_blend_opt_factor(srcA, true);
4278       dstA_opt = radv_translate_blend_opt_factor(dstA, true);
4279 
4280       /* Handle interdependencies. */
4281       if (radv_blend_factor_uses_dst(srcRGB))
4282          dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
4283       if (radv_blend_factor_uses_dst(srcA))
4284          dstA_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
4285 
4286       if (srcRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE &&
4287           (dstRGB == VK_BLEND_FACTOR_ZERO || dstRGB == VK_BLEND_FACTOR_SRC_ALPHA ||
4288            dstRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE))
4289          dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0;
4290 
4291       /* Set the final value. */
4292       sx_mrt_blend_opt[i] = S_028760_COLOR_SRC_OPT(srcRGB_opt) | S_028760_COLOR_DST_OPT(dstRGB_opt) |
4293                             S_028760_COLOR_COMB_FCN(radv_translate_blend_opt_function(eqRGB)) |
4294                             S_028760_ALPHA_SRC_OPT(srcA_opt) | S_028760_ALPHA_DST_OPT(dstA_opt) |
4295                             S_028760_ALPHA_COMB_FCN(radv_translate_blend_opt_function(eqA));
4296 
4297       blend_cntl |= S_028780_ENABLE(1);
4298       blend_cntl |= S_028780_COLOR_COMB_FCN(radv_translate_blend_function(eqRGB));
4299       blend_cntl |= S_028780_COLOR_SRCBLEND(radv_translate_blend_factor(gfx_level, srcRGB));
4300       blend_cntl |= S_028780_COLOR_DESTBLEND(radv_translate_blend_factor(gfx_level, dstRGB));
4301       if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB) {
4302          blend_cntl |= S_028780_SEPARATE_ALPHA_BLEND(1);
4303          blend_cntl |= S_028780_ALPHA_COMB_FCN(radv_translate_blend_function(eqA));
4304          blend_cntl |= S_028780_ALPHA_SRCBLEND(radv_translate_blend_factor(gfx_level, srcA));
4305          blend_cntl |= S_028780_ALPHA_DESTBLEND(radv_translate_blend_factor(gfx_level, dstA));
4306       }
4307       cb_blend_control[i] = blend_cntl;
4308    }
4309 
4310    if (pdevice->rad_info.has_rbplus) {
4311       /* Disable RB+ blend optimizations for dual source blending. */
4312       if (mrt0_is_dual_src) {
4313          for (unsigned i = 0; i < MAX_RTS; i++) {
4314             sx_mrt_blend_opt[i] =
4315                S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_NONE) | S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_NONE);
4316          }
4317       }
4318 
4319       /* Disable RB+ blend optimizations on GFX11 when alpha-to-coverage is enabled. */
4320       if (gfx_level >= GFX11 && d->vk.ms.alpha_to_coverage_enable) {
4321          sx_mrt_blend_opt[0] =
4322             S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_NONE) | S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_NONE);
4323       }
4324    }
4325 
4326    radeon_set_context_reg_seq(cmd_buffer->cs, R_028780_CB_BLEND0_CONTROL, MAX_RTS);
4327    radeon_emit_array(cmd_buffer->cs, cb_blend_control, MAX_RTS);
4328 
4329    if (pdevice->rad_info.has_rbplus) {
4330       radeon_set_context_reg_seq(cmd_buffer->cs, R_028760_SX_MRT0_BLEND_OPT, MAX_RTS);
4331       radeon_emit_array(cmd_buffer->cs, sx_mrt_blend_opt, MAX_RTS);
4332    }
4333 }
4334 
4335 static struct radv_shader_part *
lookup_ps_epilog(struct radv_cmd_buffer * cmd_buffer)4336 lookup_ps_epilog(struct radv_cmd_buffer *cmd_buffer)
4337 {
4338    const struct radv_shader *ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
4339    const struct radv_rendering_state *render = &cmd_buffer->state.render;
4340    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
4341    struct radv_device *device = cmd_buffer->device;
4342    struct radv_ps_epilog_state state = {0};
4343 
4344    state.color_attachment_count = render->color_att_count;
4345    for (unsigned i = 0; i < render->color_att_count; ++i) {
4346       state.color_attachment_formats[i] = render->color_att[i].format;
4347    }
4348 
4349    for (unsigned i = 0; i < MAX_RTS; i++) {
4350       VkBlendOp eqRGB = d->vk.cb.attachments[i].color_blend_op;
4351       VkBlendFactor srcRGB = d->vk.cb.attachments[i].src_color_blend_factor;
4352       VkBlendFactor dstRGB = d->vk.cb.attachments[i].dst_color_blend_factor;
4353 
4354       state.color_write_mask |= d->vk.cb.attachments[i].write_mask << (4 * i);
4355       state.color_blend_enable |= d->vk.cb.attachments[i].blend_enable << (4 * i);
4356 
4357       radv_normalize_blend_factor(eqRGB, &srcRGB, &dstRGB);
4358 
4359       if (srcRGB == VK_BLEND_FACTOR_SRC_ALPHA || dstRGB == VK_BLEND_FACTOR_SRC_ALPHA ||
4360           srcRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE || dstRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE ||
4361           srcRGB == VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA || dstRGB == VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA)
4362          state.need_src_alpha |= 1 << i;
4363    }
4364 
4365    state.mrt0_is_dual_src = radv_is_mrt0_dual_src(cmd_buffer);
4366 
4367    if (d->vk.ms.alpha_to_coverage_enable) {
4368       /* Select a color export format with alpha when alpha to coverage is enabled. */
4369       state.need_src_alpha |= 0x1;
4370    }
4371 
4372    if (ps) {
4373       state.colors_written = ps->info.ps.colors_written;
4374 
4375       if (ps->info.ps.exports_mrtz_via_epilog) {
4376          assert(device->physical_device->rad_info.gfx_level >= GFX11);
4377          state.export_depth = ps->info.ps.writes_z;
4378          state.export_stencil = ps->info.ps.writes_stencil;
4379          state.export_sample_mask = ps->info.ps.writes_sample_mask;
4380          state.alpha_to_coverage_via_mrtz = d->vk.ms.alpha_to_coverage_enable;
4381       }
4382    }
4383 
4384    struct radv_ps_epilog_key key = radv_generate_ps_epilog_key(device, &state);
4385 
4386    /* Clear color attachments that aren't exported by the FS to match IO shader arguments. */
4387    key.spi_shader_col_format &= ps->info.ps.colors_written;
4388 
4389    return radv_shader_part_cache_get(device, &device->ps_epilogs, &cmd_buffer->ps_epilogs, &key);
4390 }
4391 
4392 static struct radv_shader_part *
lookup_tcs_epilog(struct radv_cmd_buffer * cmd_buffer)4393 lookup_tcs_epilog(struct radv_cmd_buffer *cmd_buffer)
4394 {
4395    const struct radv_shader *tcs = cmd_buffer->state.shaders[MESA_SHADER_TESS_CTRL];
4396    const struct radv_shader *tes = radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_TESS_EVAL);
4397    struct radv_device *device = cmd_buffer->device;
4398 
4399    struct radv_tcs_epilog_key key = {
4400       .primitive_mode = tes->info.tes._primitive_mode,
4401       .tes_reads_tessfactors = tes->info.tes.reads_tess_factors,
4402       .tcs_out_patch_fits_subgroup = tcs->info.wave_size % tcs->info.tcs.tcs_vertices_out == 0,
4403    };
4404 
4405    return radv_shader_part_cache_get(device, &device->tcs_epilogs, &cmd_buffer->tcs_epilogs, &key);
4406 }
4407 
4408 static void
radv_emit_msaa_state(struct radv_cmd_buffer * cmd_buffer)4409 radv_emit_msaa_state(struct radv_cmd_buffer *cmd_buffer)
4410 {
4411    const struct radv_physical_device *pdevice = cmd_buffer->device->physical_device;
4412    const struct radv_shader *ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
4413    unsigned rasterization_samples = radv_get_rasterization_samples(cmd_buffer);
4414    const struct radv_rendering_state *render = &cmd_buffer->state.render;
4415    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
4416    unsigned log_samples = util_logbase2(rasterization_samples);
4417    unsigned pa_sc_aa_config = 0;
4418    unsigned max_sample_dist = 0;
4419    unsigned db_eqaa;
4420 
4421    db_eqaa = S_028804_HIGH_QUALITY_INTERSECTIONS(1) | S_028804_INCOHERENT_EQAA_READS(1) |
4422              S_028804_STATIC_ANCHOR_ASSOCIATIONS(1);
4423 
4424    if (pdevice->rad_info.gfx_level >= GFX9 &&
4425        d->vk.rs.conservative_mode != VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT) {
4426       /* Adjust MSAA state if conservative rasterization is enabled. */
4427       db_eqaa |= S_028804_OVERRASTERIZATION_AMOUNT(4);
4428       pa_sc_aa_config |= S_028BE0_AA_MASK_CENTROID_DTMN(1);
4429    }
4430 
4431    if (!d->sample_location.count) {
4432       max_sample_dist = radv_get_default_max_sample_dist(log_samples);
4433    } else {
4434       uint32_t num_samples = (uint32_t)d->sample_location.per_pixel;
4435       VkOffset2D sample_locs[4][8]; /* 8 is the max. sample count supported */
4436 
4437       /* Convert the user sample locations to hardware sample locations. */
4438       radv_convert_user_sample_locs(&d->sample_location, 0, 0, sample_locs[0]);
4439       radv_convert_user_sample_locs(&d->sample_location, 1, 0, sample_locs[1]);
4440       radv_convert_user_sample_locs(&d->sample_location, 0, 1, sample_locs[2]);
4441       radv_convert_user_sample_locs(&d->sample_location, 1, 1, sample_locs[3]);
4442 
4443       /* Compute the maximum sample distance from the specified locations. */
4444       for (unsigned i = 0; i < 4; ++i) {
4445          for (uint32_t j = 0; j < num_samples; j++) {
4446             VkOffset2D offset = sample_locs[i][j];
4447             max_sample_dist = MAX2(max_sample_dist, MAX2(abs(offset.x), abs(offset.y)));
4448          }
4449       }
4450    }
4451 
4452    if (rasterization_samples > 1) {
4453       unsigned z_samples = MAX2(render->ds_samples, rasterization_samples);
4454       unsigned ps_iter_samples = radv_get_ps_iter_samples(cmd_buffer);
4455       unsigned log_z_samples = util_logbase2(z_samples);
4456       unsigned log_ps_iter_samples = util_logbase2(ps_iter_samples);
4457       bool uses_underestimate = d->vk.rs.conservative_mode == VK_CONSERVATIVE_RASTERIZATION_MODE_UNDERESTIMATE_EXT;
4458 
4459       db_eqaa |= S_028804_MAX_ANCHOR_SAMPLES(log_z_samples) | S_028804_PS_ITER_SAMPLES(log_ps_iter_samples) |
4460                  S_028804_MASK_EXPORT_NUM_SAMPLES(log_samples) | S_028804_ALPHA_TO_MASK_NUM_SAMPLES(log_samples);
4461 
4462       pa_sc_aa_config |= S_028BE0_MSAA_NUM_SAMPLES(uses_underestimate ? 0 : log_samples) |
4463                          S_028BE0_MAX_SAMPLE_DIST(max_sample_dist) | S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples) |
4464                          S_028BE0_COVERED_CENTROID_IS_CENTER(pdevice->rad_info.gfx_level >= GFX10_3);
4465 
4466       if (d->vk.rs.line.mode == VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_KHR)
4467          db_eqaa |= S_028804_OVERRASTERIZATION_AMOUNT(log_samples);
4468    }
4469 
4470    pa_sc_aa_config |= S_028BE0_COVERAGE_TO_SHADER_SELECT(ps && ps->info.ps.reads_fully_covered);
4471 
4472    /* On GFX11, DB_Z_INFO.NUM_SAMPLES should always match MSAA_EXPOSED_SAMPLES. It affects VRS,
4473     * occlusion queries and Primitive Ordered Pixel Shading if depth and stencil are not bound.
4474     * This is normally emitted as framebuffer state, but if no attachments are bound the sample
4475     * count is independent of the framebuffer state and hence may need to be updated with MSAA
4476     * state.
4477     * Checking the format, not the image view, because the latter may not exist in a secondary
4478     * command buffer.
4479     */
4480    if (pdevice->rad_info.gfx_level == GFX11 && render->ds_att.format == VK_FORMAT_UNDEFINED) {
4481       assert(!render->ds_att.iview);
4482       radeon_set_context_reg(cmd_buffer->cs, R_028040_DB_Z_INFO,
4483                              S_028040_FORMAT(V_028040_Z_INVALID) | S_028040_NUM_SAMPLES(log_samples));
4484    }
4485    radeon_set_context_reg(cmd_buffer->cs, R_028804_DB_EQAA, db_eqaa);
4486    radeon_set_context_reg(cmd_buffer->cs, R_028BE0_PA_SC_AA_CONFIG, pa_sc_aa_config);
4487    radeon_set_context_reg(
4488       cmd_buffer->cs, R_028A48_PA_SC_MODE_CNTL_0,
4489       S_028A48_ALTERNATE_RBS_PER_TILE(pdevice->rad_info.gfx_level >= GFX9) | S_028A48_VPORT_SCISSOR_ENABLE(1) |
4490          S_028A48_LINE_STIPPLE_ENABLE(d->vk.rs.line.stipple.enable) | S_028A48_MSAA_ENABLE(rasterization_samples > 1));
4491 }
4492 
4493 static void
radv_emit_line_rasterization_mode(struct radv_cmd_buffer * cmd_buffer)4494 radv_emit_line_rasterization_mode(struct radv_cmd_buffer *cmd_buffer)
4495 {
4496    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
4497 
4498    /* The DX10 diamond test is unnecessary with Vulkan and it decreases line rasterization
4499     * performance.
4500     */
4501    radeon_set_context_reg(
4502       cmd_buffer->cs, R_028BDC_PA_SC_LINE_CNTL,
4503       S_028BDC_PERPENDICULAR_ENDCAP_ENA(d->vk.rs.line.mode == VK_LINE_RASTERIZATION_MODE_RECTANGULAR_KHR));
4504 }
4505 
4506 static void
radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer * cmd_buffer,const uint64_t states)4507 radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer, const uint64_t states)
4508 {
4509    if (states & (RADV_CMD_DIRTY_DYNAMIC_VIEWPORT | RADV_CMD_DIRTY_DYNAMIC_DEPTH_CLIP_ENABLE |
4510                  RADV_CMD_DIRTY_DYNAMIC_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE | RADV_CMD_DIRTY_DYNAMIC_DEPTH_CLAMP_ENABLE))
4511       radv_emit_viewport(cmd_buffer);
4512 
4513    if (states & (RADV_CMD_DIRTY_DYNAMIC_SCISSOR | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT) &&
4514        !cmd_buffer->device->physical_device->rad_info.has_gfx9_scissor_bug)
4515       radv_emit_scissor(cmd_buffer);
4516 
4517    if (states & RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH)
4518       radv_emit_line_width(cmd_buffer);
4519 
4520    if (states & RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS)
4521       radv_emit_blend_constants(cmd_buffer);
4522 
4523    if (states & (RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE | RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK |
4524                  RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK))
4525       radv_emit_stencil(cmd_buffer);
4526 
4527    if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS)
4528       radv_emit_depth_bounds(cmd_buffer);
4529 
4530    if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS)
4531       radv_emit_depth_bias(cmd_buffer);
4532 
4533    if (states & (RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE | RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE_ENABLE |
4534                  RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE_MODE))
4535       radv_emit_discard_rectangle(cmd_buffer);
4536 
4537    if (states & RADV_CMD_DIRTY_DYNAMIC_CONSERVATIVE_RAST_MODE)
4538       radv_emit_conservative_rast_mode(cmd_buffer);
4539 
4540    if (states & RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS)
4541       radv_emit_sample_locations(cmd_buffer);
4542 
4543    if (states & RADV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE)
4544       radv_emit_line_stipple(cmd_buffer);
4545 
4546    if (states & (RADV_CMD_DIRTY_DYNAMIC_CULL_MODE | RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE |
4547                  RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE | RADV_CMD_DIRTY_DYNAMIC_POLYGON_MODE |
4548                  RADV_CMD_DIRTY_DYNAMIC_PROVOKING_VERTEX_MODE | RADV_CMD_DIRTY_DYNAMIC_LINE_RASTERIZATION_MODE))
4549       radv_emit_culling(cmd_buffer);
4550 
4551    if (states & (RADV_CMD_DIRTY_DYNAMIC_PROVOKING_VERTEX_MODE | RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY))
4552       radv_emit_provoking_vertex_mode(cmd_buffer);
4553 
4554    if (states & RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY)
4555       radv_emit_primitive_topology(cmd_buffer);
4556 
4557    if (states & (RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE |
4558                  RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP | RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE |
4559                  RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP))
4560       radv_emit_depth_control(cmd_buffer);
4561 
4562    if (states & RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP)
4563       radv_emit_stencil_control(cmd_buffer);
4564 
4565    if (states & RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE)
4566       radv_emit_fragment_shading_rate(cmd_buffer);
4567 
4568    if (states & RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE)
4569       radv_emit_primitive_restart_enable(cmd_buffer);
4570 
4571    if (states & (RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE | RADV_CMD_DIRTY_DYNAMIC_DEPTH_CLIP_ENABLE |
4572                  RADV_CMD_DIRTY_DYNAMIC_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE | RADV_CMD_DIRTY_DYNAMIC_DEPTH_CLAMP_ENABLE))
4573       radv_emit_clipping(cmd_buffer);
4574 
4575    if (states & (RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP | RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP_ENABLE |
4576                  RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_MASK | RADV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_ENABLE |
4577                  RADV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_EQUATION))
4578       radv_emit_logic_op(cmd_buffer);
4579 
4580    if (states & (RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE | RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_MASK))
4581       radv_emit_color_write(cmd_buffer);
4582 
4583    if (states & RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT)
4584       radv_emit_vertex_input(cmd_buffer);
4585 
4586    if (states & RADV_CMD_DIRTY_DYNAMIC_PATCH_CONTROL_POINTS)
4587       radv_emit_patch_control_points(cmd_buffer);
4588 
4589    if (states & RADV_CMD_DIRTY_DYNAMIC_TESS_DOMAIN_ORIGIN)
4590       radv_emit_tess_domain_origin(cmd_buffer);
4591 
4592    if (states & RADV_CMD_DIRTY_DYNAMIC_ALPHA_TO_COVERAGE_ENABLE)
4593       radv_emit_alpha_to_coverage_enable(cmd_buffer);
4594 
4595    if (states & RADV_CMD_DIRTY_DYNAMIC_SAMPLE_MASK)
4596       radv_emit_sample_mask(cmd_buffer);
4597 
4598    if (states & (RADV_CMD_DIRTY_DYNAMIC_DEPTH_CLAMP_ENABLE | RADV_CMD_DIRTY_DYNAMIC_DEPTH_CLIP_ENABLE))
4599       radv_emit_depth_clamp_enable(cmd_buffer);
4600 
4601    if (states & (RADV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_ENABLE | RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_MASK |
4602                  RADV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_EQUATION | RADV_CMD_DIRTY_DYNAMIC_ALPHA_TO_COVERAGE_ENABLE))
4603       radv_emit_color_blend(cmd_buffer);
4604 
4605    if (states & RADV_CMD_DIRTY_DYNAMIC_LINE_RASTERIZATION_MODE)
4606       radv_emit_line_rasterization_mode(cmd_buffer);
4607 
4608    if (states & (RADV_CMD_DIRTY_DYNAMIC_RASTERIZATION_SAMPLES | RADV_CMD_DIRTY_DYNAMIC_LINE_RASTERIZATION_MODE))
4609       radv_emit_rasterization_samples(cmd_buffer);
4610 
4611    if (states & (RADV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE_ENABLE | RADV_CMD_DIRTY_DYNAMIC_CONSERVATIVE_RAST_MODE |
4612                  RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS | RADV_CMD_DIRTY_DYNAMIC_RASTERIZATION_SAMPLES |
4613                  RADV_CMD_DIRTY_DYNAMIC_LINE_RASTERIZATION_MODE))
4614       radv_emit_msaa_state(cmd_buffer);
4615 
4616    /* RADV_CMD_DIRTY_DYNAMIC_ATTACHMENT_FEEDBACK_LOOP_ENABLE is handled by radv_emit_db_shader_control. */
4617 
4618    cmd_buffer->state.dirty &= ~states;
4619 }
4620 
4621 static void
radv_flush_push_descriptors(struct radv_cmd_buffer * cmd_buffer,struct radv_descriptor_state * descriptors_state)4622 radv_flush_push_descriptors(struct radv_cmd_buffer *cmd_buffer, struct radv_descriptor_state *descriptors_state)
4623 {
4624    struct radv_descriptor_set *set = (struct radv_descriptor_set *)&descriptors_state->push_set.set;
4625    unsigned bo_offset;
4626 
4627    if (!radv_cmd_buffer_upload_data(cmd_buffer, set->header.size, set->header.mapped_ptr, &bo_offset))
4628       return;
4629 
4630    set->header.va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
4631    set->header.va += bo_offset;
4632 }
4633 
4634 static void
radv_flush_indirect_descriptor_sets(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)4635 radv_flush_indirect_descriptor_sets(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point)
4636 {
4637    struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);
4638    uint32_t size = MAX_SETS * 4;
4639    uint32_t offset;
4640    void *ptr;
4641 
4642    if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, &offset, &ptr))
4643       return;
4644 
4645    for (unsigned i = 0; i < MAX_SETS; i++) {
4646       uint32_t *uptr = ((uint32_t *)ptr) + i;
4647       uint64_t set_va = 0;
4648       if (descriptors_state->valid & (1u << i))
4649          set_va = radv_descriptor_get_va(descriptors_state, i);
4650 
4651       uptr[0] = set_va & 0xffffffff;
4652    }
4653 
4654    struct radeon_cmdbuf *cs = cmd_buffer->cs;
4655    struct radv_device *device = cmd_buffer->device;
4656    uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
4657    va += offset;
4658 
4659    ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cs, MESA_VULKAN_SHADER_STAGES * 3);
4660 
4661    if (bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) {
4662       for (unsigned s = MESA_SHADER_VERTEX; s <= MESA_SHADER_FRAGMENT; s++)
4663          if (radv_cmdbuf_has_stage(cmd_buffer, s))
4664             radv_emit_userdata_address(device, cs, cmd_buffer->state.shaders[s],
4665                                        cmd_buffer->state.shaders[s]->info.user_data_0, AC_UD_INDIRECT_DESCRIPTOR_SETS,
4666                                        va);
4667 
4668       if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_MESH))
4669          radv_emit_userdata_address(device, cs, cmd_buffer->state.shaders[MESA_SHADER_MESH],
4670                                     cmd_buffer->state.shaders[MESA_SHADER_MESH]->info.user_data_0,
4671                                     AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
4672 
4673       if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_TASK)) {
4674          radeon_check_space(device->ws, cmd_buffer->gang.cs, 3);
4675          radv_emit_userdata_address(device, cmd_buffer->gang.cs, cmd_buffer->state.shaders[MESA_SHADER_TASK],
4676                                     cmd_buffer->state.shaders[MESA_SHADER_TASK]->info.user_data_0,
4677                                     AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
4678       }
4679    } else {
4680       struct radv_shader *compute_shader = bind_point == VK_PIPELINE_BIND_POINT_COMPUTE
4681                                               ? cmd_buffer->state.shaders[MESA_SHADER_COMPUTE]
4682                                               : cmd_buffer->state.rt_prolog;
4683 
4684       radv_emit_userdata_address(device, cs, compute_shader, compute_shader->info.user_data_0,
4685                                  AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
4686    }
4687 
4688    assert(cmd_buffer->cs->cdw <= cdw_max);
4689 }
4690 
4691 ALWAYS_INLINE static void
radv_flush_descriptors(struct radv_cmd_buffer * cmd_buffer,VkShaderStageFlags stages,VkPipelineBindPoint bind_point)4692 radv_flush_descriptors(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stages, VkPipelineBindPoint bind_point)
4693 {
4694    struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);
4695    struct radv_device *device = cmd_buffer->device;
4696    struct radeon_cmdbuf *cs = cmd_buffer->cs;
4697    bool flush_indirect_descriptors;
4698 
4699    if (!descriptors_state->dirty)
4700       return;
4701 
4702    flush_indirect_descriptors = descriptors_state->need_indirect_descriptor_sets;
4703 
4704    if (flush_indirect_descriptors)
4705       radv_flush_indirect_descriptor_sets(cmd_buffer, bind_point);
4706 
4707    ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cs, MAX_SETS * MESA_VULKAN_SHADER_STAGES * 4);
4708 
4709    if (stages & VK_SHADER_STAGE_COMPUTE_BIT) {
4710       struct radv_shader *compute_shader = bind_point == VK_PIPELINE_BIND_POINT_COMPUTE
4711                                               ? cmd_buffer->state.shaders[MESA_SHADER_COMPUTE]
4712                                               : cmd_buffer->state.rt_prolog;
4713 
4714       radv_emit_descriptor_pointers(device, cs, compute_shader, compute_shader->info.user_data_0, descriptors_state);
4715    } else {
4716       radv_foreach_stage(stage, stages & ~VK_SHADER_STAGE_TASK_BIT_EXT)
4717       {
4718          if (!cmd_buffer->state.shaders[stage])
4719             continue;
4720 
4721          radv_emit_descriptor_pointers(device, cs, cmd_buffer->state.shaders[stage],
4722                                        cmd_buffer->state.shaders[stage]->info.user_data_0, descriptors_state);
4723       }
4724 
4725       if (stages & VK_SHADER_STAGE_TASK_BIT_EXT) {
4726          radv_emit_descriptor_pointers(device, cmd_buffer->gang.cs, cmd_buffer->state.shaders[MESA_SHADER_TASK],
4727                                        cmd_buffer->state.shaders[MESA_SHADER_TASK]->info.user_data_0,
4728                                        descriptors_state);
4729       }
4730    }
4731 
4732    descriptors_state->dirty = 0;
4733 
4734    assert(cmd_buffer->cs->cdw <= cdw_max);
4735 
4736    if (radv_device_fault_detection_enabled(cmd_buffer->device))
4737       radv_save_descriptors(cmd_buffer, bind_point);
4738 }
4739 
4740 static void
radv_emit_all_inline_push_consts(struct radv_device * device,struct radeon_cmdbuf * cs,struct radv_shader * shader,uint32_t base_reg,uint32_t * values,bool * need_push_constants)4741 radv_emit_all_inline_push_consts(struct radv_device *device, struct radeon_cmdbuf *cs, struct radv_shader *shader,
4742                                  uint32_t base_reg, uint32_t *values, bool *need_push_constants)
4743 {
4744    if (radv_get_user_sgpr(shader, AC_UD_PUSH_CONSTANTS)->sgpr_idx != -1)
4745       *need_push_constants |= true;
4746 
4747    const uint64_t mask = shader->info.inline_push_constant_mask;
4748    if (!mask)
4749       return;
4750 
4751    const uint8_t base = ffs(mask) - 1;
4752    if (mask == u_bit_consecutive64(base, util_last_bit64(mask) - base)) {
4753       /* consecutive inline push constants */
4754       radv_emit_inline_push_consts(device, cs, shader, base_reg, AC_UD_INLINE_PUSH_CONSTANTS, values + base);
4755    } else {
4756       /* sparse inline push constants */
4757       uint32_t consts[AC_MAX_INLINE_PUSH_CONSTS];
4758       unsigned num_consts = 0;
4759       u_foreach_bit64 (idx, mask)
4760          consts[num_consts++] = values[idx];
4761       radv_emit_inline_push_consts(device, cs, shader, base_reg, AC_UD_INLINE_PUSH_CONSTANTS, consts);
4762    }
4763 }
4764 
4765 ALWAYS_INLINE static VkShaderStageFlags
radv_must_flush_constants(const struct radv_cmd_buffer * cmd_buffer,VkShaderStageFlags stages,VkPipelineBindPoint bind_point)4766 radv_must_flush_constants(const struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stages,
4767                           VkPipelineBindPoint bind_point)
4768 {
4769    const struct radv_push_constant_state *push_constants = radv_get_push_constants_state(cmd_buffer, bind_point);
4770 
4771    if (push_constants->size || push_constants->dynamic_offset_count)
4772       return stages & cmd_buffer->push_constant_stages;
4773 
4774    return 0;
4775 }
4776 
4777 static void
radv_flush_constants(struct radv_cmd_buffer * cmd_buffer,VkShaderStageFlags stages,VkPipelineBindPoint bind_point)4778 radv_flush_constants(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stages, VkPipelineBindPoint bind_point)
4779 {
4780    struct radv_device *device = cmd_buffer->device;
4781    struct radeon_cmdbuf *cs = cmd_buffer->cs;
4782    struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);
4783    const struct radv_push_constant_state *push_constants = radv_get_push_constants_state(cmd_buffer, bind_point);
4784    struct radv_shader *shader, *prev_shader;
4785    bool need_push_constants = false;
4786    unsigned offset;
4787    void *ptr;
4788    uint64_t va;
4789    uint32_t internal_stages = stages;
4790    uint32_t dirty_stages = 0;
4791 
4792    switch (bind_point) {
4793    case VK_PIPELINE_BIND_POINT_GRAPHICS:
4794       break;
4795    case VK_PIPELINE_BIND_POINT_COMPUTE:
4796       dirty_stages = RADV_RT_STAGE_BITS;
4797       break;
4798    case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR:
4799       internal_stages = VK_SHADER_STAGE_COMPUTE_BIT;
4800       dirty_stages = VK_SHADER_STAGE_COMPUTE_BIT;
4801       break;
4802    default:
4803       unreachable("Unhandled bind point");
4804    }
4805 
4806    if (internal_stages & VK_SHADER_STAGE_COMPUTE_BIT) {
4807       struct radv_shader *compute_shader = bind_point == VK_PIPELINE_BIND_POINT_COMPUTE
4808                                               ? cmd_buffer->state.shaders[MESA_SHADER_COMPUTE]
4809                                               : cmd_buffer->state.rt_prolog;
4810 
4811       radv_emit_all_inline_push_consts(device, cs, compute_shader, compute_shader->info.user_data_0,
4812                                        (uint32_t *)cmd_buffer->push_constants, &need_push_constants);
4813    } else {
4814       radv_foreach_stage(stage, internal_stages & ~VK_SHADER_STAGE_TASK_BIT_EXT)
4815       {
4816          shader = radv_get_shader(cmd_buffer->state.shaders, stage);
4817 
4818          if (!shader)
4819             continue;
4820 
4821          radv_emit_all_inline_push_consts(device, cs, shader, shader->info.user_data_0,
4822                                           (uint32_t *)cmd_buffer->push_constants, &need_push_constants);
4823       }
4824 
4825       if (internal_stages & VK_SHADER_STAGE_TASK_BIT_EXT) {
4826          radv_emit_all_inline_push_consts(device, cmd_buffer->gang.cs, cmd_buffer->state.shaders[MESA_SHADER_TASK],
4827                                           cmd_buffer->state.shaders[MESA_SHADER_TASK]->info.user_data_0,
4828                                           (uint32_t *)cmd_buffer->push_constants, &need_push_constants);
4829       }
4830    }
4831 
4832    if (need_push_constants) {
4833       if (!radv_cmd_buffer_upload_alloc(cmd_buffer, push_constants->size + 16 * push_constants->dynamic_offset_count,
4834                                         &offset, &ptr))
4835          return;
4836 
4837       memcpy(ptr, cmd_buffer->push_constants, push_constants->size);
4838       memcpy((char *)ptr + push_constants->size, descriptors_state->dynamic_buffers,
4839              16 * push_constants->dynamic_offset_count);
4840 
4841       va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
4842       va += offset;
4843 
4844       ASSERTED unsigned cdw_max =
4845          radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, MESA_VULKAN_SHADER_STAGES * 4);
4846 
4847       if (internal_stages & VK_SHADER_STAGE_COMPUTE_BIT) {
4848          struct radv_shader *compute_shader = bind_point == VK_PIPELINE_BIND_POINT_COMPUTE
4849                                                  ? cmd_buffer->state.shaders[MESA_SHADER_COMPUTE]
4850                                                  : cmd_buffer->state.rt_prolog;
4851 
4852          radv_emit_userdata_address(device, cs, compute_shader, compute_shader->info.user_data_0, AC_UD_PUSH_CONSTANTS,
4853                                     va);
4854       } else {
4855          prev_shader = NULL;
4856          radv_foreach_stage(stage, internal_stages & ~VK_SHADER_STAGE_TASK_BIT_EXT)
4857          {
4858             shader = radv_get_shader(cmd_buffer->state.shaders, stage);
4859 
4860             /* Avoid redundantly emitting the address for merged stages. */
4861             if (shader && shader != prev_shader) {
4862                radv_emit_userdata_address(device, cs, shader, shader->info.user_data_0, AC_UD_PUSH_CONSTANTS, va);
4863 
4864                prev_shader = shader;
4865             }
4866          }
4867 
4868          if (internal_stages & VK_SHADER_STAGE_TASK_BIT_EXT) {
4869             radv_emit_userdata_address(device, cmd_buffer->gang.cs, cmd_buffer->state.shaders[MESA_SHADER_TASK],
4870                                        cmd_buffer->state.shaders[MESA_SHADER_TASK]->info.user_data_0,
4871                                        AC_UD_PUSH_CONSTANTS, va);
4872          }
4873       }
4874 
4875       assert(cmd_buffer->cs->cdw <= cdw_max);
4876    }
4877 
4878    cmd_buffer->push_constant_stages &= ~stages;
4879    cmd_buffer->push_constant_stages |= dirty_stages;
4880 }
4881 
4882 void
radv_write_vertex_descriptors(const struct radv_cmd_buffer * cmd_buffer,const struct radv_graphics_pipeline * pipeline,bool full_null_descriptors,void * vb_ptr)4883 radv_write_vertex_descriptors(const struct radv_cmd_buffer *cmd_buffer, const struct radv_graphics_pipeline *pipeline,
4884                               bool full_null_descriptors, void *vb_ptr)
4885 {
4886    struct radv_shader *vs_shader = radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_VERTEX);
4887    enum amd_gfx_level chip = cmd_buffer->device->physical_device->rad_info.gfx_level;
4888    enum radeon_family family = cmd_buffer->device->physical_device->rad_info.family;
4889    unsigned desc_index = 0;
4890    uint32_t mask = vs_shader->info.vs.vb_desc_usage_mask;
4891    uint64_t va;
4892    const struct radv_vs_input_state *vs_state =
4893       vs_shader->info.vs.dynamic_inputs ? &cmd_buffer->state.dynamic_vs_input : NULL;
4894    assert(!vs_state || vs_shader->info.vs.use_per_attribute_vb_descs);
4895 
4896    const struct ac_vtx_format_info *vtx_info_table = vs_state ? ac_get_vtx_format_info_table(chip, family) : NULL;
4897 
4898    while (mask) {
4899       unsigned i = u_bit_scan(&mask);
4900       uint32_t *desc = &((uint32_t *)vb_ptr)[desc_index++ * 4];
4901       uint32_t offset, rsrc_word3;
4902 
4903       if (vs_state && !(vs_state->attribute_mask & BITFIELD_BIT(i))) {
4904          /* No vertex attribute description given: assume that the shader doesn't use this
4905           * location (vb_desc_usage_mask can be larger than attribute usage) and use a null
4906           * descriptor to avoid hangs (prologs load all attributes, even if there are holes).
4907           */
4908          memset(desc, 0, 4 * 4);
4909          continue;
4910       }
4911 
4912       unsigned binding = vs_state ? cmd_buffer->state.dynamic_vs_input.bindings[i]
4913                                   : (vs_shader->info.vs.use_per_attribute_vb_descs ? pipeline->attrib_bindings[i] : i);
4914       struct radv_buffer *buffer = cmd_buffer->vertex_binding_buffers[binding];
4915       unsigned num_records;
4916       unsigned stride;
4917 
4918       if (vs_state && !(vs_state->nontrivial_formats & BITFIELD_BIT(i))) {
4919          const struct ac_vtx_format_info *vtx_info = &vtx_info_table[vs_state->formats[i]];
4920          unsigned hw_format = vtx_info->hw_format[vtx_info->num_channels - 1];
4921 
4922          if (chip >= GFX10) {
4923             rsrc_word3 = vtx_info->dst_sel | S_008F0C_FORMAT(hw_format);
4924          } else {
4925             rsrc_word3 =
4926                vtx_info->dst_sel | S_008F0C_NUM_FORMAT((hw_format >> 4) & 0x7) | S_008F0C_DATA_FORMAT(hw_format & 0xf);
4927          }
4928       } else {
4929          rsrc_word3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
4930                       S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
4931          if (chip >= GFX10)
4932             rsrc_word3 |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_UINT);
4933          else
4934             rsrc_word3 |=
4935                S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) | S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
4936       }
4937 
4938       if (cmd_buffer->state.uses_dynamic_vertex_binding_stride) {
4939          stride = cmd_buffer->vertex_bindings[binding].stride;
4940       } else {
4941          stride = pipeline->binding_stride[binding];
4942       }
4943 
4944       if (!buffer) {
4945          if (full_null_descriptors) {
4946             /* Put all the info in for the DGC generation shader in case the VBO gets overridden. */
4947             desc[0] = 0;
4948             desc[1] = S_008F04_STRIDE(stride);
4949             desc[2] = 0;
4950             desc[3] = rsrc_word3;
4951          } else if (vs_state) {
4952             /* Stride needs to be non-zero on GFX9, or else bounds checking is disabled. We need
4953              * to include the format/word3 so that the alpha channel is 1 for formats without an
4954              * alpha channel.
4955              */
4956             desc[0] = 0;
4957             desc[1] = S_008F04_STRIDE(16);
4958             desc[2] = 0;
4959             desc[3] = rsrc_word3;
4960          } else {
4961             memset(desc, 0, 4 * 4);
4962          }
4963 
4964          continue;
4965       }
4966 
4967       va = radv_buffer_get_va(buffer->bo);
4968 
4969       offset = cmd_buffer->vertex_bindings[binding].offset;
4970       va += offset + buffer->offset;
4971       if (vs_state)
4972          va += vs_state->offsets[i];
4973 
4974       if (cmd_buffer->vertex_bindings[binding].size) {
4975          num_records = cmd_buffer->vertex_bindings[binding].size;
4976       } else {
4977          num_records = vk_buffer_range(&buffer->vk, offset, VK_WHOLE_SIZE);
4978       }
4979 
4980       if (vs_shader->info.vs.use_per_attribute_vb_descs) {
4981          uint32_t attrib_end = vs_state ? vs_state->offsets[i] + vs_state->format_sizes[i] : pipeline->attrib_ends[i];
4982 
4983          if (num_records < attrib_end) {
4984             num_records = 0; /* not enough space for one vertex */
4985          } else if (stride == 0) {
4986             num_records = 1; /* only one vertex */
4987          } else {
4988             num_records = (num_records - attrib_end) / stride + 1;
4989             /* If attrib_offset>stride, then the compiler will increase the vertex index by
4990              * attrib_offset/stride and decrease the offset by attrib_offset%stride. This is
4991              * only allowed with static strides.
4992              */
4993             num_records += pipeline ? pipeline->attrib_index_offset[i] : 0;
4994          }
4995 
4996          /* GFX10 uses OOB_SELECT_RAW if stride==0, so convert num_records from elements into
4997           * into bytes in that case. GFX8 always uses bytes.
4998           */
4999          if (num_records && (chip == GFX8 || (chip != GFX9 && !stride))) {
5000             num_records = (num_records - 1) * stride + attrib_end;
5001          } else if (!num_records) {
5002             /* On GFX9, it seems bounds checking is disabled if both
5003              * num_records and stride are zero. This doesn't seem necessary on GFX8, GFX10 and
5004              * GFX10.3 but it doesn't hurt.
5005              */
5006             if (full_null_descriptors) {
5007                /* Put all the info in for the DGC generation shader in case the VBO gets overridden.
5008                 */
5009                desc[0] = 0;
5010                desc[1] = S_008F04_STRIDE(stride);
5011                desc[2] = 0;
5012                desc[3] = rsrc_word3;
5013             } else if (vs_state) {
5014                desc[0] = 0;
5015                desc[1] = S_008F04_STRIDE(16);
5016                desc[2] = 0;
5017                desc[3] = rsrc_word3;
5018             } else {
5019                memset(desc, 0, 16);
5020             }
5021 
5022             continue;
5023          }
5024       } else {
5025          if (chip != GFX8 && stride)
5026             num_records = DIV_ROUND_UP(num_records, stride);
5027       }
5028 
5029       if (chip >= GFX10) {
5030          /* OOB_SELECT chooses the out-of-bounds check:
5031           * - 1: index >= NUM_RECORDS (Structured)
5032           * - 3: offset >= NUM_RECORDS (Raw)
5033           */
5034          int oob_select = stride ? V_008F0C_OOB_SELECT_STRUCTURED : V_008F0C_OOB_SELECT_RAW;
5035          rsrc_word3 |= S_008F0C_OOB_SELECT(oob_select) | S_008F0C_RESOURCE_LEVEL(chip < GFX11);
5036       }
5037 
5038       desc[0] = va;
5039       desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride);
5040       desc[2] = num_records;
5041       desc[3] = rsrc_word3;
5042    }
5043 }
5044 
5045 static void
radv_flush_vertex_descriptors(struct radv_cmd_buffer * cmd_buffer)5046 radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer)
5047 {
5048    struct radv_shader *vs = radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_VERTEX);
5049 
5050    if (!vs->info.vs.vb_desc_usage_mask)
5051       return;
5052 
5053    /* Mesh shaders don't have vertex descriptors. */
5054    assert(!cmd_buffer->state.mesh_shading);
5055 
5056    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
5057    unsigned vb_desc_alloc_size = util_bitcount(vs->info.vs.vb_desc_usage_mask) * 16;
5058    unsigned vb_offset;
5059    void *vb_ptr;
5060    uint64_t va;
5061 
5062    /* allocate some descriptor state for vertex buffers */
5063    if (!radv_cmd_buffer_upload_alloc(cmd_buffer, vb_desc_alloc_size, &vb_offset, &vb_ptr))
5064       return;
5065 
5066    radv_write_vertex_descriptors(cmd_buffer, pipeline, false, vb_ptr);
5067 
5068    va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
5069    va += vb_offset;
5070 
5071    radv_emit_userdata_address(cmd_buffer->device, cmd_buffer->cs, vs, vs->info.user_data_0, AC_UD_VS_VERTEX_BUFFERS,
5072                               va);
5073 
5074    cmd_buffer->state.vb_va = va;
5075    cmd_buffer->state.vb_size = vb_desc_alloc_size;
5076    cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_VBO_DESCRIPTORS;
5077 
5078    if (radv_device_fault_detection_enabled(cmd_buffer->device))
5079       radv_save_vertex_descriptors(cmd_buffer, (uintptr_t)vb_ptr);
5080 
5081    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_VERTEX_BUFFER;
5082 }
5083 
5084 static void
radv_emit_streamout_buffers(struct radv_cmd_buffer * cmd_buffer,uint64_t va)5085 radv_emit_streamout_buffers(struct radv_cmd_buffer *cmd_buffer, uint64_t va)
5086 {
5087    const struct radv_shader *last_vgt_shader = cmd_buffer->state.last_vgt_shader;
5088    const struct radv_userdata_info *loc = radv_get_user_sgpr(last_vgt_shader, AC_UD_STREAMOUT_BUFFERS);
5089    uint32_t base_reg;
5090 
5091    if (loc->sgpr_idx == -1)
5092       return;
5093 
5094    base_reg = last_vgt_shader->info.user_data_0;
5095 
5096    radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, va, false);
5097 
5098    if (cmd_buffer->state.gs_copy_shader) {
5099       loc = &cmd_buffer->state.gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_STREAMOUT_BUFFERS];
5100       if (loc->sgpr_idx != -1) {
5101          base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0;
5102 
5103          radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, va, false);
5104       }
5105    }
5106 }
5107 
5108 static void
radv_flush_streamout_descriptors(struct radv_cmd_buffer * cmd_buffer)5109 radv_flush_streamout_descriptors(struct radv_cmd_buffer *cmd_buffer)
5110 {
5111    if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_STREAMOUT_BUFFER) {
5112       struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
5113       struct radv_streamout_state *so = &cmd_buffer->state.streamout;
5114       unsigned so_offset;
5115       uint64_t desc_va;
5116       void *so_ptr;
5117 
5118       /* Allocate some descriptor state for streamout buffers. */
5119       if (!radv_cmd_buffer_upload_alloc(cmd_buffer, MAX_SO_BUFFERS * 16, &so_offset, &so_ptr))
5120          return;
5121 
5122       for (uint32_t i = 0; i < MAX_SO_BUFFERS; i++) {
5123          struct radv_buffer *buffer = sb[i].buffer;
5124          uint32_t *desc = &((uint32_t *)so_ptr)[i * 4];
5125          uint32_t size = 0;
5126          uint64_t va = 0;
5127 
5128          if (so->enabled_mask & (1 << i)) {
5129             va = radv_buffer_get_va(buffer->bo) + buffer->offset;
5130 
5131             va += sb[i].offset;
5132 
5133             /* Set the descriptor.
5134              *
5135              * On GFX8, the format must be non-INVALID, otherwise
5136              * the buffer will be considered not bound and store
5137              * instructions will be no-ops.
5138              */
5139             size = 0xffffffff;
5140 
5141             if (cmd_buffer->device->physical_device->use_ngg_streamout) {
5142                /* With NGG streamout, the buffer size is used to determine the max emit per buffer
5143                 * and also acts as a disable bit when it's 0.
5144                 */
5145                size = radv_is_streamout_enabled(cmd_buffer) ? sb[i].size : 0;
5146             }
5147          }
5148 
5149          uint32_t rsrc_word3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
5150                                S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
5151 
5152          if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
5153             rsrc_word3 |=
5154                S_008F0C_FORMAT(V_008F0C_GFX11_FORMAT_32_FLOAT) | S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW);
5155          } else if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10) {
5156             rsrc_word3 |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
5157                           S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
5158          } else {
5159             rsrc_word3 |= S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
5160          }
5161 
5162          desc[0] = va;
5163          desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
5164          desc[2] = size;
5165          desc[3] = rsrc_word3;
5166       }
5167 
5168       desc_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
5169       desc_va += so_offset;
5170 
5171       radv_emit_streamout_buffers(cmd_buffer, desc_va);
5172    }
5173 
5174    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_STREAMOUT_BUFFER;
5175 }
5176 
5177 static void
radv_flush_shader_query_state_gfx(struct radv_cmd_buffer * cmd_buffer)5178 radv_flush_shader_query_state_gfx(struct radv_cmd_buffer *cmd_buffer)
5179 {
5180    const struct radv_shader *last_vgt_shader = cmd_buffer->state.last_vgt_shader;
5181    const struct radv_userdata_info *loc = radv_get_user_sgpr(last_vgt_shader, AC_UD_SHADER_QUERY_STATE);
5182    enum radv_shader_query_state shader_query_state = radv_shader_query_none;
5183    uint32_t base_reg;
5184 
5185    if (loc->sgpr_idx == -1)
5186       return;
5187 
5188    assert(last_vgt_shader->info.is_ngg || last_vgt_shader->info.stage == MESA_SHADER_GEOMETRY);
5189 
5190    /* By default shader queries are disabled but they are enabled if the command buffer has active GDS
5191     * queries or if it's a secondary command buffer that inherits the number of generated
5192     * primitives.
5193     */
5194    if (cmd_buffer->state.active_pipeline_gds_queries ||
5195        (cmd_buffer->state.inherited_pipeline_statistics &
5196         (VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT |
5197          VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT)) ||
5198        (cmd_buffer->device->physical_device->emulate_mesh_shader_queries &&
5199         (cmd_buffer->state.inherited_pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_MESH_SHADER_INVOCATIONS_BIT_EXT)))
5200       shader_query_state |= radv_shader_query_pipeline_stat;
5201 
5202    if (cmd_buffer->state.active_prims_gen_gds_queries)
5203       shader_query_state |= radv_shader_query_prim_gen;
5204 
5205    if (cmd_buffer->state.active_prims_xfb_gds_queries && radv_is_streamout_enabled(cmd_buffer)) {
5206       shader_query_state |= radv_shader_query_prim_xfb | radv_shader_query_prim_gen;
5207    }
5208 
5209    base_reg = last_vgt_shader->info.user_data_0;
5210    assert(loc->sgpr_idx != -1);
5211 
5212    radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, shader_query_state);
5213 }
5214 
5215 static void
radv_flush_shader_query_state_ace(struct radv_cmd_buffer * cmd_buffer,struct radv_shader * task_shader)5216 radv_flush_shader_query_state_ace(struct radv_cmd_buffer *cmd_buffer, struct radv_shader *task_shader)
5217 {
5218    const struct radv_userdata_info *loc = radv_get_user_sgpr(task_shader, AC_UD_SHADER_QUERY_STATE);
5219    enum radv_shader_query_state shader_query_state = radv_shader_query_none;
5220    uint32_t base_reg;
5221 
5222    if (loc->sgpr_idx == -1)
5223       return;
5224 
5225    /* By default shader queries are disabled but they are enabled if the command buffer has active ACE
5226     * queries or if it's a secondary command buffer that inherits the number of task shader
5227     * invocations query.
5228     */
5229    if (cmd_buffer->state.active_pipeline_ace_queries ||
5230        (cmd_buffer->state.inherited_pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT))
5231       shader_query_state |= radv_shader_query_pipeline_stat;
5232 
5233    base_reg = task_shader->info.user_data_0;
5234    assert(loc->sgpr_idx != -1);
5235 
5236    radeon_set_sh_reg(cmd_buffer->gang.cs, base_reg + loc->sgpr_idx * 4, shader_query_state);
5237 }
5238 
5239 static void
radv_flush_shader_query_state(struct radv_cmd_buffer * cmd_buffer)5240 radv_flush_shader_query_state(struct radv_cmd_buffer *cmd_buffer)
5241 {
5242    radv_flush_shader_query_state_gfx(cmd_buffer);
5243 
5244    if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_TASK) &&
5245        cmd_buffer->device->physical_device->emulate_mesh_shader_queries)
5246       radv_flush_shader_query_state_ace(cmd_buffer, cmd_buffer->state.shaders[MESA_SHADER_TASK]);
5247 
5248    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_SHADER_QUERY;
5249 }
5250 
5251 static void
radv_flush_force_vrs_state(struct radv_cmd_buffer * cmd_buffer)5252 radv_flush_force_vrs_state(struct radv_cmd_buffer *cmd_buffer)
5253 {
5254    const struct radv_shader *last_vgt_shader = cmd_buffer->state.last_vgt_shader;
5255 
5256    if (!last_vgt_shader->info.force_vrs_per_vertex) {
5257       /* Un-set the SGPR index so we know to re-emit it later. */
5258       cmd_buffer->state.last_vrs_rates_sgpr_idx = -1;
5259       return;
5260    }
5261 
5262    const struct radv_userdata_info *loc;
5263    uint32_t base_reg;
5264 
5265    if (cmd_buffer->state.gs_copy_shader) {
5266       loc = &cmd_buffer->state.gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_FORCE_VRS_RATES];
5267       base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0;
5268    } else {
5269       loc = radv_get_user_sgpr(last_vgt_shader, AC_UD_FORCE_VRS_RATES);
5270       base_reg = last_vgt_shader->info.user_data_0;
5271    }
5272 
5273    assert(loc->sgpr_idx != -1);
5274 
5275    enum amd_gfx_level gfx_level = cmd_buffer->device->physical_device->rad_info.gfx_level;
5276    uint32_t vrs_rates = 0;
5277 
5278    switch (cmd_buffer->device->force_vrs) {
5279    case RADV_FORCE_VRS_2x2:
5280       vrs_rates = gfx_level >= GFX11 ? V_0283D0_VRS_SHADING_RATE_2X2 : (1u << 2) | (1u << 4);
5281       break;
5282    case RADV_FORCE_VRS_2x1:
5283       vrs_rates = gfx_level >= GFX11 ? V_0283D0_VRS_SHADING_RATE_2X1 : (1u << 2) | (0u << 4);
5284       break;
5285    case RADV_FORCE_VRS_1x2:
5286       vrs_rates = gfx_level >= GFX11 ? V_0283D0_VRS_SHADING_RATE_1X2 : (0u << 2) | (1u << 4);
5287       break;
5288    default:
5289       break;
5290    }
5291 
5292    if (cmd_buffer->state.last_vrs_rates != vrs_rates || cmd_buffer->state.last_vrs_rates_sgpr_idx != loc->sgpr_idx) {
5293       radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, vrs_rates);
5294    }
5295 
5296    cmd_buffer->state.last_vrs_rates = vrs_rates;
5297    cmd_buffer->state.last_vrs_rates_sgpr_idx = loc->sgpr_idx;
5298 }
5299 
5300 static void
radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer * cmd_buffer)5301 radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer)
5302 {
5303    if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_VERTEX_BUFFER)
5304       radv_flush_vertex_descriptors(cmd_buffer);
5305 
5306    radv_flush_streamout_descriptors(cmd_buffer);
5307 
5308    VkShaderStageFlags stages = VK_SHADER_STAGE_ALL_GRAPHICS;
5309    radv_flush_descriptors(cmd_buffer, stages, VK_PIPELINE_BIND_POINT_GRAPHICS);
5310 
5311    const VkShaderStageFlags pc_stages = radv_must_flush_constants(cmd_buffer, stages, VK_PIPELINE_BIND_POINT_GRAPHICS);
5312    if (pc_stages)
5313       radv_flush_constants(cmd_buffer, pc_stages, VK_PIPELINE_BIND_POINT_GRAPHICS);
5314 
5315    radv_flush_force_vrs_state(cmd_buffer);
5316 }
5317 
5318 struct radv_draw_info {
5319    /**
5320     * Number of vertices.
5321     */
5322    uint32_t count;
5323 
5324    /**
5325     * First instance id.
5326     */
5327    uint32_t first_instance;
5328 
5329    /**
5330     * Number of instances.
5331     */
5332    uint32_t instance_count;
5333 
5334    /**
5335     * Whether it's an indexed draw.
5336     */
5337    bool indexed;
5338 
5339    /**
5340     * Indirect draw parameters resource.
5341     */
5342    struct radv_buffer *indirect;
5343    uint64_t indirect_offset;
5344    uint32_t stride;
5345 
5346    /**
5347     * Draw count parameters resource.
5348     */
5349    struct radv_buffer *count_buffer;
5350    uint64_t count_buffer_offset;
5351 
5352    /**
5353     * Stream output parameters resource.
5354     */
5355    struct radv_buffer *strmout_buffer;
5356    uint64_t strmout_buffer_offset;
5357 };
5358 
5359 static void
radv_emit_ia_multi_vgt_param(struct radv_cmd_buffer * cmd_buffer,bool instanced_draw,bool indirect_draw,bool count_from_stream_output,uint32_t draw_vertex_count)5360 radv_emit_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer, bool instanced_draw, bool indirect_draw,
5361                              bool count_from_stream_output, uint32_t draw_vertex_count)
5362 {
5363    const struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info;
5364    struct radv_cmd_state *state = &cmd_buffer->state;
5365    const unsigned patch_control_points = state->dynamic.vk.ts.patch_control_points;
5366    const unsigned topology = state->dynamic.vk.ia.primitive_topology;
5367    const bool prim_restart_enable = state->dynamic.vk.ia.primitive_restart_enable;
5368    struct radeon_cmdbuf *cs = cmd_buffer->cs;
5369    unsigned ia_multi_vgt_param;
5370 
5371    ia_multi_vgt_param = radv_get_ia_multi_vgt_param(cmd_buffer, instanced_draw, indirect_draw, count_from_stream_output,
5372                                                     draw_vertex_count, topology, prim_restart_enable,
5373                                                     patch_control_points, state->tess_num_patches);
5374 
5375    if (state->last_ia_multi_vgt_param != ia_multi_vgt_param) {
5376       if (info->gfx_level == GFX9) {
5377          radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device, cs, R_030960_IA_MULTI_VGT_PARAM, 4,
5378                                     ia_multi_vgt_param);
5379       } else if (info->gfx_level >= GFX7) {
5380          radeon_set_context_reg_idx(cs, R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param);
5381       } else {
5382          radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param);
5383       }
5384       state->last_ia_multi_vgt_param = ia_multi_vgt_param;
5385    }
5386 }
5387 
5388 static void
gfx10_emit_ge_cntl(struct radv_cmd_buffer * cmd_buffer)5389 gfx10_emit_ge_cntl(struct radv_cmd_buffer *cmd_buffer)
5390 {
5391    const struct radv_shader *last_vgt_shader = cmd_buffer->state.last_vgt_shader;
5392    struct radv_cmd_state *state = &cmd_buffer->state;
5393    bool break_wave_at_eoi = false;
5394    unsigned primgroup_size;
5395    unsigned ge_cntl;
5396 
5397    if (last_vgt_shader->info.is_ngg)
5398       return;
5399 
5400    if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_TESS_CTRL)) {
5401       primgroup_size = state->tess_num_patches;
5402 
5403       if (cmd_buffer->state.shaders[MESA_SHADER_TESS_CTRL]->info.uses_prim_id ||
5404           radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_TESS_EVAL)->info.uses_prim_id) {
5405          break_wave_at_eoi = true;
5406       }
5407    } else if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_GEOMETRY)) {
5408       const struct radv_legacy_gs_info *gs_state = &cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY]->info.gs_ring_info;
5409       primgroup_size = G_028A44_GS_PRIMS_PER_SUBGRP(gs_state->vgt_gs_onchip_cntl);
5410    } else {
5411       primgroup_size = 128; /* recommended without a GS and tess */
5412    }
5413 
5414    ge_cntl = S_03096C_PRIM_GRP_SIZE_GFX10(primgroup_size) | S_03096C_VERT_GRP_SIZE(256) | /* disable vertex grouping */
5415              S_03096C_PACKET_TO_ONE_PA(0) /* this should only be set if LINE_STIPPLE_TEX_ENA == 1 */ |
5416              S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
5417 
5418    if (state->last_ge_cntl != ge_cntl) {
5419       radeon_set_uconfig_reg(cmd_buffer->cs, R_03096C_GE_CNTL, ge_cntl);
5420       state->last_ge_cntl = ge_cntl;
5421    }
5422 }
5423 
5424 static void
radv_emit_draw_registers(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * draw_info)5425 radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *draw_info)
5426 {
5427    const struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info;
5428    struct radv_cmd_state *state = &cmd_buffer->state;
5429    struct radeon_cmdbuf *cs = cmd_buffer->cs;
5430    uint32_t topology = state->dynamic.vk.ia.primitive_topology;
5431    bool disable_instance_packing = false;
5432 
5433    /* Draw state. */
5434    if (info->gfx_level >= GFX10) {
5435       gfx10_emit_ge_cntl(cmd_buffer);
5436    } else {
5437       radv_emit_ia_multi_vgt_param(cmd_buffer, draw_info->instance_count > 1, draw_info->indirect,
5438                                    !!draw_info->strmout_buffer, draw_info->indirect ? 0 : draw_info->count);
5439    }
5440 
5441    /* RDNA2 is affected by a hardware bug when instance packing is enabled for adjacent primitive
5442     * topologies and instance_count > 1, pipeline stats generated by GE are incorrect. It needs to
5443     * be applied for indexed and non-indexed draws.
5444     */
5445    if (info->gfx_level == GFX10_3 && state->active_pipeline_queries > 0 &&
5446        (draw_info->instance_count > 1 || draw_info->indirect) &&
5447        (topology == V_008958_DI_PT_LINELIST_ADJ || topology == V_008958_DI_PT_LINESTRIP_ADJ ||
5448         topology == V_008958_DI_PT_TRILIST_ADJ || topology == V_008958_DI_PT_TRISTRIP_ADJ)) {
5449       disable_instance_packing = true;
5450    }
5451 
5452    if ((draw_info->indexed && state->index_type != state->last_index_type) ||
5453        (info->gfx_level == GFX10_3 &&
5454         (state->last_index_type == -1 ||
5455          disable_instance_packing != G_028A7C_DISABLE_INSTANCE_PACKING(state->last_index_type)))) {
5456       uint32_t index_type = state->index_type | S_028A7C_DISABLE_INSTANCE_PACKING(disable_instance_packing);
5457 
5458       if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9) {
5459          radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device, cs, R_03090C_VGT_INDEX_TYPE, 2, index_type);
5460       } else {
5461          radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
5462          radeon_emit(cs, index_type);
5463       }
5464 
5465       state->last_index_type = index_type;
5466    }
5467 }
5468 
5469 static void
radv_stage_flush(struct radv_cmd_buffer * cmd_buffer,VkPipelineStageFlags2 src_stage_mask)5470 radv_stage_flush(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 src_stage_mask)
5471 {
5472    /* For simplicity, if the barrier wants to wait for the task shader,
5473     * just make it wait for the mesh shader too.
5474     */
5475    if (src_stage_mask & VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_EXT)
5476       src_stage_mask |= VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_EXT;
5477 
5478    if (src_stage_mask & (VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_RESOLVE_BIT | VK_PIPELINE_STAGE_2_BLIT_BIT |
5479                          VK_PIPELINE_STAGE_2_CLEAR_BIT)) {
5480       /* Be conservative for now. */
5481       src_stage_mask |= VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT;
5482    }
5483 
5484    if (src_stage_mask &
5485        (VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT |
5486         VK_PIPELINE_STAGE_2_COMMAND_PREPROCESS_BIT_NV | VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_KHR |
5487         VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_COPY_BIT_KHR | VK_PIPELINE_STAGE_2_RAY_TRACING_SHADER_BIT_KHR |
5488         VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) {
5489       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
5490    }
5491 
5492    if (src_stage_mask & (VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT |
5493                          VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT |
5494                          VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT | VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT |
5495                          VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) {
5496       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
5497    } else if (src_stage_mask &
5498               (VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT |
5499                VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT |
5500                VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT | VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT |
5501                VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_EXT | VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT |
5502                VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT)) {
5503       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH;
5504    }
5505 }
5506 
5507 static bool
can_skip_buffer_l2_flushes(struct radv_device * device)5508 can_skip_buffer_l2_flushes(struct radv_device *device)
5509 {
5510    return device->physical_device->rad_info.gfx_level == GFX9 ||
5511           (device->physical_device->rad_info.gfx_level >= GFX10 &&
5512            !device->physical_device->rad_info.tcc_rb_non_coherent);
5513 }
5514 
5515 /*
5516  * In vulkan barriers have two kinds of operations:
5517  *
5518  * - visibility (implemented with radv_src_access_flush)
5519  * - availability (implemented with radv_dst_access_flush)
5520  *
5521  * for a memory operation to observe the result of a previous memory operation
5522  * one needs to do a visibility operation from the source memory and then an
5523  * availability operation to the target memory.
5524  *
5525  * The complication is the availability and visibility operations do not need to
5526  * be in the same barrier.
5527  *
5528  * The cleanest way to implement this is to define the visibility operation to
5529  * bring the caches to a "state of rest", which none of the caches below that
5530  * level dirty.
5531  *
5532  * For GFX8 and earlier this would be VRAM/GTT with none of the caches dirty.
5533  *
5534  * For GFX9+ we can define the state at rest to be L2 instead of VRAM for all
5535  * buffers and for images marked as coherent, and VRAM/GTT for non-coherent
5536  * images. However, given the existence of memory barriers which do not specify
5537  * the image/buffer it often devolves to just VRAM/GTT anyway.
5538  *
5539  * To help reducing the invalidations for GPUs that have L2 coherency between the
5540  * RB and the shader caches, we always invalidate L2 on the src side, as we can
5541  * use our knowledge of past usage to optimize flushes away.
5542  */
5543 
5544 enum radv_cmd_flush_bits
radv_src_access_flush(struct radv_cmd_buffer * cmd_buffer,VkAccessFlags2 src_flags,const struct radv_image * image)5545 radv_src_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags2 src_flags, const struct radv_image *image)
5546 {
5547    bool has_CB_meta = true, has_DB_meta = true;
5548    bool image_is_coherent = image ? image->l2_coherent : false;
5549    enum radv_cmd_flush_bits flush_bits = 0;
5550 
5551    if (image) {
5552       if (!radv_image_has_CB_metadata(image))
5553          has_CB_meta = false;
5554       if (!radv_image_has_htile(image))
5555          has_DB_meta = false;
5556    }
5557 
5558    u_foreach_bit64 (b, src_flags) {
5559       switch ((VkAccessFlags2)BITFIELD64_BIT(b)) {
5560       case VK_ACCESS_2_COMMAND_PREPROCESS_WRITE_BIT_NV:
5561          flush_bits |= RADV_CMD_FLAG_INV_L2;
5562          break;
5563       case VK_ACCESS_2_SHADER_WRITE_BIT:
5564       case VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT:
5565          /* since the STORAGE bit isn't set we know that this is a meta operation.
5566           * on the dst flush side we skip CB/DB flushes without the STORAGE bit, so
5567           * set it here. */
5568          if (image && !(image->vk.usage & VK_IMAGE_USAGE_STORAGE_BIT)) {
5569             if (vk_format_is_depth_or_stencil(image->vk.format)) {
5570                flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
5571             } else {
5572                flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
5573             }
5574          }
5575 
5576          if (!image_is_coherent)
5577             flush_bits |= RADV_CMD_FLAG_INV_L2;
5578          break;
5579       case VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_KHR:
5580       case VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT:
5581       case VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
5582          if (!image_is_coherent)
5583             flush_bits |= RADV_CMD_FLAG_WB_L2;
5584          break;
5585       case VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT:
5586          flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
5587          if (has_CB_meta)
5588             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
5589          break;
5590       case VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
5591          flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
5592          if (has_DB_meta)
5593             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
5594          break;
5595       case VK_ACCESS_2_TRANSFER_WRITE_BIT:
5596          flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB;
5597 
5598          if (!image_is_coherent)
5599             flush_bits |= RADV_CMD_FLAG_INV_L2;
5600          if (has_CB_meta)
5601             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
5602          if (has_DB_meta)
5603             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
5604          break;
5605       case VK_ACCESS_2_MEMORY_WRITE_BIT:
5606          flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB;
5607 
5608          if (!image_is_coherent)
5609             flush_bits |= RADV_CMD_FLAG_INV_L2;
5610          if (has_CB_meta)
5611             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
5612          if (has_DB_meta)
5613             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
5614          break;
5615       default:
5616          break;
5617       }
5618    }
5619    return flush_bits;
5620 }
5621 
5622 enum radv_cmd_flush_bits
radv_dst_access_flush(struct radv_cmd_buffer * cmd_buffer,VkAccessFlags2 dst_flags,const struct radv_image * image)5623 radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags2 dst_flags, const struct radv_image *image)
5624 {
5625    bool has_CB_meta = true, has_DB_meta = true;
5626    enum radv_cmd_flush_bits flush_bits = 0;
5627    bool flush_CB = true, flush_DB = true;
5628    bool image_is_coherent = image ? image->l2_coherent : false;
5629 
5630    if (image) {
5631       if (!(image->vk.usage & VK_IMAGE_USAGE_STORAGE_BIT)) {
5632          flush_CB = false;
5633          flush_DB = false;
5634       }
5635 
5636       if (!radv_image_has_CB_metadata(image))
5637          has_CB_meta = false;
5638       if (!radv_image_has_htile(image))
5639          has_DB_meta = false;
5640    }
5641 
5642    /* All the L2 invalidations below are not the CB/DB. So if there are no incoherent images
5643     * in the L2 cache in CB/DB mode then they are already usable from all the other L2 clients. */
5644    image_is_coherent |= can_skip_buffer_l2_flushes(cmd_buffer->device) && !cmd_buffer->state.rb_noncoherent_dirty;
5645 
5646    u_foreach_bit64 (b, dst_flags) {
5647       switch ((VkAccessFlags2)BITFIELD64_BIT(b)) {
5648       case VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT:
5649          /* SMEM loads are used to read compute dispatch size in shaders */
5650          if (!cmd_buffer->device->load_grid_size_from_user_sgpr)
5651             flush_bits |= RADV_CMD_FLAG_INV_SCACHE;
5652 
5653          /* Ensure the DGC meta shader can read the commands. */
5654          if (radv_uses_device_generated_commands(cmd_buffer->device)) {
5655             flush_bits |= RADV_CMD_FLAG_INV_SCACHE | RADV_CMD_FLAG_INV_VCACHE;
5656 
5657             if (cmd_buffer->device->physical_device->rad_info.gfx_level < GFX9)
5658                flush_bits |= RADV_CMD_FLAG_INV_L2;
5659          }
5660 
5661          break;
5662       case VK_ACCESS_2_INDEX_READ_BIT:
5663       case VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
5664          break;
5665       case VK_ACCESS_2_UNIFORM_READ_BIT:
5666          flush_bits |= RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_SCACHE;
5667          break;
5668       case VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT:
5669       case VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT:
5670       case VK_ACCESS_2_TRANSFER_READ_BIT:
5671       case VK_ACCESS_2_TRANSFER_WRITE_BIT:
5672          flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
5673 
5674          if (has_CB_meta || has_DB_meta)
5675             flush_bits |= RADV_CMD_FLAG_INV_L2_METADATA;
5676          if (!image_is_coherent)
5677             flush_bits |= RADV_CMD_FLAG_INV_L2;
5678          break;
5679       case VK_ACCESS_2_DESCRIPTOR_BUFFER_READ_BIT_EXT:
5680          flush_bits |= RADV_CMD_FLAG_INV_SCACHE;
5681          break;
5682       case VK_ACCESS_2_SHADER_BINDING_TABLE_READ_BIT_KHR:
5683       case VK_ACCESS_2_SHADER_READ_BIT:
5684       case VK_ACCESS_2_SHADER_STORAGE_READ_BIT:
5685          /* Unlike LLVM, ACO uses SMEM for SSBOs and we have to
5686           * invalidate the scalar cache. */
5687          if (!cmd_buffer->device->physical_device->use_llvm && !image)
5688             flush_bits |= RADV_CMD_FLAG_INV_SCACHE;
5689          FALLTHROUGH;
5690       case VK_ACCESS_2_SHADER_SAMPLED_READ_BIT:
5691          flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
5692          if (has_CB_meta || has_DB_meta)
5693             flush_bits |= RADV_CMD_FLAG_INV_L2_METADATA;
5694          if (!image_is_coherent)
5695             flush_bits |= RADV_CMD_FLAG_INV_L2;
5696          break;
5697       case VK_ACCESS_2_COMMAND_PREPROCESS_READ_BIT_NV:
5698       case VK_ACCESS_2_ACCELERATION_STRUCTURE_READ_BIT_KHR:
5699          flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
5700          if (cmd_buffer->device->physical_device->rad_info.gfx_level < GFX9)
5701             flush_bits |= RADV_CMD_FLAG_INV_L2;
5702          break;
5703       case VK_ACCESS_2_SHADER_WRITE_BIT:
5704       case VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT:
5705       case VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_KHR:
5706          break;
5707       case VK_ACCESS_2_COLOR_ATTACHMENT_READ_BIT:
5708       case VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT:
5709          if (flush_CB)
5710             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
5711          if (has_CB_meta)
5712             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
5713          break;
5714       case VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_READ_BIT:
5715       case VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
5716          if (flush_DB)
5717             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
5718          if (has_DB_meta)
5719             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
5720          break;
5721       case VK_ACCESS_2_MEMORY_READ_BIT:
5722       case VK_ACCESS_2_MEMORY_WRITE_BIT:
5723          flush_bits |= RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_SCACHE;
5724          if (!image_is_coherent)
5725             flush_bits |= RADV_CMD_FLAG_INV_L2;
5726          if (flush_CB)
5727             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
5728          if (has_CB_meta)
5729             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
5730          if (flush_DB)
5731             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
5732          if (has_DB_meta)
5733             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
5734          break;
5735       default:
5736          break;
5737       }
5738    }
5739    return flush_bits;
5740 }
5741 
5742 void
radv_emit_resolve_barrier(struct radv_cmd_buffer * cmd_buffer,const struct radv_resolve_barrier * barrier)5743 radv_emit_resolve_barrier(struct radv_cmd_buffer *cmd_buffer, const struct radv_resolve_barrier *barrier)
5744 {
5745    struct radv_rendering_state *render = &cmd_buffer->state.render;
5746 
5747    for (uint32_t i = 0; i < render->color_att_count; i++) {
5748       struct radv_image_view *iview = render->color_att[i].iview;
5749       if (!iview)
5750          continue;
5751 
5752       cmd_buffer->state.flush_bits |= radv_src_access_flush(cmd_buffer, barrier->src_access_mask, iview->image);
5753    }
5754    if (render->ds_att.iview) {
5755       cmd_buffer->state.flush_bits |=
5756          radv_src_access_flush(cmd_buffer, barrier->src_access_mask, render->ds_att.iview->image);
5757    }
5758 
5759    radv_stage_flush(cmd_buffer, barrier->src_stage_mask);
5760 
5761    for (uint32_t i = 0; i < render->color_att_count; i++) {
5762       struct radv_image_view *iview = render->color_att[i].iview;
5763       if (!iview)
5764          continue;
5765 
5766       cmd_buffer->state.flush_bits |= radv_dst_access_flush(cmd_buffer, barrier->dst_access_mask, iview->image);
5767    }
5768    if (render->ds_att.iview) {
5769       cmd_buffer->state.flush_bits |=
5770          radv_dst_access_flush(cmd_buffer, barrier->dst_access_mask, render->ds_att.iview->image);
5771    }
5772 
5773    radv_gang_barrier(cmd_buffer, barrier->src_stage_mask, barrier->dst_stage_mask);
5774 }
5775 
5776 static void
radv_handle_image_transition_separate(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,VkImageLayout dst_layout,VkImageLayout src_stencil_layout,VkImageLayout dst_stencil_layout,uint32_t src_family_index,uint32_t dst_family_index,const VkImageSubresourceRange * range,struct radv_sample_locations_state * sample_locs)5777 radv_handle_image_transition_separate(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
5778                                       VkImageLayout src_layout, VkImageLayout dst_layout,
5779                                       VkImageLayout src_stencil_layout, VkImageLayout dst_stencil_layout,
5780                                       uint32_t src_family_index, uint32_t dst_family_index,
5781                                       const VkImageSubresourceRange *range,
5782                                       struct radv_sample_locations_state *sample_locs)
5783 {
5784    /* If we have a stencil layout that's different from depth, we need to
5785     * perform the stencil transition separately.
5786     */
5787    if ((range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) &&
5788        (src_layout != src_stencil_layout || dst_layout != dst_stencil_layout)) {
5789       VkImageSubresourceRange aspect_range = *range;
5790       /* Depth-only transitions. */
5791       if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
5792          aspect_range.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT;
5793          radv_handle_image_transition(cmd_buffer, image, src_layout, dst_layout, src_family_index, dst_family_index,
5794                                       &aspect_range, sample_locs);
5795       }
5796 
5797       /* Stencil-only transitions. */
5798       aspect_range.aspectMask = VK_IMAGE_ASPECT_STENCIL_BIT;
5799       radv_handle_image_transition(cmd_buffer, image, src_stencil_layout, dst_stencil_layout, src_family_index,
5800                                    dst_family_index, &aspect_range, sample_locs);
5801    } else {
5802       radv_handle_image_transition(cmd_buffer, image, src_layout, dst_layout, src_family_index, dst_family_index, range,
5803                                    sample_locs);
5804    }
5805 }
5806 
5807 static void
radv_handle_rendering_image_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_image_view * view,uint32_t layer_count,uint32_t view_mask,VkImageLayout initial_layout,VkImageLayout initial_stencil_layout,VkImageLayout final_layout,VkImageLayout final_stencil_layout,struct radv_sample_locations_state * sample_locs)5808 radv_handle_rendering_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image_view *view,
5809                                        uint32_t layer_count, uint32_t view_mask, VkImageLayout initial_layout,
5810                                        VkImageLayout initial_stencil_layout, VkImageLayout final_layout,
5811                                        VkImageLayout final_stencil_layout,
5812                                        struct radv_sample_locations_state *sample_locs)
5813 {
5814    VkImageSubresourceRange range;
5815    range.aspectMask = view->image->vk.aspects;
5816    range.baseMipLevel = view->vk.base_mip_level;
5817    range.levelCount = 1;
5818 
5819    if (view_mask) {
5820       while (view_mask) {
5821          int start, count;
5822          u_bit_scan_consecutive_range(&view_mask, &start, &count);
5823 
5824          range.baseArrayLayer = view->vk.base_array_layer + start;
5825          range.layerCount = count;
5826 
5827          radv_handle_image_transition_separate(cmd_buffer, view->image, initial_layout, final_layout,
5828                                                initial_stencil_layout, final_stencil_layout, 0, 0, &range, sample_locs);
5829       }
5830    } else {
5831       range.baseArrayLayer = view->vk.base_array_layer;
5832       range.layerCount = layer_count;
5833       radv_handle_image_transition_separate(cmd_buffer, view->image, initial_layout, final_layout,
5834                                             initial_stencil_layout, final_stencil_layout, 0, 0, &range, sample_locs);
5835    }
5836 }
5837 
5838 VKAPI_ATTR VkResult VKAPI_CALL
radv_BeginCommandBuffer(VkCommandBuffer commandBuffer,const VkCommandBufferBeginInfo * pBeginInfo)5839 radv_BeginCommandBuffer(VkCommandBuffer commandBuffer, const VkCommandBufferBeginInfo *pBeginInfo)
5840 {
5841    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5842    VkResult result = VK_SUCCESS;
5843 
5844    vk_command_buffer_begin(&cmd_buffer->vk, pBeginInfo);
5845 
5846    if (cmd_buffer->qf == RADV_QUEUE_SPARSE)
5847       return result;
5848 
5849    memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
5850    cmd_buffer->state.last_index_type = -1;
5851    cmd_buffer->state.last_num_instances = -1;
5852    cmd_buffer->state.last_vertex_offset_valid = false;
5853    cmd_buffer->state.last_first_instance = -1;
5854    cmd_buffer->state.last_drawid = -1;
5855    cmd_buffer->state.last_subpass_color_count = MAX_RTS;
5856    cmd_buffer->state.predication_type = -1;
5857    cmd_buffer->state.last_sx_ps_downconvert = -1;
5858    cmd_buffer->state.last_sx_blend_opt_epsilon = -1;
5859    cmd_buffer->state.last_sx_blend_opt_control = -1;
5860    cmd_buffer->state.mesh_shading = false;
5861    cmd_buffer->state.last_vrs_rates = -1;
5862    cmd_buffer->state.last_vrs_rates_sgpr_idx = -1;
5863    cmd_buffer->state.last_pa_sc_binner_cntl_0 = -1;
5864    cmd_buffer->state.last_db_count_control = -1;
5865    cmd_buffer->state.last_db_shader_control = -1;
5866    cmd_buffer->usage_flags = pBeginInfo->flags;
5867 
5868    cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_ALL | RADV_CMD_DIRTY_GUARDBAND | RADV_CMD_DIRTY_OCCLUSION_QUERY |
5869                               RADV_CMD_DIRTY_DB_SHADER_CONTROL;
5870 
5871    if (cmd_buffer->qf == RADV_QUEUE_GENERAL)
5872       vk_dynamic_graphics_state_init(&cmd_buffer->state.dynamic.vk);
5873 
5874    if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) {
5875       uint32_t pred_value = 0;
5876       uint32_t pred_offset;
5877       if (!radv_cmd_buffer_upload_data(cmd_buffer, 4, &pred_value, &pred_offset))
5878          vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
5879 
5880       cmd_buffer->mec_inv_pred_emitted = false;
5881       cmd_buffer->mec_inv_pred_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + pred_offset;
5882    }
5883 
5884    if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9 && cmd_buffer->qf == RADV_QUEUE_GENERAL) {
5885       unsigned num_db = cmd_buffer->device->physical_device->rad_info.max_render_backends;
5886       unsigned fence_offset, eop_bug_offset;
5887       void *fence_ptr;
5888 
5889       radv_cmd_buffer_upload_alloc(cmd_buffer, 8, &fence_offset, &fence_ptr);
5890       memset(fence_ptr, 0, 8);
5891 
5892       cmd_buffer->gfx9_fence_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
5893       cmd_buffer->gfx9_fence_va += fence_offset;
5894 
5895       radv_emit_clear_data(cmd_buffer, V_370_PFP, cmd_buffer->gfx9_fence_va, 8);
5896 
5897       if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX9) {
5898          /* Allocate a buffer for the EOP bug on GFX9. */
5899          radv_cmd_buffer_upload_alloc(cmd_buffer, 16 * num_db, &eop_bug_offset, &fence_ptr);
5900          memset(fence_ptr, 0, 16 * num_db);
5901          cmd_buffer->gfx9_eop_bug_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
5902          cmd_buffer->gfx9_eop_bug_va += eop_bug_offset;
5903 
5904          radv_emit_clear_data(cmd_buffer, V_370_PFP, cmd_buffer->gfx9_eop_bug_va, 16 * num_db);
5905       }
5906    }
5907 
5908    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
5909        (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) {
5910 
5911       char gcbiar_data[VK_GCBIARR_DATA_SIZE(MAX_RTS)];
5912       const VkRenderingInfo *resume_info =
5913          vk_get_command_buffer_inheritance_as_rendering_resume(cmd_buffer->vk.level, pBeginInfo, gcbiar_data);
5914       if (resume_info) {
5915          radv_CmdBeginRendering(commandBuffer, resume_info);
5916       } else {
5917          const VkCommandBufferInheritanceRenderingInfo *inheritance_info =
5918             vk_get_command_buffer_inheritance_rendering_info(cmd_buffer->vk.level, pBeginInfo);
5919 
5920          radv_cmd_buffer_reset_rendering(cmd_buffer);
5921          struct radv_rendering_state *render = &cmd_buffer->state.render;
5922          render->active = true;
5923          render->view_mask = inheritance_info->viewMask;
5924          render->max_samples = inheritance_info->rasterizationSamples;
5925          render->color_att_count = inheritance_info->colorAttachmentCount;
5926          for (uint32_t i = 0; i < render->color_att_count; i++) {
5927             render->color_att[i] = (struct radv_attachment){
5928                .format = inheritance_info->pColorAttachmentFormats[i],
5929             };
5930          }
5931          assert(inheritance_info->depthAttachmentFormat == VK_FORMAT_UNDEFINED ||
5932                 inheritance_info->stencilAttachmentFormat == VK_FORMAT_UNDEFINED ||
5933                 inheritance_info->depthAttachmentFormat == inheritance_info->stencilAttachmentFormat);
5934          render->ds_att = (struct radv_attachment){.iview = NULL};
5935          if (inheritance_info->depthAttachmentFormat != VK_FORMAT_UNDEFINED)
5936             render->ds_att.format = inheritance_info->depthAttachmentFormat;
5937          if (inheritance_info->stencilAttachmentFormat != VK_FORMAT_UNDEFINED)
5938             render->ds_att.format = inheritance_info->stencilAttachmentFormat;
5939 
5940          if (vk_format_has_depth(render->ds_att.format))
5941             render->ds_att_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
5942          if (vk_format_has_stencil(render->ds_att.format))
5943             render->ds_att_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
5944       }
5945 
5946       cmd_buffer->state.inherited_pipeline_statistics = pBeginInfo->pInheritanceInfo->pipelineStatistics;
5947 
5948       if (cmd_buffer->state.inherited_pipeline_statistics &
5949           (VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT |
5950            VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT))
5951          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
5952 
5953       cmd_buffer->state.inherited_occlusion_queries = pBeginInfo->pInheritanceInfo->occlusionQueryEnable;
5954       cmd_buffer->state.inherited_query_control_flags = pBeginInfo->pInheritanceInfo->queryFlags;
5955       if (cmd_buffer->state.inherited_occlusion_queries)
5956          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_OCCLUSION_QUERY;
5957    }
5958 
5959    if (radv_device_fault_detection_enabled(cmd_buffer->device))
5960       radv_cmd_buffer_trace_emit(cmd_buffer);
5961 
5962    radv_describe_begin_cmd_buffer(cmd_buffer);
5963 
5964    return result;
5965 }
5966 
5967 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes,const VkDeviceSize * pStrides)5968 radv_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer, uint32_t firstBinding, uint32_t bindingCount,
5969                            const VkBuffer *pBuffers, const VkDeviceSize *pOffsets, const VkDeviceSize *pSizes,
5970                            const VkDeviceSize *pStrides)
5971 {
5972    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5973    struct radv_vertex_binding *vb = cmd_buffer->vertex_bindings;
5974    const struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
5975 
5976    /* We have to defer setting up vertex buffer since we need the buffer
5977     * stride from the pipeline. */
5978 
5979    assert(firstBinding + bindingCount <= MAX_VBS);
5980    enum amd_gfx_level chip = cmd_buffer->device->physical_device->rad_info.gfx_level;
5981 
5982    if (firstBinding + bindingCount > cmd_buffer->used_vertex_bindings)
5983       cmd_buffer->used_vertex_bindings = firstBinding + bindingCount;
5984 
5985    uint32_t misaligned_mask_invalid = 0;
5986 
5987    for (uint32_t i = 0; i < bindingCount; i++) {
5988       RADV_FROM_HANDLE(radv_buffer, buffer, pBuffers[i]);
5989       uint32_t idx = firstBinding + i;
5990       VkDeviceSize size = pSizes ? pSizes[i] : 0;
5991       /* if pStrides=NULL, it shouldn't overwrite the strides specified by CmdSetVertexInputEXT */
5992       VkDeviceSize stride = pStrides ? pStrides[i] : vb[idx].stride;
5993 
5994       if (!!cmd_buffer->vertex_binding_buffers[idx] != !!buffer ||
5995           (buffer && ((vb[idx].offset & 0x3) != (pOffsets[i] & 0x3) || (vb[idx].stride & 0x3) != (stride & 0x3)))) {
5996          misaligned_mask_invalid |= state->bindings_match_attrib ? BITFIELD_BIT(idx) : 0xffffffff;
5997       }
5998 
5999       cmd_buffer->vertex_binding_buffers[idx] = buffer;
6000       vb[idx].offset = pOffsets[i];
6001       vb[idx].size = buffer ? vk_buffer_range(&buffer->vk, pOffsets[i], size) : size;
6002       vb[idx].stride = stride;
6003 
6004       uint32_t bit = BITFIELD_BIT(idx);
6005       if (buffer) {
6006          radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->vertex_binding_buffers[idx]->bo);
6007          cmd_buffer->state.vbo_bound_mask |= bit;
6008       } else {
6009          cmd_buffer->state.vbo_bound_mask &= ~bit;
6010       }
6011    }
6012 
6013    if ((chip == GFX6 || chip >= GFX10) && misaligned_mask_invalid) {
6014       cmd_buffer->state.vbo_misaligned_mask_invalid = misaligned_mask_invalid;
6015       cmd_buffer->state.vbo_misaligned_mask &= ~misaligned_mask_invalid;
6016    }
6017 
6018    cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER | RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT;
6019 }
6020 
6021 static uint32_t
vk_to_index_type(VkIndexType type)6022 vk_to_index_type(VkIndexType type)
6023 {
6024    switch (type) {
6025    case VK_INDEX_TYPE_UINT8_KHR:
6026       return V_028A7C_VGT_INDEX_8;
6027    case VK_INDEX_TYPE_UINT16:
6028       return V_028A7C_VGT_INDEX_16;
6029    case VK_INDEX_TYPE_UINT32:
6030       return V_028A7C_VGT_INDEX_32;
6031    default:
6032       unreachable("invalid index type");
6033    }
6034 }
6035 
6036 uint32_t
radv_get_vgt_index_size(uint32_t type)6037 radv_get_vgt_index_size(uint32_t type)
6038 {
6039    uint32_t index_type = G_028A7C_INDEX_TYPE(type);
6040    switch (index_type) {
6041    case V_028A7C_VGT_INDEX_8:
6042       return 1;
6043    case V_028A7C_VGT_INDEX_16:
6044       return 2;
6045    case V_028A7C_VGT_INDEX_32:
6046       return 4;
6047    default:
6048       unreachable("invalid index type");
6049    }
6050 }
6051 
6052 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer,VkBuffer buffer,VkDeviceSize offset,VkDeviceSize size,VkIndexType indexType)6053 radv_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, VkDeviceSize size,
6054                             VkIndexType indexType)
6055 {
6056    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6057    RADV_FROM_HANDLE(radv_buffer, index_buffer, buffer);
6058 
6059    cmd_buffer->state.index_type = vk_to_index_type(indexType);
6060 
6061    if (index_buffer) {
6062       cmd_buffer->state.index_va = radv_buffer_get_va(index_buffer->bo);
6063       cmd_buffer->state.index_va += index_buffer->offset + offset;
6064 
6065       int index_size = radv_get_vgt_index_size(vk_to_index_type(indexType));
6066       cmd_buffer->state.max_index_count = (vk_buffer_range(&index_buffer->vk, offset, size)) / index_size;
6067       radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, index_buffer->bo);
6068    } else {
6069       cmd_buffer->state.index_va = 0;
6070       cmd_buffer->state.max_index_count = 0;
6071    }
6072 
6073    cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
6074 
6075    /* Primitive restart state depends on the index type. */
6076    if (cmd_buffer->state.dynamic.vk.ia.primitive_restart_enable)
6077       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE;
6078 }
6079 
6080 static void
radv_bind_descriptor_set(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point,struct radv_descriptor_set * set,unsigned idx)6081 radv_bind_descriptor_set(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point,
6082                          struct radv_descriptor_set *set, unsigned idx)
6083 {
6084    struct radeon_winsys *ws = cmd_buffer->device->ws;
6085 
6086    radv_set_descriptor_set(cmd_buffer, bind_point, set, idx);
6087 
6088    assert(set);
6089    assert(!(set->header.layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR));
6090 
6091    if (!cmd_buffer->device->use_global_bo_list) {
6092       for (unsigned j = 0; j < set->header.buffer_count; ++j)
6093          if (set->descriptors[j])
6094             radv_cs_add_buffer(ws, cmd_buffer->cs, set->descriptors[j]);
6095    }
6096 
6097    if (set->header.bo)
6098       radv_cs_add_buffer(ws, cmd_buffer->cs, set->header.bo);
6099 }
6100 
6101 static void
radv_bind_descriptor_sets(struct radv_cmd_buffer * cmd_buffer,const VkBindDescriptorSetsInfoKHR * pBindDescriptorSetsInfo,VkPipelineBindPoint bind_point)6102 radv_bind_descriptor_sets(struct radv_cmd_buffer *cmd_buffer,
6103                           const VkBindDescriptorSetsInfoKHR *pBindDescriptorSetsInfo, VkPipelineBindPoint bind_point)
6104 {
6105    RADV_FROM_HANDLE(radv_pipeline_layout, layout, pBindDescriptorSetsInfo->layout);
6106    const bool no_dynamic_bounds = cmd_buffer->device->instance->debug_flags & RADV_DEBUG_NO_DYNAMIC_BOUNDS;
6107    struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);
6108    unsigned dyn_idx = 0;
6109 
6110    for (unsigned i = 0; i < pBindDescriptorSetsInfo->descriptorSetCount; ++i) {
6111       unsigned set_idx = i + pBindDescriptorSetsInfo->firstSet;
6112       RADV_FROM_HANDLE(radv_descriptor_set, set, pBindDescriptorSetsInfo->pDescriptorSets[i]);
6113 
6114       if (!set)
6115          continue;
6116 
6117       /* If the set is already bound we only need to update the
6118        * (potentially changed) dynamic offsets. */
6119       if (descriptors_state->sets[set_idx] != set || !(descriptors_state->valid & (1u << set_idx))) {
6120          radv_bind_descriptor_set(cmd_buffer, bind_point, set, set_idx);
6121       }
6122 
6123       for (unsigned j = 0; j < set->header.layout->dynamic_offset_count; ++j, ++dyn_idx) {
6124          unsigned idx = j + layout->set[i + pBindDescriptorSetsInfo->firstSet].dynamic_offset_start;
6125          uint32_t *dst = descriptors_state->dynamic_buffers + idx * 4;
6126          assert(dyn_idx < pBindDescriptorSetsInfo->dynamicOffsetCount);
6127 
6128          struct radv_descriptor_range *range = set->header.dynamic_descriptors + j;
6129 
6130          if (!range->va) {
6131             memset(dst, 0, 4 * 4);
6132          } else {
6133             uint64_t va = range->va + pBindDescriptorSetsInfo->pDynamicOffsets[dyn_idx];
6134             dst[0] = va;
6135             dst[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
6136             dst[2] = no_dynamic_bounds ? 0xffffffffu : range->size;
6137             dst[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
6138                      S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
6139 
6140             if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
6141                dst[3] |= S_008F0C_FORMAT(V_008F0C_GFX11_FORMAT_32_FLOAT) | S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW);
6142             } else if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10) {
6143                dst[3] |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
6144                          S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
6145             } else {
6146                dst[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
6147                          S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
6148             }
6149          }
6150 
6151          cmd_buffer->push_constant_stages |= set->header.layout->dynamic_shader_stages;
6152       }
6153    }
6154 }
6155 
6156 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindDescriptorSets2KHR(VkCommandBuffer commandBuffer,const VkBindDescriptorSetsInfoKHR * pBindDescriptorSetsInfo)6157 radv_CmdBindDescriptorSets2KHR(VkCommandBuffer commandBuffer,
6158                                const VkBindDescriptorSetsInfoKHR *pBindDescriptorSetsInfo)
6159 {
6160    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6161 
6162    if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) {
6163       radv_bind_descriptor_sets(cmd_buffer, pBindDescriptorSetsInfo, VK_PIPELINE_BIND_POINT_COMPUTE);
6164    }
6165 
6166    if (pBindDescriptorSetsInfo->stageFlags & RADV_GRAPHICS_STAGE_BITS) {
6167       radv_bind_descriptor_sets(cmd_buffer, pBindDescriptorSetsInfo, VK_PIPELINE_BIND_POINT_GRAPHICS);
6168    }
6169 
6170    if (pBindDescriptorSetsInfo->stageFlags & RADV_RT_STAGE_BITS) {
6171       radv_bind_descriptor_sets(cmd_buffer, pBindDescriptorSetsInfo, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
6172    }
6173 }
6174 
6175 static bool
radv_init_push_descriptor_set(struct radv_cmd_buffer * cmd_buffer,struct radv_descriptor_set * set,struct radv_descriptor_set_layout * layout,VkPipelineBindPoint bind_point)6176 radv_init_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer, struct radv_descriptor_set *set,
6177                               struct radv_descriptor_set_layout *layout, VkPipelineBindPoint bind_point)
6178 {
6179    struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);
6180    set->header.size = layout->size;
6181 
6182    if (set->header.layout != layout) {
6183       if (set->header.layout)
6184          vk_descriptor_set_layout_unref(&cmd_buffer->device->vk, &set->header.layout->vk);
6185       vk_descriptor_set_layout_ref(&layout->vk);
6186       set->header.layout = layout;
6187    }
6188 
6189    if (descriptors_state->push_set.capacity < set->header.size) {
6190       size_t new_size = MAX2(set->header.size, 1024);
6191       new_size = MAX2(new_size, 2 * descriptors_state->push_set.capacity);
6192       new_size = MIN2(new_size, 96 * MAX_PUSH_DESCRIPTORS);
6193 
6194       free(set->header.mapped_ptr);
6195       set->header.mapped_ptr = malloc(new_size);
6196 
6197       if (!set->header.mapped_ptr) {
6198          descriptors_state->push_set.capacity = 0;
6199          vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
6200          return false;
6201       }
6202 
6203       descriptors_state->push_set.capacity = new_size;
6204    }
6205 
6206    return true;
6207 }
6208 
6209 void
radv_meta_push_descriptor_set(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint pipelineBindPoint,VkPipelineLayout _layout,uint32_t set,uint32_t descriptorWriteCount,const VkWriteDescriptorSet * pDescriptorWrites)6210 radv_meta_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint pipelineBindPoint,
6211                               VkPipelineLayout _layout, uint32_t set, uint32_t descriptorWriteCount,
6212                               const VkWriteDescriptorSet *pDescriptorWrites)
6213 {
6214    RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
6215    struct radv_descriptor_set *push_set = (struct radv_descriptor_set *)&cmd_buffer->meta_push_descriptors;
6216    unsigned bo_offset;
6217 
6218    assert(set == 0);
6219    assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
6220 
6221    push_set->header.size = layout->set[set].layout->size;
6222    push_set->header.layout = layout->set[set].layout;
6223 
6224    if (!radv_cmd_buffer_upload_alloc(cmd_buffer, push_set->header.size, &bo_offset,
6225                                      (void **)&push_set->header.mapped_ptr))
6226       return;
6227 
6228    push_set->header.va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
6229    push_set->header.va += bo_offset;
6230 
6231    radv_cmd_update_descriptor_sets(cmd_buffer->device, cmd_buffer, radv_descriptor_set_to_handle(push_set),
6232                                    descriptorWriteCount, pDescriptorWrites, 0, NULL);
6233 
6234    radv_set_descriptor_set(cmd_buffer, pipelineBindPoint, push_set, set);
6235 }
6236 
6237 static void
radv_push_descriptor_set(struct radv_cmd_buffer * cmd_buffer,const VkPushDescriptorSetInfoKHR * pPushDescriptorSetInfo,VkPipelineBindPoint bind_point)6238 radv_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer, const VkPushDescriptorSetInfoKHR *pPushDescriptorSetInfo,
6239                          VkPipelineBindPoint bind_point)
6240 {
6241    RADV_FROM_HANDLE(radv_pipeline_layout, layout, pPushDescriptorSetInfo->layout);
6242    struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);
6243    struct radv_descriptor_set *push_set = (struct radv_descriptor_set *)&descriptors_state->push_set.set;
6244 
6245    assert(layout->set[pPushDescriptorSetInfo->set].layout->flags &
6246           VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
6247 
6248    if (!radv_init_push_descriptor_set(cmd_buffer, push_set, layout->set[pPushDescriptorSetInfo->set].layout,
6249                                       bind_point))
6250       return;
6251 
6252    /* Check that there are no inline uniform block updates when calling vkCmdPushDescriptorSetKHR()
6253     * because it is invalid, according to Vulkan spec.
6254     */
6255    for (int i = 0; i < pPushDescriptorSetInfo->descriptorWriteCount; i++) {
6256       ASSERTED const VkWriteDescriptorSet *writeset = &pPushDescriptorSetInfo->pDescriptorWrites[i];
6257       assert(writeset->descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK);
6258    }
6259 
6260    radv_cmd_update_descriptor_sets(cmd_buffer->device, cmd_buffer, radv_descriptor_set_to_handle(push_set),
6261                                    pPushDescriptorSetInfo->descriptorWriteCount,
6262                                    pPushDescriptorSetInfo->pDescriptorWrites, 0, NULL);
6263 
6264    radv_set_descriptor_set(cmd_buffer, bind_point, push_set, pPushDescriptorSetInfo->set);
6265 
6266    radv_flush_push_descriptors(cmd_buffer, descriptors_state);
6267 }
6268 
6269 VKAPI_ATTR void VKAPI_CALL
radv_CmdPushDescriptorSet2KHR(VkCommandBuffer commandBuffer,const VkPushDescriptorSetInfoKHR * pPushDescriptorSetInfo)6270 radv_CmdPushDescriptorSet2KHR(VkCommandBuffer commandBuffer, const VkPushDescriptorSetInfoKHR *pPushDescriptorSetInfo)
6271 {
6272    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6273 
6274    if (pPushDescriptorSetInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) {
6275       radv_push_descriptor_set(cmd_buffer, pPushDescriptorSetInfo, VK_PIPELINE_BIND_POINT_COMPUTE);
6276    }
6277 
6278    if (pPushDescriptorSetInfo->stageFlags & RADV_GRAPHICS_STAGE_BITS) {
6279       radv_push_descriptor_set(cmd_buffer, pPushDescriptorSetInfo, VK_PIPELINE_BIND_POINT_GRAPHICS);
6280    }
6281 
6282    if (pPushDescriptorSetInfo->stageFlags & RADV_RT_STAGE_BITS) {
6283       radv_push_descriptor_set(cmd_buffer, pPushDescriptorSetInfo, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
6284    }
6285 }
6286 
6287 VKAPI_ATTR void VKAPI_CALL
radv_CmdPushDescriptorSetWithTemplate2KHR(VkCommandBuffer commandBuffer,const VkPushDescriptorSetWithTemplateInfoKHR * pPushDescriptorSetWithTemplateInfo)6288 radv_CmdPushDescriptorSetWithTemplate2KHR(
6289    VkCommandBuffer commandBuffer, const VkPushDescriptorSetWithTemplateInfoKHR *pPushDescriptorSetWithTemplateInfo)
6290 {
6291    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6292    RADV_FROM_HANDLE(radv_pipeline_layout, layout, pPushDescriptorSetWithTemplateInfo->layout);
6293    RADV_FROM_HANDLE(radv_descriptor_update_template, templ,
6294                     pPushDescriptorSetWithTemplateInfo->descriptorUpdateTemplate);
6295    struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, templ->bind_point);
6296    struct radv_descriptor_set *push_set = (struct radv_descriptor_set *)&descriptors_state->push_set.set;
6297 
6298    assert(layout->set[pPushDescriptorSetWithTemplateInfo->set].layout->flags &
6299           VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
6300 
6301    if (!radv_init_push_descriptor_set(cmd_buffer, push_set, layout->set[pPushDescriptorSetWithTemplateInfo->set].layout,
6302                                       templ->bind_point))
6303       return;
6304 
6305    radv_cmd_update_descriptor_set_with_template(cmd_buffer->device, cmd_buffer, push_set,
6306                                                 pPushDescriptorSetWithTemplateInfo->descriptorUpdateTemplate,
6307                                                 pPushDescriptorSetWithTemplateInfo->pData);
6308 
6309    radv_set_descriptor_set(cmd_buffer, templ->bind_point, push_set, pPushDescriptorSetWithTemplateInfo->set);
6310 
6311    radv_flush_push_descriptors(cmd_buffer, descriptors_state);
6312 }
6313 
6314 VKAPI_ATTR void VKAPI_CALL
radv_CmdPushConstants2KHR(VkCommandBuffer commandBuffer,const VkPushConstantsInfoKHR * pPushConstantsInfo)6315 radv_CmdPushConstants2KHR(VkCommandBuffer commandBuffer, const VkPushConstantsInfoKHR *pPushConstantsInfo)
6316 {
6317    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6318    memcpy(cmd_buffer->push_constants + pPushConstantsInfo->offset, pPushConstantsInfo->pValues,
6319           pPushConstantsInfo->size);
6320    cmd_buffer->push_constant_stages |= pPushConstantsInfo->stageFlags;
6321 }
6322 
6323 VKAPI_ATTR VkResult VKAPI_CALL
radv_EndCommandBuffer(VkCommandBuffer commandBuffer)6324 radv_EndCommandBuffer(VkCommandBuffer commandBuffer)
6325 {
6326    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6327 
6328    if (cmd_buffer->qf == RADV_QUEUE_SPARSE)
6329       return vk_command_buffer_end(&cmd_buffer->vk);
6330 
6331    radv_emit_mip_change_flush_default(cmd_buffer);
6332 
6333    const bool is_gfx_or_ace = cmd_buffer->qf == RADV_QUEUE_GENERAL || cmd_buffer->qf == RADV_QUEUE_COMPUTE;
6334 
6335    if (is_gfx_or_ace) {
6336       if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX6)
6337          cmd_buffer->state.flush_bits |=
6338             RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_WB_L2;
6339 
6340       /* Make sure to sync all pending active queries at the end of
6341        * command buffer.
6342        */
6343       cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
6344 
6345       /* Flush noncoherent images on GFX9+ so we can assume they're clean on the start of a
6346        * command buffer.
6347        */
6348       if (cmd_buffer->state.rb_noncoherent_dirty && !can_skip_buffer_l2_flushes(cmd_buffer->device))
6349          cmd_buffer->state.flush_bits |= radv_src_access_flush(
6350             cmd_buffer, VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT | VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, NULL);
6351 
6352       /* Since NGG streamout uses GDS, we need to make GDS idle when
6353        * we leave the IB, otherwise another process might overwrite
6354        * it while our shaders are busy.
6355        */
6356       if (cmd_buffer->gds_needed)
6357          cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
6358    }
6359 
6360    /* Finalize the internal compute command stream, if it exists. */
6361    if (cmd_buffer->gang.cs) {
6362       VkResult result = radv_gang_finalize(cmd_buffer);
6363       if (result != VK_SUCCESS)
6364          return vk_error(cmd_buffer, result);
6365    }
6366 
6367    if (is_gfx_or_ace) {
6368       radv_emit_cache_flush(cmd_buffer);
6369 
6370       /* Make sure CP DMA is idle at the end of IBs because the kernel
6371        * doesn't wait for it.
6372        */
6373       radv_cp_dma_wait_for_idle(cmd_buffer);
6374    }
6375 
6376    radv_describe_end_cmd_buffer(cmd_buffer);
6377 
6378    VkResult result = cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs);
6379    if (result != VK_SUCCESS)
6380       return vk_error(cmd_buffer, result);
6381 
6382    return vk_command_buffer_end(&cmd_buffer->vk);
6383 }
6384 
6385 static void
radv_emit_compute_pipeline(struct radv_cmd_buffer * cmd_buffer,struct radv_compute_pipeline * pipeline)6386 radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer, struct radv_compute_pipeline *pipeline)
6387 {
6388    if (pipeline == cmd_buffer->state.emitted_compute_pipeline)
6389       return;
6390 
6391    assert(!pipeline->base.ctx_cs.cdw);
6392 
6393    cmd_buffer->state.emitted_compute_pipeline = pipeline;
6394 
6395    radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->base.cs.cdw);
6396    radeon_emit_array(cmd_buffer->cs, pipeline->base.cs.buf, pipeline->base.cs.cdw);
6397 
6398    if (pipeline->base.type == RADV_PIPELINE_COMPUTE) {
6399       radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->state.shaders[MESA_SHADER_COMPUTE]->bo);
6400    } else {
6401       radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->state.rt_prolog->bo);
6402 
6403       if (cmd_buffer->state.shaders[MESA_SHADER_INTERSECTION])
6404          radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
6405                             cmd_buffer->state.shaders[MESA_SHADER_INTERSECTION]->bo);
6406 
6407       struct radv_ray_tracing_pipeline *rt_pipeline = radv_pipeline_to_ray_tracing(&pipeline->base);
6408       for (unsigned i = 0; i < rt_pipeline->stage_count; ++i) {
6409          struct radv_shader *shader = rt_pipeline->stages[i].shader;
6410          if (shader)
6411             radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, shader->bo);
6412       }
6413    }
6414 
6415    if (radv_device_fault_detection_enabled(cmd_buffer->device))
6416       radv_save_pipeline(cmd_buffer, &pipeline->base);
6417 }
6418 
6419 static void
radv_mark_descriptor_sets_dirty(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)6420 radv_mark_descriptor_sets_dirty(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point)
6421 {
6422    struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);
6423 
6424    descriptors_state->dirty |= descriptors_state->valid;
6425 }
6426 
6427 static void
radv_bind_vs_input_state(struct radv_cmd_buffer * cmd_buffer,const struct radv_graphics_pipeline * pipeline)6428 radv_bind_vs_input_state(struct radv_cmd_buffer *cmd_buffer, const struct radv_graphics_pipeline *pipeline)
6429 {
6430    const struct radv_shader *vs_shader = radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_VERTEX);
6431    const struct radv_vs_input_state *src = &pipeline->vs_input_state;
6432 
6433    /* Bind the vertex input state from the pipeline when the VS has a prolog and the state isn't
6434     * dynamic. This can happen when the pre-rasterization stages and the vertex input state are from
6435     * two different libraries. Otherwise, if the VS has a prolog, the state is dynamic and there is
6436     * nothing to bind.
6437     */
6438    if (!vs_shader || !vs_shader->info.vs.has_prolog || (pipeline->dynamic_states & RADV_DYNAMIC_VERTEX_INPUT))
6439       return;
6440 
6441    cmd_buffer->state.dynamic_vs_input = *src;
6442 
6443    if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX6 ||
6444        cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10) {
6445       cmd_buffer->state.vbo_misaligned_mask = 0;
6446       cmd_buffer->state.vbo_misaligned_mask_invalid = src->attribute_mask;
6447    }
6448 
6449    cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT;
6450 }
6451 
6452 static void
radv_bind_multisample_state(struct radv_cmd_buffer * cmd_buffer,const struct radv_multisample_state * ms)6453 radv_bind_multisample_state(struct radv_cmd_buffer *cmd_buffer, const struct radv_multisample_state *ms)
6454 {
6455    if (ms->sample_shading_enable) {
6456       cmd_buffer->state.ms.sample_shading_enable = true;
6457       cmd_buffer->state.ms.min_sample_shading = ms->min_sample_shading;
6458    }
6459 }
6460 
6461 static void
radv_bind_custom_blend_mode(struct radv_cmd_buffer * cmd_buffer,unsigned custom_blend_mode)6462 radv_bind_custom_blend_mode(struct radv_cmd_buffer *cmd_buffer, unsigned custom_blend_mode)
6463 {
6464    /* Re-emit CB_COLOR_CONTROL when the custom blending mode changes. */
6465    if (cmd_buffer->state.custom_blend_mode != custom_blend_mode)
6466       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP | RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP_ENABLE;
6467 
6468    cmd_buffer->state.custom_blend_mode = custom_blend_mode;
6469 }
6470 
6471 static void
radv_bind_pre_rast_shader(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * shader)6472 radv_bind_pre_rast_shader(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *shader)
6473 {
6474    bool mesh_shading = shader->info.stage == MESA_SHADER_MESH;
6475    const struct radv_userdata_info *loc;
6476 
6477    assert(shader->info.stage == MESA_SHADER_VERTEX || shader->info.stage == MESA_SHADER_TESS_CTRL ||
6478           shader->info.stage == MESA_SHADER_TESS_EVAL || shader->info.stage == MESA_SHADER_GEOMETRY ||
6479           shader->info.stage == MESA_SHADER_MESH);
6480 
6481    if (radv_get_user_sgpr(shader, AC_UD_NGG_PROVOKING_VTX)->sgpr_idx != -1) {
6482       /* Re-emit the provoking vertex mode state because the SGPR idx can be different. */
6483       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_PROVOKING_VERTEX_MODE;
6484    }
6485 
6486    if (radv_get_user_sgpr(shader, AC_UD_STREAMOUT_BUFFERS)->sgpr_idx != -1) {
6487       /* Re-emit the streamout buffers because the SGPR idx can be different and with NGG streamout
6488        * they always need to be emitted because a buffer size of 0 is used to disable streamout.
6489        */
6490       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_BUFFER;
6491 
6492       if (cmd_buffer->device->physical_device->use_ngg_streamout) {
6493          /* GFX11 needs GDS OA for streamout. */
6494          cmd_buffer->gds_oa_needed = true;
6495       }
6496    }
6497 
6498    if (radv_get_user_sgpr(shader, AC_UD_NUM_VERTS_PER_PRIM)->sgpr_idx != -1) {
6499       /* Re-emit the primitive topology because the SGPR idx can be different. */
6500       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY;
6501    }
6502 
6503    if (radv_get_user_sgpr(shader, AC_UD_SHADER_QUERY_STATE)->sgpr_idx != -1) {
6504       /* Re-emit shader query state when SGPR exists but location potentially changed. */
6505       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
6506    }
6507 
6508    const bool needs_vtx_sgpr =
6509       shader->info.stage == MESA_SHADER_VERTEX || shader->info.stage == MESA_SHADER_MESH ||
6510       (shader->info.stage == MESA_SHADER_GEOMETRY && !shader->info.merged_shader_compiled_separately) ||
6511       (shader->info.stage == MESA_SHADER_TESS_CTRL && !shader->info.merged_shader_compiled_separately);
6512 
6513    loc = radv_get_user_sgpr(shader, AC_UD_VS_BASE_VERTEX_START_INSTANCE);
6514    if (needs_vtx_sgpr && loc->sgpr_idx != -1) {
6515       cmd_buffer->state.vtx_base_sgpr = shader->info.user_data_0 + loc->sgpr_idx * 4;
6516       cmd_buffer->state.vtx_emit_num = loc->num_sgprs;
6517       cmd_buffer->state.uses_drawid = shader->info.vs.needs_draw_id;
6518       cmd_buffer->state.uses_baseinstance = shader->info.vs.needs_base_instance;
6519 
6520       if (shader->info.merged_shader_compiled_separately) {
6521          /* Merged shaders compiled separately (eg. VS+TCS) always declare these user SGPRS
6522           * because the input arguments must match.
6523           */
6524          cmd_buffer->state.uses_drawid = true;
6525          cmd_buffer->state.uses_baseinstance = true;
6526       }
6527 
6528       /* Re-emit some vertex states because the SGPR idx can be different. */
6529       cmd_buffer->state.last_first_instance = -1;
6530       cmd_buffer->state.last_vertex_offset_valid = false;
6531       cmd_buffer->state.last_drawid = -1;
6532    }
6533 
6534    if (mesh_shading != cmd_buffer->state.mesh_shading) {
6535       /* Re-emit VRS state because the combiner is different (vertex vs primitive). Re-emit
6536        * primitive topology because the mesh shading pipeline clobbered it.
6537        */
6538       cmd_buffer->state.dirty |=
6539          RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE | RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY;
6540    }
6541 
6542    cmd_buffer->state.mesh_shading = mesh_shading;
6543 }
6544 
6545 static void
radv_bind_vertex_shader(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * vs)6546 radv_bind_vertex_shader(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *vs)
6547 {
6548    radv_bind_pre_rast_shader(cmd_buffer, vs);
6549 
6550    /* Re-emit states that need to be updated when the vertex shader is compiled separately
6551     * because shader configs are combined.
6552     */
6553    if (vs->info.merged_shader_compiled_separately && vs->info.next_stage == MESA_SHADER_TESS_CTRL) {
6554       cmd_buffer->state.emitted_tcs_epilog = NULL;
6555       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_PATCH_CONTROL_POINTS;
6556    }
6557 
6558    /* Can't put anything else here due to merged shaders */
6559 }
6560 
6561 static void
radv_bind_tess_ctrl_shader(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * tcs)6562 radv_bind_tess_ctrl_shader(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *tcs)
6563 {
6564    radv_bind_pre_rast_shader(cmd_buffer, tcs);
6565 
6566    cmd_buffer->tess_rings_needed = true;
6567 
6568    /* Always re-emit patch control points/domain origin when a new pipeline with tessellation is
6569     * bound because a bunch of parameters (user SGPRs, TCS vertices out, ccw, etc) can be different.
6570     */
6571    cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_PATCH_CONTROL_POINTS | RADV_CMD_DIRTY_DYNAMIC_TESS_DOMAIN_ORIGIN;
6572 
6573    /* Re-emit the TCS epilog when a new tessellation control shader is bound. */
6574    if (tcs->info.has_epilog)
6575       cmd_buffer->state.emitted_tcs_epilog = NULL;
6576 
6577    /* Re-emit the VS prolog when the tessellation control shader is compiled separately because
6578     * shader configs are combined and need to be updated.
6579     */
6580    if (tcs->info.merged_shader_compiled_separately)
6581       cmd_buffer->state.emitted_vs_prolog = NULL;
6582 }
6583 
6584 static void
radv_bind_tess_eval_shader(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * tes)6585 radv_bind_tess_eval_shader(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *tes)
6586 {
6587    radv_bind_pre_rast_shader(cmd_buffer, tes);
6588 
6589    /* Can't put anything else here due to merged shaders */
6590 }
6591 
6592 static void
radv_bind_geometry_shader(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * gs)6593 radv_bind_geometry_shader(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *gs)
6594 {
6595    radv_bind_pre_rast_shader(cmd_buffer, gs);
6596 
6597    cmd_buffer->esgs_ring_size_needed = MAX2(cmd_buffer->esgs_ring_size_needed, gs->info.gs_ring_info.esgs_ring_size);
6598    cmd_buffer->gsvs_ring_size_needed = MAX2(cmd_buffer->gsvs_ring_size_needed, gs->info.gs_ring_info.gsvs_ring_size);
6599 
6600    /* Re-emit the VS prolog when the geometry shader is compiled separately because shader configs
6601     * are combined and need to be updated.
6602     */
6603    if (gs->info.merged_shader_compiled_separately)
6604       cmd_buffer->state.emitted_vs_prolog = NULL;
6605 }
6606 
6607 static void
radv_bind_mesh_shader(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * ms)6608 radv_bind_mesh_shader(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *ms)
6609 {
6610    radv_bind_pre_rast_shader(cmd_buffer, ms);
6611 
6612    cmd_buffer->mesh_scratch_ring_needed |= ms->info.ms.needs_ms_scratch_ring;
6613 }
6614 
6615 static void
radv_bind_fragment_shader(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * ps)6616 radv_bind_fragment_shader(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *ps)
6617 {
6618    const enum amd_gfx_level gfx_level = cmd_buffer->device->physical_device->rad_info.gfx_level;
6619    const struct radv_shader *previous_ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
6620    const float min_sample_shading = 1.0f;
6621 
6622    if (ps->info.ps.needs_sample_positions) {
6623       cmd_buffer->sample_positions_needed = true;
6624    }
6625 
6626    /* Re-emit the FS state because the SGPR idx can be different. */
6627    if (radv_get_user_sgpr(ps, AC_UD_PS_STATE)->sgpr_idx != -1) {
6628       cmd_buffer->state.dirty |=
6629          RADV_CMD_DIRTY_DYNAMIC_RASTERIZATION_SAMPLES | RADV_CMD_DIRTY_DYNAMIC_LINE_RASTERIZATION_MODE;
6630    }
6631 
6632    /* Re-emit the conservative rasterization mode because inner coverage is different. */
6633    if (!previous_ps || previous_ps->info.ps.reads_fully_covered != ps->info.ps.reads_fully_covered)
6634       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_CONSERVATIVE_RAST_MODE;
6635 
6636    if (gfx_level >= GFX10_3 && (!previous_ps || previous_ps->info.ps.force_sample_iter_shading_rate !=
6637                                                    ps->info.ps.force_sample_iter_shading_rate))
6638       cmd_buffer->state.dirty |=
6639          RADV_CMD_DIRTY_DYNAMIC_RASTERIZATION_SAMPLES | RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE;
6640 
6641    if (cmd_buffer->state.ms.sample_shading_enable != ps->info.ps.uses_sample_shading) {
6642       cmd_buffer->state.ms.sample_shading_enable = ps->info.ps.uses_sample_shading;
6643       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_RASTERIZATION_SAMPLES;
6644 
6645       if (gfx_level >= GFX10_3)
6646          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE;
6647    }
6648 
6649    if (cmd_buffer->state.ms.min_sample_shading != min_sample_shading) {
6650       cmd_buffer->state.ms.min_sample_shading = min_sample_shading;
6651       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_RASTERIZATION_SAMPLES;
6652    }
6653 
6654    if (!previous_ps || previous_ps->info.ps.db_shader_control != ps->info.ps.db_shader_control ||
6655        previous_ps->info.ps.pops_is_per_sample != ps->info.ps.pops_is_per_sample)
6656       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DB_SHADER_CONTROL;
6657 
6658    /* Re-emit the PS epilog when a new fragment shader is bound. */
6659    if (ps->info.has_epilog)
6660       cmd_buffer->state.emitted_ps_epilog = NULL;
6661 }
6662 
6663 static void
radv_bind_task_shader(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * ts)6664 radv_bind_task_shader(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *ts)
6665 {
6666    if (!radv_gang_init(cmd_buffer))
6667       return;
6668 
6669    cmd_buffer->task_rings_needed = true;
6670 }
6671 
6672 /* This function binds/unbinds a shader to the cmdbuffer state. */
6673 static void
radv_bind_shader(struct radv_cmd_buffer * cmd_buffer,struct radv_shader * shader,gl_shader_stage stage)6674 radv_bind_shader(struct radv_cmd_buffer *cmd_buffer, struct radv_shader *shader, gl_shader_stage stage)
6675 {
6676    const struct radv_device *device = cmd_buffer->device;
6677 
6678    if (!shader) {
6679       cmd_buffer->state.shaders[stage] = NULL;
6680       cmd_buffer->state.active_stages &= ~mesa_to_vk_shader_stage(stage);
6681 
6682       /* Reset some dynamic states when a shader stage is unbound. */
6683       switch (stage) {
6684       case MESA_SHADER_FRAGMENT:
6685          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_CONSERVATIVE_RAST_MODE |
6686                                     RADV_CMD_DIRTY_DYNAMIC_RASTERIZATION_SAMPLES |
6687                                     RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE | RADV_CMD_DIRTY_DB_SHADER_CONTROL;
6688          break;
6689       default:
6690          break;
6691       }
6692       return;
6693    }
6694 
6695    switch (stage) {
6696    case MESA_SHADER_VERTEX:
6697       radv_bind_vertex_shader(cmd_buffer, shader);
6698       break;
6699    case MESA_SHADER_TESS_CTRL:
6700       radv_bind_tess_ctrl_shader(cmd_buffer, shader);
6701       break;
6702    case MESA_SHADER_TESS_EVAL:
6703       radv_bind_tess_eval_shader(cmd_buffer, shader);
6704       break;
6705    case MESA_SHADER_GEOMETRY:
6706       radv_bind_geometry_shader(cmd_buffer, shader);
6707       break;
6708    case MESA_SHADER_FRAGMENT:
6709       radv_bind_fragment_shader(cmd_buffer, shader);
6710       break;
6711    case MESA_SHADER_MESH:
6712       radv_bind_mesh_shader(cmd_buffer, shader);
6713       break;
6714    case MESA_SHADER_TASK:
6715       radv_bind_task_shader(cmd_buffer, shader);
6716       break;
6717    case MESA_SHADER_COMPUTE: {
6718       cmd_buffer->compute_scratch_size_per_wave_needed =
6719          MAX2(cmd_buffer->compute_scratch_size_per_wave_needed, shader->config.scratch_bytes_per_wave);
6720 
6721       const unsigned max_stage_waves = radv_get_max_scratch_waves(device, shader);
6722       cmd_buffer->compute_scratch_waves_wanted = MAX2(cmd_buffer->compute_scratch_waves_wanted, max_stage_waves);
6723       break;
6724    }
6725    case MESA_SHADER_INTERSECTION:
6726       /* no-op */
6727       break;
6728    default:
6729       unreachable("invalid shader stage");
6730    }
6731 
6732    cmd_buffer->state.shaders[stage] = shader;
6733    cmd_buffer->state.active_stages |= mesa_to_vk_shader_stage(stage);
6734 
6735    if (mesa_to_vk_shader_stage(stage) & RADV_GRAPHICS_STAGE_BITS) {
6736       cmd_buffer->scratch_size_per_wave_needed =
6737          MAX2(cmd_buffer->scratch_size_per_wave_needed, shader->config.scratch_bytes_per_wave);
6738 
6739       const unsigned max_stage_waves = radv_get_max_scratch_waves(device, shader);
6740       cmd_buffer->scratch_waves_wanted = MAX2(cmd_buffer->scratch_waves_wanted, max_stage_waves);
6741    }
6742 }
6743 
6744 static void
radv_reset_shader_object_state(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint pipelineBindPoint)6745 radv_reset_shader_object_state(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint pipelineBindPoint)
6746 {
6747    switch (pipelineBindPoint) {
6748    case VK_PIPELINE_BIND_POINT_COMPUTE:
6749       if (cmd_buffer->state.shader_objs[MESA_SHADER_COMPUTE]) {
6750          radv_bind_shader(cmd_buffer, NULL, MESA_SHADER_COMPUTE);
6751          cmd_buffer->state.shader_objs[MESA_SHADER_COMPUTE] = NULL;
6752       }
6753       break;
6754    case VK_PIPELINE_BIND_POINT_GRAPHICS:
6755       radv_foreach_stage(s, RADV_GRAPHICS_STAGE_BITS)
6756       {
6757          if (cmd_buffer->state.shader_objs[s]) {
6758             radv_bind_shader(cmd_buffer, NULL, s);
6759             cmd_buffer->state.shader_objs[s] = NULL;
6760          }
6761       }
6762       break;
6763    default:
6764       break;
6765    }
6766 
6767    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_GRAPHICS_SHADERS;
6768 }
6769 
6770 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindPipeline(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipeline _pipeline)6771 radv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint, VkPipeline _pipeline)
6772 {
6773    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6774    RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);
6775 
6776    radv_reset_shader_object_state(cmd_buffer, pipelineBindPoint);
6777 
6778    switch (pipelineBindPoint) {
6779    case VK_PIPELINE_BIND_POINT_COMPUTE: {
6780       struct radv_compute_pipeline *compute_pipeline = radv_pipeline_to_compute(pipeline);
6781 
6782       if (cmd_buffer->state.compute_pipeline == compute_pipeline)
6783          return;
6784       radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
6785 
6786       radv_bind_shader(cmd_buffer, compute_pipeline->base.shaders[MESA_SHADER_COMPUTE], MESA_SHADER_COMPUTE);
6787 
6788       cmd_buffer->state.compute_pipeline = compute_pipeline;
6789       cmd_buffer->push_constant_stages |= VK_SHADER_STAGE_COMPUTE_BIT;
6790       break;
6791    }
6792    case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR: {
6793       struct radv_ray_tracing_pipeline *rt_pipeline = radv_pipeline_to_ray_tracing(pipeline);
6794 
6795       if (cmd_buffer->state.rt_pipeline == rt_pipeline)
6796          return;
6797       radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
6798 
6799       radv_bind_shader(cmd_buffer, rt_pipeline->base.base.shaders[MESA_SHADER_INTERSECTION], MESA_SHADER_INTERSECTION);
6800       cmd_buffer->state.rt_prolog = rt_pipeline->prolog;
6801 
6802       cmd_buffer->state.rt_pipeline = rt_pipeline;
6803       cmd_buffer->push_constant_stages |= RADV_RT_STAGE_BITS;
6804 
6805       /* Bind the stack size when it's not dynamic. */
6806       if (rt_pipeline->stack_size != -1u)
6807          cmd_buffer->state.rt_stack_size = rt_pipeline->stack_size;
6808 
6809       const unsigned max_scratch_waves = radv_get_max_scratch_waves(cmd_buffer->device, rt_pipeline->prolog);
6810       cmd_buffer->compute_scratch_waves_wanted = MAX2(cmd_buffer->compute_scratch_waves_wanted, max_scratch_waves);
6811       break;
6812    }
6813    case VK_PIPELINE_BIND_POINT_GRAPHICS: {
6814       struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline);
6815 
6816       /* Bind the non-dynamic graphics state from the pipeline unconditionally because some PSO
6817        * might have been overwritten between two binds of the same pipeline.
6818        */
6819       radv_bind_dynamic_state(cmd_buffer, &graphics_pipeline->dynamic_state);
6820 
6821       if (cmd_buffer->state.graphics_pipeline == graphics_pipeline)
6822          return;
6823       radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
6824 
6825       radv_foreach_stage(
6826          stage, (cmd_buffer->state.active_stages | graphics_pipeline->active_stages) & RADV_GRAPHICS_STAGE_BITS)
6827       {
6828          radv_bind_shader(cmd_buffer, graphics_pipeline->base.shaders[stage], stage);
6829       }
6830 
6831       cmd_buffer->state.gs_copy_shader = graphics_pipeline->base.gs_copy_shader;
6832       cmd_buffer->state.last_vgt_shader = graphics_pipeline->base.shaders[graphics_pipeline->last_vgt_api_stage];
6833 
6834       cmd_buffer->state.graphics_pipeline = graphics_pipeline;
6835 
6836       cmd_buffer->state.has_nggc = graphics_pipeline->has_ngg_culling;
6837       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_PIPELINE;
6838       cmd_buffer->push_constant_stages |= graphics_pipeline->active_stages;
6839 
6840       /* Prefetch all pipeline shaders at first draw time. */
6841       cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_SHADERS;
6842 
6843       if (cmd_buffer->device->physical_device->rad_info.has_vgt_flush_ngg_legacy_bug &&
6844           cmd_buffer->state.emitted_graphics_pipeline && cmd_buffer->state.emitted_graphics_pipeline->is_ngg &&
6845           !cmd_buffer->state.graphics_pipeline->is_ngg) {
6846          /* Transitioning from NGG to legacy GS requires
6847           * VGT_FLUSH on GFX10 and Navi21. VGT_FLUSH
6848           * is also emitted at the beginning of IBs when legacy
6849           * GS ring pointers are set.
6850           */
6851          cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_FLUSH;
6852       }
6853 
6854       cmd_buffer->state.uses_dynamic_patch_control_points =
6855          !!(graphics_pipeline->dynamic_states & RADV_DYNAMIC_PATCH_CONTROL_POINTS);
6856 
6857       if (graphics_pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT) {
6858          if (!cmd_buffer->state.uses_dynamic_patch_control_points) {
6859             /* Bind the tessellation state from the pipeline when it's not dynamic. */
6860             struct radv_shader *tcs = cmd_buffer->state.shaders[MESA_SHADER_TESS_CTRL];
6861 
6862             cmd_buffer->state.tess_num_patches = tcs->info.num_tess_patches;
6863             cmd_buffer->state.tess_lds_size = tcs->info.tcs.num_lds_blocks;
6864          }
6865       }
6866 
6867       const struct radv_shader *vs = radv_get_shader(graphics_pipeline->base.shaders, MESA_SHADER_VERTEX);
6868       if (vs) {
6869          /* Re-emit the VS prolog when a new vertex shader is bound. */
6870          if (vs->info.vs.has_prolog) {
6871             cmd_buffer->state.emitted_vs_prolog = NULL;
6872             cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT;
6873          }
6874 
6875          /* Re-emit the vertex buffer descriptors because they are really tied to the pipeline. */
6876          if (vs->info.vs.vb_desc_usage_mask) {
6877             cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER;
6878          }
6879       }
6880 
6881       if (cmd_buffer->device->physical_device->rad_info.rbplus_allowed &&
6882           (!cmd_buffer->state.emitted_graphics_pipeline ||
6883            cmd_buffer->state.col_format_non_compacted != graphics_pipeline->col_format_non_compacted)) {
6884          cmd_buffer->state.col_format_non_compacted = graphics_pipeline->col_format_non_compacted;
6885          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_RBPLUS;
6886       }
6887 
6888       radv_bind_vs_input_state(cmd_buffer, graphics_pipeline);
6889 
6890       radv_bind_multisample_state(cmd_buffer, &graphics_pipeline->ms);
6891 
6892       radv_bind_custom_blend_mode(cmd_buffer, graphics_pipeline->custom_blend_mode);
6893 
6894       cmd_buffer->state.db_render_control = graphics_pipeline->db_render_control;
6895 
6896       cmd_buffer->state.rast_prim = graphics_pipeline->rast_prim;
6897 
6898       cmd_buffer->state.ia_multi_vgt_param = graphics_pipeline->ia_multi_vgt_param;
6899 
6900       cmd_buffer->state.uses_out_of_order_rast = graphics_pipeline->uses_out_of_order_rast;
6901       cmd_buffer->state.uses_vrs_attachment = graphics_pipeline->uses_vrs_attachment;
6902       cmd_buffer->state.uses_dynamic_vertex_binding_stride =
6903          !!(graphics_pipeline->dynamic_states & (RADV_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE | RADV_DYNAMIC_VERTEX_INPUT));
6904       break;
6905    }
6906    default:
6907       assert(!"invalid bind point");
6908       break;
6909    }
6910 
6911    cmd_buffer->push_constant_state[vk_to_bind_point(pipelineBindPoint)].size = pipeline->push_constant_size;
6912    cmd_buffer->push_constant_state[vk_to_bind_point(pipelineBindPoint)].dynamic_offset_count =
6913       pipeline->dynamic_offset_count;
6914    cmd_buffer->descriptors[vk_to_bind_point(pipelineBindPoint)].need_indirect_descriptor_sets =
6915       pipeline->need_indirect_descriptor_sets;
6916 
6917    if (cmd_buffer->device->shader_use_invisible_vram)
6918       cmd_buffer->shader_upload_seq = MAX2(cmd_buffer->shader_upload_seq, pipeline->shader_upload_seq);
6919 }
6920 
6921 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetViewport(VkCommandBuffer commandBuffer,uint32_t firstViewport,uint32_t viewportCount,const VkViewport * pViewports)6922 radv_CmdSetViewport(VkCommandBuffer commandBuffer, uint32_t firstViewport, uint32_t viewportCount,
6923                     const VkViewport *pViewports)
6924 {
6925    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6926    struct radv_cmd_state *state = &cmd_buffer->state;
6927    ASSERTED const uint32_t total_count = firstViewport + viewportCount;
6928 
6929    assert(firstViewport < MAX_VIEWPORTS);
6930    assert(total_count >= 1 && total_count <= MAX_VIEWPORTS);
6931 
6932    if (state->dynamic.vk.vp.viewport_count < total_count)
6933       state->dynamic.vk.vp.viewport_count = total_count;
6934 
6935    memcpy(state->dynamic.vk.vp.viewports + firstViewport, pViewports, viewportCount * sizeof(*pViewports));
6936    for (unsigned i = 0; i < viewportCount; i++) {
6937       radv_get_viewport_xform(&pViewports[i], state->dynamic.hw_vp.xform[i + firstViewport].scale,
6938                               state->dynamic.hw_vp.xform[i + firstViewport].translate);
6939    }
6940 
6941    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_VIEWPORT | RADV_CMD_DIRTY_GUARDBAND;
6942 }
6943 
6944 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetScissor(VkCommandBuffer commandBuffer,uint32_t firstScissor,uint32_t scissorCount,const VkRect2D * pScissors)6945 radv_CmdSetScissor(VkCommandBuffer commandBuffer, uint32_t firstScissor, uint32_t scissorCount,
6946                    const VkRect2D *pScissors)
6947 {
6948    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6949    struct radv_cmd_state *state = &cmd_buffer->state;
6950    ASSERTED const uint32_t total_count = firstScissor + scissorCount;
6951 
6952    assert(firstScissor < MAX_SCISSORS);
6953    assert(total_count >= 1 && total_count <= MAX_SCISSORS);
6954 
6955    if (state->dynamic.vk.vp.scissor_count < total_count)
6956       state->dynamic.vk.vp.scissor_count = total_count;
6957 
6958    memcpy(state->dynamic.vk.vp.scissors + firstScissor, pScissors, scissorCount * sizeof(*pScissors));
6959 
6960    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR;
6961 }
6962 
6963 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetLineWidth(VkCommandBuffer commandBuffer,float lineWidth)6964 radv_CmdSetLineWidth(VkCommandBuffer commandBuffer, float lineWidth)
6965 {
6966    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6967    struct radv_cmd_state *state = &cmd_buffer->state;
6968 
6969    state->dynamic.vk.rs.line.width = lineWidth;
6970 
6971    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH | RADV_CMD_DIRTY_GUARDBAND;
6972 }
6973 
6974 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetBlendConstants(VkCommandBuffer commandBuffer,const float blendConstants[4])6975 radv_CmdSetBlendConstants(VkCommandBuffer commandBuffer, const float blendConstants[4])
6976 {
6977    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6978    struct radv_cmd_state *state = &cmd_buffer->state;
6979 
6980    memcpy(state->dynamic.vk.cb.blend_constants, blendConstants, sizeof(float) * 4);
6981 
6982    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS;
6983 }
6984 
6985 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthBounds(VkCommandBuffer commandBuffer,float minDepthBounds,float maxDepthBounds)6986 radv_CmdSetDepthBounds(VkCommandBuffer commandBuffer, float minDepthBounds, float maxDepthBounds)
6987 {
6988    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6989    struct radv_cmd_state *state = &cmd_buffer->state;
6990 
6991    state->dynamic.vk.ds.depth.bounds_test.min = minDepthBounds;
6992    state->dynamic.vk.ds.depth.bounds_test.max = maxDepthBounds;
6993 
6994    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS;
6995 }
6996 
6997 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t compareMask)6998 radv_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask, uint32_t compareMask)
6999 {
7000    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7001    struct radv_cmd_state *state = &cmd_buffer->state;
7002 
7003    if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
7004       state->dynamic.vk.ds.stencil.front.compare_mask = compareMask;
7005    if (faceMask & VK_STENCIL_FACE_BACK_BIT)
7006       state->dynamic.vk.ds.stencil.back.compare_mask = compareMask;
7007 
7008    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK;
7009 }
7010 
7011 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t writeMask)7012 radv_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask, uint32_t writeMask)
7013 {
7014    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7015    struct radv_cmd_state *state = &cmd_buffer->state;
7016 
7017    if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
7018       state->dynamic.vk.ds.stencil.front.write_mask = writeMask;
7019    if (faceMask & VK_STENCIL_FACE_BACK_BIT)
7020       state->dynamic.vk.ds.stencil.back.write_mask = writeMask;
7021 
7022    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK;
7023 }
7024 
7025 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetStencilReference(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t reference)7026 radv_CmdSetStencilReference(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask, uint32_t reference)
7027 {
7028    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7029    struct radv_cmd_state *state = &cmd_buffer->state;
7030 
7031    if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
7032       state->dynamic.vk.ds.stencil.front.reference = reference;
7033    if (faceMask & VK_STENCIL_FACE_BACK_BIT)
7034       state->dynamic.vk.ds.stencil.back.reference = reference;
7035 
7036    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE;
7037 }
7038 
7039 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDiscardRectangleEXT(VkCommandBuffer commandBuffer,uint32_t firstDiscardRectangle,uint32_t discardRectangleCount,const VkRect2D * pDiscardRectangles)7040 radv_CmdSetDiscardRectangleEXT(VkCommandBuffer commandBuffer, uint32_t firstDiscardRectangle,
7041                                uint32_t discardRectangleCount, const VkRect2D *pDiscardRectangles)
7042 {
7043    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7044    struct radv_cmd_state *state = &cmd_buffer->state;
7045    ASSERTED const uint32_t total_count = firstDiscardRectangle + discardRectangleCount;
7046 
7047    assert(firstDiscardRectangle < MAX_DISCARD_RECTANGLES);
7048    assert(total_count >= 1 && total_count <= MAX_DISCARD_RECTANGLES);
7049 
7050    typed_memcpy(&state->dynamic.vk.dr.rectangles[firstDiscardRectangle], pDiscardRectangles, discardRectangleCount);
7051 
7052    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE;
7053 }
7054 
7055 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetSampleLocationsEXT(VkCommandBuffer commandBuffer,const VkSampleLocationsInfoEXT * pSampleLocationsInfo)7056 radv_CmdSetSampleLocationsEXT(VkCommandBuffer commandBuffer, const VkSampleLocationsInfoEXT *pSampleLocationsInfo)
7057 {
7058    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7059    struct radv_cmd_state *state = &cmd_buffer->state;
7060 
7061    assert(pSampleLocationsInfo->sampleLocationsCount <= MAX_SAMPLE_LOCATIONS);
7062 
7063    state->dynamic.sample_location.per_pixel = pSampleLocationsInfo->sampleLocationsPerPixel;
7064    state->dynamic.sample_location.grid_size = pSampleLocationsInfo->sampleLocationGridSize;
7065    state->dynamic.sample_location.count = pSampleLocationsInfo->sampleLocationsCount;
7066    typed_memcpy(&state->dynamic.sample_location.locations[0], pSampleLocationsInfo->pSampleLocations,
7067                 pSampleLocationsInfo->sampleLocationsCount);
7068 
7069    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS;
7070 }
7071 
7072 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetLineStippleKHR(VkCommandBuffer commandBuffer,uint32_t lineStippleFactor,uint16_t lineStipplePattern)7073 radv_CmdSetLineStippleKHR(VkCommandBuffer commandBuffer, uint32_t lineStippleFactor, uint16_t lineStipplePattern)
7074 {
7075    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7076    struct radv_cmd_state *state = &cmd_buffer->state;
7077 
7078    state->dynamic.vk.rs.line.stipple.factor = lineStippleFactor;
7079    state->dynamic.vk.rs.line.stipple.pattern = lineStipplePattern;
7080 
7081    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE;
7082 }
7083 
7084 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetCullMode(VkCommandBuffer commandBuffer,VkCullModeFlags cullMode)7085 radv_CmdSetCullMode(VkCommandBuffer commandBuffer, VkCullModeFlags cullMode)
7086 {
7087    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7088    struct radv_cmd_state *state = &cmd_buffer->state;
7089 
7090    state->dynamic.vk.rs.cull_mode = cullMode;
7091 
7092    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_CULL_MODE;
7093 }
7094 
7095 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetFrontFace(VkCommandBuffer commandBuffer,VkFrontFace frontFace)7096 radv_CmdSetFrontFace(VkCommandBuffer commandBuffer, VkFrontFace frontFace)
7097 {
7098    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7099    struct radv_cmd_state *state = &cmd_buffer->state;
7100 
7101    state->dynamic.vk.rs.front_face = frontFace;
7102 
7103    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE;
7104 }
7105 
7106 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetPrimitiveTopology(VkCommandBuffer commandBuffer,VkPrimitiveTopology primitiveTopology)7107 radv_CmdSetPrimitiveTopology(VkCommandBuffer commandBuffer, VkPrimitiveTopology primitiveTopology)
7108 {
7109    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7110    struct radv_cmd_state *state = &cmd_buffer->state;
7111    unsigned primitive_topology = radv_translate_prim(primitiveTopology);
7112 
7113    if (radv_primitive_topology_is_line_list(state->dynamic.vk.ia.primitive_topology) !=
7114        radv_primitive_topology_is_line_list(primitive_topology))
7115       state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE;
7116 
7117    if (radv_prim_is_points_or_lines(state->dynamic.vk.ia.primitive_topology) !=
7118        radv_prim_is_points_or_lines(primitive_topology))
7119       state->dirty |= RADV_CMD_DIRTY_GUARDBAND;
7120 
7121    state->dynamic.vk.ia.primitive_topology = primitive_topology;
7122 
7123    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY;
7124 }
7125 
7126 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetViewportWithCount(VkCommandBuffer commandBuffer,uint32_t viewportCount,const VkViewport * pViewports)7127 radv_CmdSetViewportWithCount(VkCommandBuffer commandBuffer, uint32_t viewportCount, const VkViewport *pViewports)
7128 {
7129    radv_CmdSetViewport(commandBuffer, 0, viewportCount, pViewports);
7130 }
7131 
7132 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetScissorWithCount(VkCommandBuffer commandBuffer,uint32_t scissorCount,const VkRect2D * pScissors)7133 radv_CmdSetScissorWithCount(VkCommandBuffer commandBuffer, uint32_t scissorCount, const VkRect2D *pScissors)
7134 {
7135    radv_CmdSetScissor(commandBuffer, 0, scissorCount, pScissors);
7136 }
7137 
7138 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthTestEnable(VkCommandBuffer commandBuffer,VkBool32 depthTestEnable)7139 radv_CmdSetDepthTestEnable(VkCommandBuffer commandBuffer, VkBool32 depthTestEnable)
7140 
7141 {
7142    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7143    struct radv_cmd_state *state = &cmd_buffer->state;
7144 
7145    state->dynamic.vk.ds.depth.test_enable = depthTestEnable;
7146 
7147    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE;
7148 }
7149 
7150 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthWriteEnable(VkCommandBuffer commandBuffer,VkBool32 depthWriteEnable)7151 radv_CmdSetDepthWriteEnable(VkCommandBuffer commandBuffer, VkBool32 depthWriteEnable)
7152 {
7153    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7154    struct radv_cmd_state *state = &cmd_buffer->state;
7155 
7156    state->dynamic.vk.ds.depth.write_enable = depthWriteEnable;
7157 
7158    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE;
7159 }
7160 
7161 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthCompareOp(VkCommandBuffer commandBuffer,VkCompareOp depthCompareOp)7162 radv_CmdSetDepthCompareOp(VkCommandBuffer commandBuffer, VkCompareOp depthCompareOp)
7163 {
7164    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7165    struct radv_cmd_state *state = &cmd_buffer->state;
7166 
7167    state->dynamic.vk.ds.depth.compare_op = depthCompareOp;
7168 
7169    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP;
7170 }
7171 
7172 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthBoundsTestEnable(VkCommandBuffer commandBuffer,VkBool32 depthBoundsTestEnable)7173 radv_CmdSetDepthBoundsTestEnable(VkCommandBuffer commandBuffer, VkBool32 depthBoundsTestEnable)
7174 {
7175    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7176    struct radv_cmd_state *state = &cmd_buffer->state;
7177 
7178    state->dynamic.vk.ds.depth.bounds_test.enable = depthBoundsTestEnable;
7179 
7180    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE;
7181 }
7182 
7183 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetStencilTestEnable(VkCommandBuffer commandBuffer,VkBool32 stencilTestEnable)7184 radv_CmdSetStencilTestEnable(VkCommandBuffer commandBuffer, VkBool32 stencilTestEnable)
7185 {
7186    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7187    struct radv_cmd_state *state = &cmd_buffer->state;
7188 
7189    state->dynamic.vk.ds.stencil.test_enable = stencilTestEnable;
7190 
7191    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE;
7192 }
7193 
7194 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetStencilOp(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,VkStencilOp failOp,VkStencilOp passOp,VkStencilOp depthFailOp,VkCompareOp compareOp)7195 radv_CmdSetStencilOp(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask, VkStencilOp failOp, VkStencilOp passOp,
7196                      VkStencilOp depthFailOp, VkCompareOp compareOp)
7197 {
7198    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7199    struct radv_cmd_state *state = &cmd_buffer->state;
7200 
7201    if (faceMask & VK_STENCIL_FACE_FRONT_BIT) {
7202       state->dynamic.vk.ds.stencil.front.op.fail = failOp;
7203       state->dynamic.vk.ds.stencil.front.op.pass = passOp;
7204       state->dynamic.vk.ds.stencil.front.op.depth_fail = depthFailOp;
7205       state->dynamic.vk.ds.stencil.front.op.compare = compareOp;
7206    }
7207 
7208    if (faceMask & VK_STENCIL_FACE_BACK_BIT) {
7209       state->dynamic.vk.ds.stencil.back.op.fail = failOp;
7210       state->dynamic.vk.ds.stencil.back.op.pass = passOp;
7211       state->dynamic.vk.ds.stencil.back.op.depth_fail = depthFailOp;
7212       state->dynamic.vk.ds.stencil.back.op.compare = compareOp;
7213    }
7214 
7215    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP;
7216 }
7217 
7218 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetFragmentShadingRateKHR(VkCommandBuffer commandBuffer,const VkExtent2D * pFragmentSize,const VkFragmentShadingRateCombinerOpKHR combinerOps[2])7219 radv_CmdSetFragmentShadingRateKHR(VkCommandBuffer commandBuffer, const VkExtent2D *pFragmentSize,
7220                                   const VkFragmentShadingRateCombinerOpKHR combinerOps[2])
7221 {
7222    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7223    struct radv_cmd_state *state = &cmd_buffer->state;
7224 
7225    state->dynamic.vk.fsr.fragment_size = *pFragmentSize;
7226    for (unsigned i = 0; i < 2; i++)
7227       state->dynamic.vk.fsr.combiner_ops[i] = combinerOps[i];
7228 
7229    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE;
7230 }
7231 
7232 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthBiasEnable(VkCommandBuffer commandBuffer,VkBool32 depthBiasEnable)7233 radv_CmdSetDepthBiasEnable(VkCommandBuffer commandBuffer, VkBool32 depthBiasEnable)
7234 {
7235    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7236    struct radv_cmd_state *state = &cmd_buffer->state;
7237 
7238    state->dynamic.vk.rs.depth_bias.enable = depthBiasEnable;
7239 
7240    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE;
7241 }
7242 
7243 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetPrimitiveRestartEnable(VkCommandBuffer commandBuffer,VkBool32 primitiveRestartEnable)7244 radv_CmdSetPrimitiveRestartEnable(VkCommandBuffer commandBuffer, VkBool32 primitiveRestartEnable)
7245 {
7246    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7247    struct radv_cmd_state *state = &cmd_buffer->state;
7248 
7249    state->dynamic.vk.ia.primitive_restart_enable = primitiveRestartEnable;
7250 
7251    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE;
7252 }
7253 
7254 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetRasterizerDiscardEnable(VkCommandBuffer commandBuffer,VkBool32 rasterizerDiscardEnable)7255 radv_CmdSetRasterizerDiscardEnable(VkCommandBuffer commandBuffer, VkBool32 rasterizerDiscardEnable)
7256 {
7257    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7258    struct radv_cmd_state *state = &cmd_buffer->state;
7259 
7260    state->dynamic.vk.rs.rasterizer_discard_enable = rasterizerDiscardEnable;
7261 
7262    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
7263 }
7264 
7265 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetPatchControlPointsEXT(VkCommandBuffer commandBuffer,uint32_t patchControlPoints)7266 radv_CmdSetPatchControlPointsEXT(VkCommandBuffer commandBuffer, uint32_t patchControlPoints)
7267 {
7268    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7269    struct radv_cmd_state *state = &cmd_buffer->state;
7270 
7271    state->dynamic.vk.ts.patch_control_points = patchControlPoints;
7272 
7273    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_PATCH_CONTROL_POINTS;
7274 }
7275 
7276 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetLogicOpEXT(VkCommandBuffer commandBuffer,VkLogicOp logicOp)7277 radv_CmdSetLogicOpEXT(VkCommandBuffer commandBuffer, VkLogicOp logicOp)
7278 {
7279    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7280    struct radv_cmd_state *state = &cmd_buffer->state;
7281    unsigned logic_op = radv_translate_blend_logic_op(logicOp);
7282 
7283    state->dynamic.vk.cb.logic_op = logic_op;
7284 
7285    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP;
7286 }
7287 
7288 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer,uint32_t attachmentCount,const VkBool32 * pColorWriteEnables)7289 radv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer, uint32_t attachmentCount,
7290                                const VkBool32 *pColorWriteEnables)
7291 {
7292    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7293    struct radv_cmd_state *state = &cmd_buffer->state;
7294    uint8_t color_write_enable = 0;
7295 
7296    assert(attachmentCount <= MAX_RTS);
7297 
7298    for (uint32_t i = 0; i < attachmentCount; i++) {
7299       if (pColorWriteEnables[i]) {
7300          color_write_enable |= BITFIELD_BIT(i);
7301       }
7302    }
7303 
7304    state->dynamic.vk.cb.color_write_enables = color_write_enable;
7305 
7306    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE;
7307 }
7308 
7309 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetVertexInputEXT(VkCommandBuffer commandBuffer,uint32_t vertexBindingDescriptionCount,const VkVertexInputBindingDescription2EXT * pVertexBindingDescriptions,uint32_t vertexAttributeDescriptionCount,const VkVertexInputAttributeDescription2EXT * pVertexAttributeDescriptions)7310 radv_CmdSetVertexInputEXT(VkCommandBuffer commandBuffer, uint32_t vertexBindingDescriptionCount,
7311                           const VkVertexInputBindingDescription2EXT *pVertexBindingDescriptions,
7312                           uint32_t vertexAttributeDescriptionCount,
7313                           const VkVertexInputAttributeDescription2EXT *pVertexAttributeDescriptions)
7314 {
7315    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7316    struct radv_cmd_state *state = &cmd_buffer->state;
7317    struct radv_vs_input_state *vs_state = &state->dynamic_vs_input;
7318 
7319    const VkVertexInputBindingDescription2EXT *bindings[MAX_VBS];
7320    for (unsigned i = 0; i < vertexBindingDescriptionCount; i++)
7321       bindings[pVertexBindingDescriptions[i].binding] = &pVertexBindingDescriptions[i];
7322 
7323    state->vbo_misaligned_mask = 0;
7324    state->vbo_misaligned_mask_invalid = 0;
7325 
7326    vs_state->attribute_mask = 0;
7327    vs_state->instance_rate_inputs = 0;
7328    vs_state->nontrivial_divisors = 0;
7329    vs_state->zero_divisors = 0;
7330    vs_state->post_shuffle = 0;
7331    vs_state->alpha_adjust_lo = 0;
7332    vs_state->alpha_adjust_hi = 0;
7333    vs_state->nontrivial_formats = 0;
7334    vs_state->bindings_match_attrib = true;
7335 
7336    enum amd_gfx_level chip = cmd_buffer->device->physical_device->rad_info.gfx_level;
7337    enum radeon_family family = cmd_buffer->device->physical_device->rad_info.family;
7338    const struct ac_vtx_format_info *vtx_info_table = ac_get_vtx_format_info_table(chip, family);
7339 
7340    for (unsigned i = 0; i < vertexAttributeDescriptionCount; i++) {
7341       const VkVertexInputAttributeDescription2EXT *attrib = &pVertexAttributeDescriptions[i];
7342       const VkVertexInputBindingDescription2EXT *binding = bindings[attrib->binding];
7343       unsigned loc = attrib->location;
7344 
7345       vs_state->attribute_mask |= 1u << loc;
7346       vs_state->bindings[loc] = attrib->binding;
7347       if (attrib->binding != loc)
7348          vs_state->bindings_match_attrib = false;
7349       if (binding->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE) {
7350          vs_state->instance_rate_inputs |= 1u << loc;
7351          vs_state->divisors[loc] = binding->divisor;
7352          if (binding->divisor == 0) {
7353             vs_state->zero_divisors |= 1u << loc;
7354          } else if (binding->divisor > 1) {
7355             vs_state->nontrivial_divisors |= 1u << loc;
7356          }
7357       }
7358       cmd_buffer->vertex_bindings[attrib->binding].stride = binding->stride;
7359       vs_state->offsets[loc] = attrib->offset;
7360 
7361       enum pipe_format format = vk_format_map[attrib->format];
7362       const struct ac_vtx_format_info *vtx_info = &vtx_info_table[format];
7363 
7364       vs_state->formats[loc] = format;
7365       uint8_t align_req_minus_1 = vtx_info->chan_byte_size >= 4 ? 3 : (vtx_info->element_size - 1);
7366       vs_state->format_align_req_minus_1[loc] = align_req_minus_1;
7367       vs_state->format_sizes[loc] = vtx_info->element_size;
7368       vs_state->alpha_adjust_lo |= (vtx_info->alpha_adjust & 0x1) << loc;
7369       vs_state->alpha_adjust_hi |= (vtx_info->alpha_adjust >> 1) << loc;
7370       if (G_008F0C_DST_SEL_X(vtx_info->dst_sel) == V_008F0C_SQ_SEL_Z)
7371          vs_state->post_shuffle |= BITFIELD_BIT(loc);
7372 
7373       if (!(vtx_info->has_hw_format & BITFIELD_BIT(vtx_info->num_channels - 1)))
7374          vs_state->nontrivial_formats |= BITFIELD_BIT(loc);
7375 
7376       if ((chip == GFX6 || chip >= GFX10) && state->vbo_bound_mask & BITFIELD_BIT(attrib->binding)) {
7377          if (binding->stride & align_req_minus_1) {
7378             state->vbo_misaligned_mask |= BITFIELD_BIT(loc);
7379          } else if ((cmd_buffer->vertex_bindings[attrib->binding].offset + vs_state->offsets[loc]) &
7380                     align_req_minus_1) {
7381             state->vbo_misaligned_mask |= BITFIELD_BIT(loc);
7382          }
7383       }
7384    }
7385 
7386    state->dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER | RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT;
7387 }
7388 
7389 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetPolygonModeEXT(VkCommandBuffer commandBuffer,VkPolygonMode polygonMode)7390 radv_CmdSetPolygonModeEXT(VkCommandBuffer commandBuffer, VkPolygonMode polygonMode)
7391 {
7392    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7393    struct radv_cmd_state *state = &cmd_buffer->state;
7394    unsigned polygon_mode = radv_translate_fill(polygonMode);
7395 
7396    if (radv_polygon_mode_is_points_or_lines(state->dynamic.vk.rs.polygon_mode) !=
7397        radv_polygon_mode_is_points_or_lines(polygon_mode))
7398       state->dirty |= RADV_CMD_DIRTY_GUARDBAND;
7399 
7400    state->dynamic.vk.rs.polygon_mode = polygon_mode;
7401 
7402    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_POLYGON_MODE;
7403 }
7404 
7405 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetTessellationDomainOriginEXT(VkCommandBuffer commandBuffer,VkTessellationDomainOrigin domainOrigin)7406 radv_CmdSetTessellationDomainOriginEXT(VkCommandBuffer commandBuffer, VkTessellationDomainOrigin domainOrigin)
7407 {
7408    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7409    struct radv_cmd_state *state = &cmd_buffer->state;
7410 
7411    state->dynamic.vk.ts.domain_origin = domainOrigin;
7412 
7413    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_TESS_DOMAIN_ORIGIN;
7414 }
7415 
7416 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetLogicOpEnableEXT(VkCommandBuffer commandBuffer,VkBool32 logicOpEnable)7417 radv_CmdSetLogicOpEnableEXT(VkCommandBuffer commandBuffer, VkBool32 logicOpEnable)
7418 {
7419    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7420    struct radv_cmd_state *state = &cmd_buffer->state;
7421 
7422    state->dynamic.vk.cb.logic_op_enable = logicOpEnable;
7423 
7424    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP_ENABLE;
7425 }
7426 
7427 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetLineStippleEnableEXT(VkCommandBuffer commandBuffer,VkBool32 stippledLineEnable)7428 radv_CmdSetLineStippleEnableEXT(VkCommandBuffer commandBuffer, VkBool32 stippledLineEnable)
7429 {
7430    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7431    struct radv_cmd_state *state = &cmd_buffer->state;
7432 
7433    state->dynamic.vk.rs.line.stipple.enable = stippledLineEnable;
7434 
7435    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE_ENABLE;
7436 }
7437 
7438 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetAlphaToCoverageEnableEXT(VkCommandBuffer commandBuffer,VkBool32 alphaToCoverageEnable)7439 radv_CmdSetAlphaToCoverageEnableEXT(VkCommandBuffer commandBuffer, VkBool32 alphaToCoverageEnable)
7440 {
7441    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7442    struct radv_cmd_state *state = &cmd_buffer->state;
7443 
7444    state->dynamic.vk.ms.alpha_to_coverage_enable = alphaToCoverageEnable;
7445 
7446    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_ALPHA_TO_COVERAGE_ENABLE;
7447 }
7448 
7449 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetSampleMaskEXT(VkCommandBuffer commandBuffer,VkSampleCountFlagBits samples,const VkSampleMask * pSampleMask)7450 radv_CmdSetSampleMaskEXT(VkCommandBuffer commandBuffer, VkSampleCountFlagBits samples, const VkSampleMask *pSampleMask)
7451 {
7452    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7453    struct radv_cmd_state *state = &cmd_buffer->state;
7454 
7455    state->dynamic.vk.ms.sample_mask = pSampleMask[0] & 0xffff;
7456 
7457    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SAMPLE_MASK;
7458 }
7459 
7460 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthClipEnableEXT(VkCommandBuffer commandBuffer,VkBool32 depthClipEnable)7461 radv_CmdSetDepthClipEnableEXT(VkCommandBuffer commandBuffer, VkBool32 depthClipEnable)
7462 {
7463    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7464    struct radv_cmd_state *state = &cmd_buffer->state;
7465 
7466    state->dynamic.vk.rs.depth_clip_enable = depthClipEnable;
7467 
7468    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_CLIP_ENABLE;
7469 }
7470 
7471 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetConservativeRasterizationModeEXT(VkCommandBuffer commandBuffer,VkConservativeRasterizationModeEXT conservativeRasterizationMode)7472 radv_CmdSetConservativeRasterizationModeEXT(VkCommandBuffer commandBuffer,
7473                                             VkConservativeRasterizationModeEXT conservativeRasterizationMode)
7474 {
7475    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7476    struct radv_cmd_state *state = &cmd_buffer->state;
7477 
7478    state->dynamic.vk.rs.conservative_mode = conservativeRasterizationMode;
7479 
7480    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_CONSERVATIVE_RAST_MODE;
7481 }
7482 
7483 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthClipNegativeOneToOneEXT(VkCommandBuffer commandBuffer,VkBool32 negativeOneToOne)7484 radv_CmdSetDepthClipNegativeOneToOneEXT(VkCommandBuffer commandBuffer, VkBool32 negativeOneToOne)
7485 {
7486    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7487    struct radv_cmd_state *state = &cmd_buffer->state;
7488 
7489    state->dynamic.vk.vp.depth_clip_negative_one_to_one = negativeOneToOne;
7490 
7491    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE;
7492 }
7493 
7494 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetProvokingVertexModeEXT(VkCommandBuffer commandBuffer,VkProvokingVertexModeEXT provokingVertexMode)7495 radv_CmdSetProvokingVertexModeEXT(VkCommandBuffer commandBuffer, VkProvokingVertexModeEXT provokingVertexMode)
7496 {
7497    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7498    struct radv_cmd_state *state = &cmd_buffer->state;
7499 
7500    state->dynamic.vk.rs.provoking_vertex = provokingVertexMode;
7501 
7502    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_PROVOKING_VERTEX_MODE;
7503 }
7504 
7505 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthClampEnableEXT(VkCommandBuffer commandBuffer,VkBool32 depthClampEnable)7506 radv_CmdSetDepthClampEnableEXT(VkCommandBuffer commandBuffer, VkBool32 depthClampEnable)
7507 {
7508    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7509    struct radv_cmd_state *state = &cmd_buffer->state;
7510 
7511    state->dynamic.vk.rs.depth_clamp_enable = depthClampEnable;
7512 
7513    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_CLAMP_ENABLE;
7514 }
7515 
7516 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetColorWriteMaskEXT(VkCommandBuffer commandBuffer,uint32_t firstAttachment,uint32_t attachmentCount,const VkColorComponentFlags * pColorWriteMasks)7517 radv_CmdSetColorWriteMaskEXT(VkCommandBuffer commandBuffer, uint32_t firstAttachment, uint32_t attachmentCount,
7518                              const VkColorComponentFlags *pColorWriteMasks)
7519 {
7520    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7521    struct radv_cmd_state *state = &cmd_buffer->state;
7522 
7523    assert(firstAttachment + attachmentCount <= MAX_RTS);
7524 
7525    for (uint32_t i = 0; i < attachmentCount; i++) {
7526       uint32_t idx = firstAttachment + i;
7527 
7528       state->dynamic.vk.cb.attachments[idx].write_mask = pColorWriteMasks[i];
7529    }
7530 
7531    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_MASK;
7532 
7533    if (cmd_buffer->device->physical_device->rad_info.rbplus_allowed)
7534       state->dirty |= RADV_CMD_DIRTY_RBPLUS;
7535 }
7536 
7537 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetColorBlendEnableEXT(VkCommandBuffer commandBuffer,uint32_t firstAttachment,uint32_t attachmentCount,const VkBool32 * pColorBlendEnables)7538 radv_CmdSetColorBlendEnableEXT(VkCommandBuffer commandBuffer, uint32_t firstAttachment, uint32_t attachmentCount,
7539                                const VkBool32 *pColorBlendEnables)
7540 {
7541    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7542    struct radv_cmd_state *state = &cmd_buffer->state;
7543 
7544    assert(firstAttachment + attachmentCount <= MAX_RTS);
7545 
7546    for (uint32_t i = 0; i < attachmentCount; i++) {
7547       uint32_t idx = firstAttachment + i;
7548 
7549       state->dynamic.vk.cb.attachments[idx].blend_enable = pColorBlendEnables[i];
7550    }
7551 
7552    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_ENABLE;
7553 }
7554 
7555 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetRasterizationSamplesEXT(VkCommandBuffer commandBuffer,VkSampleCountFlagBits rasterizationSamples)7556 radv_CmdSetRasterizationSamplesEXT(VkCommandBuffer commandBuffer, VkSampleCountFlagBits rasterizationSamples)
7557 {
7558    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7559    struct radv_cmd_state *state = &cmd_buffer->state;
7560 
7561    state->dynamic.vk.ms.rasterization_samples = rasterizationSamples;
7562 
7563    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_RASTERIZATION_SAMPLES;
7564 }
7565 
7566 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetLineRasterizationModeEXT(VkCommandBuffer commandBuffer,VkLineRasterizationModeKHR lineRasterizationMode)7567 radv_CmdSetLineRasterizationModeEXT(VkCommandBuffer commandBuffer, VkLineRasterizationModeKHR lineRasterizationMode)
7568 {
7569    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7570    struct radv_cmd_state *state = &cmd_buffer->state;
7571 
7572    state->dynamic.vk.rs.line.mode = lineRasterizationMode;
7573 
7574    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_RASTERIZATION_MODE;
7575 }
7576 
7577 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetColorBlendEquationEXT(VkCommandBuffer commandBuffer,uint32_t firstAttachment,uint32_t attachmentCount,const VkColorBlendEquationEXT * pColorBlendEquations)7578 radv_CmdSetColorBlendEquationEXT(VkCommandBuffer commandBuffer, uint32_t firstAttachment, uint32_t attachmentCount,
7579                                  const VkColorBlendEquationEXT *pColorBlendEquations)
7580 {
7581    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7582    struct radv_cmd_state *state = &cmd_buffer->state;
7583 
7584    assert(firstAttachment + attachmentCount <= MAX_RTS);
7585    for (uint32_t i = 0; i < attachmentCount; i++) {
7586       unsigned idx = firstAttachment + i;
7587 
7588       state->dynamic.vk.cb.attachments[idx].src_color_blend_factor = pColorBlendEquations[i].srcColorBlendFactor;
7589       state->dynamic.vk.cb.attachments[idx].dst_color_blend_factor = pColorBlendEquations[i].dstColorBlendFactor;
7590       state->dynamic.vk.cb.attachments[idx].color_blend_op = pColorBlendEquations[i].colorBlendOp;
7591       state->dynamic.vk.cb.attachments[idx].src_alpha_blend_factor = pColorBlendEquations[i].srcAlphaBlendFactor;
7592       state->dynamic.vk.cb.attachments[idx].dst_alpha_blend_factor = pColorBlendEquations[i].dstAlphaBlendFactor;
7593       state->dynamic.vk.cb.attachments[idx].alpha_blend_op = pColorBlendEquations[i].alphaBlendOp;
7594    }
7595 
7596    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_EQUATION;
7597 }
7598 
7599 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetSampleLocationsEnableEXT(VkCommandBuffer commandBuffer,VkBool32 sampleLocationsEnable)7600 radv_CmdSetSampleLocationsEnableEXT(VkCommandBuffer commandBuffer, VkBool32 sampleLocationsEnable)
7601 {
7602    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7603    struct radv_cmd_state *state = &cmd_buffer->state;
7604 
7605    state->dynamic.vk.ms.sample_locations_enable = sampleLocationsEnable;
7606 
7607    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS_ENABLE;
7608 }
7609 
7610 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDiscardRectangleEnableEXT(VkCommandBuffer commandBuffer,VkBool32 discardRectangleEnable)7611 radv_CmdSetDiscardRectangleEnableEXT(VkCommandBuffer commandBuffer, VkBool32 discardRectangleEnable)
7612 {
7613    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7614    struct radv_cmd_state *state = &cmd_buffer->state;
7615 
7616    state->dynamic.vk.dr.enable = discardRectangleEnable;
7617    state->dynamic.vk.dr.rectangle_count = discardRectangleEnable ? MAX_DISCARD_RECTANGLES : 0;
7618 
7619    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE_ENABLE;
7620 }
7621 
7622 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDiscardRectangleModeEXT(VkCommandBuffer commandBuffer,VkDiscardRectangleModeEXT discardRectangleMode)7623 radv_CmdSetDiscardRectangleModeEXT(VkCommandBuffer commandBuffer, VkDiscardRectangleModeEXT discardRectangleMode)
7624 {
7625    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7626    struct radv_cmd_state *state = &cmd_buffer->state;
7627 
7628    state->dynamic.vk.dr.mode = discardRectangleMode;
7629 
7630    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE_MODE;
7631 }
7632 
7633 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetAttachmentFeedbackLoopEnableEXT(VkCommandBuffer commandBuffer,VkImageAspectFlags aspectMask)7634 radv_CmdSetAttachmentFeedbackLoopEnableEXT(VkCommandBuffer commandBuffer, VkImageAspectFlags aspectMask)
7635 {
7636    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7637    struct radv_cmd_state *state = &cmd_buffer->state;
7638 
7639    state->dynamic.feedback_loop_aspects = aspectMask;
7640 
7641    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_ATTACHMENT_FEEDBACK_LOOP_ENABLE;
7642 }
7643 
7644 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthBias2EXT(VkCommandBuffer commandBuffer,const VkDepthBiasInfoEXT * pDepthBiasInfo)7645 radv_CmdSetDepthBias2EXT(VkCommandBuffer commandBuffer, const VkDepthBiasInfoEXT *pDepthBiasInfo)
7646 {
7647    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7648    struct radv_cmd_state *state = &cmd_buffer->state;
7649 
7650    const VkDepthBiasRepresentationInfoEXT *dbr_info =
7651       vk_find_struct_const(pDepthBiasInfo->pNext, DEPTH_BIAS_REPRESENTATION_INFO_EXT);
7652 
7653    state->dynamic.vk.rs.depth_bias.constant = pDepthBiasInfo->depthBiasConstantFactor;
7654    state->dynamic.vk.rs.depth_bias.clamp = pDepthBiasInfo->depthBiasClamp;
7655    state->dynamic.vk.rs.depth_bias.slope = pDepthBiasInfo->depthBiasSlopeFactor;
7656    state->dynamic.vk.rs.depth_bias.representation =
7657       dbr_info ? dbr_info->depthBiasRepresentation : VK_DEPTH_BIAS_REPRESENTATION_LEAST_REPRESENTABLE_VALUE_FORMAT_EXT;
7658 
7659    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
7660 }
7661 
7662 VKAPI_ATTR void VKAPI_CALL
radv_CmdExecuteCommands(VkCommandBuffer commandBuffer,uint32_t commandBufferCount,const VkCommandBuffer * pCmdBuffers)7663 radv_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCount, const VkCommandBuffer *pCmdBuffers)
7664 {
7665    RADV_FROM_HANDLE(radv_cmd_buffer, primary, commandBuffer);
7666 
7667    assert(commandBufferCount > 0);
7668 
7669    radv_emit_mip_change_flush_default(primary);
7670 
7671    /* Emit pending flushes on primary prior to executing secondary */
7672    radv_emit_cache_flush(primary);
7673 
7674    /* Make sure CP DMA is idle on primary prior to executing secondary. */
7675    radv_cp_dma_wait_for_idle(primary);
7676 
7677    for (uint32_t i = 0; i < commandBufferCount; i++) {
7678       RADV_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]);
7679 
7680       /* Do not launch an IB2 for secondary command buffers that contain
7681        * DRAW_{INDEX}_INDIRECT_{MULTI} on GFX6-7 because it's illegal and hangs the GPU.
7682        */
7683       const bool allow_ib2 =
7684          !secondary->state.uses_draw_indirect || secondary->device->physical_device->rad_info.gfx_level >= GFX8;
7685 
7686       primary->scratch_size_per_wave_needed =
7687          MAX2(primary->scratch_size_per_wave_needed, secondary->scratch_size_per_wave_needed);
7688       primary->scratch_waves_wanted = MAX2(primary->scratch_waves_wanted, secondary->scratch_waves_wanted);
7689       primary->compute_scratch_size_per_wave_needed =
7690          MAX2(primary->compute_scratch_size_per_wave_needed, secondary->compute_scratch_size_per_wave_needed);
7691       primary->compute_scratch_waves_wanted =
7692          MAX2(primary->compute_scratch_waves_wanted, secondary->compute_scratch_waves_wanted);
7693 
7694       if (secondary->esgs_ring_size_needed > primary->esgs_ring_size_needed)
7695          primary->esgs_ring_size_needed = secondary->esgs_ring_size_needed;
7696       if (secondary->gsvs_ring_size_needed > primary->gsvs_ring_size_needed)
7697          primary->gsvs_ring_size_needed = secondary->gsvs_ring_size_needed;
7698       if (secondary->tess_rings_needed)
7699          primary->tess_rings_needed = true;
7700       if (secondary->task_rings_needed)
7701          primary->task_rings_needed = true;
7702       if (secondary->mesh_scratch_ring_needed)
7703          primary->mesh_scratch_ring_needed = true;
7704       if (secondary->sample_positions_needed)
7705          primary->sample_positions_needed = true;
7706       if (secondary->gds_needed)
7707          primary->gds_needed = true;
7708       if (secondary->gds_oa_needed)
7709          primary->gds_oa_needed = true;
7710 
7711       primary->shader_upload_seq = MAX2(primary->shader_upload_seq, secondary->shader_upload_seq);
7712 
7713       if (!secondary->state.render.has_image_views && primary->state.render.active &&
7714           (primary->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)) {
7715          /* Emit the framebuffer state from primary if secondary
7716           * has been recorded without a framebuffer, otherwise
7717           * fast color/depth clears can't work.
7718           */
7719          radv_emit_framebuffer_state(primary);
7720       }
7721 
7722       if (secondary->gang.cs) {
7723          if (!radv_gang_init(primary))
7724             return;
7725 
7726          struct radeon_cmdbuf *ace_primary = primary->gang.cs;
7727          struct radeon_cmdbuf *ace_secondary = secondary->gang.cs;
7728 
7729          /* Emit pending flushes on primary prior to executing secondary. */
7730          radv_gang_cache_flush(primary);
7731 
7732          /* Wait for gang semaphores, if necessary. */
7733          if (radv_flush_gang_leader_semaphore(primary))
7734             radv_wait_gang_leader(primary);
7735          if (radv_flush_gang_follower_semaphore(primary))
7736             radv_wait_gang_follower(primary);
7737 
7738          /* Execute the secondary compute cmdbuf.
7739           * Don't use IB2 packets because they are not supported on compute queues.
7740           */
7741          primary->device->ws->cs_execute_secondary(ace_primary, ace_secondary, false);
7742       }
7743 
7744       /* Update pending ACE internal flush bits from the secondary cmdbuf */
7745       primary->gang.flush_bits |= secondary->gang.flush_bits;
7746 
7747       /* Increment gang semaphores if secondary was dirty.
7748        * This happens when the secondary cmdbuf has a barrier which
7749        * isn't consumed by a draw call.
7750        */
7751       if (radv_gang_leader_sem_dirty(secondary))
7752          primary->gang.sem.leader_value++;
7753       if (radv_gang_follower_sem_dirty(secondary))
7754          primary->gang.sem.follower_value++;
7755 
7756       primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs, allow_ib2);
7757 
7758       /* When the secondary command buffer is compute only we don't
7759        * need to re-emit the current graphics pipeline.
7760        */
7761       if (secondary->state.emitted_graphics_pipeline) {
7762          primary->state.emitted_graphics_pipeline = secondary->state.emitted_graphics_pipeline;
7763       }
7764 
7765       /* When the secondary command buffer is graphics only we don't
7766        * need to re-emit the current compute pipeline.
7767        */
7768       if (secondary->state.emitted_compute_pipeline) {
7769          primary->state.emitted_compute_pipeline = secondary->state.emitted_compute_pipeline;
7770       }
7771 
7772       if (secondary->state.last_primitive_reset_index) {
7773          primary->state.last_primitive_reset_index = secondary->state.last_primitive_reset_index;
7774       }
7775 
7776       if (secondary->state.last_ia_multi_vgt_param) {
7777          primary->state.last_ia_multi_vgt_param = secondary->state.last_ia_multi_vgt_param;
7778       }
7779 
7780       if (secondary->state.last_ge_cntl) {
7781          primary->state.last_ge_cntl = secondary->state.last_ge_cntl;
7782       }
7783 
7784       primary->state.last_num_instances = secondary->state.last_num_instances;
7785       primary->state.last_subpass_color_count = secondary->state.last_subpass_color_count;
7786       primary->state.last_sx_ps_downconvert = secondary->state.last_sx_ps_downconvert;
7787       primary->state.last_sx_blend_opt_epsilon = secondary->state.last_sx_blend_opt_epsilon;
7788       primary->state.last_sx_blend_opt_control = secondary->state.last_sx_blend_opt_control;
7789 
7790       if (secondary->state.last_index_type != -1) {
7791          primary->state.last_index_type = secondary->state.last_index_type;
7792       }
7793 
7794       primary->state.last_vrs_rates = secondary->state.last_vrs_rates;
7795       primary->state.last_vrs_rates_sgpr_idx = secondary->state.last_vrs_rates_sgpr_idx;
7796 
7797       primary->state.last_pa_sc_binner_cntl_0 = secondary->state.last_pa_sc_binner_cntl_0;
7798 
7799       primary->state.last_db_shader_control = secondary->state.last_db_shader_control;
7800 
7801       primary->state.rb_noncoherent_dirty |= secondary->state.rb_noncoherent_dirty;
7802    }
7803 
7804    /* After executing commands from secondary buffers we have to dirty
7805     * some states.
7806     */
7807    primary->state.dirty |= RADV_CMD_DIRTY_PIPELINE | RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_GUARDBAND |
7808                            RADV_CMD_DIRTY_DYNAMIC_ALL | RADV_CMD_DIRTY_SHADER_QUERY | RADV_CMD_DIRTY_OCCLUSION_QUERY |
7809                            RADV_CMD_DIRTY_DB_SHADER_CONTROL;
7810    radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_GRAPHICS);
7811    radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_COMPUTE);
7812 
7813    primary->state.last_first_instance = -1;
7814    primary->state.last_drawid = -1;
7815    primary->state.last_vertex_offset_valid = false;
7816    primary->state.last_db_count_control = -1;
7817 }
7818 
7819 static void
radv_mark_noncoherent_rb(struct radv_cmd_buffer * cmd_buffer)7820 radv_mark_noncoherent_rb(struct radv_cmd_buffer *cmd_buffer)
7821 {
7822    struct radv_rendering_state *render = &cmd_buffer->state.render;
7823 
7824    /* Have to be conservative in cmdbuffers with inherited attachments. */
7825    if (!render->has_image_views) {
7826       cmd_buffer->state.rb_noncoherent_dirty = true;
7827       return;
7828    }
7829 
7830    for (uint32_t i = 0; i < render->color_att_count; i++) {
7831       if (render->color_att[i].iview && !render->color_att[i].iview->image->l2_coherent) {
7832          cmd_buffer->state.rb_noncoherent_dirty = true;
7833          return;
7834       }
7835    }
7836    if (render->ds_att.iview && !render->ds_att.iview->image->l2_coherent)
7837       cmd_buffer->state.rb_noncoherent_dirty = true;
7838 }
7839 
7840 static VkImageLayout
attachment_initial_layout(const VkRenderingAttachmentInfo * att)7841 attachment_initial_layout(const VkRenderingAttachmentInfo *att)
7842 {
7843    const VkRenderingAttachmentInitialLayoutInfoMESA *layout_info =
7844       vk_find_struct_const(att->pNext, RENDERING_ATTACHMENT_INITIAL_LAYOUT_INFO_MESA);
7845    if (layout_info != NULL)
7846       return layout_info->initialLayout;
7847 
7848    return att->imageLayout;
7849 }
7850 
7851 VKAPI_ATTR void VKAPI_CALL
radv_CmdBeginRendering(VkCommandBuffer commandBuffer,const VkRenderingInfo * pRenderingInfo)7852 radv_CmdBeginRendering(VkCommandBuffer commandBuffer, const VkRenderingInfo *pRenderingInfo)
7853 {
7854    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7855 
7856    const struct VkSampleLocationsInfoEXT *sample_locs_info =
7857       vk_find_struct_const(pRenderingInfo->pNext, SAMPLE_LOCATIONS_INFO_EXT);
7858 
7859    struct radv_sample_locations_state sample_locations = {
7860       .count = 0,
7861    };
7862    if (sample_locs_info) {
7863       sample_locations = (struct radv_sample_locations_state){
7864          .per_pixel = sample_locs_info->sampleLocationsPerPixel,
7865          .grid_size = sample_locs_info->sampleLocationGridSize,
7866          .count = sample_locs_info->sampleLocationsCount,
7867       };
7868       typed_memcpy(sample_locations.locations, sample_locs_info->pSampleLocations,
7869                    sample_locs_info->sampleLocationsCount);
7870    }
7871 
7872    /* Dynamic rendering does not have implicit transitions, so limit the marker to
7873     * when a render pass is used.
7874     * Additionally, some internal meta operations called inside a barrier may issue
7875     * render calls (with dynamic rendering), so this makes sure those case don't
7876     * create a nested barrier scope.
7877     */
7878    if (cmd_buffer->vk.render_pass)
7879       radv_describe_barrier_start(cmd_buffer, RGP_BARRIER_EXTERNAL_RENDER_PASS_SYNC);
7880    uint32_t color_samples = 0, ds_samples = 0;
7881    struct radv_attachment color_att[MAX_RTS];
7882    for (uint32_t i = 0; i < pRenderingInfo->colorAttachmentCount; i++) {
7883       const VkRenderingAttachmentInfo *att_info = &pRenderingInfo->pColorAttachments[i];
7884 
7885       color_att[i] = (struct radv_attachment){.iview = NULL};
7886       if (att_info->imageView == VK_NULL_HANDLE)
7887          continue;
7888 
7889       VK_FROM_HANDLE(radv_image_view, iview, att_info->imageView);
7890       color_att[i].format = iview->vk.format;
7891       color_att[i].iview = iview;
7892       color_att[i].layout = att_info->imageLayout;
7893       radv_initialise_color_surface(cmd_buffer->device, &color_att[i].cb, iview);
7894 
7895       if (att_info->resolveMode != VK_RESOLVE_MODE_NONE && att_info->resolveImageView != VK_NULL_HANDLE) {
7896          color_att[i].resolve_mode = att_info->resolveMode;
7897          color_att[i].resolve_iview = radv_image_view_from_handle(att_info->resolveImageView);
7898          color_att[i].resolve_layout = att_info->resolveImageLayout;
7899       }
7900 
7901       color_samples = MAX2(color_samples, color_att[i].iview->vk.image->samples);
7902 
7903       VkImageLayout initial_layout = attachment_initial_layout(att_info);
7904       if (initial_layout != color_att[i].layout) {
7905          assert(!(pRenderingInfo->flags & VK_RENDERING_RESUMING_BIT));
7906          radv_handle_rendering_image_transition(cmd_buffer, color_att[i].iview, pRenderingInfo->layerCount,
7907                                                 pRenderingInfo->viewMask, initial_layout, VK_IMAGE_LAYOUT_UNDEFINED,
7908                                                 color_att[i].layout, VK_IMAGE_LAYOUT_UNDEFINED, &sample_locations);
7909       }
7910    }
7911 
7912    struct radv_attachment ds_att = {.iview = NULL};
7913    VkImageAspectFlags ds_att_aspects = 0;
7914    const VkRenderingAttachmentInfo *d_att_info = pRenderingInfo->pDepthAttachment;
7915    const VkRenderingAttachmentInfo *s_att_info = pRenderingInfo->pStencilAttachment;
7916    if ((d_att_info != NULL && d_att_info->imageView != VK_NULL_HANDLE) ||
7917        (s_att_info != NULL && s_att_info->imageView != VK_NULL_HANDLE)) {
7918       struct radv_image_view *d_iview = NULL, *s_iview = NULL;
7919       struct radv_image_view *d_res_iview = NULL, *s_res_iview = NULL;
7920       VkImageLayout initial_depth_layout = VK_IMAGE_LAYOUT_UNDEFINED;
7921       VkImageLayout initial_stencil_layout = VK_IMAGE_LAYOUT_UNDEFINED;
7922 
7923       if (d_att_info != NULL && d_att_info->imageView != VK_NULL_HANDLE) {
7924          d_iview = radv_image_view_from_handle(d_att_info->imageView);
7925          initial_depth_layout = attachment_initial_layout(d_att_info);
7926          ds_att.layout = d_att_info->imageLayout;
7927 
7928          if (d_att_info->resolveMode != VK_RESOLVE_MODE_NONE && d_att_info->resolveImageView != VK_NULL_HANDLE) {
7929             d_res_iview = radv_image_view_from_handle(d_att_info->resolveImageView);
7930             ds_att.resolve_mode = d_att_info->resolveMode;
7931             ds_att.resolve_layout = d_att_info->resolveImageLayout;
7932          }
7933       }
7934 
7935       if (s_att_info != NULL && s_att_info->imageView != VK_NULL_HANDLE) {
7936          s_iview = radv_image_view_from_handle(s_att_info->imageView);
7937          initial_stencil_layout = attachment_initial_layout(s_att_info);
7938          ds_att.stencil_layout = s_att_info->imageLayout;
7939 
7940          if (s_att_info->resolveMode != VK_RESOLVE_MODE_NONE && s_att_info->resolveImageView != VK_NULL_HANDLE) {
7941             s_res_iview = radv_image_view_from_handle(s_att_info->resolveImageView);
7942             ds_att.stencil_resolve_mode = s_att_info->resolveMode;
7943             ds_att.stencil_resolve_layout = s_att_info->resolveImageLayout;
7944          }
7945       }
7946 
7947       assert(d_iview == NULL || s_iview == NULL || d_iview == s_iview);
7948       ds_att.iview = d_iview ? d_iview : s_iview, ds_att.format = ds_att.iview->vk.format;
7949 
7950       if (d_iview && s_iview) {
7951          ds_att_aspects = VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
7952       } else if (d_iview) {
7953          ds_att_aspects = VK_IMAGE_ASPECT_DEPTH_BIT;
7954       } else {
7955          ds_att_aspects = VK_IMAGE_ASPECT_STENCIL_BIT;
7956       }
7957 
7958       radv_initialise_ds_surface(cmd_buffer->device, &ds_att.ds, ds_att.iview, ds_att_aspects);
7959 
7960       assert(d_res_iview == NULL || s_res_iview == NULL || d_res_iview == s_res_iview);
7961       ds_att.resolve_iview = d_res_iview ? d_res_iview : s_res_iview;
7962 
7963       ds_samples = ds_att.iview->vk.image->samples;
7964 
7965       if (initial_depth_layout != ds_att.layout || initial_stencil_layout != ds_att.stencil_layout) {
7966          assert(!(pRenderingInfo->flags & VK_RENDERING_RESUMING_BIT));
7967          radv_handle_rendering_image_transition(cmd_buffer, ds_att.iview, pRenderingInfo->layerCount,
7968                                                 pRenderingInfo->viewMask, initial_depth_layout, initial_stencil_layout,
7969                                                 ds_att.layout, ds_att.stencil_layout, &sample_locations);
7970       }
7971    }
7972    if (cmd_buffer->vk.render_pass)
7973       radv_describe_barrier_end(cmd_buffer);
7974 
7975    const VkRenderingFragmentShadingRateAttachmentInfoKHR *fsr_info =
7976       vk_find_struct_const(pRenderingInfo->pNext, RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_INFO_KHR);
7977    struct radv_attachment vrs_att = {.iview = NULL};
7978    VkExtent2D vrs_texel_size = {.width = 0};
7979    if (fsr_info && fsr_info->imageView) {
7980       VK_FROM_HANDLE(radv_image_view, iview, fsr_info->imageView);
7981       vrs_att = (struct radv_attachment){
7982          .format = iview->vk.format,
7983          .iview = iview,
7984          .layout = fsr_info->imageLayout,
7985       };
7986       vrs_texel_size = fsr_info->shadingRateAttachmentTexelSize;
7987    }
7988 
7989    /* Now that we've done any layout transitions which may invoke meta, we can
7990     * fill out the actual rendering info and set up for the client's render pass.
7991     */
7992    radv_cmd_buffer_reset_rendering(cmd_buffer);
7993 
7994    struct radv_rendering_state *render = &cmd_buffer->state.render;
7995    render->active = true;
7996    render->has_image_views = true;
7997    render->area = pRenderingInfo->renderArea;
7998    render->view_mask = pRenderingInfo->viewMask;
7999    render->layer_count = pRenderingInfo->layerCount;
8000    render->color_samples = color_samples;
8001    render->ds_samples = ds_samples;
8002    render->max_samples = MAX2(color_samples, ds_samples);
8003    render->sample_locations = sample_locations;
8004    render->color_att_count = pRenderingInfo->colorAttachmentCount;
8005    typed_memcpy(render->color_att, color_att, render->color_att_count);
8006    render->ds_att = ds_att;
8007    render->ds_att_aspects = ds_att_aspects;
8008    render->vrs_att = vrs_att;
8009    render->vrs_texel_size = vrs_texel_size;
8010    cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FRAMEBUFFER;
8011 
8012    if (cmd_buffer->device->physical_device->rad_info.rbplus_allowed)
8013       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_RBPLUS;
8014 
8015    cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS | RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE;
8016 
8017    if (render->vrs_att.iview && cmd_buffer->device->physical_device->rad_info.gfx_level == GFX10_3) {
8018       if (render->ds_att.iview &&
8019           radv_htile_enabled(render->ds_att.iview->image, render->ds_att.iview->vk.base_mip_level)) {
8020          /* When we have a VRS attachment and a depth/stencil attachment, we just need to copy the
8021           * VRS rates to the HTILE buffer of the attachment.
8022           */
8023          struct radv_image_view *ds_iview = render->ds_att.iview;
8024          struct radv_image *ds_image = ds_iview->image;
8025          uint32_t level = ds_iview->vk.base_mip_level;
8026 
8027          /* HTILE buffer */
8028          uint64_t htile_offset = ds_image->bindings[0].offset + ds_image->planes[0].surface.meta_offset +
8029                                  ds_image->planes[0].surface.u.gfx9.meta_levels[level].offset;
8030          uint64_t htile_size = ds_image->planes[0].surface.u.gfx9.meta_levels[level].size;
8031          struct radv_buffer htile_buffer;
8032 
8033          radv_buffer_init(&htile_buffer, cmd_buffer->device, ds_image->bindings[0].bo, htile_size, htile_offset);
8034 
8035          assert(render->area.offset.x + render->area.extent.width <= ds_image->vk.extent.width &&
8036                 render->area.offset.x + render->area.extent.height <= ds_image->vk.extent.height);
8037 
8038          /* Copy the VRS rates to the HTILE buffer. */
8039          radv_copy_vrs_htile(cmd_buffer, render->vrs_att.iview->image, &render->area, ds_image, &htile_buffer, true);
8040 
8041          radv_buffer_finish(&htile_buffer);
8042       } else {
8043          /* When a subpass uses a VRS attachment without binding a depth/stencil attachment, or when
8044           * HTILE isn't enabled, we use a fallback that copies the VRS rates to our internal HTILE buffer.
8045           */
8046          struct radv_image *ds_image = radv_cmd_buffer_get_vrs_image(cmd_buffer);
8047 
8048          if (ds_image && render->area.offset.x < ds_image->vk.extent.width &&
8049              render->area.offset.y < ds_image->vk.extent.height) {
8050             /* HTILE buffer */
8051             struct radv_buffer *htile_buffer = cmd_buffer->device->vrs.buffer;
8052 
8053             VkRect2D area = render->area;
8054             area.extent.width = MIN2(area.extent.width, ds_image->vk.extent.width - area.offset.x);
8055             area.extent.height = MIN2(area.extent.height, ds_image->vk.extent.height - area.offset.y);
8056 
8057             /* Copy the VRS rates to the HTILE buffer. */
8058             radv_copy_vrs_htile(cmd_buffer, render->vrs_att.iview->image, &area, ds_image, htile_buffer, false);
8059          }
8060       }
8061    }
8062 
8063    radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 6);
8064    radeon_set_context_reg(cmd_buffer->cs, R_028204_PA_SC_WINDOW_SCISSOR_TL,
8065                           S_028204_TL_X(render->area.offset.x) | S_028204_TL_Y(render->area.offset.y));
8066    radeon_set_context_reg(cmd_buffer->cs, R_028208_PA_SC_WINDOW_SCISSOR_BR,
8067                           S_028208_BR_X(render->area.offset.x + render->area.extent.width) |
8068                              S_028208_BR_Y(render->area.offset.y + render->area.extent.height));
8069 
8070    radv_emit_fb_mip_change_flush(cmd_buffer);
8071 
8072    if (!(pRenderingInfo->flags & VK_RENDERING_RESUMING_BIT))
8073       radv_cmd_buffer_clear_rendering(cmd_buffer, pRenderingInfo);
8074 }
8075 
8076 VKAPI_ATTR void VKAPI_CALL
radv_CmdEndRendering(VkCommandBuffer commandBuffer)8077 radv_CmdEndRendering(VkCommandBuffer commandBuffer)
8078 {
8079    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8080 
8081    radv_mark_noncoherent_rb(cmd_buffer);
8082    radv_cmd_buffer_resolve_rendering(cmd_buffer);
8083    radv_cmd_buffer_reset_rendering(cmd_buffer);
8084 }
8085 
8086 static void
radv_emit_view_index_per_stage(struct radeon_cmdbuf * cs,const struct radv_shader * shader,uint32_t base_reg,unsigned index)8087 radv_emit_view_index_per_stage(struct radeon_cmdbuf *cs, const struct radv_shader *shader, uint32_t base_reg,
8088                                unsigned index)
8089 {
8090    const struct radv_userdata_info *loc = radv_get_user_sgpr(shader, AC_UD_VIEW_INDEX);
8091 
8092    if (loc->sgpr_idx == -1)
8093       return;
8094 
8095    radeon_set_sh_reg(cs, base_reg + loc->sgpr_idx * 4, index);
8096 }
8097 
8098 static void
radv_emit_view_index(struct radv_cmd_buffer * cmd_buffer,unsigned index)8099 radv_emit_view_index(struct radv_cmd_buffer *cmd_buffer, unsigned index)
8100 {
8101    struct radeon_cmdbuf *cs = cmd_buffer->cs;
8102 
8103    radv_foreach_stage(stage, cmd_buffer->state.active_stages & ~VK_SHADER_STAGE_TASK_BIT_EXT)
8104    {
8105       const struct radv_shader *shader = radv_get_shader(cmd_buffer->state.shaders, stage);
8106 
8107       radv_emit_view_index_per_stage(cs, shader, shader->info.user_data_0, index);
8108    }
8109 
8110    if (cmd_buffer->state.gs_copy_shader) {
8111       radv_emit_view_index_per_stage(cs, cmd_buffer->state.gs_copy_shader, R_00B130_SPI_SHADER_USER_DATA_VS_0, index);
8112    }
8113 
8114    if (cmd_buffer->state.active_stages & VK_SHADER_STAGE_TASK_BIT_EXT) {
8115       radv_emit_view_index_per_stage(cmd_buffer->gang.cs, cmd_buffer->state.shaders[MESA_SHADER_TASK],
8116                                      cmd_buffer->state.shaders[MESA_SHADER_TASK]->info.user_data_0, index);
8117    }
8118 }
8119 
8120 /**
8121  * Emulates predication for MEC using COND_EXEC.
8122  * When the current command buffer is predicating, emit a COND_EXEC packet
8123  * so that the MEC skips the next few dwords worth of packets.
8124  *
8125  * To make it work with inverted conditional rendering, we allocate
8126  * space in the upload BO and emit some packets to invert the condition.
8127  */
8128 static void
radv_cs_emit_compute_predication(struct radv_cmd_state * state,struct radeon_cmdbuf * cs,uint64_t inv_va,bool * inv_emitted,unsigned dwords)8129 radv_cs_emit_compute_predication(struct radv_cmd_state *state, struct radeon_cmdbuf *cs, uint64_t inv_va,
8130                                  bool *inv_emitted, unsigned dwords)
8131 {
8132    if (!state->predicating)
8133       return;
8134 
8135    uint64_t va = state->predication_va;
8136 
8137    if (!state->predication_type) {
8138       /* Invert the condition the first time it is needed. */
8139       if (!*inv_emitted) {
8140          *inv_emitted = true;
8141 
8142          /* Write 1 to the inverted predication VA. */
8143          radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
8144          radeon_emit(cs,
8145                      COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM);
8146          radeon_emit(cs, 1);
8147          radeon_emit(cs, 0);
8148          radeon_emit(cs, inv_va);
8149          radeon_emit(cs, inv_va >> 32);
8150 
8151          /* If the API predication VA == 0, skip next command. */
8152          radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0));
8153          radeon_emit(cs, va);
8154          radeon_emit(cs, va >> 32);
8155          radeon_emit(cs, 0);
8156          radeon_emit(cs, 6); /* 1x COPY_DATA size */
8157 
8158          /* Write 0 to the new predication VA (when the API condition != 0) */
8159          radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
8160          radeon_emit(cs,
8161                      COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM);
8162          radeon_emit(cs, 0);
8163          radeon_emit(cs, 0);
8164          radeon_emit(cs, inv_va);
8165          radeon_emit(cs, inv_va >> 32);
8166       }
8167 
8168       va = inv_va;
8169    }
8170 
8171    radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0));
8172    radeon_emit(cs, va);
8173    radeon_emit(cs, va >> 32);
8174    radeon_emit(cs, 0);      /* Cache policy */
8175    radeon_emit(cs, dwords); /* Size of the predicated packet(s) in DWORDs. */
8176 }
8177 
8178 static void
radv_cs_emit_draw_packet(struct radv_cmd_buffer * cmd_buffer,uint32_t vertex_count,uint32_t use_opaque)8179 radv_cs_emit_draw_packet(struct radv_cmd_buffer *cmd_buffer, uint32_t vertex_count, uint32_t use_opaque)
8180 {
8181    radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, cmd_buffer->state.predicating));
8182    radeon_emit(cmd_buffer->cs, vertex_count);
8183    radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX | use_opaque);
8184 }
8185 
8186 /**
8187  * Emit a PKT3_DRAW_INDEX_2 packet to render "index_count` vertices.
8188  *
8189  * The starting address "index_va" may point anywhere within the index buffer. The number of
8190  * indexes allocated in the index buffer *past that point* is specified by "max_index_count".
8191  * Hardware uses this information to return 0 for out-of-bounds reads.
8192  */
8193 static void
radv_cs_emit_draw_indexed_packet(struct radv_cmd_buffer * cmd_buffer,uint64_t index_va,uint32_t max_index_count,uint32_t index_count,bool not_eop)8194 radv_cs_emit_draw_indexed_packet(struct radv_cmd_buffer *cmd_buffer, uint64_t index_va, uint32_t max_index_count,
8195                                  uint32_t index_count, bool not_eop)
8196 {
8197    radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_2, 4, cmd_buffer->state.predicating));
8198    radeon_emit(cmd_buffer->cs, max_index_count);
8199    radeon_emit(cmd_buffer->cs, index_va);
8200    radeon_emit(cmd_buffer->cs, index_va >> 32);
8201    radeon_emit(cmd_buffer->cs, index_count);
8202    /* NOT_EOP allows merging multiple draws into 1 wave, but only user VGPRs
8203     * can be changed between draws and GS fast launch must be disabled.
8204     * NOT_EOP doesn't work on gfx9 and older.
8205     */
8206    radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_DMA | S_0287F0_NOT_EOP(not_eop));
8207 }
8208 
8209 /* MUST inline this function to avoid massive perf loss in drawoverhead */
8210 ALWAYS_INLINE static void
radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer * cmd_buffer,bool indexed,uint32_t draw_count,uint64_t count_va,uint32_t stride)8211 radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer *cmd_buffer, bool indexed, uint32_t draw_count,
8212                                   uint64_t count_va, uint32_t stride)
8213 {
8214    struct radeon_cmdbuf *cs = cmd_buffer->cs;
8215    const unsigned di_src_sel = indexed ? V_0287F0_DI_SRC_SEL_DMA : V_0287F0_DI_SRC_SEL_AUTO_INDEX;
8216    bool draw_id_enable = cmd_buffer->state.uses_drawid;
8217    uint32_t base_reg = cmd_buffer->state.vtx_base_sgpr;
8218    uint32_t vertex_offset_reg, start_instance_reg = 0, draw_id_reg = 0;
8219    bool predicating = cmd_buffer->state.predicating;
8220    bool mesh = cmd_buffer->state.mesh_shading;
8221    assert(base_reg);
8222 
8223    /* just reset draw state for vertex data */
8224    cmd_buffer->state.last_first_instance = -1;
8225    cmd_buffer->state.last_num_instances = -1;
8226    cmd_buffer->state.last_drawid = -1;
8227    cmd_buffer->state.last_vertex_offset_valid = false;
8228 
8229    vertex_offset_reg = (base_reg - SI_SH_REG_OFFSET) >> 2;
8230    if (cmd_buffer->state.uses_baseinstance)
8231       start_instance_reg = ((base_reg + (draw_id_enable ? 8 : 4)) - SI_SH_REG_OFFSET) >> 2;
8232    if (draw_id_enable)
8233       draw_id_reg = ((base_reg + mesh * 12 + 4) - SI_SH_REG_OFFSET) >> 2;
8234 
8235    if (draw_count == 1 && !count_va && !draw_id_enable) {
8236       radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT : PKT3_DRAW_INDIRECT, 3, predicating));
8237       radeon_emit(cs, 0);
8238       radeon_emit(cs, vertex_offset_reg);
8239       radeon_emit(cs, start_instance_reg);
8240       radeon_emit(cs, di_src_sel);
8241    } else {
8242       radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI : PKT3_DRAW_INDIRECT_MULTI, 8, predicating));
8243       radeon_emit(cs, 0);
8244       radeon_emit(cs, vertex_offset_reg);
8245       radeon_emit(cs, start_instance_reg);
8246       radeon_emit(cs, draw_id_reg | S_2C3_DRAW_INDEX_ENABLE(draw_id_enable) | S_2C3_COUNT_INDIRECT_ENABLE(!!count_va));
8247       radeon_emit(cs, draw_count); /* count */
8248       radeon_emit(cs, count_va);   /* count_addr */
8249       radeon_emit(cs, count_va >> 32);
8250       radeon_emit(cs, stride); /* stride */
8251       radeon_emit(cs, di_src_sel);
8252    }
8253 
8254    cmd_buffer->state.uses_draw_indirect = true;
8255 }
8256 
8257 ALWAYS_INLINE static void
radv_cs_emit_indirect_mesh_draw_packet(struct radv_cmd_buffer * cmd_buffer,uint32_t draw_count,uint64_t count_va,uint32_t stride)8258 radv_cs_emit_indirect_mesh_draw_packet(struct radv_cmd_buffer *cmd_buffer, uint32_t draw_count, uint64_t count_va,
8259                                        uint32_t stride)
8260 {
8261    const struct radv_shader *mesh_shader = cmd_buffer->state.shaders[MESA_SHADER_MESH];
8262    struct radeon_cmdbuf *cs = cmd_buffer->cs;
8263    uint32_t base_reg = cmd_buffer->state.vtx_base_sgpr;
8264    bool predicating = cmd_buffer->state.predicating;
8265    assert(base_reg || (!cmd_buffer->state.uses_drawid && !mesh_shader->info.cs.uses_grid_size));
8266 
8267    /* Reset draw state. */
8268    cmd_buffer->state.last_first_instance = -1;
8269    cmd_buffer->state.last_num_instances = -1;
8270    cmd_buffer->state.last_drawid = -1;
8271    cmd_buffer->state.last_vertex_offset_valid = false;
8272 
8273    uint32_t xyz_dim_enable = mesh_shader->info.cs.uses_grid_size;
8274    uint32_t xyz_dim_reg = !xyz_dim_enable ? 0 : (base_reg - SI_SH_REG_OFFSET) >> 2;
8275    uint32_t draw_id_enable = !!cmd_buffer->state.uses_drawid;
8276    uint32_t draw_id_reg = !draw_id_enable ? 0 : (base_reg + (xyz_dim_enable ? 12 : 0) - SI_SH_REG_OFFSET) >> 2;
8277 
8278    uint32_t mode1_enable = !cmd_buffer->device->physical_device->mesh_fast_launch_2;
8279 
8280    radeon_emit(cs, PKT3(PKT3_DISPATCH_MESH_INDIRECT_MULTI, 7, predicating) | PKT3_RESET_FILTER_CAM_S(1));
8281    radeon_emit(cs, 0); /* data_offset */
8282    radeon_emit(cs, S_4C1_XYZ_DIM_REG(xyz_dim_reg) | S_4C1_DRAW_INDEX_REG(draw_id_reg));
8283    if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11)
8284       radeon_emit(cs, S_4C2_DRAW_INDEX_ENABLE(draw_id_enable) | S_4C2_COUNT_INDIRECT_ENABLE(!!count_va) |
8285                          S_4C2_XYZ_DIM_ENABLE(xyz_dim_enable) | S_4C2_MODE1_ENABLE(mode1_enable));
8286    else
8287       radeon_emit(cs, S_4C2_DRAW_INDEX_ENABLE(draw_id_enable) | S_4C2_COUNT_INDIRECT_ENABLE(!!count_va));
8288    radeon_emit(cs, draw_count);
8289    radeon_emit(cs, count_va & 0xFFFFFFFF);
8290    radeon_emit(cs, count_va >> 32);
8291    radeon_emit(cs, stride);
8292    radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX);
8293 }
8294 
8295 ALWAYS_INLINE static void
radv_cs_emit_dispatch_taskmesh_direct_ace_packet(struct radv_cmd_buffer * cmd_buffer,const uint32_t x,const uint32_t y,const uint32_t z)8296 radv_cs_emit_dispatch_taskmesh_direct_ace_packet(struct radv_cmd_buffer *cmd_buffer, const uint32_t x, const uint32_t y,
8297                                                  const uint32_t z)
8298 {
8299    struct radv_shader *task_shader = cmd_buffer->state.shaders[MESA_SHADER_TASK];
8300    struct radeon_cmdbuf *cs = cmd_buffer->gang.cs;
8301    const bool predicating = cmd_buffer->state.predicating;
8302    const uint32_t dispatch_initiator =
8303       cmd_buffer->device->dispatch_initiator_task | S_00B800_CS_W32_EN(task_shader->info.wave_size == 32);
8304 
8305    const struct radv_userdata_info *ring_entry_loc = radv_get_user_sgpr(task_shader, AC_UD_TASK_RING_ENTRY);
8306    assert(ring_entry_loc && ring_entry_loc->sgpr_idx != -1 && ring_entry_loc->num_sgprs == 1);
8307 
8308    uint32_t ring_entry_reg = (R_00B900_COMPUTE_USER_DATA_0 + ring_entry_loc->sgpr_idx * 4 - SI_SH_REG_OFFSET) >> 2;
8309 
8310    radeon_emit(cs, PKT3(PKT3_DISPATCH_TASKMESH_DIRECT_ACE, 4, predicating) | PKT3_SHADER_TYPE_S(1));
8311    radeon_emit(cs, x);
8312    radeon_emit(cs, y);
8313    radeon_emit(cs, z);
8314    radeon_emit(cs, dispatch_initiator);
8315    radeon_emit(cs, ring_entry_reg & 0xFFFF);
8316 }
8317 
8318 ALWAYS_INLINE static void
radv_cs_emit_dispatch_taskmesh_indirect_multi_ace_packet(struct radv_cmd_buffer * cmd_buffer,uint64_t data_va,uint32_t draw_count,uint64_t count_va,uint32_t stride)8319 radv_cs_emit_dispatch_taskmesh_indirect_multi_ace_packet(struct radv_cmd_buffer *cmd_buffer, uint64_t data_va,
8320                                                          uint32_t draw_count, uint64_t count_va, uint32_t stride)
8321 {
8322    assert((data_va & 0x03) == 0);
8323    assert((count_va & 0x03) == 0);
8324 
8325    struct radv_shader *task_shader = cmd_buffer->state.shaders[MESA_SHADER_TASK];
8326    struct radeon_cmdbuf *cs = cmd_buffer->gang.cs;
8327 
8328    const uint32_t xyz_dim_enable = task_shader->info.cs.uses_grid_size;
8329    const uint32_t draw_id_enable = task_shader->info.vs.needs_draw_id;
8330    const uint32_t dispatch_initiator =
8331       cmd_buffer->device->dispatch_initiator_task | S_00B800_CS_W32_EN(task_shader->info.wave_size == 32);
8332 
8333    const struct radv_userdata_info *ring_entry_loc = radv_get_user_sgpr(task_shader, AC_UD_TASK_RING_ENTRY);
8334    const struct radv_userdata_info *xyz_dim_loc = radv_get_user_sgpr(task_shader, AC_UD_CS_GRID_SIZE);
8335    const struct radv_userdata_info *draw_id_loc = radv_get_user_sgpr(task_shader, AC_UD_CS_TASK_DRAW_ID);
8336 
8337    assert(ring_entry_loc->sgpr_idx != -1 && ring_entry_loc->num_sgprs == 1);
8338    assert(!xyz_dim_enable || (xyz_dim_loc->sgpr_idx != -1 && xyz_dim_loc->num_sgprs == 3));
8339    assert(!draw_id_enable || (draw_id_loc->sgpr_idx != -1 && draw_id_loc->num_sgprs == 1));
8340 
8341    const uint32_t ring_entry_reg =
8342       (R_00B900_COMPUTE_USER_DATA_0 + ring_entry_loc->sgpr_idx * 4 - SI_SH_REG_OFFSET) >> 2;
8343    const uint32_t xyz_dim_reg =
8344       !xyz_dim_enable ? 0 : (R_00B900_COMPUTE_USER_DATA_0 + xyz_dim_loc->sgpr_idx * 4 - SI_SH_REG_OFFSET) >> 2;
8345    const uint32_t draw_id_reg =
8346       !draw_id_enable ? 0 : (R_00B900_COMPUTE_USER_DATA_0 + draw_id_loc->sgpr_idx * 4 - SI_SH_REG_OFFSET) >> 2;
8347 
8348    radeon_emit(cs, PKT3(PKT3_DISPATCH_TASKMESH_INDIRECT_MULTI_ACE, 9, 0) | PKT3_SHADER_TYPE_S(1));
8349    radeon_emit(cs, data_va);
8350    radeon_emit(cs, data_va >> 32);
8351    radeon_emit(cs, S_AD2_RING_ENTRY_REG(ring_entry_reg));
8352    radeon_emit(cs, S_AD3_COUNT_INDIRECT_ENABLE(!!count_va) | S_AD3_DRAW_INDEX_ENABLE(draw_id_enable) |
8353                       S_AD3_XYZ_DIM_ENABLE(xyz_dim_enable) | S_AD3_DRAW_INDEX_REG(draw_id_reg));
8354    radeon_emit(cs, S_AD4_XYZ_DIM_REG(xyz_dim_reg));
8355    radeon_emit(cs, draw_count);
8356    radeon_emit(cs, count_va);
8357    radeon_emit(cs, count_va >> 32);
8358    radeon_emit(cs, stride);
8359    radeon_emit(cs, dispatch_initiator);
8360 }
8361 
8362 ALWAYS_INLINE static void
radv_cs_emit_dispatch_taskmesh_gfx_packet(struct radv_cmd_buffer * cmd_buffer)8363 radv_cs_emit_dispatch_taskmesh_gfx_packet(struct radv_cmd_buffer *cmd_buffer)
8364 {
8365    const struct radv_shader *mesh_shader = cmd_buffer->state.shaders[MESA_SHADER_MESH];
8366    struct radeon_cmdbuf *cs = cmd_buffer->cs;
8367    bool predicating = cmd_buffer->state.predicating;
8368 
8369    const struct radv_userdata_info *ring_entry_loc =
8370       radv_get_user_sgpr(cmd_buffer->state.last_vgt_shader, AC_UD_TASK_RING_ENTRY);
8371 
8372    assert(ring_entry_loc->sgpr_idx != -1);
8373 
8374    uint32_t xyz_dim_en = mesh_shader->info.cs.uses_grid_size;
8375    uint32_t xyz_dim_reg = !xyz_dim_en ? 0 : (cmd_buffer->state.vtx_base_sgpr - SI_SH_REG_OFFSET) >> 2;
8376    uint32_t ring_entry_reg = ((mesh_shader->info.user_data_0 - SI_SH_REG_OFFSET) >> 2) + ring_entry_loc->sgpr_idx;
8377    uint32_t mode1_en = !cmd_buffer->device->physical_device->mesh_fast_launch_2;
8378    uint32_t linear_dispatch_en = cmd_buffer->state.shaders[MESA_SHADER_TASK]->info.cs.linear_taskmesh_dispatch;
8379    const bool sqtt_en = !!cmd_buffer->device->sqtt.bo;
8380 
8381    radeon_emit(cs, PKT3(PKT3_DISPATCH_TASKMESH_GFX, 2, predicating) | PKT3_RESET_FILTER_CAM_S(1));
8382    radeon_emit(cs, S_4D0_RING_ENTRY_REG(ring_entry_reg) | S_4D0_XYZ_DIM_REG(xyz_dim_reg));
8383    if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11)
8384       radeon_emit(cs, S_4D1_XYZ_DIM_ENABLE(xyz_dim_en) | S_4D1_MODE1_ENABLE(mode1_en) |
8385                          S_4D1_LINEAR_DISPATCH_ENABLE(linear_dispatch_en) | S_4D1_THREAD_TRACE_MARKER_ENABLE(sqtt_en));
8386    else
8387       radeon_emit(cs, S_4D1_THREAD_TRACE_MARKER_ENABLE(sqtt_en));
8388    radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX);
8389 }
8390 
8391 ALWAYS_INLINE static void
radv_emit_userdata_vertex_internal(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,const uint32_t vertex_offset)8392 radv_emit_userdata_vertex_internal(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
8393                                    const uint32_t vertex_offset)
8394 {
8395    struct radv_cmd_state *state = &cmd_buffer->state;
8396    struct radeon_cmdbuf *cs = cmd_buffer->cs;
8397    const bool uses_baseinstance = state->uses_baseinstance;
8398    const bool uses_drawid = state->uses_drawid;
8399 
8400    radeon_set_sh_reg_seq(cs, state->vtx_base_sgpr, state->vtx_emit_num);
8401 
8402    radeon_emit(cs, vertex_offset);
8403    state->last_vertex_offset_valid = true;
8404    state->last_vertex_offset = vertex_offset;
8405    if (uses_drawid) {
8406       radeon_emit(cs, 0);
8407       state->last_drawid = 0;
8408    }
8409    if (uses_baseinstance) {
8410       radeon_emit(cs, info->first_instance);
8411       state->last_first_instance = info->first_instance;
8412    }
8413 }
8414 
8415 ALWAYS_INLINE static void
radv_emit_userdata_vertex(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,const uint32_t vertex_offset)8416 radv_emit_userdata_vertex(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
8417                           const uint32_t vertex_offset)
8418 {
8419    const struct radv_cmd_state *state = &cmd_buffer->state;
8420    const bool uses_baseinstance = state->uses_baseinstance;
8421    const bool uses_drawid = state->uses_drawid;
8422 
8423    if (!state->last_vertex_offset_valid || vertex_offset != state->last_vertex_offset ||
8424        (uses_drawid && 0 != state->last_drawid) ||
8425        (uses_baseinstance && info->first_instance != state->last_first_instance))
8426       radv_emit_userdata_vertex_internal(cmd_buffer, info, vertex_offset);
8427 }
8428 
8429 ALWAYS_INLINE static void
radv_emit_userdata_vertex_drawid(struct radv_cmd_buffer * cmd_buffer,uint32_t vertex_offset,uint32_t drawid)8430 radv_emit_userdata_vertex_drawid(struct radv_cmd_buffer *cmd_buffer, uint32_t vertex_offset, uint32_t drawid)
8431 {
8432    struct radv_cmd_state *state = &cmd_buffer->state;
8433    struct radeon_cmdbuf *cs = cmd_buffer->cs;
8434    radeon_set_sh_reg_seq(cs, state->vtx_base_sgpr, 1 + !!drawid);
8435    radeon_emit(cs, vertex_offset);
8436    state->last_vertex_offset_valid = true;
8437    state->last_vertex_offset = vertex_offset;
8438    if (drawid)
8439       radeon_emit(cs, drawid);
8440 }
8441 
8442 ALWAYS_INLINE static void
radv_emit_userdata_mesh(struct radv_cmd_buffer * cmd_buffer,const uint32_t x,const uint32_t y,const uint32_t z)8443 radv_emit_userdata_mesh(struct radv_cmd_buffer *cmd_buffer, const uint32_t x, const uint32_t y, const uint32_t z)
8444 {
8445    struct radv_cmd_state *state = &cmd_buffer->state;
8446    const struct radv_shader *mesh_shader = state->shaders[MESA_SHADER_MESH];
8447    struct radeon_cmdbuf *cs = cmd_buffer->cs;
8448    const bool uses_drawid = state->uses_drawid;
8449    const bool uses_grid_size = mesh_shader->info.cs.uses_grid_size;
8450 
8451    if (!uses_drawid && !uses_grid_size)
8452       return;
8453 
8454    radeon_set_sh_reg_seq(cs, state->vtx_base_sgpr, state->vtx_emit_num);
8455    if (uses_grid_size) {
8456       radeon_emit(cs, x);
8457       radeon_emit(cs, y);
8458       radeon_emit(cs, z);
8459    }
8460    if (uses_drawid) {
8461       radeon_emit(cs, 0);
8462       state->last_drawid = 0;
8463    }
8464 }
8465 
8466 ALWAYS_INLINE static void
radv_emit_userdata_task(struct radv_cmd_buffer * cmd_buffer,uint32_t x,uint32_t y,uint32_t z,uint32_t draw_id)8467 radv_emit_userdata_task(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t y, uint32_t z, uint32_t draw_id)
8468 {
8469    struct radv_shader *task_shader = cmd_buffer->state.shaders[MESA_SHADER_TASK];
8470    struct radeon_cmdbuf *cs = cmd_buffer->gang.cs;
8471 
8472    const struct radv_userdata_info *xyz_loc = radv_get_user_sgpr(task_shader, AC_UD_CS_GRID_SIZE);
8473    const struct radv_userdata_info *draw_id_loc = radv_get_user_sgpr(task_shader, AC_UD_CS_TASK_DRAW_ID);
8474 
8475    if (xyz_loc->sgpr_idx != -1) {
8476       assert(xyz_loc->num_sgprs == 3);
8477       unsigned xyz_reg = R_00B900_COMPUTE_USER_DATA_0 + xyz_loc->sgpr_idx * 4;
8478 
8479       radeon_set_sh_reg_seq(cs, xyz_reg, 3);
8480       radeon_emit(cs, x);
8481       radeon_emit(cs, y);
8482       radeon_emit(cs, z);
8483    }
8484 
8485    if (draw_id_loc->sgpr_idx != -1) {
8486       assert(draw_id_loc->num_sgprs == 1);
8487       unsigned draw_id_reg = R_00B900_COMPUTE_USER_DATA_0 + draw_id_loc->sgpr_idx * 4;
8488 
8489       radeon_set_sh_reg_seq(cs, draw_id_reg, 1);
8490       radeon_emit(cs, draw_id);
8491    }
8492 }
8493 
8494 ALWAYS_INLINE static void
radv_emit_draw_packets_indexed(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,uint32_t drawCount,const VkMultiDrawIndexedInfoEXT * minfo,uint32_t stride,const int32_t * vertexOffset)8495 radv_emit_draw_packets_indexed(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
8496                                uint32_t drawCount, const VkMultiDrawIndexedInfoEXT *minfo, uint32_t stride,
8497                                const int32_t *vertexOffset)
8498 
8499 {
8500    struct radv_cmd_state *state = &cmd_buffer->state;
8501    struct radeon_cmdbuf *cs = cmd_buffer->cs;
8502    const int index_size = radv_get_vgt_index_size(state->index_type);
8503    unsigned i = 0;
8504    const bool uses_drawid = state->uses_drawid;
8505    const bool can_eop = !uses_drawid && cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10;
8506 
8507    if (uses_drawid) {
8508       if (vertexOffset) {
8509          radv_emit_userdata_vertex(cmd_buffer, info, *vertexOffset);
8510          vk_foreach_multi_draw_indexed (draw, i, minfo, drawCount, stride) {
8511             uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
8512             uint64_t index_va = state->index_va + draw->firstIndex * index_size;
8513 
8514             /* Handle draw calls with 0-sized index buffers if the GPU can't support them. */
8515             if (!remaining_indexes && cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
8516                radv_handle_zero_index_buffer_bug(cmd_buffer, &index_va, &remaining_indexes);
8517 
8518             if (i > 0)
8519                radeon_set_sh_reg(cs, state->vtx_base_sgpr + sizeof(uint32_t), i);
8520 
8521             if (!state->render.view_mask) {
8522                radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
8523             } else {
8524                u_foreach_bit (view, state->render.view_mask) {
8525                   radv_emit_view_index(cmd_buffer, view);
8526 
8527                   radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
8528                }
8529             }
8530          }
8531       } else {
8532          vk_foreach_multi_draw_indexed (draw, i, minfo, drawCount, stride) {
8533             uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
8534             uint64_t index_va = state->index_va + draw->firstIndex * index_size;
8535 
8536             /* Handle draw calls with 0-sized index buffers if the GPU can't support them. */
8537             if (!remaining_indexes && cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
8538                radv_handle_zero_index_buffer_bug(cmd_buffer, &index_va, &remaining_indexes);
8539 
8540             if (i > 0) {
8541                assert(state->last_vertex_offset_valid);
8542                if (state->last_vertex_offset != draw->vertexOffset)
8543                   radv_emit_userdata_vertex_drawid(cmd_buffer, draw->vertexOffset, i);
8544                else
8545                   radeon_set_sh_reg(cs, state->vtx_base_sgpr + sizeof(uint32_t), i);
8546             } else
8547                radv_emit_userdata_vertex(cmd_buffer, info, draw->vertexOffset);
8548 
8549             if (!state->render.view_mask) {
8550                radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
8551             } else {
8552                u_foreach_bit (view, state->render.view_mask) {
8553                   radv_emit_view_index(cmd_buffer, view);
8554 
8555                   radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
8556                }
8557             }
8558          }
8559       }
8560       if (drawCount > 1) {
8561          state->last_drawid = drawCount - 1;
8562       }
8563    } else {
8564       if (vertexOffset) {
8565          if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX10) {
8566             /* GFX10 has a bug that consecutive draw packets with NOT_EOP must not have
8567              * count == 0 for the last draw that doesn't have NOT_EOP.
8568              */
8569             while (drawCount > 1) {
8570                const VkMultiDrawIndexedInfoEXT *last =
8571                   (const VkMultiDrawIndexedInfoEXT *)(((const uint8_t *)minfo) + (drawCount - 1) * stride);
8572                if (last->indexCount)
8573                   break;
8574                drawCount--;
8575             }
8576          }
8577 
8578          radv_emit_userdata_vertex(cmd_buffer, info, *vertexOffset);
8579          vk_foreach_multi_draw_indexed (draw, i, minfo, drawCount, stride) {
8580             uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
8581             uint64_t index_va = state->index_va + draw->firstIndex * index_size;
8582 
8583             /* Handle draw calls with 0-sized index buffers if the GPU can't support them. */
8584             if (!remaining_indexes && cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
8585                radv_handle_zero_index_buffer_bug(cmd_buffer, &index_va, &remaining_indexes);
8586 
8587             if (!state->render.view_mask) {
8588                radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount,
8589                                                 can_eop && i < drawCount - 1);
8590             } else {
8591                u_foreach_bit (view, state->render.view_mask) {
8592                   radv_emit_view_index(cmd_buffer, view);
8593 
8594                   radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
8595                }
8596             }
8597          }
8598       } else {
8599          vk_foreach_multi_draw_indexed (draw, i, minfo, drawCount, stride) {
8600             uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
8601             uint64_t index_va = state->index_va + draw->firstIndex * index_size;
8602 
8603             /* Handle draw calls with 0-sized index buffers if the GPU can't support them. */
8604             if (!remaining_indexes && cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
8605                radv_handle_zero_index_buffer_bug(cmd_buffer, &index_va, &remaining_indexes);
8606 
8607             const VkMultiDrawIndexedInfoEXT *next =
8608                (const VkMultiDrawIndexedInfoEXT *)(i < drawCount - 1 ? ((uint8_t *)draw + stride) : NULL);
8609             const bool offset_changes = next && next->vertexOffset != draw->vertexOffset;
8610             radv_emit_userdata_vertex(cmd_buffer, info, draw->vertexOffset);
8611 
8612             if (!state->render.view_mask) {
8613                radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount,
8614                                                 can_eop && !offset_changes && i < drawCount - 1);
8615             } else {
8616                u_foreach_bit (view, state->render.view_mask) {
8617                   radv_emit_view_index(cmd_buffer, view);
8618 
8619                   radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
8620                }
8621             }
8622          }
8623       }
8624       if (drawCount > 1) {
8625          state->last_drawid = drawCount - 1;
8626       }
8627    }
8628 }
8629 
8630 ALWAYS_INLINE static void
radv_emit_direct_draw_packets(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,uint32_t drawCount,const VkMultiDrawInfoEXT * minfo,uint32_t use_opaque,uint32_t stride)8631 radv_emit_direct_draw_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info, uint32_t drawCount,
8632                               const VkMultiDrawInfoEXT *minfo, uint32_t use_opaque, uint32_t stride)
8633 {
8634    unsigned i = 0;
8635    const uint32_t view_mask = cmd_buffer->state.render.view_mask;
8636    const bool uses_drawid = cmd_buffer->state.uses_drawid;
8637    uint32_t last_start = 0;
8638 
8639    vk_foreach_multi_draw (draw, i, minfo, drawCount, stride) {
8640       if (!i)
8641          radv_emit_userdata_vertex(cmd_buffer, info, draw->firstVertex);
8642       else
8643          radv_emit_userdata_vertex_drawid(cmd_buffer, draw->firstVertex, uses_drawid ? i : 0);
8644 
8645       if (!view_mask) {
8646          radv_cs_emit_draw_packet(cmd_buffer, draw->vertexCount, use_opaque);
8647       } else {
8648          u_foreach_bit (view, view_mask) {
8649             radv_emit_view_index(cmd_buffer, view);
8650             radv_cs_emit_draw_packet(cmd_buffer, draw->vertexCount, use_opaque);
8651          }
8652       }
8653       last_start = draw->firstVertex;
8654    }
8655    if (drawCount > 1) {
8656       struct radv_cmd_state *state = &cmd_buffer->state;
8657       assert(state->last_vertex_offset_valid);
8658       state->last_vertex_offset = last_start;
8659       if (uses_drawid)
8660          state->last_drawid = drawCount - 1;
8661    }
8662 }
8663 
8664 static void
radv_cs_emit_mesh_dispatch_packet(struct radv_cmd_buffer * cmd_buffer,uint32_t x,uint32_t y,uint32_t z)8665 radv_cs_emit_mesh_dispatch_packet(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t y, uint32_t z)
8666 {
8667    radeon_emit(cmd_buffer->cs, PKT3(PKT3_DISPATCH_MESH_DIRECT, 3, cmd_buffer->state.predicating));
8668    radeon_emit(cmd_buffer->cs, x);
8669    radeon_emit(cmd_buffer->cs, y);
8670    radeon_emit(cmd_buffer->cs, z);
8671    radeon_emit(cmd_buffer->cs, S_0287F0_SOURCE_SELECT(V_0287F0_DI_SRC_SEL_AUTO_INDEX));
8672 }
8673 
8674 ALWAYS_INLINE static void
radv_emit_direct_mesh_draw_packet(struct radv_cmd_buffer * cmd_buffer,uint32_t x,uint32_t y,uint32_t z)8675 radv_emit_direct_mesh_draw_packet(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t y, uint32_t z)
8676 {
8677    const uint32_t view_mask = cmd_buffer->state.render.view_mask;
8678 
8679    radv_emit_userdata_mesh(cmd_buffer, x, y, z);
8680 
8681    if (cmd_buffer->device->physical_device->mesh_fast_launch_2) {
8682       if (!view_mask) {
8683          radv_cs_emit_mesh_dispatch_packet(cmd_buffer, x, y, z);
8684       } else {
8685          u_foreach_bit (view, view_mask) {
8686             radv_emit_view_index(cmd_buffer, view);
8687             radv_cs_emit_mesh_dispatch_packet(cmd_buffer, x, y, z);
8688          }
8689       }
8690    } else {
8691       const uint32_t count = x * y * z;
8692       if (!view_mask) {
8693          radv_cs_emit_draw_packet(cmd_buffer, count, 0);
8694       } else {
8695          u_foreach_bit (view, view_mask) {
8696             radv_emit_view_index(cmd_buffer, view);
8697             radv_cs_emit_draw_packet(cmd_buffer, count, 0);
8698          }
8699       }
8700    }
8701 }
8702 
8703 ALWAYS_INLINE static void
radv_emit_indirect_mesh_draw_packets(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info)8704 radv_emit_indirect_mesh_draw_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info)
8705 {
8706    const struct radv_cmd_state *state = &cmd_buffer->state;
8707    struct radeon_winsys *ws = cmd_buffer->device->ws;
8708    struct radeon_cmdbuf *cs = cmd_buffer->cs;
8709    const uint64_t va = radv_buffer_get_va(info->indirect->bo) + info->indirect->offset + info->indirect_offset;
8710    const uint64_t count_va = !info->count_buffer ? 0
8711                                                  : radv_buffer_get_va(info->count_buffer->bo) +
8712                                                       info->count_buffer->offset + info->count_buffer_offset;
8713 
8714    radv_cs_add_buffer(ws, cs, info->indirect->bo);
8715 
8716    if (info->count_buffer) {
8717       radv_cs_add_buffer(ws, cs, info->count_buffer->bo);
8718    }
8719 
8720    radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0));
8721    radeon_emit(cs, 1);
8722    radeon_emit(cs, va);
8723    radeon_emit(cs, va >> 32);
8724 
8725    if (state->uses_drawid) {
8726       const struct radv_shader *mesh_shader = state->shaders[MESA_SHADER_MESH];
8727       unsigned reg = state->vtx_base_sgpr + (mesh_shader->info.cs.uses_grid_size ? 12 : 0);
8728       radeon_set_sh_reg_seq(cs, reg, 1);
8729       radeon_emit(cs, 0);
8730    }
8731 
8732    if (!state->render.view_mask) {
8733       radv_cs_emit_indirect_mesh_draw_packet(cmd_buffer, info->count, count_va, info->stride);
8734    } else {
8735       u_foreach_bit (i, state->render.view_mask) {
8736          radv_emit_view_index(cmd_buffer, i);
8737          radv_cs_emit_indirect_mesh_draw_packet(cmd_buffer, info->count, count_va, info->stride);
8738       }
8739    }
8740 }
8741 
8742 ALWAYS_INLINE static void
radv_emit_direct_taskmesh_draw_packets(struct radv_cmd_buffer * cmd_buffer,uint32_t x,uint32_t y,uint32_t z)8743 radv_emit_direct_taskmesh_draw_packets(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t y, uint32_t z)
8744 {
8745    const uint32_t view_mask = cmd_buffer->state.render.view_mask;
8746    const unsigned num_views = MAX2(1, util_bitcount(view_mask));
8747    unsigned ace_predication_size = num_views * 6; /* DISPATCH_TASKMESH_DIRECT_ACE size */
8748 
8749    radv_emit_userdata_task(cmd_buffer, x, y, z, 0);
8750    radv_cs_emit_compute_predication(&cmd_buffer->state, cmd_buffer->gang.cs, cmd_buffer->mec_inv_pred_va,
8751                                     &cmd_buffer->mec_inv_pred_emitted, ace_predication_size);
8752 
8753    if (!view_mask) {
8754       radv_cs_emit_dispatch_taskmesh_direct_ace_packet(cmd_buffer, x, y, z);
8755       radv_cs_emit_dispatch_taskmesh_gfx_packet(cmd_buffer);
8756    } else {
8757       u_foreach_bit (view, view_mask) {
8758          radv_emit_view_index(cmd_buffer, view);
8759          radv_cs_emit_dispatch_taskmesh_direct_ace_packet(cmd_buffer, x, y, z);
8760          radv_cs_emit_dispatch_taskmesh_gfx_packet(cmd_buffer);
8761       }
8762    }
8763 }
8764 
8765 static void
radv_emit_indirect_taskmesh_draw_packets(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info)8766 radv_emit_indirect_taskmesh_draw_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info)
8767 {
8768    const uint32_t view_mask = cmd_buffer->state.render.view_mask;
8769    struct radeon_winsys *ws = cmd_buffer->device->ws;
8770    const unsigned num_views = MAX2(1, util_bitcount(view_mask));
8771    unsigned ace_predication_size = num_views * 11; /* DISPATCH_TASKMESH_INDIRECT_MULTI_ACE size */
8772    struct radeon_cmdbuf *ace_cs = cmd_buffer->gang.cs;
8773 
8774    const uint64_t va = radv_buffer_get_va(info->indirect->bo) + info->indirect->offset + info->indirect_offset;
8775    const uint64_t count_va = !info->count_buffer ? 0
8776                                                  : radv_buffer_get_va(info->count_buffer->bo) +
8777                                                       info->count_buffer->offset + info->count_buffer_offset;
8778    uint64_t workaround_cond_va = 0;
8779 
8780    if (num_views > 1)
8781       ace_predication_size += num_views * 3; /* SET_SH_REG size (view index SGPR) */
8782 
8783    if (count_va)
8784       radv_cs_add_buffer(ws, cmd_buffer->gang.cs, info->count_buffer->bo);
8785 
8786    if (cmd_buffer->device->physical_device->rad_info.has_taskmesh_indirect0_bug && count_va) {
8787       /* MEC firmware bug workaround.
8788        * When the count buffer contains zero, DISPATCH_TASKMESH_INDIRECT_MULTI_ACE hangs.
8789        * - We must ensure that DISPATCH_TASKMESH_INDIRECT_MULTI_ACE
8790        *   is only executed when the count buffer contains non-zero.
8791        * - Furthermore, we must also ensure that each DISPATCH_TASKMESH_GFX packet
8792        *   has a matching ACE packet.
8793        *
8794        * As a workaround:
8795        * - Reserve a dword in the upload buffer and initialize it to 1 for the workaround
8796        * - When count != 0, write 0 to the workaround BO and execute the indirect dispatch
8797        * - When workaround BO != 0 (count was 0), execute an empty direct dispatch
8798        */
8799 
8800       uint32_t workaround_cond_init = 0;
8801       uint32_t workaround_cond_off;
8802       if (!radv_cmd_buffer_upload_data(cmd_buffer, 4, &workaround_cond_init, &workaround_cond_off))
8803          vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
8804 
8805       workaround_cond_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + workaround_cond_off;
8806 
8807       radeon_emit(ace_cs, PKT3(PKT3_COPY_DATA, 4, 0));
8808       radeon_emit(ace_cs,
8809                   COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM);
8810       radeon_emit(ace_cs, 1);
8811       radeon_emit(ace_cs, 0);
8812       radeon_emit(ace_cs, workaround_cond_va);
8813       radeon_emit(ace_cs, workaround_cond_va >> 32);
8814 
8815       /* 2x COND_EXEC + 1x COPY_DATA + Nx DISPATCH_TASKMESH_DIRECT_ACE */
8816       ace_predication_size += 2 * 5 + 6 + 6 * num_views;
8817    }
8818 
8819    radv_cs_add_buffer(ws, cmd_buffer->gang.cs, info->indirect->bo);
8820    radv_cs_emit_compute_predication(&cmd_buffer->state, cmd_buffer->gang.cs, cmd_buffer->mec_inv_pred_va,
8821                                     &cmd_buffer->mec_inv_pred_emitted, ace_predication_size);
8822 
8823    if (workaround_cond_va) {
8824       radeon_emit(ace_cs, PKT3(PKT3_COND_EXEC, 3, 0));
8825       radeon_emit(ace_cs, count_va);
8826       radeon_emit(ace_cs, count_va >> 32);
8827       radeon_emit(ace_cs, 0);
8828       radeon_emit(ace_cs, 6 + 11 * num_views); /* 1x COPY_DATA + Nx DISPATCH_TASKMESH_INDIRECT_MULTI_ACE */
8829 
8830       radeon_emit(ace_cs, PKT3(PKT3_COPY_DATA, 4, 0));
8831       radeon_emit(ace_cs,
8832                   COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM);
8833       radeon_emit(ace_cs, 0);
8834       radeon_emit(ace_cs, 0);
8835       radeon_emit(ace_cs, workaround_cond_va);
8836       radeon_emit(ace_cs, workaround_cond_va >> 32);
8837    }
8838 
8839    if (!view_mask) {
8840       radv_cs_emit_dispatch_taskmesh_indirect_multi_ace_packet(cmd_buffer, va, info->count, count_va, info->stride);
8841       radv_cs_emit_dispatch_taskmesh_gfx_packet(cmd_buffer);
8842    } else {
8843       u_foreach_bit (view, view_mask) {
8844          radv_emit_view_index(cmd_buffer, view);
8845          radv_cs_emit_dispatch_taskmesh_indirect_multi_ace_packet(cmd_buffer, va, info->count, count_va, info->stride);
8846          radv_cs_emit_dispatch_taskmesh_gfx_packet(cmd_buffer);
8847       }
8848    }
8849 
8850    if (workaround_cond_va) {
8851       radeon_emit(ace_cs, PKT3(PKT3_COND_EXEC, 3, 0));
8852       radeon_emit(ace_cs, workaround_cond_va);
8853       radeon_emit(ace_cs, workaround_cond_va >> 32);
8854       radeon_emit(ace_cs, 0);
8855       radeon_emit(ace_cs, 6 * num_views); /* Nx DISPATCH_TASKMESH_DIRECT_ACE */
8856 
8857       for (unsigned v = 0; v < num_views; ++v) {
8858          radv_cs_emit_dispatch_taskmesh_direct_ace_packet(cmd_buffer, 0, 0, 0);
8859       }
8860    }
8861 }
8862 
8863 static void
radv_emit_indirect_draw_packets(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info)8864 radv_emit_indirect_draw_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info)
8865 {
8866    const struct radv_cmd_state *state = &cmd_buffer->state;
8867    struct radeon_winsys *ws = cmd_buffer->device->ws;
8868    struct radeon_cmdbuf *cs = cmd_buffer->cs;
8869    const uint64_t va = radv_buffer_get_va(info->indirect->bo) + info->indirect->offset + info->indirect_offset;
8870    const uint64_t count_va = info->count_buffer ? radv_buffer_get_va(info->count_buffer->bo) +
8871                                                      info->count_buffer->offset + info->count_buffer_offset
8872                                                 : 0;
8873 
8874    radv_cs_add_buffer(ws, cs, info->indirect->bo);
8875 
8876    radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0));
8877    radeon_emit(cs, 1);
8878    radeon_emit(cs, va);
8879    radeon_emit(cs, va >> 32);
8880 
8881    if (info->count_buffer) {
8882       radv_cs_add_buffer(ws, cs, info->count_buffer->bo);
8883    }
8884 
8885    if (!state->render.view_mask) {
8886       radv_cs_emit_indirect_draw_packet(cmd_buffer, info->indexed, info->count, count_va, info->stride);
8887    } else {
8888       u_foreach_bit (i, state->render.view_mask) {
8889          radv_emit_view_index(cmd_buffer, i);
8890 
8891          radv_cs_emit_indirect_draw_packet(cmd_buffer, info->indexed, info->count, count_va, info->stride);
8892       }
8893    }
8894 }
8895 
8896 static uint64_t
radv_get_needed_dynamic_states(struct radv_cmd_buffer * cmd_buffer)8897 radv_get_needed_dynamic_states(struct radv_cmd_buffer *cmd_buffer)
8898 {
8899    uint64_t dynamic_states = RADV_DYNAMIC_ALL;
8900 
8901    if (cmd_buffer->state.graphics_pipeline)
8902       return cmd_buffer->state.graphics_pipeline->needed_dynamic_state;
8903 
8904    /* Clear unnecessary dynamic states for shader objects. */
8905    if (!cmd_buffer->state.shaders[MESA_SHADER_TESS_CTRL])
8906       dynamic_states &= ~(RADV_DYNAMIC_PATCH_CONTROL_POINTS | RADV_DYNAMIC_TESS_DOMAIN_ORIGIN);
8907 
8908    if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10_3) {
8909       if (cmd_buffer->state.shaders[MESA_SHADER_MESH])
8910          dynamic_states &= ~(RADV_DYNAMIC_VERTEX_INPUT | RADV_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE |
8911                              RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE | RADV_DYNAMIC_PRIMITIVE_TOPOLOGY);
8912    } else {
8913       dynamic_states &= ~RADV_DYNAMIC_FRAGMENT_SHADING_RATE;
8914    }
8915 
8916    return dynamic_states;
8917 }
8918 
8919 /*
8920  * Vega and raven have a bug which triggers if there are multiple context
8921  * register contexts active at the same time with different scissor values.
8922  *
8923  * There are two possible workarounds:
8924  * 1) Wait for PS_PARTIAL_FLUSH every time the scissor is changed. That way
8925  *    there is only ever 1 active set of scissor values at the same time.
8926  *
8927  * 2) Whenever the hardware switches contexts we have to set the scissor
8928  *    registers again even if it is a noop. That way the new context gets
8929  *    the correct scissor values.
8930  *
8931  * This implements option 2. radv_need_late_scissor_emission needs to
8932  * return true on affected HW if radv_emit_all_graphics_states sets
8933  * any context registers.
8934  */
8935 static bool
radv_need_late_scissor_emission(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info)8936 radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info)
8937 {
8938    if (cmd_buffer->state.context_roll_without_scissor_emitted || info->strmout_buffer)
8939       return true;
8940 
8941    uint64_t used_states = radv_get_needed_dynamic_states(cmd_buffer) | ~RADV_CMD_DIRTY_DYNAMIC_ALL;
8942 
8943    /* Index, vertex and streamout buffers don't change context regs.
8944     * We assume that any other dirty flag causes context rolls.
8945     */
8946    used_states &= ~(RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_VERTEX_BUFFER | RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT |
8947                     RADV_CMD_DIRTY_STREAMOUT_BUFFER);
8948 
8949    return cmd_buffer->state.dirty & used_states;
8950 }
8951 
8952 ALWAYS_INLINE static uint32_t
radv_get_ngg_culling_settings(struct radv_cmd_buffer * cmd_buffer,bool vp_y_inverted)8953 radv_get_ngg_culling_settings(struct radv_cmd_buffer *cmd_buffer, bool vp_y_inverted)
8954 {
8955    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
8956 
8957    /* Disable shader culling entirely when conservative overestimate is used.
8958     * The face culling algorithm can delete very tiny triangles (even if unintended).
8959     */
8960    if (d->vk.rs.conservative_mode == VK_CONSERVATIVE_RASTERIZATION_MODE_OVERESTIMATE_EXT)
8961       return radv_nggc_none;
8962 
8963    /* With graphics pipeline library, NGG culling is unconditionally compiled into shaders
8964     * because we don't know the primitive topology at compile time, so we should
8965     * disable it dynamically for points or lines.
8966     */
8967    const unsigned num_vertices_per_prim = radv_conv_prim_to_gs_out(d->vk.ia.primitive_topology, true) + 1;
8968    if (num_vertices_per_prim != 3)
8969       return radv_nggc_none;
8970 
8971    /* Cull every triangle when rasterizer discard is enabled. */
8972    if (d->vk.rs.rasterizer_discard_enable)
8973       return radv_nggc_front_face | radv_nggc_back_face;
8974 
8975    uint32_t nggc_settings = radv_nggc_none;
8976 
8977    /* The culling code needs to know whether face is CW or CCW. */
8978    bool ccw = d->vk.rs.front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE;
8979 
8980    /* Take inverted viewport into account. */
8981    ccw ^= vp_y_inverted;
8982 
8983    if (ccw)
8984       nggc_settings |= radv_nggc_face_is_ccw;
8985 
8986    /* Face culling settings. */
8987    if (d->vk.rs.cull_mode & VK_CULL_MODE_FRONT_BIT)
8988       nggc_settings |= radv_nggc_front_face;
8989    if (d->vk.rs.cull_mode & VK_CULL_MODE_BACK_BIT)
8990       nggc_settings |= radv_nggc_back_face;
8991 
8992    /* Small primitive culling assumes a sample position at (0.5, 0.5)
8993     * so don't enable it with user sample locations.
8994     */
8995    if (!d->vk.ms.sample_locations_enable) {
8996       nggc_settings |= radv_nggc_small_primitives;
8997 
8998       /* small_prim_precision = num_samples / 2^subpixel_bits
8999        * num_samples is also always a power of two, so the small prim precision can only be
9000        * a power of two between 2^-2 and 2^-6, therefore it's enough to remember the exponent.
9001        */
9002       unsigned rasterization_samples = radv_get_rasterization_samples(cmd_buffer);
9003       unsigned subpixel_bits = 256;
9004       int32_t small_prim_precision_log2 = util_logbase2(rasterization_samples) - util_logbase2(subpixel_bits);
9005       nggc_settings |= ((uint32_t)small_prim_precision_log2 << 24u);
9006    }
9007 
9008    return nggc_settings;
9009 }
9010 
9011 static void
radv_emit_ngg_culling_state(struct radv_cmd_buffer * cmd_buffer)9012 radv_emit_ngg_culling_state(struct radv_cmd_buffer *cmd_buffer)
9013 {
9014    const struct radv_shader *last_vgt_shader = cmd_buffer->state.last_vgt_shader;
9015    const uint32_t base_reg = last_vgt_shader->info.user_data_0;
9016 
9017    /* Get viewport transform. */
9018    float vp_scale[2], vp_translate[2];
9019    memcpy(vp_scale, cmd_buffer->state.dynamic.hw_vp.xform[0].scale, 2 * sizeof(float));
9020    memcpy(vp_translate, cmd_buffer->state.dynamic.hw_vp.xform[0].translate, 2 * sizeof(float));
9021    bool vp_y_inverted = (-vp_scale[1] + vp_translate[1]) > (vp_scale[1] + vp_translate[1]);
9022 
9023    /* Get current culling settings. */
9024    uint32_t nggc_settings = radv_get_ngg_culling_settings(cmd_buffer, vp_y_inverted);
9025 
9026    if (cmd_buffer->state.dirty &
9027        (RADV_CMD_DIRTY_PIPELINE | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT | RADV_CMD_DIRTY_DYNAMIC_RASTERIZATION_SAMPLES)) {
9028       /* Correction for inverted Y */
9029       if (vp_y_inverted) {
9030          vp_scale[1] = -vp_scale[1];
9031          vp_translate[1] = -vp_translate[1];
9032       }
9033 
9034       /* Correction for number of samples per pixel. */
9035       for (unsigned i = 0; i < 2; ++i) {
9036          vp_scale[i] *= (float)cmd_buffer->state.dynamic.vk.ms.rasterization_samples;
9037          vp_translate[i] *= (float)cmd_buffer->state.dynamic.vk.ms.rasterization_samples;
9038       }
9039 
9040       uint32_t vp_reg_values[4] = {fui(vp_scale[0]), fui(vp_scale[1]), fui(vp_translate[0]), fui(vp_translate[1])};
9041       const int8_t vp_sgpr_idx = radv_get_user_sgpr(last_vgt_shader, AC_UD_NGG_VIEWPORT)->sgpr_idx;
9042       assert(vp_sgpr_idx != -1);
9043       radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + vp_sgpr_idx * 4, 4);
9044       radeon_emit_array(cmd_buffer->cs, vp_reg_values, 4);
9045    }
9046 
9047    const int8_t nggc_sgpr_idx = radv_get_user_sgpr(last_vgt_shader, AC_UD_NGG_CULLING_SETTINGS)->sgpr_idx;
9048    assert(nggc_sgpr_idx != -1);
9049 
9050    radeon_set_sh_reg(cmd_buffer->cs, base_reg + nggc_sgpr_idx * 4, nggc_settings);
9051 }
9052 
9053 static void
radv_emit_fs_state(struct radv_cmd_buffer * cmd_buffer)9054 radv_emit_fs_state(struct radv_cmd_buffer *cmd_buffer)
9055 {
9056    const struct radv_shader *ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
9057    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
9058    const struct radv_userdata_info *loc;
9059 
9060    if (!ps)
9061       return;
9062 
9063    loc = radv_get_user_sgpr(ps, AC_UD_PS_STATE);
9064    if (loc->sgpr_idx == -1)
9065       return;
9066    assert(loc->num_sgprs == 1);
9067 
9068    const unsigned rasterization_samples = radv_get_rasterization_samples(cmd_buffer);
9069    const unsigned ps_iter_samples = radv_get_ps_iter_samples(cmd_buffer);
9070    const uint16_t ps_iter_mask = ac_get_ps_iter_mask(ps_iter_samples);
9071    const unsigned rast_prim = radv_get_rasterization_prim(cmd_buffer);
9072    const uint32_t base_reg = ps->info.user_data_0;
9073    const unsigned ps_state = SET_SGPR_FIELD(PS_STATE_NUM_SAMPLES, rasterization_samples) |
9074                              SET_SGPR_FIELD(PS_STATE_PS_ITER_MASK, ps_iter_mask) |
9075                              SET_SGPR_FIELD(PS_STATE_LINE_RAST_MODE, d->vk.rs.line.mode) |
9076                              SET_SGPR_FIELD(PS_STATE_RAST_PRIM, rast_prim);
9077 
9078    radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, ps_state);
9079 }
9080 
9081 static void
radv_emit_db_shader_control(struct radv_cmd_buffer * cmd_buffer)9082 radv_emit_db_shader_control(struct radv_cmd_buffer *cmd_buffer)
9083 {
9084    const struct radeon_info *rad_info = &cmd_buffer->device->physical_device->rad_info;
9085    const struct radv_shader *ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
9086    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
9087    const bool uses_ds_feedback_loop =
9088       !!(d->feedback_loop_aspects & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT));
9089    const unsigned rasterization_samples = radv_get_rasterization_samples(cmd_buffer);
9090 
9091    uint32_t db_shader_control;
9092 
9093    if (ps) {
9094       db_shader_control = ps->info.ps.db_shader_control;
9095    } else {
9096       db_shader_control = S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_ANY_Z) |
9097                           S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z) |
9098                           S_02880C_DUAL_QUAD_DISABLE(rad_info->has_rbplus && !rad_info->rbplus_allowed);
9099    }
9100 
9101    /* When a depth/stencil attachment is used inside feedback loops, use LATE_Z to make sure shader invocations read the
9102     * correct value.
9103     * Also apply the bug workaround for smoothing (overrasterization) on GFX6.
9104     */
9105    if (uses_ds_feedback_loop ||
9106        (rad_info->gfx_level == GFX6 && d->vk.rs.line.mode == VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_KHR))
9107       db_shader_control = (db_shader_control & C_02880C_Z_ORDER) | S_02880C_Z_ORDER(V_02880C_LATE_Z);
9108 
9109    if (ps && ps->info.ps.pops) {
9110       /* POPS_OVERLAP_NUM_SAMPLES (OVERRIDE_INTRINSIC_RATE on GFX11, must always be enabled for POPS) controls the
9111        * interlock granularity.
9112        * PixelInterlock: 1x.
9113        * SampleInterlock: MSAA_EXPOSED_SAMPLES (much faster at common edges of adjacent primitives with MSAA).
9114        */
9115       if (rad_info->gfx_level >= GFX11) {
9116          db_shader_control |= S_02880C_OVERRIDE_INTRINSIC_RATE_ENABLE(1);
9117          if (ps->info.ps.pops_is_per_sample)
9118             db_shader_control |= S_02880C_OVERRIDE_INTRINSIC_RATE(util_logbase2(rasterization_samples));
9119       } else {
9120          if (ps->info.ps.pops_is_per_sample)
9121             db_shader_control |= S_02880C_POPS_OVERLAP_NUM_SAMPLES(util_logbase2(rasterization_samples));
9122 
9123          if (rad_info->has_pops_missed_overlap_bug) {
9124             radeon_set_context_reg(cmd_buffer->cs, R_028060_DB_DFSM_CONTROL,
9125                                    S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF) |
9126                                       S_028060_POPS_DRAIN_PS_ON_OVERLAP(rasterization_samples >= 8));
9127          }
9128       }
9129    } else if (rad_info->has_export_conflict_bug && rasterization_samples == 1) {
9130       for (uint32_t i = 0; i < MAX_RTS; i++) {
9131          if (d->vk.cb.attachments[i].write_mask && d->vk.cb.attachments[i].blend_enable) {
9132             db_shader_control |= S_02880C_OVERRIDE_INTRINSIC_RATE_ENABLE(1) | S_02880C_OVERRIDE_INTRINSIC_RATE(2);
9133             break;
9134          }
9135       }
9136    }
9137 
9138    if (db_shader_control != cmd_buffer->state.last_db_shader_control) {
9139       radeon_set_context_reg(cmd_buffer->cs, R_02880C_DB_SHADER_CONTROL, db_shader_control);
9140 
9141       cmd_buffer->state.last_db_shader_control = db_shader_control;
9142    }
9143 
9144    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_DB_SHADER_CONTROL;
9145 }
9146 
9147 static void
radv_emit_streamout_enable_state(struct radv_cmd_buffer * cmd_buffer)9148 radv_emit_streamout_enable_state(struct radv_cmd_buffer *cmd_buffer)
9149 {
9150    const struct radv_streamout_state *so = &cmd_buffer->state.streamout;
9151    const bool streamout_enabled = radv_is_streamout_enabled(cmd_buffer);
9152    uint32_t enabled_stream_buffers_mask = 0;
9153 
9154    if (streamout_enabled && cmd_buffer->state.last_vgt_shader) {
9155       const struct radv_shader_info *info = &cmd_buffer->state.last_vgt_shader->info;
9156 
9157       enabled_stream_buffers_mask = info->so.enabled_stream_buffers_mask;
9158 
9159       if (!cmd_buffer->device->physical_device->use_ngg_streamout) {
9160          u_foreach_bit (i, so->enabled_mask) {
9161             radeon_set_context_reg(cmd_buffer->cs, R_028AD4_VGT_STRMOUT_VTX_STRIDE_0 + 16 * i, info->so.strides[i]);
9162          }
9163       }
9164    }
9165 
9166    radeon_set_context_reg_seq(cmd_buffer->cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
9167    radeon_emit(cmd_buffer->cs, S_028B94_STREAMOUT_0_EN(streamout_enabled) | S_028B94_RAST_STREAM(0) |
9168                                   S_028B94_STREAMOUT_1_EN(streamout_enabled) |
9169                                   S_028B94_STREAMOUT_2_EN(streamout_enabled) |
9170                                   S_028B94_STREAMOUT_3_EN(streamout_enabled));
9171    radeon_emit(cmd_buffer->cs, so->hw_enabled_mask & enabled_stream_buffers_mask);
9172 
9173    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_STREAMOUT_ENABLE;
9174 }
9175 
9176 static gl_shader_stage
radv_cmdbuf_get_last_vgt_api_stage(const struct radv_cmd_buffer * cmd_buffer)9177 radv_cmdbuf_get_last_vgt_api_stage(const struct radv_cmd_buffer *cmd_buffer)
9178 {
9179    if (cmd_buffer->state.active_stages & VK_SHADER_STAGE_MESH_BIT_EXT)
9180       return MESA_SHADER_MESH;
9181 
9182    return util_last_bit(cmd_buffer->state.active_stages & BITFIELD_MASK(MESA_SHADER_FRAGMENT)) - 1;
9183 }
9184 
9185 static void
radv_emit_graphics_shaders(struct radv_cmd_buffer * cmd_buffer)9186 radv_emit_graphics_shaders(struct radv_cmd_buffer *cmd_buffer)
9187 {
9188    const gl_shader_stage last_vgt_api_stage = radv_cmdbuf_get_last_vgt_api_stage(cmd_buffer);
9189    const struct radv_shader *last_vgt_shader = cmd_buffer->state.shaders[last_vgt_api_stage];
9190    struct radv_device *device = cmd_buffer->device;
9191    struct radeon_cmdbuf *cs = cmd_buffer->cs;
9192 
9193    radv_foreach_stage(s, cmd_buffer->state.active_stages & RADV_GRAPHICS_STAGE_BITS)
9194    {
9195       struct radv_shader_object *shader_obj = cmd_buffer->state.shader_objs[s];
9196 
9197       switch (s) {
9198       case MESA_SHADER_VERTEX: {
9199          const struct radv_shader *vs = cmd_buffer->state.shaders[MESA_SHADER_VERTEX];
9200          struct radv_shader *next_stage = NULL;
9201 
9202          if (vs->info.merged_shader_compiled_separately) {
9203             assert(vs->info.next_stage == MESA_SHADER_TESS_CTRL || vs->info.next_stage == MESA_SHADER_GEOMETRY);
9204             next_stage = cmd_buffer->state.shaders[vs->info.next_stage];
9205          }
9206 
9207          radv_emit_vertex_shader(device, cs, cs, vs, next_stage);
9208          break;
9209       }
9210       case MESA_SHADER_TESS_CTRL:
9211          radv_emit_tess_ctrl_shader(device, cs, cmd_buffer->state.shaders[MESA_SHADER_TESS_CTRL]);
9212          break;
9213       case MESA_SHADER_TESS_EVAL: {
9214          const struct radv_shader *tes = cmd_buffer->state.shaders[MESA_SHADER_TESS_EVAL];
9215          struct radv_shader *gs = NULL;
9216 
9217          if (tes->info.merged_shader_compiled_separately) {
9218             assert(tes->info.next_stage == MESA_SHADER_GEOMETRY);
9219             gs = cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY];
9220          }
9221 
9222          radv_emit_tess_eval_shader(device, cs, cs, tes, gs);
9223          break;
9224       }
9225       case MESA_SHADER_GEOMETRY: {
9226          struct radv_shader *es = cmd_buffer->state.shaders[MESA_SHADER_TESS_EVAL]
9227                                      ? cmd_buffer->state.shaders[MESA_SHADER_TESS_EVAL]
9228                                      : cmd_buffer->state.shaders[MESA_SHADER_VERTEX];
9229 
9230          radv_emit_geometry_shader(device, cs, cs, cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY], es,
9231                                    shader_obj->gs.copy_shader);
9232          break;
9233       }
9234       case MESA_SHADER_FRAGMENT:
9235          radv_emit_fragment_shader(device, cs, cs, cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT]);
9236          radv_emit_ps_inputs(device, cs, last_vgt_shader, cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT]);
9237          break;
9238       case MESA_SHADER_MESH:
9239          radv_emit_mesh_shader(device, cs, cs, cmd_buffer->state.shaders[MESA_SHADER_MESH]);
9240          break;
9241       case MESA_SHADER_TASK:
9242          radv_emit_compute_shader(device->physical_device, cmd_buffer->gang.cs,
9243                                   cmd_buffer->state.shaders[MESA_SHADER_TASK]);
9244          break;
9245       default:
9246          unreachable("invalid bind stage");
9247       }
9248    }
9249 
9250    /* Emit graphics states related to shaders. */
9251    struct radv_vgt_shader_key vgt_shader_cfg_key = {
9252       .tess = !!cmd_buffer->state.shaders[MESA_SHADER_TESS_CTRL],
9253       .gs = !!cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY],
9254       .ngg = last_vgt_shader->info.is_ngg,
9255       .ngg_passthrough = last_vgt_shader->info.is_ngg_passthrough,
9256       .ngg_streamout = last_vgt_shader->info.is_ngg && last_vgt_shader->info.so.num_outputs > 0,
9257    };
9258 
9259    if (cmd_buffer->state.shaders[MESA_SHADER_MESH]) {
9260       vgt_shader_cfg_key.mesh = 1;
9261       vgt_shader_cfg_key.mesh_scratch_ring = cmd_buffer->state.shaders[MESA_SHADER_MESH]->info.ms.needs_ms_scratch_ring;
9262    }
9263 
9264    radv_emit_vgt_gs_mode(device, cs, last_vgt_shader);
9265    radv_emit_vgt_vertex_reuse(device, cs, radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_TESS_EVAL));
9266    radv_emit_vgt_shader_config(device, cs, &vgt_shader_cfg_key);
9267    radv_emit_vgt_gs_out(device, cs, radv_get_rasterization_prim(cmd_buffer));
9268 
9269    if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10_3) {
9270       gfx103_emit_vgt_draw_payload_cntl(cs, cmd_buffer->state.shaders[MESA_SHADER_MESH], false);
9271       gfx103_emit_vrs_state(device, cs, NULL, false, false, false);
9272    }
9273 
9274    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_GRAPHICS_SHADERS;
9275 }
9276 
9277 static void
radv_emit_all_graphics_states(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info)9278 radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info)
9279 {
9280    const struct radv_device *device = cmd_buffer->device;
9281    struct radv_shader_part *tcs_epilog = NULL, *ps_epilog = NULL;
9282 
9283    if (cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT] &&
9284        cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT]->info.has_epilog) {
9285       if ((cmd_buffer->state.emitted_graphics_pipeline != cmd_buffer->state.graphics_pipeline ||
9286            (cmd_buffer->state.dirty &
9287             (RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_MASK | RADV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_ENABLE |
9288              RADV_CMD_DIRTY_DYNAMIC_ALPHA_TO_COVERAGE_ENABLE | RADV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_EQUATION |
9289              RADV_CMD_DIRTY_GRAPHICS_SHADERS)))) {
9290          ps_epilog = lookup_ps_epilog(cmd_buffer);
9291          if (!ps_epilog) {
9292             vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
9293             return;
9294          }
9295 
9296          cmd_buffer->state.col_format_non_compacted = ps_epilog->spi_shader_col_format;
9297 
9298          bool need_null_export_workaround = radv_needs_null_export_workaround(
9299             device, cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT], cmd_buffer->state.custom_blend_mode);
9300 
9301          if (need_null_export_workaround && !cmd_buffer->state.col_format_non_compacted)
9302             cmd_buffer->state.col_format_non_compacted = V_028714_SPI_SHADER_32_R;
9303          if (device->physical_device->rad_info.rbplus_allowed)
9304             cmd_buffer->state.dirty |= RADV_CMD_DIRTY_RBPLUS;
9305       }
9306    }
9307 
9308    if (cmd_buffer->state.shaders[MESA_SHADER_TESS_CTRL] &&
9309        cmd_buffer->state.shaders[MESA_SHADER_TESS_CTRL]->info.has_epilog) {
9310       tcs_epilog = lookup_tcs_epilog(cmd_buffer);
9311       if (!tcs_epilog) {
9312          vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
9313          return;
9314       }
9315    }
9316 
9317    /* Determine whether GFX9 late scissor workaround should be applied based on:
9318     * 1. radv_need_late_scissor_emission
9319     * 2. any dirty dynamic flags that may cause context rolls
9320     */
9321    const bool late_scissor_emission = cmd_buffer->device->physical_device->rad_info.has_gfx9_scissor_bug
9322                                          ? radv_need_late_scissor_emission(cmd_buffer, info)
9323                                          : false;
9324 
9325    if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_RBPLUS)
9326       radv_emit_rbplus_state(cmd_buffer);
9327 
9328    if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_SHADER_QUERY)
9329       radv_flush_shader_query_state(cmd_buffer);
9330 
9331    if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_OCCLUSION_QUERY)
9332       radv_flush_occlusion_query_state(cmd_buffer);
9333 
9334    if ((cmd_buffer->state.dirty &
9335         (RADV_CMD_DIRTY_PIPELINE | RADV_CMD_DIRTY_DYNAMIC_CULL_MODE | RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE |
9336          RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT |
9337          RADV_CMD_DIRTY_DYNAMIC_CONSERVATIVE_RAST_MODE | RADV_CMD_DIRTY_DYNAMIC_RASTERIZATION_SAMPLES |
9338          RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY | RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS_ENABLE)) &&
9339        cmd_buffer->state.has_nggc)
9340       radv_emit_ngg_culling_state(cmd_buffer);
9341 
9342    if (cmd_buffer->state.dirty &
9343        (RADV_CMD_DIRTY_FRAMEBUFFER | RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_MASK |
9344         RADV_CMD_DIRTY_DYNAMIC_RASTERIZATION_SAMPLES | RADV_CMD_DIRTY_DYNAMIC_LINE_RASTERIZATION_MODE))
9345       radv_emit_binning_state(cmd_buffer);
9346 
9347    if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) {
9348       radv_emit_graphics_pipeline(cmd_buffer);
9349    } else if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_GRAPHICS_SHADERS) {
9350       radv_emit_graphics_shaders(cmd_buffer);
9351    }
9352 
9353    if (ps_epilog)
9354       radv_emit_ps_epilog_state(cmd_buffer, ps_epilog);
9355 
9356    if (tcs_epilog)
9357       radv_emit_tcs_epilog_state(cmd_buffer, tcs_epilog);
9358 
9359    if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)
9360       radv_emit_framebuffer_state(cmd_buffer);
9361 
9362    if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_GUARDBAND)
9363       radv_emit_guardband_state(cmd_buffer);
9364 
9365    if (cmd_buffer->state.dirty &
9366        (RADV_CMD_DIRTY_DB_SHADER_CONTROL | RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_MASK |
9367         RADV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_ENABLE | RADV_CMD_DIRTY_DYNAMIC_RASTERIZATION_SAMPLES |
9368         RADV_CMD_DIRTY_DYNAMIC_LINE_RASTERIZATION_MODE | RADV_CMD_DIRTY_DYNAMIC_ATTACHMENT_FEEDBACK_LOOP_ENABLE))
9369       radv_emit_db_shader_control(cmd_buffer);
9370 
9371    if (info->indexed && info->indirect && cmd_buffer->state.dirty & RADV_CMD_DIRTY_INDEX_BUFFER)
9372       radv_emit_index_buffer(cmd_buffer);
9373 
9374    if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_STREAMOUT_ENABLE)
9375       radv_emit_streamout_enable_state(cmd_buffer);
9376 
9377    const uint64_t dynamic_states = cmd_buffer->state.dirty & radv_get_needed_dynamic_states(cmd_buffer);
9378 
9379    if (dynamic_states) {
9380       radv_cmd_buffer_flush_dynamic_state(cmd_buffer, dynamic_states);
9381 
9382       if (dynamic_states &
9383           (RADV_CMD_DIRTY_DYNAMIC_RASTERIZATION_SAMPLES | RADV_CMD_DIRTY_DYNAMIC_LINE_RASTERIZATION_MODE))
9384          radv_emit_fs_state(cmd_buffer);
9385    }
9386 
9387    radv_emit_draw_registers(cmd_buffer, info);
9388 
9389    if (late_scissor_emission) {
9390       radv_emit_scissor(cmd_buffer);
9391       cmd_buffer->state.context_roll_without_scissor_emitted = false;
9392    }
9393 }
9394 
9395 static void
radv_bind_graphics_shaders(struct radv_cmd_buffer * cmd_buffer)9396 radv_bind_graphics_shaders(struct radv_cmd_buffer *cmd_buffer)
9397 {
9398    const struct radv_device *device = cmd_buffer->device;
9399    uint32_t push_constant_size = 0, dynamic_offset_count = 0;
9400    bool need_indirect_descriptor_sets = false;
9401 
9402    for (unsigned s = 0; s <= MESA_SHADER_MESH; s++) {
9403       const struct radv_shader_object *shader_obj = cmd_buffer->state.shader_objs[s];
9404       struct radv_shader *shader = NULL;
9405 
9406       if (s == MESA_SHADER_COMPUTE)
9407          continue;
9408 
9409       if (!shader_obj) {
9410          radv_bind_shader(cmd_buffer, NULL, s);
9411          continue;
9412       }
9413 
9414       /* Select shader variants. */
9415       if (s == MESA_SHADER_VERTEX && (cmd_buffer->state.shader_objs[MESA_SHADER_TESS_CTRL] ||
9416                                       cmd_buffer->state.shader_objs[MESA_SHADER_GEOMETRY])) {
9417          if (cmd_buffer->state.shader_objs[MESA_SHADER_TESS_CTRL]) {
9418             shader = shader_obj->as_ls.shader;
9419          } else {
9420             shader = shader_obj->as_es.shader;
9421          }
9422       } else if (s == MESA_SHADER_TESS_EVAL && cmd_buffer->state.shader_objs[MESA_SHADER_GEOMETRY]) {
9423          shader = shader_obj->as_es.shader;
9424       } else {
9425          shader = shader_obj->shader;
9426       }
9427 
9428       radv_bind_shader(cmd_buffer, shader, s);
9429       if (!shader)
9430          continue;
9431 
9432       radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, shader->bo);
9433 
9434       /* Compute push constants/indirect descriptors state. */
9435       need_indirect_descriptor_sets |= radv_get_user_sgpr(shader, AC_UD_INDIRECT_DESCRIPTOR_SETS)->sgpr_idx != -1;
9436       push_constant_size += shader_obj->push_constant_size;
9437       dynamic_offset_count += shader_obj->dynamic_offset_count;
9438    }
9439 
9440    /* Determine the last VGT shader. */
9441    const gl_shader_stage last_vgt_api_stage = radv_cmdbuf_get_last_vgt_api_stage(cmd_buffer);
9442 
9443    assume(last_vgt_api_stage != MESA_SHADER_NONE);
9444    cmd_buffer->state.last_vgt_shader = cmd_buffer->state.shaders[last_vgt_api_stage];
9445 
9446    cmd_buffer->state.gs_copy_shader = cmd_buffer->state.shader_objs[MESA_SHADER_GEOMETRY]
9447                                          ? cmd_buffer->state.shader_objs[MESA_SHADER_GEOMETRY]->gs.copy_shader
9448                                          : NULL;
9449    if (cmd_buffer->state.gs_copy_shader) {
9450       radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->state.gs_copy_shader->bo);
9451    }
9452 
9453    /* Determine the rasterized primitive. */
9454    if (cmd_buffer->state.active_stages &
9455        (VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT | VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT |
9456         VK_SHADER_STAGE_GEOMETRY_BIT | VK_SHADER_STAGE_MESH_BIT_EXT)) {
9457       cmd_buffer->state.rast_prim = radv_get_vgt_gs_out(cmd_buffer->state.shaders, 0);
9458    }
9459 
9460    const struct radv_shader *vs = radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_VERTEX);
9461    if (vs) {
9462       /* Re-emit the VS prolog when a new vertex shader is bound. */
9463       if (vs->info.vs.has_prolog) {
9464          cmd_buffer->state.emitted_vs_prolog = NULL;
9465          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT;
9466       }
9467 
9468       /* Re-emit the vertex buffer descriptors because they are really tied to the pipeline. */
9469       if (vs->info.vs.vb_desc_usage_mask) {
9470          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER;
9471       }
9472    }
9473 
9474    /* Update push constants/indirect descriptors state. */
9475    struct radv_descriptor_state *descriptors_state =
9476       radv_get_descriptors_state(cmd_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS);
9477    struct radv_push_constant_state *pc_state = &cmd_buffer->push_constant_state[VK_PIPELINE_BIND_POINT_GRAPHICS];
9478 
9479    descriptors_state->need_indirect_descriptor_sets = need_indirect_descriptor_sets;
9480    pc_state->size = push_constant_size;
9481    pc_state->dynamic_offset_count = dynamic_offset_count;
9482 
9483    if (device->physical_device->rad_info.gfx_level <= GFX9) {
9484       cmd_buffer->state.ia_multi_vgt_param = radv_compute_ia_multi_vgt_param(device, cmd_buffer->state.shaders);
9485    }
9486 
9487    if (cmd_buffer->state.active_stages &
9488        (VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT | VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT)) {
9489       cmd_buffer->state.uses_dynamic_patch_control_points = true;
9490    }
9491 
9492    cmd_buffer->state.uses_dynamic_vertex_binding_stride = true;
9493 }
9494 
9495 /* MUST inline this function to avoid massive perf loss in drawoverhead */
9496 ALWAYS_INLINE static bool
radv_before_draw(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,uint32_t drawCount,bool dgc)9497 radv_before_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info, uint32_t drawCount, bool dgc)
9498 {
9499    const bool has_prefetch = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7;
9500 
9501    ASSERTED const unsigned cdw_max =
9502       radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4096 + 128 * (drawCount - 1));
9503 
9504    if (likely(!info->indirect)) {
9505       /* GFX6-GFX7 treat instance_count==0 as instance_count==1. There is
9506        * no workaround for indirect draws, but we can at least skip
9507        * direct draws.
9508        */
9509       if (unlikely(!info->instance_count))
9510          return false;
9511 
9512       /* Handle count == 0. */
9513       if (unlikely(!info->count && !info->strmout_buffer))
9514          return false;
9515    }
9516 
9517    if (!info->indexed && cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) {
9518       /* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE,
9519        * so the state must be re-emitted before the next indexed
9520        * draw.
9521        */
9522       cmd_buffer->state.last_index_type = -1;
9523    }
9524 
9525    if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_GRAPHICS_SHADERS) {
9526       radv_bind_graphics_shaders(cmd_buffer);
9527    }
9528 
9529    /* Use optimal packet order based on whether we need to sync the
9530     * pipeline.
9531     */
9532    if (cmd_buffer->state.flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB |
9533                                        RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
9534       /* If we have to wait for idle, set all states first, so that
9535        * all SET packets are processed in parallel with previous draw
9536        * calls. Then upload descriptors, set shader pointers, and
9537        * draw, and prefetch at the end. This ensures that the time
9538        * the CUs are idle is very short. (there are only SET_SH
9539        * packets between the wait and the draw)
9540        */
9541       radv_emit_all_graphics_states(cmd_buffer, info);
9542       radv_emit_cache_flush(cmd_buffer);
9543       /* <-- CUs are idle here --> */
9544 
9545       radv_upload_graphics_shader_descriptors(cmd_buffer);
9546    } else {
9547       const bool need_prefetch = has_prefetch && cmd_buffer->state.prefetch_L2_mask;
9548 
9549       /* If we don't wait for idle, start prefetches first, then set
9550        * states, and draw at the end.
9551        */
9552       radv_emit_cache_flush(cmd_buffer);
9553 
9554       if (need_prefetch) {
9555          /* Only prefetch the vertex shader and VBO descriptors
9556           * in order to start the draw as soon as possible.
9557           */
9558          radv_emit_prefetch_L2(cmd_buffer, true);
9559       }
9560 
9561       radv_upload_graphics_shader_descriptors(cmd_buffer);
9562 
9563       radv_emit_all_graphics_states(cmd_buffer, info);
9564    }
9565 
9566    if (!dgc)
9567       radv_describe_draw(cmd_buffer);
9568    if (likely(!info->indirect)) {
9569       struct radv_cmd_state *state = &cmd_buffer->state;
9570       struct radeon_cmdbuf *cs = cmd_buffer->cs;
9571       assert(state->vtx_base_sgpr);
9572       if (state->last_num_instances != info->instance_count) {
9573          radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, false));
9574          radeon_emit(cs, info->instance_count);
9575          state->last_num_instances = info->instance_count;
9576       }
9577    }
9578    assert(cmd_buffer->cs->cdw <= cdw_max);
9579 
9580    return true;
9581 }
9582 
9583 ALWAYS_INLINE static bool
radv_before_taskmesh_draw(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,uint32_t drawCount,bool dgc)9584 radv_before_taskmesh_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info, uint32_t drawCount,
9585                           bool dgc)
9586 {
9587    /* For direct draws, this makes sure we don't draw anything.
9588     * For indirect draws, this is necessary to prevent a GPU hang (on MEC version < 100).
9589     */
9590    if (unlikely(!info->count))
9591       return false;
9592 
9593    if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_GRAPHICS_SHADERS) {
9594       radv_bind_graphics_shaders(cmd_buffer);
9595    }
9596 
9597    struct radeon_cmdbuf *ace_cs = cmd_buffer->gang.cs;
9598    struct radv_shader *task_shader = cmd_buffer->state.shaders[MESA_SHADER_TASK];
9599 
9600    assert(!task_shader || ace_cs);
9601 
9602    const VkShaderStageFlags stages =
9603       VK_SHADER_STAGE_MESH_BIT_EXT | VK_SHADER_STAGE_FRAGMENT_BIT | (task_shader ? VK_SHADER_STAGE_TASK_BIT_EXT : 0);
9604    const bool need_task_semaphore = task_shader && radv_flush_gang_leader_semaphore(cmd_buffer);
9605 
9606    ASSERTED const unsigned cdw_max =
9607       radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4096 + 128 * (drawCount - 1));
9608    ASSERTED const unsigned ace_cdw_max =
9609       !ace_cs ? 0 : radeon_check_space(cmd_buffer->device->ws, ace_cs, 4096 + 128 * (drawCount - 1));
9610 
9611    radv_emit_all_graphics_states(cmd_buffer, info);
9612 
9613    radv_emit_cache_flush(cmd_buffer);
9614 
9615    if (task_shader) {
9616       radv_gang_cache_flush(cmd_buffer);
9617 
9618       if (need_task_semaphore) {
9619          radv_wait_gang_leader(cmd_buffer);
9620       }
9621    }
9622 
9623    radv_flush_descriptors(cmd_buffer, stages, VK_PIPELINE_BIND_POINT_GRAPHICS);
9624 
9625    const VkShaderStageFlags pc_stages = radv_must_flush_constants(cmd_buffer, stages, VK_PIPELINE_BIND_POINT_GRAPHICS);
9626    if (pc_stages)
9627       radv_flush_constants(cmd_buffer, pc_stages, VK_PIPELINE_BIND_POINT_GRAPHICS);
9628 
9629    if (!dgc)
9630       radv_describe_draw(cmd_buffer);
9631    if (likely(!info->indirect)) {
9632       struct radv_cmd_state *state = &cmd_buffer->state;
9633       if (unlikely(state->last_num_instances != 1)) {
9634          struct radeon_cmdbuf *cs = cmd_buffer->cs;
9635          radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, false));
9636          radeon_emit(cs, 1);
9637          state->last_num_instances = 1;
9638       }
9639    }
9640 
9641    assert(cmd_buffer->cs->cdw <= cdw_max);
9642    assert(!ace_cs || ace_cs->cdw <= ace_cdw_max);
9643 
9644    cmd_buffer->state.last_index_type = -1;
9645 
9646    return true;
9647 }
9648 
9649 ALWAYS_INLINE static void
radv_after_draw(struct radv_cmd_buffer * cmd_buffer,bool dgc)9650 radv_after_draw(struct radv_cmd_buffer *cmd_buffer, bool dgc)
9651 {
9652    const struct radeon_info *rad_info = &cmd_buffer->device->physical_device->rad_info;
9653    bool has_prefetch = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7;
9654    /* Start prefetches after the draw has been started. Both will
9655     * run in parallel, but starting the draw first is more
9656     * important.
9657     */
9658    if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) {
9659       radv_emit_prefetch_L2(cmd_buffer, false);
9660    }
9661 
9662    /* Workaround for a VGT hang when streamout is enabled.
9663     * It must be done after drawing.
9664     */
9665    if (radv_is_streamout_enabled(cmd_buffer) &&
9666        (rad_info->family == CHIP_HAWAII || rad_info->family == CHIP_TONGA || rad_info->family == CHIP_FIJI)) {
9667       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_STREAMOUT_SYNC;
9668    }
9669 
9670    radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_PS_PARTIAL_FLUSH, dgc);
9671 }
9672 
9673 VKAPI_ATTR void VKAPI_CALL
radv_CmdDraw(VkCommandBuffer commandBuffer,uint32_t vertexCount,uint32_t instanceCount,uint32_t firstVertex,uint32_t firstInstance)9674 radv_CmdDraw(VkCommandBuffer commandBuffer, uint32_t vertexCount, uint32_t instanceCount, uint32_t firstVertex,
9675              uint32_t firstInstance)
9676 {
9677    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9678    struct radv_draw_info info;
9679 
9680    info.count = vertexCount;
9681    info.instance_count = instanceCount;
9682    info.first_instance = firstInstance;
9683    info.strmout_buffer = NULL;
9684    info.indirect = NULL;
9685    info.indexed = false;
9686 
9687    if (!radv_before_draw(cmd_buffer, &info, 1, false))
9688       return;
9689    const VkMultiDrawInfoEXT minfo = {firstVertex, vertexCount};
9690    radv_emit_direct_draw_packets(cmd_buffer, &info, 1, &minfo, 0, 0);
9691    radv_after_draw(cmd_buffer, false);
9692 }
9693 
9694 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawMultiEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawInfoEXT * pVertexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride)9695 radv_CmdDrawMultiEXT(VkCommandBuffer commandBuffer, uint32_t drawCount, const VkMultiDrawInfoEXT *pVertexInfo,
9696                      uint32_t instanceCount, uint32_t firstInstance, uint32_t stride)
9697 {
9698    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9699    struct radv_draw_info info;
9700 
9701    if (!drawCount)
9702       return;
9703 
9704    info.count = pVertexInfo->vertexCount;
9705    info.instance_count = instanceCount;
9706    info.first_instance = firstInstance;
9707    info.strmout_buffer = NULL;
9708    info.indirect = NULL;
9709    info.indexed = false;
9710 
9711    if (!radv_before_draw(cmd_buffer, &info, drawCount, false))
9712       return;
9713    radv_emit_direct_draw_packets(cmd_buffer, &info, drawCount, pVertexInfo, 0, stride);
9714    radv_after_draw(cmd_buffer, false);
9715 }
9716 
9717 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawIndexed(VkCommandBuffer commandBuffer,uint32_t indexCount,uint32_t instanceCount,uint32_t firstIndex,int32_t vertexOffset,uint32_t firstInstance)9718 radv_CmdDrawIndexed(VkCommandBuffer commandBuffer, uint32_t indexCount, uint32_t instanceCount, uint32_t firstIndex,
9719                     int32_t vertexOffset, uint32_t firstInstance)
9720 {
9721    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9722    struct radv_draw_info info;
9723 
9724    info.indexed = true;
9725    info.count = indexCount;
9726    info.instance_count = instanceCount;
9727    info.first_instance = firstInstance;
9728    info.strmout_buffer = NULL;
9729    info.indirect = NULL;
9730 
9731    if (!radv_before_draw(cmd_buffer, &info, 1, false))
9732       return;
9733    const VkMultiDrawIndexedInfoEXT minfo = {firstIndex, indexCount, vertexOffset};
9734    radv_emit_draw_packets_indexed(cmd_buffer, &info, 1, &minfo, 0, NULL);
9735    radv_after_draw(cmd_buffer, false);
9736 }
9737 
9738 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawIndexedInfoEXT * pIndexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride,const int32_t * pVertexOffset)9739 radv_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer, uint32_t drawCount,
9740                             const VkMultiDrawIndexedInfoEXT *pIndexInfo, uint32_t instanceCount, uint32_t firstInstance,
9741                             uint32_t stride, const int32_t *pVertexOffset)
9742 {
9743    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9744    struct radv_draw_info info;
9745 
9746    if (!drawCount)
9747       return;
9748 
9749    const VkMultiDrawIndexedInfoEXT *minfo = pIndexInfo;
9750    info.indexed = true;
9751    info.count = minfo->indexCount;
9752    info.instance_count = instanceCount;
9753    info.first_instance = firstInstance;
9754    info.strmout_buffer = NULL;
9755    info.indirect = NULL;
9756 
9757    if (!radv_before_draw(cmd_buffer, &info, drawCount, false))
9758       return;
9759    radv_emit_draw_packets_indexed(cmd_buffer, &info, drawCount, pIndexInfo, stride, pVertexOffset);
9760    radv_after_draw(cmd_buffer, false);
9761 }
9762 
9763 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)9764 radv_CmdDrawIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, uint32_t drawCount,
9765                      uint32_t stride)
9766 {
9767    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9768    RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
9769    struct radv_draw_info info;
9770 
9771    info.count = drawCount;
9772    info.indirect = buffer;
9773    info.indirect_offset = offset;
9774    info.stride = stride;
9775    info.strmout_buffer = NULL;
9776    info.count_buffer = NULL;
9777    info.indexed = false;
9778    info.instance_count = 0;
9779 
9780    if (!radv_before_draw(cmd_buffer, &info, 1, false))
9781       return;
9782    radv_emit_indirect_draw_packets(cmd_buffer, &info);
9783    radv_after_draw(cmd_buffer, false);
9784 }
9785 
9786 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)9787 radv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, uint32_t drawCount,
9788                             uint32_t stride)
9789 {
9790    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9791    RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
9792    struct radv_draw_info info;
9793 
9794    info.indexed = true;
9795    info.count = drawCount;
9796    info.indirect = buffer;
9797    info.indirect_offset = offset;
9798    info.stride = stride;
9799    info.count_buffer = NULL;
9800    info.strmout_buffer = NULL;
9801    info.instance_count = 0;
9802 
9803    if (!radv_before_draw(cmd_buffer, &info, 1, false))
9804       return;
9805    radv_emit_indirect_draw_packets(cmd_buffer, &info);
9806    radv_after_draw(cmd_buffer, false);
9807 }
9808 
9809 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer _countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)9810 radv_CmdDrawIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, VkBuffer _countBuffer,
9811                           VkDeviceSize countBufferOffset, uint32_t maxDrawCount, uint32_t stride)
9812 {
9813    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9814    RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
9815    RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
9816    struct radv_draw_info info;
9817 
9818    info.count = maxDrawCount;
9819    info.indirect = buffer;
9820    info.indirect_offset = offset;
9821    info.count_buffer = count_buffer;
9822    info.count_buffer_offset = countBufferOffset;
9823    info.stride = stride;
9824    info.strmout_buffer = NULL;
9825    info.indexed = false;
9826    info.instance_count = 0;
9827 
9828    if (!radv_before_draw(cmd_buffer, &info, 1, false))
9829       return;
9830    radv_emit_indirect_draw_packets(cmd_buffer, &info);
9831    radv_after_draw(cmd_buffer, false);
9832 }
9833 
9834 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer _countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)9835 radv_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset,
9836                                  VkBuffer _countBuffer, VkDeviceSize countBufferOffset, uint32_t maxDrawCount,
9837                                  uint32_t stride)
9838 {
9839    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9840    RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
9841    RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
9842    struct radv_draw_info info;
9843 
9844    info.indexed = true;
9845    info.count = maxDrawCount;
9846    info.indirect = buffer;
9847    info.indirect_offset = offset;
9848    info.count_buffer = count_buffer;
9849    info.count_buffer_offset = countBufferOffset;
9850    info.stride = stride;
9851    info.strmout_buffer = NULL;
9852    info.instance_count = 0;
9853 
9854    if (!radv_before_draw(cmd_buffer, &info, 1, false))
9855       return;
9856    radv_emit_indirect_draw_packets(cmd_buffer, &info);
9857    radv_after_draw(cmd_buffer, false);
9858 }
9859 
9860 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawMeshTasksEXT(VkCommandBuffer commandBuffer,uint32_t x,uint32_t y,uint32_t z)9861 radv_CmdDrawMeshTasksEXT(VkCommandBuffer commandBuffer, uint32_t x, uint32_t y, uint32_t z)
9862 {
9863    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9864    struct radv_draw_info info;
9865 
9866    info.count = x * y * z;
9867    info.instance_count = 1;
9868    info.first_instance = 0;
9869    info.stride = 0;
9870    info.indexed = false;
9871    info.strmout_buffer = NULL;
9872    info.count_buffer = NULL;
9873    info.indirect = NULL;
9874 
9875    if (!radv_before_taskmesh_draw(cmd_buffer, &info, 1, false))
9876       return;
9877 
9878    if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_TASK)) {
9879       radv_emit_direct_taskmesh_draw_packets(cmd_buffer, x, y, z);
9880    } else {
9881       radv_emit_direct_mesh_draw_packet(cmd_buffer, x, y, z);
9882    }
9883 
9884    radv_after_draw(cmd_buffer, false);
9885 }
9886 
9887 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawMeshTasksIndirectEXT(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)9888 radv_CmdDrawMeshTasksIndirectEXT(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset,
9889                                  uint32_t drawCount, uint32_t stride)
9890 {
9891    if (!drawCount)
9892       return;
9893 
9894    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9895    RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
9896 
9897    struct radv_draw_info info;
9898 
9899    info.indirect = buffer;
9900    info.indirect_offset = offset;
9901    info.stride = stride;
9902    info.count = drawCount;
9903    info.strmout_buffer = NULL;
9904    info.count_buffer = NULL;
9905    info.indexed = false;
9906    info.instance_count = 0;
9907 
9908    if (!radv_before_taskmesh_draw(cmd_buffer, &info, drawCount, false))
9909       return;
9910 
9911    if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_TASK)) {
9912       radv_emit_indirect_taskmesh_draw_packets(cmd_buffer, &info);
9913    } else {
9914       radv_emit_indirect_mesh_draw_packets(cmd_buffer, &info);
9915    }
9916 
9917    radv_after_draw(cmd_buffer, false);
9918 }
9919 
9920 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawMeshTasksIndirectCountEXT(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer _countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)9921 radv_CmdDrawMeshTasksIndirectCountEXT(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset,
9922                                       VkBuffer _countBuffer, VkDeviceSize countBufferOffset, uint32_t maxDrawCount,
9923                                       uint32_t stride)
9924 {
9925 
9926    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9927    RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
9928    RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
9929 
9930    struct radv_draw_info info;
9931 
9932    info.indirect = buffer;
9933    info.indirect_offset = offset;
9934    info.stride = stride;
9935    info.count = maxDrawCount;
9936    info.strmout_buffer = NULL;
9937    info.count_buffer = count_buffer;
9938    info.count_buffer_offset = countBufferOffset;
9939    info.indexed = false;
9940    info.instance_count = 0;
9941 
9942    if (!radv_before_taskmesh_draw(cmd_buffer, &info, maxDrawCount, false))
9943       return;
9944 
9945    if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_TASK)) {
9946       radv_emit_indirect_taskmesh_draw_packets(cmd_buffer, &info);
9947    } else {
9948       radv_emit_indirect_mesh_draw_packets(cmd_buffer, &info);
9949    }
9950 
9951    radv_after_draw(cmd_buffer, false);
9952 }
9953 
9954 /* TODO: Use these functions with the normal dispatch path. */
9955 static void radv_dgc_before_dispatch(struct radv_cmd_buffer *cmd_buffer);
9956 static void radv_dgc_after_dispatch(struct radv_cmd_buffer *cmd_buffer);
9957 
9958 VKAPI_ATTR void VKAPI_CALL
radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer,VkBool32 isPreprocessed,const VkGeneratedCommandsInfoNV * pGeneratedCommandsInfo)9959 radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer, VkBool32 isPreprocessed,
9960                                    const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo)
9961 {
9962    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9963    VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout);
9964    VK_FROM_HANDLE(radv_pipeline, pipeline, pGeneratedCommandsInfo->pipeline);
9965    VK_FROM_HANDLE(radv_buffer, prep_buffer, pGeneratedCommandsInfo->preprocessBuffer);
9966    const bool compute = layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_COMPUTE;
9967    const bool use_predication = radv_use_dgc_predication(cmd_buffer, pGeneratedCommandsInfo);
9968    const struct radv_device *device = cmd_buffer->device;
9969 
9970    /* Secondary command buffers are needed for the full extension but can't use
9971     * PKT3_INDIRECT_BUFFER.
9972     */
9973    assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
9974 
9975    if (use_predication) {
9976       VK_FROM_HANDLE(radv_buffer, seq_count_buffer, pGeneratedCommandsInfo->sequencesCountBuffer);
9977       const uint64_t va = radv_buffer_get_va(seq_count_buffer->bo) + seq_count_buffer->offset +
9978                           pGeneratedCommandsInfo->sequencesCountOffset;
9979 
9980       radv_begin_conditional_rendering(cmd_buffer, va, true);
9981    }
9982 
9983    if (!radv_dgc_can_preprocess(layout, pipeline)) {
9984       const bool old_predicating = cmd_buffer->state.predicating;
9985 
9986       if (cmd_buffer->qf == RADV_QUEUE_COMPUTE && cmd_buffer->state.predicating) {
9987          /* Suspend conditional rendering when the DGC execute is called on the compute queue to
9988           * generate a cmdbuf which will skips dispatches when necessary. This is because the
9989           * compute queue is missing IB2 which means it's not possible to skip the cmdbuf entirely.
9990           */
9991          cmd_buffer->state.predicating = false;
9992       }
9993 
9994       radv_prepare_dgc(cmd_buffer, pGeneratedCommandsInfo, old_predicating);
9995 
9996       if (cmd_buffer->qf == RADV_QUEUE_COMPUTE) {
9997          cmd_buffer->state.predicating = old_predicating;
9998       }
9999 
10000       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_L2;
10001    }
10002 
10003    if (compute) {
10004       radv_dgc_before_dispatch(cmd_buffer);
10005    } else {
10006       struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline);
10007       struct radv_draw_info info;
10008 
10009       info.count = pGeneratedCommandsInfo->sequencesCount;
10010       info.indirect = prep_buffer; /* We're not really going use it this way, but a good signal
10011                                    that this is not direct. */
10012       info.indirect_offset = 0;
10013       info.stride = 0;
10014       info.strmout_buffer = NULL;
10015       info.count_buffer = NULL;
10016       info.indexed = layout->indexed;
10017       info.instance_count = 0;
10018 
10019       if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_MESH)) {
10020          if (!radv_before_taskmesh_draw(cmd_buffer, &info, 1, true))
10021             return;
10022       } else {
10023          if (!radv_before_draw(cmd_buffer, &info, 1, true))
10024             return;
10025       }
10026    }
10027 
10028    uint32_t cmdbuf_size = radv_get_indirect_cmdbuf_size(pGeneratedCommandsInfo);
10029    struct radeon_winsys_bo *ib_bo = prep_buffer->bo;
10030    const uint64_t ib_offset = prep_buffer->offset + pGeneratedCommandsInfo->preprocessOffset;
10031    const uint32_t view_mask = cmd_buffer->state.render.view_mask;
10032 
10033    if (!radv_cmd_buffer_uses_mec(cmd_buffer)) {
10034       radeon_emit(cmd_buffer->cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
10035       radeon_emit(cmd_buffer->cs, 0);
10036    }
10037 
10038    if (compute || !view_mask) {
10039       device->ws->cs_execute_ib(cmd_buffer->cs, ib_bo, ib_offset, cmdbuf_size >> 2, cmd_buffer->state.predicating);
10040    } else {
10041       u_foreach_bit (view, view_mask) {
10042          radv_emit_view_index(cmd_buffer, view);
10043 
10044          device->ws->cs_execute_ib(cmd_buffer->cs, ib_bo, ib_offset, cmdbuf_size >> 2, cmd_buffer->state.predicating);
10045       }
10046    }
10047 
10048    if (compute) {
10049       cmd_buffer->push_constant_stages |= VK_SHADER_STAGE_COMPUTE_BIT;
10050 
10051       radv_dgc_after_dispatch(cmd_buffer);
10052    } else {
10053       struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline);
10054 
10055       if (layout->binds_index_buffer) {
10056          cmd_buffer->state.last_index_type = -1;
10057          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
10058       }
10059 
10060       if (layout->bind_vbo_mask)
10061          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER;
10062 
10063       cmd_buffer->push_constant_stages |= graphics_pipeline->active_stages;
10064 
10065       if (!layout->indexed && cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) {
10066          /* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE, so the state must be
10067           * re-emitted before the next indexed draw.
10068           */
10069          cmd_buffer->state.last_index_type = -1;
10070       }
10071 
10072       cmd_buffer->state.last_num_instances = -1;
10073       cmd_buffer->state.last_vertex_offset_valid = false;
10074       cmd_buffer->state.last_first_instance = -1;
10075       cmd_buffer->state.last_drawid = -1;
10076 
10077       radv_after_draw(cmd_buffer, true);
10078    }
10079 
10080    if (use_predication) {
10081       radv_end_conditional_rendering(cmd_buffer);
10082    }
10083 }
10084 
10085 static void
radv_emit_dispatch_packets(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * compute_shader,const struct radv_dispatch_info * info)10086 radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *compute_shader,
10087                            const struct radv_dispatch_info *info)
10088 {
10089    unsigned dispatch_initiator = cmd_buffer->device->dispatch_initiator;
10090    struct radeon_winsys *ws = cmd_buffer->device->ws;
10091    bool predicating = cmd_buffer->state.predicating;
10092    struct radeon_cmdbuf *cs = cmd_buffer->cs;
10093    const struct radv_userdata_info *loc = radv_get_user_sgpr(compute_shader, AC_UD_CS_GRID_SIZE);
10094 
10095    radv_describe_dispatch(cmd_buffer, info);
10096 
10097    ASSERTED unsigned cdw_max = radeon_check_space(ws, cs, 30);
10098 
10099    if (compute_shader->info.wave_size == 32) {
10100       assert(cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10);
10101       dispatch_initiator |= S_00B800_CS_W32_EN(1);
10102    }
10103 
10104    if (info->ordered)
10105       dispatch_initiator &= ~S_00B800_ORDER_MODE(1);
10106 
10107    if (info->va) {
10108       if (info->indirect)
10109          radv_cs_add_buffer(ws, cs, info->indirect);
10110 
10111       if (info->unaligned) {
10112          radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
10113          radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[0]));
10114          radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[1]));
10115          radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[2]));
10116 
10117          dispatch_initiator |= S_00B800_USE_THREAD_DIMENSIONS(1);
10118       }
10119 
10120       if (loc->sgpr_idx != -1) {
10121          unsigned reg = R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4;
10122 
10123          if (cmd_buffer->device->load_grid_size_from_user_sgpr) {
10124             assert(cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10_3);
10125             radeon_emit(cs, PKT3(PKT3_LOAD_SH_REG_INDEX, 3, 0));
10126             radeon_emit(cs, info->va);
10127             radeon_emit(cs, info->va >> 32);
10128             radeon_emit(cs, (reg - SI_SH_REG_OFFSET) >> 2);
10129             radeon_emit(cs, 3);
10130          } else {
10131             radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, reg, info->va, true);
10132          }
10133       }
10134 
10135       if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
10136          uint64_t indirect_va = info->va;
10137          const bool needs_align32_workaround =
10138             cmd_buffer->device->physical_device->rad_info.has_async_compute_align32_bug &&
10139             cmd_buffer->qf == RADV_QUEUE_COMPUTE && !radv_is_aligned(indirect_va, 32);
10140          const unsigned ace_predication_size =
10141             4 /* DISPATCH_INDIRECT */ + (needs_align32_workaround ? 6 * 3 /* 3x COPY_DATA */ : 0);
10142 
10143          radv_cs_emit_compute_predication(&cmd_buffer->state, cs, cmd_buffer->mec_inv_pred_va,
10144                                           &cmd_buffer->mec_inv_pred_emitted, ace_predication_size);
10145 
10146          if (needs_align32_workaround) {
10147             const uint64_t unaligned_va = indirect_va;
10148             UNUSED void *ptr;
10149             uint32_t offset;
10150 
10151             if (!radv_cmd_buffer_upload_alloc_aligned(cmd_buffer, sizeof(VkDispatchIndirectCommand), 32, &offset, &ptr))
10152                return;
10153 
10154             indirect_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset;
10155 
10156             for (uint32_t i = 0; i < 3; i++) {
10157                const uint64_t src_va = unaligned_va + i * 4;
10158                const uint64_t dst_va = indirect_va + i * 4;
10159 
10160                radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
10161                radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
10162                                   COPY_DATA_WR_CONFIRM);
10163                radeon_emit(cs, src_va);
10164                radeon_emit(cs, src_va >> 32);
10165                radeon_emit(cs, dst_va);
10166                radeon_emit(cs, dst_va >> 32);
10167             }
10168          }
10169 
10170          radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 2, 0) | PKT3_SHADER_TYPE_S(1));
10171          radeon_emit(cs, indirect_va);
10172          radeon_emit(cs, indirect_va >> 32);
10173          radeon_emit(cs, dispatch_initiator);
10174       } else {
10175          radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) | PKT3_SHADER_TYPE_S(1));
10176          radeon_emit(cs, 1);
10177          radeon_emit(cs, info->va);
10178          radeon_emit(cs, info->va >> 32);
10179 
10180          radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, predicating) | PKT3_SHADER_TYPE_S(1));
10181          radeon_emit(cs, 0);
10182          radeon_emit(cs, dispatch_initiator);
10183       }
10184    } else {
10185       const unsigned *cs_block_size = compute_shader->info.cs.block_size;
10186       unsigned blocks[3] = {info->blocks[0], info->blocks[1], info->blocks[2]};
10187       unsigned offsets[3] = {info->offsets[0], info->offsets[1], info->offsets[2]};
10188 
10189       if (info->unaligned) {
10190          unsigned remainder[3];
10191 
10192          /* If aligned, these should be an entire block size,
10193           * not 0.
10194           */
10195          remainder[0] = blocks[0] + cs_block_size[0] - align_u32_npot(blocks[0], cs_block_size[0]);
10196          remainder[1] = blocks[1] + cs_block_size[1] - align_u32_npot(blocks[1], cs_block_size[1]);
10197          remainder[2] = blocks[2] + cs_block_size[2] - align_u32_npot(blocks[2], cs_block_size[2]);
10198 
10199          blocks[0] = DIV_ROUND_UP(blocks[0], cs_block_size[0]);
10200          blocks[1] = DIV_ROUND_UP(blocks[1], cs_block_size[1]);
10201          blocks[2] = DIV_ROUND_UP(blocks[2], cs_block_size[2]);
10202 
10203          for (unsigned i = 0; i < 3; ++i) {
10204             assert(offsets[i] % cs_block_size[i] == 0);
10205             offsets[i] /= cs_block_size[i];
10206          }
10207 
10208          radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
10209          radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(cs_block_size[0]) | S_00B81C_NUM_THREAD_PARTIAL(remainder[0]));
10210          radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(cs_block_size[1]) | S_00B81C_NUM_THREAD_PARTIAL(remainder[1]));
10211          radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(cs_block_size[2]) | S_00B81C_NUM_THREAD_PARTIAL(remainder[2]));
10212 
10213          dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1);
10214       }
10215 
10216       if (loc->sgpr_idx != -1) {
10217          if (cmd_buffer->device->load_grid_size_from_user_sgpr) {
10218             assert(loc->num_sgprs == 3);
10219 
10220             radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, 3);
10221             radeon_emit(cs, blocks[0]);
10222             radeon_emit(cs, blocks[1]);
10223             radeon_emit(cs, blocks[2]);
10224          } else {
10225             uint32_t offset;
10226             if (!radv_cmd_buffer_upload_data(cmd_buffer, 12, blocks, &offset))
10227                return;
10228 
10229             uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset;
10230             radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs,
10231                                      R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, va, true);
10232          }
10233       }
10234 
10235       if (offsets[0] || offsets[1] || offsets[2]) {
10236          radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3);
10237          radeon_emit(cs, offsets[0]);
10238          radeon_emit(cs, offsets[1]);
10239          radeon_emit(cs, offsets[2]);
10240 
10241          /* The blocks in the packet are not counts but end values. */
10242          for (unsigned i = 0; i < 3; ++i)
10243             blocks[i] += offsets[i];
10244       } else {
10245          dispatch_initiator |= S_00B800_FORCE_START_AT_000(1);
10246       }
10247 
10248       if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
10249          radv_cs_emit_compute_predication(&cmd_buffer->state, cs, cmd_buffer->mec_inv_pred_va,
10250                                           &cmd_buffer->mec_inv_pred_emitted, 5 /* DISPATCH_DIRECT size */);
10251          predicating = false;
10252       }
10253 
10254       if (cmd_buffer->device->physical_device->rad_info.has_async_compute_threadgroup_bug &&
10255           cmd_buffer->qf == RADV_QUEUE_COMPUTE) {
10256          for (unsigned i = 0; i < 3; i++) {
10257             if (info->unaligned) {
10258                /* info->blocks is already in thread dimensions for unaligned dispatches. */
10259                blocks[i] = info->blocks[i];
10260             } else {
10261                /* Force the async compute dispatch to be in "thread" dim mode to workaround a hw bug. */
10262                blocks[i] *= cs_block_size[i];
10263             }
10264 
10265             dispatch_initiator |= S_00B800_USE_THREAD_DIMENSIONS(1);
10266          }
10267       }
10268 
10269       radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, predicating) | PKT3_SHADER_TYPE_S(1));
10270       radeon_emit(cs, blocks[0]);
10271       radeon_emit(cs, blocks[1]);
10272       radeon_emit(cs, blocks[2]);
10273       radeon_emit(cs, dispatch_initiator);
10274    }
10275 
10276    assert(cmd_buffer->cs->cdw <= cdw_max);
10277 }
10278 
10279 static void
radv_upload_compute_shader_descriptors(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)10280 radv_upload_compute_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point)
10281 {
10282    radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_COMPUTE_BIT, bind_point);
10283    const VkShaderStageFlags stages =
10284       bind_point == VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR ? RADV_RT_STAGE_BITS : VK_SHADER_STAGE_COMPUTE_BIT;
10285    const VkShaderStageFlags pc_stages = radv_must_flush_constants(cmd_buffer, stages, bind_point);
10286    if (pc_stages)
10287       radv_flush_constants(cmd_buffer, pc_stages, bind_point);
10288 }
10289 
10290 static void
radv_dispatch(struct radv_cmd_buffer * cmd_buffer,const struct radv_dispatch_info * info,struct radv_compute_pipeline * pipeline,struct radv_shader * compute_shader,VkPipelineBindPoint bind_point)10291 radv_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info,
10292               struct radv_compute_pipeline *pipeline, struct radv_shader *compute_shader,
10293               VkPipelineBindPoint bind_point)
10294 {
10295    bool has_prefetch = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7;
10296    bool pipeline_is_dirty = pipeline != cmd_buffer->state.emitted_compute_pipeline;
10297 
10298    if (compute_shader->info.cs.regalloc_hang_bug)
10299       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
10300 
10301    if (cmd_buffer->state.flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB |
10302                                        RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
10303       /* If we have to wait for idle, set all states first, so that
10304        * all SET packets are processed in parallel with previous draw
10305        * calls. Then upload descriptors, set shader pointers, and
10306        * dispatch, and prefetch at the end. This ensures that the
10307        * time the CUs are idle is very short. (there are only SET_SH
10308        * packets between the wait and the draw)
10309        */
10310       radv_emit_compute_pipeline(cmd_buffer, pipeline);
10311       radv_emit_cache_flush(cmd_buffer);
10312       /* <-- CUs are idle here --> */
10313 
10314       radv_upload_compute_shader_descriptors(cmd_buffer, bind_point);
10315 
10316       radv_emit_dispatch_packets(cmd_buffer, compute_shader, info);
10317       /* <-- CUs are busy here --> */
10318 
10319       /* Start prefetches after the dispatch has been started. Both
10320        * will run in parallel, but starting the dispatch first is
10321        * more important.
10322        */
10323       if (has_prefetch && pipeline_is_dirty) {
10324          radv_emit_shader_prefetch(cmd_buffer, compute_shader);
10325       }
10326    } else {
10327       /* If we don't wait for idle, start prefetches first, then set
10328        * states, and dispatch at the end.
10329        */
10330       radv_emit_cache_flush(cmd_buffer);
10331 
10332       if (has_prefetch && pipeline_is_dirty) {
10333          radv_emit_shader_prefetch(cmd_buffer, compute_shader);
10334       }
10335 
10336       radv_upload_compute_shader_descriptors(cmd_buffer, bind_point);
10337 
10338       radv_emit_compute_pipeline(cmd_buffer, pipeline);
10339       radv_emit_dispatch_packets(cmd_buffer, compute_shader, info);
10340    }
10341 
10342    if (pipeline_is_dirty) {
10343       /* Raytracing uses compute shaders but has separate bind points and pipelines.
10344        * So if we set compute userdata & shader registers we should dirty the raytracing
10345        * ones and the other way around.
10346        *
10347        * We only need to do this when the pipeline is dirty because when we switch between
10348        * the two we always need to switch pipelines.
10349        */
10350       radv_mark_descriptor_sets_dirty(cmd_buffer, bind_point == VK_PIPELINE_BIND_POINT_COMPUTE
10351                                                      ? VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR
10352                                                      : VK_PIPELINE_BIND_POINT_COMPUTE);
10353    }
10354 
10355    if (compute_shader->info.cs.regalloc_hang_bug)
10356       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
10357 
10358    radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_CS_PARTIAL_FLUSH, false);
10359 }
10360 
10361 static void
radv_dgc_before_dispatch(struct radv_cmd_buffer * cmd_buffer)10362 radv_dgc_before_dispatch(struct radv_cmd_buffer *cmd_buffer)
10363 {
10364    struct radv_compute_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
10365    struct radv_shader *compute_shader = cmd_buffer->state.shaders[MESA_SHADER_COMPUTE];
10366    bool pipeline_is_dirty = pipeline != cmd_buffer->state.emitted_compute_pipeline;
10367 
10368    /* We will have run the DGC patch shaders before, so we can assume that there is something to
10369     * flush. Otherwise, we just split radv_dispatch in two. One pre-dispatch and another one
10370     * post-dispatch. */
10371 
10372    if (compute_shader->info.cs.regalloc_hang_bug)
10373       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
10374 
10375    radv_emit_compute_pipeline(cmd_buffer, pipeline);
10376    radv_emit_cache_flush(cmd_buffer);
10377 
10378    radv_upload_compute_shader_descriptors(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE);
10379 
10380    if (pipeline_is_dirty) {
10381       const bool has_prefetch = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7;
10382 
10383       if (has_prefetch)
10384          radv_emit_shader_prefetch(cmd_buffer, compute_shader);
10385 
10386       /* Raytracing uses compute shaders but has separate bind points and pipelines.
10387        * So if we set compute userdata & shader registers we should dirty the raytracing
10388        * ones and the other way around.
10389        *
10390        * We only need to do this when the pipeline is dirty because when we switch between
10391        * the two we always need to switch pipelines.
10392        */
10393       radv_mark_descriptor_sets_dirty(cmd_buffer, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
10394    }
10395 }
10396 
10397 static void
radv_dgc_after_dispatch(struct radv_cmd_buffer * cmd_buffer)10398 radv_dgc_after_dispatch(struct radv_cmd_buffer *cmd_buffer)
10399 {
10400    struct radv_shader *compute_shader = cmd_buffer->state.shaders[MESA_SHADER_COMPUTE];
10401 
10402    if (compute_shader->info.cs.regalloc_hang_bug)
10403       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
10404 
10405    radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_CS_PARTIAL_FLUSH, true);
10406 }
10407 
10408 void
radv_compute_dispatch(struct radv_cmd_buffer * cmd_buffer,const struct radv_dispatch_info * info)10409 radv_compute_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info)
10410 {
10411    radv_dispatch(cmd_buffer, info, cmd_buffer->state.compute_pipeline, cmd_buffer->state.shaders[MESA_SHADER_COMPUTE],
10412                  VK_PIPELINE_BIND_POINT_COMPUTE);
10413 }
10414 
10415 VKAPI_ATTR void VKAPI_CALL
radv_CmdDispatchBase(VkCommandBuffer commandBuffer,uint32_t base_x,uint32_t base_y,uint32_t base_z,uint32_t x,uint32_t y,uint32_t z)10416 radv_CmdDispatchBase(VkCommandBuffer commandBuffer, uint32_t base_x, uint32_t base_y, uint32_t base_z, uint32_t x,
10417                      uint32_t y, uint32_t z)
10418 {
10419    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
10420    struct radv_dispatch_info info = {0};
10421 
10422    info.blocks[0] = x;
10423    info.blocks[1] = y;
10424    info.blocks[2] = z;
10425 
10426    info.offsets[0] = base_x;
10427    info.offsets[1] = base_y;
10428    info.offsets[2] = base_z;
10429    radv_compute_dispatch(cmd_buffer, &info);
10430 }
10431 
10432 VKAPI_ATTR void VKAPI_CALL
radv_CmdDispatchIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset)10433 radv_CmdDispatchIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset)
10434 {
10435    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
10436    RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
10437    struct radv_dispatch_info info = {0};
10438 
10439    info.indirect = buffer->bo;
10440    info.va = radv_buffer_get_va(buffer->bo) + buffer->offset + offset;
10441 
10442    radv_compute_dispatch(cmd_buffer, &info);
10443 }
10444 
10445 void
radv_unaligned_dispatch(struct radv_cmd_buffer * cmd_buffer,uint32_t x,uint32_t y,uint32_t z)10446 radv_unaligned_dispatch(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t y, uint32_t z)
10447 {
10448    struct radv_dispatch_info info = {0};
10449 
10450    info.blocks[0] = x;
10451    info.blocks[1] = y;
10452    info.blocks[2] = z;
10453    info.unaligned = 1;
10454 
10455    radv_compute_dispatch(cmd_buffer, &info);
10456 }
10457 
10458 void
radv_indirect_dispatch(struct radv_cmd_buffer * cmd_buffer,struct radeon_winsys_bo * bo,uint64_t va)10459 radv_indirect_dispatch(struct radv_cmd_buffer *cmd_buffer, struct radeon_winsys_bo *bo, uint64_t va)
10460 {
10461    struct radv_dispatch_info info = {0};
10462 
10463    info.indirect = bo;
10464    info.va = va;
10465 
10466    radv_compute_dispatch(cmd_buffer, &info);
10467 }
10468 
10469 static void
radv_trace_trace_rays(struct radv_cmd_buffer * cmd_buffer,const VkTraceRaysIndirectCommand2KHR * cmd,uint64_t indirect_va)10470 radv_trace_trace_rays(struct radv_cmd_buffer *cmd_buffer, const VkTraceRaysIndirectCommand2KHR *cmd,
10471                       uint64_t indirect_va)
10472 {
10473    if (!cmd || indirect_va)
10474       return;
10475 
10476    struct radv_rra_ray_history_data *data = malloc(sizeof(struct radv_rra_ray_history_data));
10477    if (!data)
10478       return;
10479 
10480    uint32_t width = DIV_ROUND_UP(cmd->width, cmd_buffer->device->rra_trace.ray_history_resolution_scale);
10481    uint32_t height = DIV_ROUND_UP(cmd->height, cmd_buffer->device->rra_trace.ray_history_resolution_scale);
10482    uint32_t depth = DIV_ROUND_UP(cmd->depth, cmd_buffer->device->rra_trace.ray_history_resolution_scale);
10483 
10484    struct radv_rra_ray_history_counter counter = {
10485       .dispatch_size = {width, height, depth},
10486       .hit_shader_count = cmd->hitShaderBindingTableSize / cmd->hitShaderBindingTableStride,
10487       .miss_shader_count = cmd->missShaderBindingTableSize / cmd->missShaderBindingTableStride,
10488       .shader_count = cmd_buffer->state.rt_pipeline->stage_count,
10489       .pipeline_api_hash = cmd_buffer->state.rt_pipeline->base.base.pipeline_hash,
10490       .mode = 1,
10491       .stride = sizeof(uint32_t),
10492       .data_size = 0,
10493       .ray_id_begin = 0,
10494       .ray_id_end = 0xFFFFFFFF,
10495       .pipeline_type = RADV_RRA_PIPELINE_RAY_TRACING,
10496    };
10497 
10498    struct radv_rra_ray_history_dispatch_size dispatch_size = {
10499       .size = {width, height, depth},
10500    };
10501 
10502    struct radv_rra_ray_history_traversal_flags traversal_flags = {0};
10503 
10504    data->metadata = (struct radv_rra_ray_history_metadata){
10505       .counter_info.type = RADV_RRA_COUNTER_INFO,
10506       .counter_info.size = sizeof(struct radv_rra_ray_history_counter),
10507       .counter = counter,
10508 
10509       .dispatch_size_info.type = RADV_RRA_DISPATCH_SIZE,
10510       .dispatch_size_info.size = sizeof(struct radv_rra_ray_history_dispatch_size),
10511       .dispatch_size = dispatch_size,
10512 
10513       .traversal_flags_info.type = RADV_RRA_TRAVERSAL_FLAGS,
10514       .traversal_flags_info.size = sizeof(struct radv_rra_ray_history_traversal_flags),
10515       .traversal_flags = traversal_flags,
10516    };
10517 
10518    uint32_t dispatch_index = util_dynarray_num_elements(&cmd_buffer->ray_history, struct radv_rra_ray_history_data *)
10519                              << 16;
10520 
10521    util_dynarray_append(&cmd_buffer->ray_history, struct radv_rra_ray_history_data *, data);
10522 
10523    cmd_buffer->state.flush_bits |=
10524       RADV_CMD_FLAG_INV_SCACHE | RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
10525       radv_src_access_flush(cmd_buffer, VK_ACCESS_2_SHADER_READ_BIT | VK_ACCESS_2_SHADER_WRITE_BIT, NULL) |
10526       radv_dst_access_flush(cmd_buffer, VK_ACCESS_2_SHADER_READ_BIT | VK_ACCESS_2_SHADER_WRITE_BIT, NULL);
10527 
10528    radv_update_buffer_cp(
10529       cmd_buffer,
10530       cmd_buffer->device->rra_trace.ray_history_addr + offsetof(struct radv_ray_history_header, dispatch_index),
10531       &dispatch_index, sizeof(dispatch_index));
10532 }
10533 
10534 enum radv_rt_mode {
10535    radv_rt_mode_direct,
10536    radv_rt_mode_indirect,
10537    radv_rt_mode_indirect2,
10538 };
10539 
10540 static void
radv_upload_trace_rays_params(struct radv_cmd_buffer * cmd_buffer,VkTraceRaysIndirectCommand2KHR * tables,enum radv_rt_mode mode,uint64_t * launch_size_va,uint64_t * sbt_va)10541 radv_upload_trace_rays_params(struct radv_cmd_buffer *cmd_buffer, VkTraceRaysIndirectCommand2KHR *tables,
10542                               enum radv_rt_mode mode, uint64_t *launch_size_va, uint64_t *sbt_va)
10543 {
10544    uint32_t upload_size = mode == radv_rt_mode_direct ? sizeof(VkTraceRaysIndirectCommand2KHR)
10545                                                       : offsetof(VkTraceRaysIndirectCommand2KHR, width);
10546 
10547    uint32_t offset;
10548    if (!radv_cmd_buffer_upload_data(cmd_buffer, upload_size, tables, &offset))
10549       return;
10550 
10551    uint64_t upload_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset;
10552 
10553    if (mode == radv_rt_mode_direct)
10554       *launch_size_va = upload_va + offsetof(VkTraceRaysIndirectCommand2KHR, width);
10555    if (sbt_va)
10556       *sbt_va = upload_va;
10557 }
10558 
10559 static void
radv_trace_rays(struct radv_cmd_buffer * cmd_buffer,VkTraceRaysIndirectCommand2KHR * tables,uint64_t indirect_va,enum radv_rt_mode mode)10560 radv_trace_rays(struct radv_cmd_buffer *cmd_buffer, VkTraceRaysIndirectCommand2KHR *tables, uint64_t indirect_va,
10561                 enum radv_rt_mode mode)
10562 {
10563    if (cmd_buffer->device->instance->debug_flags & RADV_DEBUG_NO_RT)
10564       return;
10565 
10566    if (unlikely(cmd_buffer->device->rra_trace.ray_history_buffer))
10567       radv_trace_trace_rays(cmd_buffer, tables, indirect_va);
10568 
10569    struct radv_compute_pipeline *pipeline = &cmd_buffer->state.rt_pipeline->base;
10570    struct radv_shader *rt_prolog = cmd_buffer->state.rt_prolog;
10571    uint32_t base_reg = rt_prolog->info.user_data_0;
10572 
10573    /* Reserve scratch for stacks manually since it is not handled by the compute path. */
10574    uint32_t scratch_bytes_per_wave = rt_prolog->config.scratch_bytes_per_wave;
10575    uint32_t wave_size = rt_prolog->info.wave_size;
10576 
10577    /* The hardware register is specified as a multiple of 64 or 256 DWORDS. */
10578    unsigned scratch_alloc_granule = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11 ? 256 : 1024;
10579    scratch_bytes_per_wave += align(cmd_buffer->state.rt_stack_size * wave_size, scratch_alloc_granule);
10580 
10581    cmd_buffer->compute_scratch_size_per_wave_needed =
10582       MAX2(cmd_buffer->compute_scratch_size_per_wave_needed, scratch_bytes_per_wave);
10583 
10584    /* Since the workgroup size is 8x4 (or 8x8), 1D dispatches can only fill 8 threads per wave at most. To increase
10585     * occupancy, it's beneficial to convert to a 2D dispatch in these cases. */
10586    if (tables && tables->height == 1 && tables->width >= cmd_buffer->state.rt_prolog->info.cs.block_size[0])
10587       tables->height = ACO_RT_CONVERTED_2D_LAUNCH_SIZE;
10588 
10589    struct radv_dispatch_info info = {0};
10590    info.unaligned = true;
10591 
10592    uint64_t launch_size_va = 0;
10593    uint64_t sbt_va = 0;
10594 
10595    if (mode != radv_rt_mode_indirect2) {
10596       launch_size_va = indirect_va;
10597       radv_upload_trace_rays_params(cmd_buffer, tables, mode, &launch_size_va, &sbt_va);
10598    } else {
10599       launch_size_va = indirect_va + offsetof(VkTraceRaysIndirectCommand2KHR, width);
10600       sbt_va = indirect_va;
10601    }
10602 
10603    uint32_t remaining_ray_count = 0;
10604 
10605    if (mode == radv_rt_mode_direct) {
10606       info.blocks[0] = tables->width;
10607       info.blocks[1] = tables->height;
10608       info.blocks[2] = tables->depth;
10609 
10610       if (tables->height == ACO_RT_CONVERTED_2D_LAUNCH_SIZE) {
10611          /* We need the ray count for the 2D dispatch to be a multiple of the y block size for the division to work, and
10612           * a multiple of the x block size because the invocation offset must be a multiple of the block size when
10613           * dispatching the remaining rays. Fortunately, the x block size is itself a multiple of the y block size, so
10614           * we only need to ensure that the ray count is a multiple of the x block size. */
10615          remaining_ray_count = tables->width % rt_prolog->info.cs.block_size[0];
10616 
10617          uint32_t ray_count = tables->width - remaining_ray_count;
10618          info.blocks[0] = ray_count / rt_prolog->info.cs.block_size[1];
10619          info.blocks[1] = rt_prolog->info.cs.block_size[1];
10620       }
10621    } else
10622       info.va = launch_size_va;
10623 
10624    ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 15);
10625 
10626    const struct radv_userdata_info *desc_loc = radv_get_user_sgpr(rt_prolog, AC_UD_CS_SBT_DESCRIPTORS);
10627    if (desc_loc->sgpr_idx != -1) {
10628       radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + desc_loc->sgpr_idx * 4, sbt_va, true);
10629    }
10630 
10631    const struct radv_userdata_info *size_loc = radv_get_user_sgpr(rt_prolog, AC_UD_CS_RAY_LAUNCH_SIZE_ADDR);
10632    if (size_loc->sgpr_idx != -1) {
10633       radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + size_loc->sgpr_idx * 4, launch_size_va,
10634                                true);
10635    }
10636 
10637    const struct radv_userdata_info *base_loc = radv_get_user_sgpr(rt_prolog, AC_UD_CS_RAY_DYNAMIC_CALLABLE_STACK_BASE);
10638    if (base_loc->sgpr_idx != -1) {
10639       const struct radv_shader_info *cs_info = &rt_prolog->info;
10640       radeon_set_sh_reg(cmd_buffer->cs, R_00B900_COMPUTE_USER_DATA_0 + base_loc->sgpr_idx * 4,
10641                         rt_prolog->config.scratch_bytes_per_wave / cs_info->wave_size);
10642    }
10643 
10644    const struct radv_userdata_info *shader_loc = radv_get_user_sgpr(rt_prolog, AC_UD_CS_TRAVERSAL_SHADER_ADDR);
10645    struct radv_shader *traversal_shader = cmd_buffer->state.shaders[MESA_SHADER_INTERSECTION];
10646    if (shader_loc->sgpr_idx != -1 && traversal_shader) {
10647       uint64_t traversal_va = traversal_shader->va | radv_rt_priority_traversal;
10648       radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + shader_loc->sgpr_idx * 4, traversal_va,
10649                                true);
10650    }
10651 
10652    assert(cmd_buffer->cs->cdw <= cdw_max);
10653 
10654    radv_dispatch(cmd_buffer, &info, pipeline, rt_prolog, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
10655 
10656    if (remaining_ray_count) {
10657       info.blocks[0] = remaining_ray_count;
10658       info.blocks[1] = 1;
10659       info.offsets[0] = tables->width - remaining_ray_count;
10660 
10661       /* Reset the ray launch size so the prolog doesn't think this is a converted dispatch */
10662       tables->height = 1;
10663       radv_upload_trace_rays_params(cmd_buffer, tables, mode, &launch_size_va, NULL);
10664       if (size_loc->sgpr_idx != -1) {
10665          radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + size_loc->sgpr_idx * 4, launch_size_va,
10666                                   true);
10667       }
10668 
10669       radv_dispatch(cmd_buffer, &info, pipeline, rt_prolog, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
10670    }
10671 }
10672 
10673 VKAPI_ATTR void VKAPI_CALL
radv_CmdTraceRaysKHR(VkCommandBuffer commandBuffer,const VkStridedDeviceAddressRegionKHR * pRaygenShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pMissShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pHitShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pCallableShaderBindingTable,uint32_t width,uint32_t height,uint32_t depth)10674 radv_CmdTraceRaysKHR(VkCommandBuffer commandBuffer, const VkStridedDeviceAddressRegionKHR *pRaygenShaderBindingTable,
10675                      const VkStridedDeviceAddressRegionKHR *pMissShaderBindingTable,
10676                      const VkStridedDeviceAddressRegionKHR *pHitShaderBindingTable,
10677                      const VkStridedDeviceAddressRegionKHR *pCallableShaderBindingTable, uint32_t width,
10678                      uint32_t height, uint32_t depth)
10679 {
10680    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
10681 
10682    VkTraceRaysIndirectCommand2KHR tables = {
10683       .raygenShaderRecordAddress = pRaygenShaderBindingTable->deviceAddress,
10684       .raygenShaderRecordSize = pRaygenShaderBindingTable->size,
10685       .missShaderBindingTableAddress = pMissShaderBindingTable->deviceAddress,
10686       .missShaderBindingTableSize = pMissShaderBindingTable->size,
10687       .missShaderBindingTableStride = pMissShaderBindingTable->stride,
10688       .hitShaderBindingTableAddress = pHitShaderBindingTable->deviceAddress,
10689       .hitShaderBindingTableSize = pHitShaderBindingTable->size,
10690       .hitShaderBindingTableStride = pHitShaderBindingTable->stride,
10691       .callableShaderBindingTableAddress = pCallableShaderBindingTable->deviceAddress,
10692       .callableShaderBindingTableSize = pCallableShaderBindingTable->size,
10693       .callableShaderBindingTableStride = pCallableShaderBindingTable->stride,
10694       .width = width,
10695       .height = height,
10696       .depth = depth,
10697    };
10698 
10699    radv_trace_rays(cmd_buffer, &tables, 0, radv_rt_mode_direct);
10700 }
10701 
10702 VKAPI_ATTR void VKAPI_CALL
radv_CmdTraceRaysIndirectKHR(VkCommandBuffer commandBuffer,const VkStridedDeviceAddressRegionKHR * pRaygenShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pMissShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pHitShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pCallableShaderBindingTable,VkDeviceAddress indirectDeviceAddress)10703 radv_CmdTraceRaysIndirectKHR(VkCommandBuffer commandBuffer,
10704                              const VkStridedDeviceAddressRegionKHR *pRaygenShaderBindingTable,
10705                              const VkStridedDeviceAddressRegionKHR *pMissShaderBindingTable,
10706                              const VkStridedDeviceAddressRegionKHR *pHitShaderBindingTable,
10707                              const VkStridedDeviceAddressRegionKHR *pCallableShaderBindingTable,
10708                              VkDeviceAddress indirectDeviceAddress)
10709 {
10710    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
10711 
10712    assert(cmd_buffer->device->use_global_bo_list);
10713 
10714    VkTraceRaysIndirectCommand2KHR tables = {
10715       .raygenShaderRecordAddress = pRaygenShaderBindingTable->deviceAddress,
10716       .raygenShaderRecordSize = pRaygenShaderBindingTable->size,
10717       .missShaderBindingTableAddress = pMissShaderBindingTable->deviceAddress,
10718       .missShaderBindingTableSize = pMissShaderBindingTable->size,
10719       .missShaderBindingTableStride = pMissShaderBindingTable->stride,
10720       .hitShaderBindingTableAddress = pHitShaderBindingTable->deviceAddress,
10721       .hitShaderBindingTableSize = pHitShaderBindingTable->size,
10722       .hitShaderBindingTableStride = pHitShaderBindingTable->stride,
10723       .callableShaderBindingTableAddress = pCallableShaderBindingTable->deviceAddress,
10724       .callableShaderBindingTableSize = pCallableShaderBindingTable->size,
10725       .callableShaderBindingTableStride = pCallableShaderBindingTable->stride,
10726    };
10727 
10728    radv_trace_rays(cmd_buffer, &tables, indirectDeviceAddress, radv_rt_mode_indirect);
10729 }
10730 
10731 VKAPI_ATTR void VKAPI_CALL
radv_CmdTraceRaysIndirect2KHR(VkCommandBuffer commandBuffer,VkDeviceAddress indirectDeviceAddress)10732 radv_CmdTraceRaysIndirect2KHR(VkCommandBuffer commandBuffer, VkDeviceAddress indirectDeviceAddress)
10733 {
10734    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
10735 
10736    assert(cmd_buffer->device->use_global_bo_list);
10737 
10738    radv_trace_rays(cmd_buffer, NULL, indirectDeviceAddress, radv_rt_mode_indirect2);
10739 }
10740 
10741 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetRayTracingPipelineStackSizeKHR(VkCommandBuffer commandBuffer,uint32_t size)10742 radv_CmdSetRayTracingPipelineStackSizeKHR(VkCommandBuffer commandBuffer, uint32_t size)
10743 {
10744    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
10745    cmd_buffer->state.rt_stack_size = size;
10746 }
10747 
10748 /*
10749  * For HTILE we have the following interesting clear words:
10750  *   0xfffff30f: Uncompressed, full depth range, for depth+stencil HTILE
10751  *   0xfffc000f: Uncompressed, full depth range, for depth only HTILE.
10752  *   0xfffffff0: Clear depth to 1.0
10753  *   0x00000000: Clear depth to 0.0
10754  */
10755 static void
radv_initialize_htile(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range)10756 radv_initialize_htile(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
10757                       const VkImageSubresourceRange *range)
10758 {
10759    struct radv_cmd_state *state = &cmd_buffer->state;
10760    uint32_t htile_value = radv_get_htile_initial_value(cmd_buffer->device, image);
10761    VkClearDepthStencilValue value = {0};
10762    struct radv_barrier_data barrier = {0};
10763 
10764    barrier.layout_transitions.init_mask_ram = 1;
10765    radv_describe_layout_transition(cmd_buffer, &barrier);
10766 
10767    /* Transitioning from LAYOUT_UNDEFINED layout not everyone is consistent
10768     * in considering previous rendering work for WAW hazards. */
10769    state->flush_bits |= radv_src_access_flush(cmd_buffer, VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, image);
10770 
10771    if (image->planes[0].surface.has_stencil &&
10772        !(range->aspectMask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT))) {
10773       /* Flush caches before performing a separate aspect initialization because it's a
10774        * read-modify-write operation.
10775        */
10776       state->flush_bits |= radv_dst_access_flush(cmd_buffer, VK_ACCESS_2_SHADER_READ_BIT, image);
10777    }
10778 
10779    state->flush_bits |= radv_clear_htile(cmd_buffer, image, range, htile_value);
10780 
10781    radv_set_ds_clear_metadata(cmd_buffer, image, range, value, range->aspectMask);
10782 
10783    if (radv_image_is_tc_compat_htile(image) && (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)) {
10784       /* Initialize the TC-compat metada value to 0 because by
10785        * default DB_Z_INFO.RANGE_PRECISION is set to 1, and we only
10786        * need have to conditionally update its value when performing
10787        * a fast depth clear.
10788        */
10789       radv_set_tc_compat_zrange_metadata(cmd_buffer, image, range, 0);
10790    }
10791 }
10792 
10793 static void
radv_handle_depth_image_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,VkImageLayout dst_layout,unsigned src_queue_mask,unsigned dst_queue_mask,const VkImageSubresourceRange * range,struct radv_sample_locations_state * sample_locs)10794 radv_handle_depth_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
10795                                    VkImageLayout src_layout, VkImageLayout dst_layout, unsigned src_queue_mask,
10796                                    unsigned dst_queue_mask, const VkImageSubresourceRange *range,
10797                                    struct radv_sample_locations_state *sample_locs)
10798 {
10799    struct radv_device *device = cmd_buffer->device;
10800 
10801    if (!radv_htile_enabled(image, range->baseMipLevel))
10802       return;
10803 
10804    if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
10805       radv_initialize_htile(cmd_buffer, image, range);
10806    } else if (!radv_layout_is_htile_compressed(device, image, src_layout, src_queue_mask) &&
10807               radv_layout_is_htile_compressed(device, image, dst_layout, dst_queue_mask)) {
10808       radv_initialize_htile(cmd_buffer, image, range);
10809    } else if (radv_layout_is_htile_compressed(device, image, src_layout, src_queue_mask) &&
10810               !radv_layout_is_htile_compressed(device, image, dst_layout, dst_queue_mask)) {
10811       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
10812 
10813       radv_expand_depth_stencil(cmd_buffer, image, range, sample_locs);
10814 
10815       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
10816    }
10817 }
10818 
10819 static uint32_t
radv_init_cmask(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,uint32_t value)10820 radv_init_cmask(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, const VkImageSubresourceRange *range,
10821                 uint32_t value)
10822 {
10823    struct radv_barrier_data barrier = {0};
10824 
10825    barrier.layout_transitions.init_mask_ram = 1;
10826    radv_describe_layout_transition(cmd_buffer, &barrier);
10827 
10828    return radv_clear_cmask(cmd_buffer, image, range, value);
10829 }
10830 
10831 uint32_t
radv_init_fmask(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range)10832 radv_init_fmask(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, const VkImageSubresourceRange *range)
10833 {
10834    static const uint32_t fmask_clear_values[4] = {0x00000000, 0x02020202, 0xE4E4E4E4, 0x76543210};
10835    uint32_t log2_samples = util_logbase2(image->vk.samples);
10836    uint32_t value = fmask_clear_values[log2_samples];
10837    struct radv_barrier_data barrier = {0};
10838 
10839    barrier.layout_transitions.init_mask_ram = 1;
10840    radv_describe_layout_transition(cmd_buffer, &barrier);
10841 
10842    return radv_clear_fmask(cmd_buffer, image, range, value);
10843 }
10844 
10845 uint32_t
radv_init_dcc(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,uint32_t value)10846 radv_init_dcc(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, const VkImageSubresourceRange *range,
10847               uint32_t value)
10848 {
10849    struct radv_barrier_data barrier = {0};
10850    uint32_t flush_bits = 0;
10851    unsigned size = 0;
10852 
10853    barrier.layout_transitions.init_mask_ram = 1;
10854    radv_describe_layout_transition(cmd_buffer, &barrier);
10855 
10856    flush_bits |= radv_clear_dcc(cmd_buffer, image, range, value);
10857 
10858    if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX8) {
10859       /* When DCC is enabled with mipmaps, some levels might not
10860        * support fast clears and we have to initialize them as "fully
10861        * expanded".
10862        */
10863       /* Compute the size of all fast clearable DCC levels. */
10864       for (unsigned i = 0; i < image->planes[0].surface.num_meta_levels; i++) {
10865          struct legacy_surf_dcc_level *dcc_level = &image->planes[0].surface.u.legacy.color.dcc_level[i];
10866          unsigned dcc_fast_clear_size = dcc_level->dcc_slice_fast_clear_size * image->vk.array_layers;
10867 
10868          if (!dcc_fast_clear_size)
10869             break;
10870 
10871          size = dcc_level->dcc_offset + dcc_fast_clear_size;
10872       }
10873 
10874       /* Initialize the mipmap levels without DCC. */
10875       if (size != image->planes[0].surface.meta_size) {
10876          flush_bits |= radv_fill_buffer(cmd_buffer, image, image->bindings[0].bo,
10877                                         radv_buffer_get_va(image->bindings[0].bo) + image->bindings[0].offset +
10878                                            image->planes[0].surface.meta_offset + size,
10879                                         image->planes[0].surface.meta_size - size, 0xffffffff);
10880       }
10881    }
10882 
10883    return flush_bits;
10884 }
10885 
10886 /**
10887  * Initialize DCC/FMASK/CMASK metadata for a color image.
10888  */
10889 static void
radv_init_color_image_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,VkImageLayout dst_layout,unsigned src_queue_mask,unsigned dst_queue_mask,const VkImageSubresourceRange * range)10890 radv_init_color_image_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, VkImageLayout src_layout,
10891                                VkImageLayout dst_layout, unsigned src_queue_mask, unsigned dst_queue_mask,
10892                                const VkImageSubresourceRange *range)
10893 {
10894    uint32_t flush_bits = 0;
10895 
10896    /* Transitioning from LAYOUT_UNDEFINED layout not everyone is
10897     * consistent in considering previous rendering work for WAW hazards.
10898     */
10899    cmd_buffer->state.flush_bits |= radv_src_access_flush(cmd_buffer, VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT, image);
10900 
10901    if (radv_image_has_cmask(image)) {
10902       static const uint32_t cmask_clear_values[4] = {0xffffffff, 0xdddddddd, 0xeeeeeeee, 0xffffffff};
10903       uint32_t log2_samples = util_logbase2(image->vk.samples);
10904 
10905       flush_bits |= radv_init_cmask(cmd_buffer, image, range, cmask_clear_values[log2_samples]);
10906    }
10907 
10908    if (radv_image_has_fmask(image)) {
10909       flush_bits |= radv_init_fmask(cmd_buffer, image, range);
10910    }
10911 
10912    if (radv_dcc_enabled(image, range->baseMipLevel)) {
10913       uint32_t value = 0xffffffffu; /* Fully expanded mode. */
10914 
10915       if (radv_layout_dcc_compressed(cmd_buffer->device, image, range->baseMipLevel, dst_layout, dst_queue_mask)) {
10916          value = 0u;
10917       }
10918 
10919       flush_bits |= radv_init_dcc(cmd_buffer, image, range, value);
10920    }
10921 
10922    if (radv_image_has_cmask(image) || radv_dcc_enabled(image, range->baseMipLevel)) {
10923       radv_update_fce_metadata(cmd_buffer, image, range, false);
10924 
10925       uint32_t color_values[2] = {0};
10926       radv_set_color_clear_metadata(cmd_buffer, image, range, color_values);
10927    }
10928 
10929    cmd_buffer->state.flush_bits |= flush_bits;
10930 }
10931 
10932 static void
radv_retile_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,VkImageLayout dst_layout,unsigned dst_queue_mask)10933 radv_retile_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, VkImageLayout src_layout,
10934                        VkImageLayout dst_layout, unsigned dst_queue_mask)
10935 {
10936    /* If the image is read-only, we don't have to retile DCC because it can't change. */
10937    if (!(image->vk.usage & RADV_IMAGE_USAGE_WRITE_BITS))
10938       return;
10939 
10940    if (src_layout != VK_IMAGE_LAYOUT_PRESENT_SRC_KHR &&
10941        (dst_layout == VK_IMAGE_LAYOUT_PRESENT_SRC_KHR || (dst_queue_mask & (1u << RADV_QUEUE_FOREIGN))))
10942       radv_retile_dcc(cmd_buffer, image);
10943 }
10944 
10945 static bool
radv_image_need_retile(const struct radv_cmd_buffer * cmd_buffer,const struct radv_image * image)10946 radv_image_need_retile(const struct radv_cmd_buffer *cmd_buffer, const struct radv_image *image)
10947 {
10948    return cmd_buffer->qf != RADV_QUEUE_TRANSFER && image->planes[0].surface.display_dcc_offset &&
10949           image->planes[0].surface.display_dcc_offset != image->planes[0].surface.meta_offset;
10950 }
10951 
10952 /**
10953  * Handle color image transitions for DCC/FMASK/CMASK.
10954  */
10955 static void
radv_handle_color_image_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,VkImageLayout dst_layout,unsigned src_queue_mask,unsigned dst_queue_mask,const VkImageSubresourceRange * range)10956 radv_handle_color_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
10957                                    VkImageLayout src_layout, VkImageLayout dst_layout, unsigned src_queue_mask,
10958                                    unsigned dst_queue_mask, const VkImageSubresourceRange *range)
10959 {
10960    bool dcc_decompressed = false, fast_clear_flushed = false;
10961 
10962    if (!radv_image_has_cmask(image) && !radv_image_has_fmask(image) && !radv_dcc_enabled(image, range->baseMipLevel))
10963       return;
10964 
10965    if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
10966       radv_init_color_image_metadata(cmd_buffer, image, src_layout, dst_layout, src_queue_mask, dst_queue_mask, range);
10967 
10968       if (radv_image_need_retile(cmd_buffer, image))
10969          radv_retile_transition(cmd_buffer, image, src_layout, dst_layout, dst_queue_mask);
10970       return;
10971    }
10972 
10973    if (radv_dcc_enabled(image, range->baseMipLevel)) {
10974       if (src_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) {
10975          cmd_buffer->state.flush_bits |= radv_init_dcc(cmd_buffer, image, range, 0xffffffffu);
10976       } else if (radv_layout_dcc_compressed(cmd_buffer->device, image, range->baseMipLevel, src_layout,
10977                                             src_queue_mask) &&
10978                  !radv_layout_dcc_compressed(cmd_buffer->device, image, range->baseMipLevel, dst_layout,
10979                                              dst_queue_mask)) {
10980          radv_decompress_dcc(cmd_buffer, image, range);
10981          dcc_decompressed = true;
10982       } else if (radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel, src_layout,
10983                                             src_queue_mask) &&
10984                  !radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel, dst_layout,
10985                                              dst_queue_mask)) {
10986          radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
10987          fast_clear_flushed = true;
10988       }
10989 
10990       if (radv_image_need_retile(cmd_buffer, image))
10991          radv_retile_transition(cmd_buffer, image, src_layout, dst_layout, dst_queue_mask);
10992    } else if (radv_image_has_cmask(image) || radv_image_has_fmask(image)) {
10993       if (radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel, src_layout, src_queue_mask) &&
10994           !radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel, dst_layout, dst_queue_mask)) {
10995          radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
10996          fast_clear_flushed = true;
10997       }
10998    }
10999 
11000    /* MSAA color decompress. */
11001    const enum radv_fmask_compression src_fmask_comp =
11002       radv_layout_fmask_compression(cmd_buffer->device, image, src_layout, src_queue_mask);
11003    const enum radv_fmask_compression dst_fmask_comp =
11004       radv_layout_fmask_compression(cmd_buffer->device, image, dst_layout, dst_queue_mask);
11005    if (src_fmask_comp <= dst_fmask_comp)
11006       return;
11007 
11008    if (src_fmask_comp == RADV_FMASK_COMPRESSION_FULL) {
11009       if (radv_dcc_enabled(image, range->baseMipLevel) && !radv_image_use_dcc_image_stores(cmd_buffer->device, image) &&
11010           !dcc_decompressed) {
11011          /* A DCC decompress is required before expanding FMASK
11012           * when DCC stores aren't supported to avoid being in
11013           * a state where DCC is compressed and the main
11014           * surface is uncompressed.
11015           */
11016          radv_decompress_dcc(cmd_buffer, image, range);
11017       } else if (!fast_clear_flushed) {
11018          /* A FMASK decompress is required before expanding
11019           * FMASK.
11020           */
11021          radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
11022       }
11023    }
11024 
11025    if (dst_fmask_comp == RADV_FMASK_COMPRESSION_NONE) {
11026       struct radv_barrier_data barrier = {0};
11027       barrier.layout_transitions.fmask_color_expand = 1;
11028       radv_describe_layout_transition(cmd_buffer, &barrier);
11029 
11030       radv_expand_fmask_image_inplace(cmd_buffer, image, range);
11031    }
11032 }
11033 
11034 static void
radv_handle_image_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,VkImageLayout dst_layout,uint32_t src_family_index,uint32_t dst_family_index,const VkImageSubresourceRange * range,struct radv_sample_locations_state * sample_locs)11035 radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, VkImageLayout src_layout,
11036                              VkImageLayout dst_layout, uint32_t src_family_index, uint32_t dst_family_index,
11037                              const VkImageSubresourceRange *range, struct radv_sample_locations_state *sample_locs)
11038 {
11039    enum radv_queue_family src_qf = vk_queue_to_radv(cmd_buffer->device->physical_device, src_family_index);
11040    enum radv_queue_family dst_qf = vk_queue_to_radv(cmd_buffer->device->physical_device, dst_family_index);
11041    if (image->exclusive && src_family_index != dst_family_index) {
11042       /* This is an acquire or a release operation and there will be
11043        * a corresponding release/acquire. Do the transition in the
11044        * most flexible queue. */
11045 
11046       assert(src_qf == cmd_buffer->qf || dst_qf == cmd_buffer->qf);
11047 
11048       if (src_family_index == VK_QUEUE_FAMILY_EXTERNAL || src_family_index == VK_QUEUE_FAMILY_FOREIGN_EXT)
11049          return;
11050 
11051       if (cmd_buffer->qf == RADV_QUEUE_TRANSFER)
11052          return;
11053 
11054       if (cmd_buffer->qf == RADV_QUEUE_COMPUTE && (src_qf == RADV_QUEUE_GENERAL || dst_qf == RADV_QUEUE_GENERAL))
11055          return;
11056    }
11057 
11058    unsigned src_queue_mask = radv_image_queue_family_mask(image, src_qf, cmd_buffer->qf);
11059    unsigned dst_queue_mask = radv_image_queue_family_mask(image, dst_qf, cmd_buffer->qf);
11060 
11061    if (src_layout == dst_layout && src_queue_mask == dst_queue_mask)
11062       return;
11063 
11064    if (image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {
11065       radv_handle_depth_image_transition(cmd_buffer, image, src_layout, dst_layout, src_queue_mask, dst_queue_mask,
11066                                          range, sample_locs);
11067    } else {
11068       radv_handle_color_image_transition(cmd_buffer, image, src_layout, dst_layout, src_queue_mask, dst_queue_mask,
11069                                          range);
11070    }
11071 }
11072 
11073 static void
radv_cp_dma_wait_for_stages(struct radv_cmd_buffer * cmd_buffer,VkPipelineStageFlags2 stage_mask)11074 radv_cp_dma_wait_for_stages(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 stage_mask)
11075 {
11076    /* Make sure CP DMA is idle because the driver might have performed a DMA operation for copying a
11077     * buffer (or a MSAA image using FMASK). Note that updating a buffer is considered a clear
11078     * operation but it might also use a CP DMA copy in some rare situations. Other operations using
11079     * a CP DMA clear are implicitly synchronized (see CP_DMA_SYNC).
11080     */
11081    if (stage_mask &
11082        (VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_CLEAR_BIT | VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT |
11083         VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
11084       radv_cp_dma_wait_for_idle(cmd_buffer);
11085 }
11086 
11087 static void
radv_barrier(struct radv_cmd_buffer * cmd_buffer,const VkDependencyInfo * dep_info,enum rgp_barrier_reason reason)11088 radv_barrier(struct radv_cmd_buffer *cmd_buffer, const VkDependencyInfo *dep_info, enum rgp_barrier_reason reason)
11089 {
11090    enum radv_cmd_flush_bits src_flush_bits = 0;
11091    enum radv_cmd_flush_bits dst_flush_bits = 0;
11092    VkPipelineStageFlags2 src_stage_mask = 0;
11093    VkPipelineStageFlags2 dst_stage_mask = 0;
11094 
11095    if (cmd_buffer->state.render.active)
11096       radv_mark_noncoherent_rb(cmd_buffer);
11097 
11098    radv_describe_barrier_start(cmd_buffer, reason);
11099 
11100    for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
11101       src_stage_mask |= dep_info->pMemoryBarriers[i].srcStageMask;
11102       src_flush_bits |= radv_src_access_flush(cmd_buffer, dep_info->pMemoryBarriers[i].srcAccessMask, NULL);
11103       dst_stage_mask |= dep_info->pMemoryBarriers[i].dstStageMask;
11104       dst_flush_bits |= radv_dst_access_flush(cmd_buffer, dep_info->pMemoryBarriers[i].dstAccessMask, NULL);
11105    }
11106 
11107    for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
11108       src_stage_mask |= dep_info->pBufferMemoryBarriers[i].srcStageMask;
11109       src_flush_bits |= radv_src_access_flush(cmd_buffer, dep_info->pBufferMemoryBarriers[i].srcAccessMask, NULL);
11110       dst_stage_mask |= dep_info->pBufferMemoryBarriers[i].dstStageMask;
11111       dst_flush_bits |= radv_dst_access_flush(cmd_buffer, dep_info->pBufferMemoryBarriers[i].dstAccessMask, NULL);
11112    }
11113 
11114    for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
11115       RADV_FROM_HANDLE(radv_image, image, dep_info->pImageMemoryBarriers[i].image);
11116 
11117       src_stage_mask |= dep_info->pImageMemoryBarriers[i].srcStageMask;
11118       src_flush_bits |= radv_src_access_flush(cmd_buffer, dep_info->pImageMemoryBarriers[i].srcAccessMask, image);
11119       dst_stage_mask |= dep_info->pImageMemoryBarriers[i].dstStageMask;
11120       dst_flush_bits |= radv_dst_access_flush(cmd_buffer, dep_info->pImageMemoryBarriers[i].dstAccessMask, image);
11121    }
11122 
11123    /* The Vulkan spec 1.1.98 says:
11124     *
11125     * "An execution dependency with only
11126     *  VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT in the destination stage mask
11127     *  will only prevent that stage from executing in subsequently
11128     *  submitted commands. As this stage does not perform any actual
11129     *  execution, this is not observable - in effect, it does not delay
11130     *  processing of subsequent commands. Similarly an execution dependency
11131     *  with only VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT in the source stage mask
11132     *  will effectively not wait for any prior commands to complete."
11133     */
11134    if (dst_stage_mask != VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT)
11135       radv_stage_flush(cmd_buffer, src_stage_mask);
11136    cmd_buffer->state.flush_bits |= src_flush_bits;
11137 
11138    radv_gang_barrier(cmd_buffer, src_stage_mask, 0);
11139 
11140    for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
11141       RADV_FROM_HANDLE(radv_image, image, dep_info->pImageMemoryBarriers[i].image);
11142 
11143       const struct VkSampleLocationsInfoEXT *sample_locs_info =
11144          vk_find_struct_const(dep_info->pImageMemoryBarriers[i].pNext, SAMPLE_LOCATIONS_INFO_EXT);
11145       struct radv_sample_locations_state sample_locations;
11146 
11147       if (sample_locs_info) {
11148          assert(image->vk.create_flags & VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT);
11149          sample_locations.per_pixel = sample_locs_info->sampleLocationsPerPixel;
11150          sample_locations.grid_size = sample_locs_info->sampleLocationGridSize;
11151          sample_locations.count = sample_locs_info->sampleLocationsCount;
11152          typed_memcpy(&sample_locations.locations[0], sample_locs_info->pSampleLocations,
11153                       sample_locs_info->sampleLocationsCount);
11154       }
11155 
11156       radv_handle_image_transition(
11157          cmd_buffer, image, dep_info->pImageMemoryBarriers[i].oldLayout, dep_info->pImageMemoryBarriers[i].newLayout,
11158          dep_info->pImageMemoryBarriers[i].srcQueueFamilyIndex, dep_info->pImageMemoryBarriers[i].dstQueueFamilyIndex,
11159          &dep_info->pImageMemoryBarriers[i].subresourceRange, sample_locs_info ? &sample_locations : NULL);
11160    }
11161 
11162    radv_gang_barrier(cmd_buffer, 0, dst_stage_mask);
11163 
11164    if (cmd_buffer->qf == RADV_QUEUE_TRANSFER) {
11165       /* SDMA NOP packet waits for all pending SDMA operations to complete.
11166        * Note that GFX9+ is supposed to have RAW dependency tracking, but it's buggy
11167        * so we can't rely on it fow now.
11168        */
11169       radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 1);
11170       radeon_emit(cmd_buffer->cs, SDMA_PACKET(SDMA_OPCODE_NOP, 0, 0));
11171    } else {
11172       const bool is_gfx_or_ace = cmd_buffer->qf == RADV_QUEUE_GENERAL || cmd_buffer->qf == RADV_QUEUE_COMPUTE;
11173       if (is_gfx_or_ace)
11174          radv_cp_dma_wait_for_stages(cmd_buffer, src_stage_mask);
11175    }
11176 
11177    cmd_buffer->state.flush_bits |= dst_flush_bits;
11178 
11179    radv_describe_barrier_end(cmd_buffer);
11180 }
11181 
11182 VKAPI_ATTR void VKAPI_CALL
radv_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,const VkDependencyInfo * pDependencyInfo)11183 radv_CmdPipelineBarrier2(VkCommandBuffer commandBuffer, const VkDependencyInfo *pDependencyInfo)
11184 {
11185    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11186    enum rgp_barrier_reason barrier_reason;
11187 
11188    if (cmd_buffer->vk.runtime_rp_barrier) {
11189       barrier_reason = RGP_BARRIER_EXTERNAL_RENDER_PASS_SYNC;
11190    } else {
11191       barrier_reason = RGP_BARRIER_EXTERNAL_CMD_PIPELINE_BARRIER;
11192    }
11193 
11194    radv_barrier(cmd_buffer, pDependencyInfo, barrier_reason);
11195 }
11196 
11197 static void
write_event(struct radv_cmd_buffer * cmd_buffer,struct radv_event * event,VkPipelineStageFlags2 stageMask,unsigned value)11198 write_event(struct radv_cmd_buffer *cmd_buffer, struct radv_event *event, VkPipelineStageFlags2 stageMask,
11199             unsigned value)
11200 {
11201    struct radeon_cmdbuf *cs = cmd_buffer->cs;
11202    uint64_t va = radv_buffer_get_va(event->bo);
11203 
11204    if (cmd_buffer->qf == RADV_QUEUE_VIDEO_DEC)
11205       return;
11206 
11207    radv_emit_cache_flush(cmd_buffer);
11208 
11209    radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo);
11210 
11211    ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 28);
11212 
11213    if (stageMask & (VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_RESOLVE_BIT | VK_PIPELINE_STAGE_2_BLIT_BIT |
11214                     VK_PIPELINE_STAGE_2_CLEAR_BIT)) {
11215       /* Be conservative for now. */
11216       stageMask |= VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT;
11217    }
11218 
11219    /* Flags that only require a top-of-pipe event. */
11220    VkPipelineStageFlags2 top_of_pipe_flags = VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT;
11221 
11222    /* Flags that only require a post-index-fetch event. */
11223    VkPipelineStageFlags2 post_index_fetch_flags =
11224       top_of_pipe_flags | VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT;
11225 
11226    /* Flags that only require signaling post PS. */
11227    VkPipelineStageFlags2 post_ps_flags =
11228       post_index_fetch_flags | VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT |
11229       VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT | VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT |
11230       VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT | VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_EXT |
11231       VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT | VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT |
11232       VK_PIPELINE_STAGE_2_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR | VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT |
11233       VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT;
11234 
11235    /* Flags that only require signaling post CS. */
11236    VkPipelineStageFlags2 post_cs_flags = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT;
11237 
11238    radv_cp_dma_wait_for_stages(cmd_buffer, stageMask);
11239 
11240    if (!(stageMask & ~top_of_pipe_flags)) {
11241       /* Just need to sync the PFP engine. */
11242       radv_write_data(cmd_buffer, V_370_PFP, va, 1, &value, false);
11243    } else if (!(stageMask & ~post_index_fetch_flags)) {
11244       /* Sync ME because PFP reads index and indirect buffers. */
11245       radv_write_data(cmd_buffer, V_370_ME, va, 1, &value, false);
11246    } else {
11247       unsigned event_type;
11248 
11249       if (!(stageMask & ~post_ps_flags)) {
11250          /* Sync previous fragment shaders. */
11251          event_type = V_028A90_PS_DONE;
11252       } else if (!(stageMask & ~post_cs_flags)) {
11253          /* Sync previous compute shaders. */
11254          event_type = V_028A90_CS_DONE;
11255       } else {
11256          /* Otherwise, sync all prior GPU work. */
11257          event_type = V_028A90_BOTTOM_OF_PIPE_TS;
11258       }
11259 
11260       radv_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level, cmd_buffer->qf,
11261                                    event_type, 0, EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, va, value,
11262                                    cmd_buffer->gfx9_eop_bug_va);
11263    }
11264 
11265    assert(cmd_buffer->cs->cdw <= cdw_max);
11266 }
11267 
11268 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetEvent2(VkCommandBuffer commandBuffer,VkEvent _event,const VkDependencyInfo * pDependencyInfo)11269 radv_CmdSetEvent2(VkCommandBuffer commandBuffer, VkEvent _event, const VkDependencyInfo *pDependencyInfo)
11270 {
11271    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11272    RADV_FROM_HANDLE(radv_event, event, _event);
11273    VkPipelineStageFlags2 src_stage_mask = 0;
11274 
11275    for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++)
11276       src_stage_mask |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
11277    for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++)
11278       src_stage_mask |= pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
11279    for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++)
11280       src_stage_mask |= pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
11281 
11282    write_event(cmd_buffer, event, src_stage_mask, 1);
11283 }
11284 
11285 VKAPI_ATTR void VKAPI_CALL
radv_CmdResetEvent2(VkCommandBuffer commandBuffer,VkEvent _event,VkPipelineStageFlags2 stageMask)11286 radv_CmdResetEvent2(VkCommandBuffer commandBuffer, VkEvent _event, VkPipelineStageFlags2 stageMask)
11287 {
11288    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11289    RADV_FROM_HANDLE(radv_event, event, _event);
11290 
11291    write_event(cmd_buffer, event, stageMask, 0);
11292 }
11293 
11294 VKAPI_ATTR void VKAPI_CALL
radv_CmdWaitEvents2(VkCommandBuffer commandBuffer,uint32_t eventCount,const VkEvent * pEvents,const VkDependencyInfo * pDependencyInfos)11295 radv_CmdWaitEvents2(VkCommandBuffer commandBuffer, uint32_t eventCount, const VkEvent *pEvents,
11296                     const VkDependencyInfo *pDependencyInfos)
11297 {
11298    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11299    struct radeon_cmdbuf *cs = cmd_buffer->cs;
11300 
11301    if (cmd_buffer->qf == RADV_QUEUE_VIDEO_DEC)
11302       return;
11303 
11304    for (unsigned i = 0; i < eventCount; ++i) {
11305       RADV_FROM_HANDLE(radv_event, event, pEvents[i]);
11306       uint64_t va = radv_buffer_get_va(event->bo);
11307 
11308       radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo);
11309 
11310       ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 7);
11311 
11312       radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_EQUAL, va, 1, 0xffffffff);
11313       assert(cmd_buffer->cs->cdw <= cdw_max);
11314    }
11315 
11316    radv_barrier(cmd_buffer, pDependencyInfos, RGP_BARRIER_EXTERNAL_CMD_WAIT_EVENTS);
11317 }
11318 
11319 void
radv_begin_conditional_rendering(struct radv_cmd_buffer * cmd_buffer,uint64_t va,bool draw_visible)11320 radv_begin_conditional_rendering(struct radv_cmd_buffer *cmd_buffer, uint64_t va, bool draw_visible)
11321 {
11322    struct radeon_cmdbuf *cs = cmd_buffer->cs;
11323    unsigned pred_op = PREDICATION_OP_BOOL32;
11324 
11325    radv_emit_cache_flush(cmd_buffer);
11326 
11327    if (cmd_buffer->qf == RADV_QUEUE_GENERAL && !cmd_buffer->device->physical_device->rad_info.has_32bit_predication) {
11328       uint64_t pred_value = 0, pred_va;
11329       unsigned pred_offset;
11330 
11331       /* From the Vulkan spec 1.1.107:
11332        *
11333        * "If the 32-bit value at offset in buffer memory is zero,
11334        *  then the rendering commands are discarded, otherwise they
11335        *  are executed as normal. If the value of the predicate in
11336        *  buffer memory changes while conditional rendering is
11337        *  active, the rendering commands may be discarded in an
11338        *  implementation-dependent way. Some implementations may
11339        *  latch the value of the predicate upon beginning conditional
11340        *  rendering while others may read it before every rendering
11341        *  command."
11342        *
11343        * But, the AMD hardware treats the predicate as a 64-bit
11344        * value which means we need a workaround in the driver.
11345        * Luckily, it's not required to support if the value changes
11346        * when predication is active.
11347        *
11348        * The workaround is as follows:
11349        * 1) allocate a 64-value in the upload BO and initialize it
11350        *    to 0
11351        * 2) copy the 32-bit predicate value to the upload BO
11352        * 3) use the new allocated VA address for predication
11353        *
11354        * Based on the conditionalrender demo, it's faster to do the
11355        * COPY_DATA in ME  (+ sync PFP) instead of PFP.
11356        */
11357       radv_cmd_buffer_upload_data(cmd_buffer, 8, &pred_value, &pred_offset);
11358 
11359       pred_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + pred_offset;
11360 
11361       radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 8);
11362 
11363       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
11364       radeon_emit(cs,
11365                   COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM);
11366       radeon_emit(cs, va);
11367       radeon_emit(cs, va >> 32);
11368       radeon_emit(cs, pred_va);
11369       radeon_emit(cs, pred_va >> 32);
11370 
11371       radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
11372       radeon_emit(cs, 0);
11373 
11374       va = pred_va;
11375       pred_op = PREDICATION_OP_BOOL64;
11376    }
11377 
11378    /* MEC doesn't support predication, we emulate it elsewhere. */
11379    if (!radv_cmd_buffer_uses_mec(cmd_buffer)) {
11380       radv_emit_set_predication_state(cmd_buffer, draw_visible, pred_op, va);
11381    }
11382 
11383    /* Store conditional rendering user info. */
11384    cmd_buffer->state.predicating = true;
11385    cmd_buffer->state.predication_type = draw_visible;
11386    cmd_buffer->state.predication_op = pred_op;
11387    cmd_buffer->state.predication_va = va;
11388    cmd_buffer->mec_inv_pred_emitted = false;
11389 }
11390 
11391 void
radv_end_conditional_rendering(struct radv_cmd_buffer * cmd_buffer)11392 radv_end_conditional_rendering(struct radv_cmd_buffer *cmd_buffer)
11393 {
11394    /* MEC doesn't support predication, no need to emit anything here. */
11395    if (!radv_cmd_buffer_uses_mec(cmd_buffer)) {
11396       radv_emit_set_predication_state(cmd_buffer, false, 0, 0);
11397    }
11398 
11399    /* Reset conditional rendering user info. */
11400    cmd_buffer->state.predicating = false;
11401    cmd_buffer->state.predication_type = -1;
11402    cmd_buffer->state.predication_op = 0;
11403    cmd_buffer->state.predication_va = 0;
11404    cmd_buffer->mec_inv_pred_emitted = false;
11405 }
11406 
11407 /* VK_EXT_conditional_rendering */
11408 VKAPI_ATTR void VKAPI_CALL
radv_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,const VkConditionalRenderingBeginInfoEXT * pConditionalRenderingBegin)11409 radv_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,
11410                                      const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin)
11411 {
11412    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11413    RADV_FROM_HANDLE(radv_buffer, buffer, pConditionalRenderingBegin->buffer);
11414    bool draw_visible = true;
11415    uint64_t va;
11416 
11417    va = radv_buffer_get_va(buffer->bo) + buffer->offset + pConditionalRenderingBegin->offset;
11418 
11419    /* By default, if the 32-bit value at offset in buffer memory is zero,
11420     * then the rendering commands are discarded, otherwise they are
11421     * executed as normal. If the inverted flag is set, all commands are
11422     * discarded if the value is non zero.
11423     */
11424    if (pConditionalRenderingBegin->flags & VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT) {
11425       draw_visible = false;
11426    }
11427 
11428    radv_begin_conditional_rendering(cmd_buffer, va, draw_visible);
11429 }
11430 
11431 VKAPI_ATTR void VKAPI_CALL
radv_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)11432 radv_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)
11433 {
11434    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11435 
11436    radv_end_conditional_rendering(cmd_buffer);
11437 }
11438 
11439 /* VK_EXT_transform_feedback */
11440 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes)11441 radv_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer, uint32_t firstBinding, uint32_t bindingCount,
11442                                         const VkBuffer *pBuffers, const VkDeviceSize *pOffsets,
11443                                         const VkDeviceSize *pSizes)
11444 {
11445    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11446    struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
11447    uint8_t enabled_mask = 0;
11448 
11449    assert(firstBinding + bindingCount <= MAX_SO_BUFFERS);
11450    for (uint32_t i = 0; i < bindingCount; i++) {
11451       uint32_t idx = firstBinding + i;
11452 
11453       sb[idx].buffer = radv_buffer_from_handle(pBuffers[i]);
11454       sb[idx].offset = pOffsets[i];
11455 
11456       if (!pSizes || pSizes[i] == VK_WHOLE_SIZE) {
11457          sb[idx].size = sb[idx].buffer->vk.size - sb[idx].offset;
11458       } else {
11459          sb[idx].size = pSizes[i];
11460       }
11461 
11462       radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, sb[idx].buffer->bo);
11463 
11464       enabled_mask |= 1 << idx;
11465    }
11466 
11467    cmd_buffer->state.streamout.enabled_mask |= enabled_mask;
11468 
11469    cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_BUFFER;
11470 }
11471 
11472 static void
radv_set_streamout_enable(struct radv_cmd_buffer * cmd_buffer,bool enable)11473 radv_set_streamout_enable(struct radv_cmd_buffer *cmd_buffer, bool enable)
11474 {
11475    struct radv_streamout_state *so = &cmd_buffer->state.streamout;
11476    bool old_streamout_enabled = radv_is_streamout_enabled(cmd_buffer);
11477    uint32_t old_hw_enabled_mask = so->hw_enabled_mask;
11478 
11479    so->streamout_enabled = enable;
11480 
11481    so->hw_enabled_mask =
11482       so->enabled_mask | (so->enabled_mask << 4) | (so->enabled_mask << 8) | (so->enabled_mask << 12);
11483 
11484    if (!cmd_buffer->device->physical_device->use_ngg_streamout &&
11485        ((old_streamout_enabled != radv_is_streamout_enabled(cmd_buffer)) ||
11486         (old_hw_enabled_mask != so->hw_enabled_mask)))
11487       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_ENABLE;
11488 
11489    if (cmd_buffer->device->physical_device->use_ngg_streamout) {
11490       /* Re-emit streamout desciptors because with NGG streamout, a buffer size of 0 acts like a
11491        * disable bit and this is needed when streamout needs to be ignored in shaders.
11492        */
11493       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY | RADV_CMD_DIRTY_STREAMOUT_BUFFER;
11494    }
11495 }
11496 
11497 static void
radv_flush_vgt_streamout(struct radv_cmd_buffer * cmd_buffer)11498 radv_flush_vgt_streamout(struct radv_cmd_buffer *cmd_buffer)
11499 {
11500    struct radeon_cmdbuf *cs = cmd_buffer->cs;
11501    unsigned reg_strmout_cntl;
11502 
11503    ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 14);
11504 
11505    /* The register is at different places on different ASICs. */
11506    if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9) {
11507       reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
11508       radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
11509       radeon_emit(cs, S_370_DST_SEL(V_370_MEM_MAPPED_REGISTER) | S_370_ENGINE_SEL(V_370_ME));
11510       radeon_emit(cs, R_0300FC_CP_STRMOUT_CNTL >> 2);
11511       radeon_emit(cs, 0);
11512       radeon_emit(cs, 0);
11513    } else if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) {
11514       reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
11515       radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0);
11516    } else {
11517       reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL;
11518       radeon_set_config_reg(cs, reg_strmout_cntl, 0);
11519    }
11520 
11521    radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
11522    radeon_emit(cs, EVENT_TYPE(V_028A90_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0));
11523 
11524    radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
11525    radeon_emit(cs, WAIT_REG_MEM_EQUAL);    /* wait until the register is equal to the reference value */
11526    radeon_emit(cs, reg_strmout_cntl >> 2); /* register */
11527    radeon_emit(cs, 0);
11528    radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
11529    radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
11530    radeon_emit(cs, 4);                              /* poll interval */
11531 
11532    assert(cs->cdw <= cdw_max);
11533 }
11534 
11535 VKAPI_ATTR void VKAPI_CALL
radv_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)11536 radv_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer,
11537                                   uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
11538                                   const VkDeviceSize *pCounterBufferOffsets)
11539 {
11540    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11541    struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
11542    struct radv_streamout_state *so = &cmd_buffer->state.streamout;
11543    struct radeon_cmdbuf *cs = cmd_buffer->cs;
11544 
11545    assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
11546    if (!cmd_buffer->device->physical_device->use_ngg_streamout)
11547       radv_flush_vgt_streamout(cmd_buffer);
11548 
11549    ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, MAX_SO_BUFFERS * 10);
11550 
11551    u_foreach_bit (i, so->enabled_mask) {
11552       int32_t counter_buffer_idx = i - firstCounterBuffer;
11553       if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
11554          counter_buffer_idx = -1;
11555 
11556       bool append = counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx];
11557       uint64_t va = 0;
11558 
11559       if (append) {
11560          RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
11561          uint64_t counter_buffer_offset = 0;
11562 
11563          if (pCounterBufferOffsets)
11564             counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
11565 
11566          va += radv_buffer_get_va(buffer->bo);
11567          va += buffer->offset + counter_buffer_offset;
11568 
11569          radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
11570       }
11571 
11572       if (cmd_buffer->device->physical_device->use_ngg_streamout) {
11573          if (append) {
11574             radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
11575             radeon_emit(cs,
11576                         COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) | COPY_DATA_WR_CONFIRM);
11577             radeon_emit(cs, va);
11578             radeon_emit(cs, va >> 32);
11579             radeon_emit(cs, (R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 >> 2) + i);
11580             radeon_emit(cs, 0);
11581          } else {
11582             /* The PKT3 CAM bit workaround seems needed for initializing this GDS register to zero. */
11583             radeon_set_perfctr_reg(cmd_buffer->device->physical_device->rad_info.gfx_level, cmd_buffer->qf, cs,
11584                                    R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 + i * 4, 0);
11585          }
11586       } else {
11587          /* AMD GCN binds streamout buffers as shader resources.
11588           * VGT only counts primitives and tells the shader through
11589           * SGPRs what to do.
11590           */
11591          radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, sb[i].size >> 2);
11592 
11593          cmd_buffer->state.context_roll_without_scissor_emitted = true;
11594 
11595          if (append) {
11596             radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
11597             radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) |   /* offset in bytes */
11598                                STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
11599             radeon_emit(cs, 0);                                                 /* unused */
11600             radeon_emit(cs, 0);                                                 /* unused */
11601             radeon_emit(cs, va);                                                /* src address lo */
11602             radeon_emit(cs, va >> 32);                                          /* src address hi */
11603          } else {
11604             /* Start from the beginning. */
11605             radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
11606             radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) |      /* offset in bytes */
11607                                STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
11608             radeon_emit(cs, 0);                                                    /* unused */
11609             radeon_emit(cs, 0);                                                    /* unused */
11610             radeon_emit(cs, 0);                                                    /* unused */
11611             radeon_emit(cs, 0);                                                    /* unused */
11612          }
11613       }
11614    }
11615 
11616    assert(cs->cdw <= cdw_max);
11617 
11618    radv_set_streamout_enable(cmd_buffer, true);
11619 
11620    cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_ENABLE;
11621 }
11622 
11623 VKAPI_ATTR void VKAPI_CALL
radv_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)11624 radv_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer, uint32_t counterBufferCount,
11625                                 const VkBuffer *pCounterBuffers, const VkDeviceSize *pCounterBufferOffsets)
11626 {
11627    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11628    struct radv_streamout_state *so = &cmd_buffer->state.streamout;
11629    struct radeon_cmdbuf *cs = cmd_buffer->cs;
11630 
11631    assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
11632 
11633    if (cmd_buffer->device->physical_device->use_ngg_streamout) {
11634       /* Wait for streamout to finish before reading GDS_STRMOUT registers. */
11635       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH;
11636       radv_emit_cache_flush(cmd_buffer);
11637    } else {
11638       radv_flush_vgt_streamout(cmd_buffer);
11639    }
11640 
11641    ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, MAX_SO_BUFFERS * 12);
11642 
11643    u_foreach_bit (i, so->enabled_mask) {
11644       int32_t counter_buffer_idx = i - firstCounterBuffer;
11645       if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
11646          counter_buffer_idx = -1;
11647 
11648       bool append = counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx];
11649       uint64_t va = 0;
11650 
11651       if (append) {
11652          RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
11653          uint64_t counter_buffer_offset = 0;
11654 
11655          if (pCounterBufferOffsets)
11656             counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
11657 
11658          va += radv_buffer_get_va(buffer->bo);
11659          va += buffer->offset + counter_buffer_offset;
11660 
11661          radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
11662       }
11663 
11664       if (cmd_buffer->device->physical_device->use_ngg_streamout) {
11665          if (append) {
11666             radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
11667             radeon_emit(cs,
11668                         COPY_DATA_SRC_SEL(COPY_DATA_REG) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM);
11669             radeon_emit(cs, (R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 >> 2) + i);
11670             radeon_emit(cs, 0);
11671             radeon_emit(cs, va);
11672             radeon_emit(cs, va >> 32);
11673          }
11674       } else {
11675          if (append) {
11676             radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
11677             radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) | /* offset in bytes */
11678                                STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
11679                                STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
11680             radeon_emit(cs, va);                                  /* dst address lo */
11681             radeon_emit(cs, va >> 32);                            /* dst address hi */
11682             radeon_emit(cs, 0);                                   /* unused */
11683             radeon_emit(cs, 0);                                   /* unused */
11684          }
11685 
11686          /* Deactivate transform feedback by zeroing the buffer size.
11687           * The counters (primitives generated, primitives emitted) may
11688           * be enabled even if there is not buffer bound. This ensures
11689           * that the primitives-emitted query won't increment.
11690           */
11691          radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 0);
11692 
11693          cmd_buffer->state.context_roll_without_scissor_emitted = true;
11694       }
11695    }
11696 
11697    assert(cmd_buffer->cs->cdw <= cdw_max);
11698 
11699    radv_set_streamout_enable(cmd_buffer, false);
11700 }
11701 
11702 static void
radv_emit_strmout_buffer(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * draw_info)11703 radv_emit_strmout_buffer(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *draw_info)
11704 {
11705    const enum amd_gfx_level gfx_level = cmd_buffer->device->physical_device->rad_info.gfx_level;
11706    uint64_t va = radv_buffer_get_va(draw_info->strmout_buffer->bo);
11707    struct radeon_cmdbuf *cs = cmd_buffer->cs;
11708 
11709    va += draw_info->strmout_buffer->offset + draw_info->strmout_buffer_offset;
11710 
11711    radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE, draw_info->stride);
11712 
11713    if (gfx_level >= GFX10) {
11714       /* Emitting a COPY_DATA packet should be enough because RADV doesn't support preemption
11715        * (shadow memory) but for unknown reasons, it can lead to GPU hangs on GFX10+.
11716        */
11717       radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
11718       radeon_emit(cs, 0);
11719 
11720       radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, 0));
11721       radeon_emit(cs, va);
11722       radeon_emit(cs, va >> 32);
11723       radeon_emit(cs, (R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE - SI_CONTEXT_REG_OFFSET) >> 2);
11724       radeon_emit(cs, 1); /* 1 DWORD */
11725    } else {
11726       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
11727       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) | COPY_DATA_WR_CONFIRM);
11728       radeon_emit(cs, va);
11729       radeon_emit(cs, va >> 32);
11730       radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2);
11731       radeon_emit(cs, 0); /* unused */
11732    }
11733 
11734    radv_cs_add_buffer(cmd_buffer->device->ws, cs, draw_info->strmout_buffer->bo);
11735 }
11736 
11737 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,uint32_t instanceCount,uint32_t firstInstance,VkBuffer _counterBuffer,VkDeviceSize counterBufferOffset,uint32_t counterOffset,uint32_t vertexStride)11738 radv_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer, uint32_t instanceCount, uint32_t firstInstance,
11739                                  VkBuffer _counterBuffer, VkDeviceSize counterBufferOffset, uint32_t counterOffset,
11740                                  uint32_t vertexStride)
11741 {
11742    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11743    RADV_FROM_HANDLE(radv_buffer, counterBuffer, _counterBuffer);
11744    struct radv_draw_info info;
11745 
11746    info.count = 0;
11747    info.instance_count = instanceCount;
11748    info.first_instance = firstInstance;
11749    info.strmout_buffer = counterBuffer;
11750    info.strmout_buffer_offset = counterBufferOffset;
11751    info.stride = vertexStride;
11752    info.indexed = false;
11753    info.indirect = NULL;
11754 
11755    if (!radv_before_draw(cmd_buffer, &info, 1, false))
11756       return;
11757    struct VkMultiDrawInfoEXT minfo = {0, 0};
11758    radv_emit_strmout_buffer(cmd_buffer, &info);
11759    radv_emit_direct_draw_packets(cmd_buffer, &info, 1, &minfo, S_0287F0_USE_OPAQUE(1), 0);
11760    radv_after_draw(cmd_buffer, false);
11761 }
11762 
11763 /* VK_AMD_buffer_marker */
11764 VKAPI_ATTR void VKAPI_CALL
radv_CmdWriteBufferMarker2AMD(VkCommandBuffer commandBuffer,VkPipelineStageFlags2 stage,VkBuffer dstBuffer,VkDeviceSize dstOffset,uint32_t marker)11765 radv_CmdWriteBufferMarker2AMD(VkCommandBuffer commandBuffer, VkPipelineStageFlags2 stage, VkBuffer dstBuffer,
11766                               VkDeviceSize dstOffset, uint32_t marker)
11767 {
11768    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11769    RADV_FROM_HANDLE(radv_buffer, buffer, dstBuffer);
11770    struct radeon_cmdbuf *cs = cmd_buffer->cs;
11771    const uint64_t va = radv_buffer_get_va(buffer->bo) + buffer->offset + dstOffset;
11772 
11773    if (cmd_buffer->qf == RADV_QUEUE_TRANSFER) {
11774       radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4);
11775       radeon_emit(cmd_buffer->cs, SDMA_PACKET(SDMA_OPCODE_FENCE, 0, SDMA_FENCE_MTYPE_UC));
11776       radeon_emit(cs, va);
11777       radeon_emit(cs, va >> 32);
11778       radeon_emit(cs, marker);
11779       return;
11780    }
11781 
11782    radv_emit_cache_flush(cmd_buffer);
11783 
11784    ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 12);
11785 
11786    if (!(stage & ~VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT)) {
11787       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
11788       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM);
11789       radeon_emit(cs, marker);
11790       radeon_emit(cs, 0);
11791       radeon_emit(cs, va);
11792       radeon_emit(cs, va >> 32);
11793    } else {
11794       radv_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level, cmd_buffer->qf,
11795                                    V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, va, marker,
11796                                    cmd_buffer->gfx9_eop_bug_va);
11797    }
11798 
11799    assert(cmd_buffer->cs->cdw <= cdw_max);
11800 }
11801 
11802 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindPipelineShaderGroupNV(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipeline pipeline,uint32_t groupIndex)11803 radv_CmdBindPipelineShaderGroupNV(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint,
11804                                   VkPipeline pipeline, uint32_t groupIndex)
11805 {
11806    fprintf(stderr, "radv: unimplemented vkCmdBindPipelineShaderGroupNV\n");
11807    abort();
11808 }
11809 
11810 /* VK_NV_device_generated_commands_compute */
11811 VKAPI_ATTR void VKAPI_CALL
radv_CmdUpdatePipelineIndirectBufferNV(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipeline pipeline)11812 radv_CmdUpdatePipelineIndirectBufferNV(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint,
11813                                        VkPipeline pipeline)
11814 {
11815    unreachable("radv: unimplemented vkCmdUpdatePipelineIndirectBufferNV");
11816 }
11817 
11818 /* VK_EXT_descriptor_buffer */
11819 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindDescriptorBuffersEXT(VkCommandBuffer commandBuffer,uint32_t bufferCount,const VkDescriptorBufferBindingInfoEXT * pBindingInfos)11820 radv_CmdBindDescriptorBuffersEXT(VkCommandBuffer commandBuffer, uint32_t bufferCount,
11821                                  const VkDescriptorBufferBindingInfoEXT *pBindingInfos)
11822 {
11823    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11824 
11825    for (uint32_t i = 0; i < bufferCount; i++) {
11826       cmd_buffer->descriptor_buffers[i] = pBindingInfos[i].address;
11827    }
11828 }
11829 
11830 static void
radv_set_descriptor_buffer_offsets(struct radv_cmd_buffer * cmd_buffer,const VkSetDescriptorBufferOffsetsInfoEXT * pSetDescriptorBufferOffsetsInfo,VkPipelineBindPoint bind_point)11831 radv_set_descriptor_buffer_offsets(struct radv_cmd_buffer *cmd_buffer,
11832                                    const VkSetDescriptorBufferOffsetsInfoEXT *pSetDescriptorBufferOffsetsInfo,
11833                                    VkPipelineBindPoint bind_point)
11834 {
11835    struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);
11836 
11837    for (unsigned i = 0; i < pSetDescriptorBufferOffsetsInfo->setCount; i++) {
11838       const uint32_t buffer_idx = pSetDescriptorBufferOffsetsInfo->pBufferIndices[i];
11839       const uint64_t offset = pSetDescriptorBufferOffsetsInfo->pOffsets[i];
11840       unsigned idx = i + pSetDescriptorBufferOffsetsInfo->firstSet;
11841 
11842       descriptors_state->descriptor_buffers[idx] = cmd_buffer->descriptor_buffers[buffer_idx] + offset;
11843 
11844       radv_set_descriptor_set(cmd_buffer, bind_point, NULL, idx);
11845    }
11846 }
11847 
11848 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDescriptorBufferOffsets2EXT(VkCommandBuffer commandBuffer,const VkSetDescriptorBufferOffsetsInfoEXT * pSetDescriptorBufferOffsetsInfo)11849 radv_CmdSetDescriptorBufferOffsets2EXT(VkCommandBuffer commandBuffer,
11850                                        const VkSetDescriptorBufferOffsetsInfoEXT *pSetDescriptorBufferOffsetsInfo)
11851 {
11852    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11853 
11854    if (pSetDescriptorBufferOffsetsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) {
11855       radv_set_descriptor_buffer_offsets(cmd_buffer, pSetDescriptorBufferOffsetsInfo, VK_PIPELINE_BIND_POINT_COMPUTE);
11856    }
11857 
11858    if (pSetDescriptorBufferOffsetsInfo->stageFlags & RADV_GRAPHICS_STAGE_BITS) {
11859       radv_set_descriptor_buffer_offsets(cmd_buffer, pSetDescriptorBufferOffsetsInfo, VK_PIPELINE_BIND_POINT_GRAPHICS);
11860    }
11861 
11862    if (pSetDescriptorBufferOffsetsInfo->stageFlags & RADV_RT_STAGE_BITS) {
11863       radv_set_descriptor_buffer_offsets(cmd_buffer, pSetDescriptorBufferOffsetsInfo,
11864                                          VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
11865    }
11866 }
11867 
11868 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindDescriptorBufferEmbeddedSamplers2EXT(VkCommandBuffer commandBuffer,const VkBindDescriptorBufferEmbeddedSamplersInfoEXT * pBindDescriptorBufferEmbeddedSamplersInfo)11869 radv_CmdBindDescriptorBufferEmbeddedSamplers2EXT(
11870    VkCommandBuffer commandBuffer,
11871    const VkBindDescriptorBufferEmbeddedSamplersInfoEXT *pBindDescriptorBufferEmbeddedSamplersInfo)
11872 {
11873    /* This is a no-op because embedded samplers are inlined at compile time. */
11874 }
11875 
11876 /* VK_EXT_shader_object */
11877 static void
radv_reset_pipeline_state(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint pipelineBindPoint)11878 radv_reset_pipeline_state(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint pipelineBindPoint)
11879 {
11880    switch (pipelineBindPoint) {
11881    case VK_PIPELINE_BIND_POINT_COMPUTE:
11882       if (cmd_buffer->state.compute_pipeline) {
11883          radv_bind_shader(cmd_buffer, NULL, MESA_SHADER_COMPUTE);
11884          cmd_buffer->state.compute_pipeline = NULL;
11885       }
11886       if (cmd_buffer->state.emitted_compute_pipeline) {
11887          cmd_buffer->state.emitted_compute_pipeline = NULL;
11888       }
11889       break;
11890    case VK_PIPELINE_BIND_POINT_GRAPHICS:
11891       if (cmd_buffer->state.graphics_pipeline) {
11892          radv_foreach_stage(s, cmd_buffer->state.graphics_pipeline->active_stages)
11893          {
11894             radv_bind_shader(cmd_buffer, NULL, s);
11895          }
11896          cmd_buffer->state.graphics_pipeline = NULL;
11897 
11898          cmd_buffer->state.gs_copy_shader = NULL;
11899          cmd_buffer->state.last_vgt_shader = NULL;
11900          cmd_buffer->state.has_nggc = false;
11901          cmd_buffer->state.emitted_vs_prolog = NULL;
11902          cmd_buffer->state.col_format_non_compacted = 0;
11903          cmd_buffer->state.ms.sample_shading_enable = false;
11904          cmd_buffer->state.ms.min_sample_shading = 1.0f;
11905          cmd_buffer->state.rast_prim = 0;
11906          cmd_buffer->state.uses_out_of_order_rast = false;
11907          cmd_buffer->state.uses_vrs_attachment = false;
11908          cmd_buffer->state.uses_dynamic_vertex_binding_stride = false;
11909       }
11910       if (cmd_buffer->state.emitted_graphics_pipeline) {
11911          radv_bind_custom_blend_mode(cmd_buffer, 0);
11912 
11913          if (cmd_buffer->state.db_render_control) {
11914             cmd_buffer->state.db_render_control = 0;
11915             cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FRAMEBUFFER;
11916          }
11917 
11918          cmd_buffer->state.emitted_graphics_pipeline = NULL;
11919       }
11920       break;
11921    default:
11922       break;
11923    }
11924 
11925    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_PIPELINE;
11926 }
11927 
11928 static void
radv_bind_compute_shader(struct radv_cmd_buffer * cmd_buffer,struct radv_shader_object * shader_obj)11929 radv_bind_compute_shader(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_object *shader_obj)
11930 {
11931    struct radv_shader *shader = shader_obj ? shader_obj->shader : NULL;
11932    const struct radv_device *device = cmd_buffer->device;
11933    struct radeon_cmdbuf *cs = cmd_buffer->cs;
11934 
11935    radv_bind_shader(cmd_buffer, shader, MESA_SHADER_COMPUTE);
11936 
11937    if (!shader_obj)
11938       return;
11939 
11940    ASSERTED const unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 128);
11941 
11942    radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, shader->bo);
11943 
11944    radv_emit_compute_shader(device->physical_device, cs, shader);
11945 
11946    /* Update push constants/indirect descriptors state. */
11947    struct radv_descriptor_state *descriptors_state =
11948       radv_get_descriptors_state(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE);
11949    struct radv_push_constant_state *pc_state = &cmd_buffer->push_constant_state[VK_PIPELINE_BIND_POINT_COMPUTE];
11950 
11951    descriptors_state->need_indirect_descriptor_sets =
11952       radv_get_user_sgpr(shader, AC_UD_INDIRECT_DESCRIPTOR_SETS)->sgpr_idx != -1;
11953    pc_state->size = shader_obj->push_constant_size;
11954    pc_state->dynamic_offset_count = shader_obj->dynamic_offset_count;
11955 
11956    assert(cmd_buffer->cs->cdw <= cdw_max);
11957 }
11958 
11959 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindShadersEXT(VkCommandBuffer commandBuffer,uint32_t stageCount,const VkShaderStageFlagBits * pStages,const VkShaderEXT * pShaders)11960 radv_CmdBindShadersEXT(VkCommandBuffer commandBuffer, uint32_t stageCount, const VkShaderStageFlagBits *pStages,
11961                        const VkShaderEXT *pShaders)
11962 {
11963    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11964    VkShaderStageFlagBits bound_stages = 0;
11965 
11966    for (uint32_t i = 0; i < stageCount; i++) {
11967       const gl_shader_stage stage = vk_to_mesa_shader_stage(pStages[i]);
11968 
11969       if (!pShaders) {
11970          cmd_buffer->state.shader_objs[stage] = NULL;
11971          continue;
11972       }
11973 
11974       RADV_FROM_HANDLE(radv_shader_object, shader_obj, pShaders[i]);
11975 
11976       cmd_buffer->state.shader_objs[stage] = shader_obj;
11977 
11978       bound_stages |= pStages[i];
11979    }
11980 
11981    if (bound_stages & VK_SHADER_STAGE_COMPUTE_BIT) {
11982       radv_reset_pipeline_state(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE);
11983       radv_mark_descriptor_sets_dirty(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE);
11984 
11985       radv_bind_compute_shader(cmd_buffer, cmd_buffer->state.shader_objs[MESA_SHADER_COMPUTE]);
11986    }
11987 
11988    if (bound_stages & RADV_GRAPHICS_STAGE_BITS) {
11989       radv_reset_pipeline_state(cmd_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS);
11990       radv_mark_descriptor_sets_dirty(cmd_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS);
11991 
11992       /* Graphics shaders are handled at draw time because of shader variants. */
11993    }
11994 
11995    cmd_buffer->state.dirty |= RADV_CMD_DIRTY_GRAPHICS_SHADERS;
11996 }
11997 
11998 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetCoverageModulationModeNV(VkCommandBuffer commandBuffer,VkCoverageModulationModeNV coverageModulationMode)11999 radv_CmdSetCoverageModulationModeNV(VkCommandBuffer commandBuffer, VkCoverageModulationModeNV coverageModulationMode)
12000 {
12001    unreachable("Not supported by RADV.");
12002 }
12003 
12004 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetCoverageModulationTableEnableNV(VkCommandBuffer commandBuffer,VkBool32 coverageModulationTableEnable)12005 radv_CmdSetCoverageModulationTableEnableNV(VkCommandBuffer commandBuffer, VkBool32 coverageModulationTableEnable)
12006 {
12007    unreachable("Not supported by RADV.");
12008 }
12009 
12010 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetCoverageModulationTableNV(VkCommandBuffer commandBuffer,uint32_t coverageModulationTableCount,const float * pCoverageModulationTable)12011 radv_CmdSetCoverageModulationTableNV(VkCommandBuffer commandBuffer, uint32_t coverageModulationTableCount,
12012                                      const float *pCoverageModulationTable)
12013 {
12014    unreachable("Not supported by RADV.");
12015 }
12016 
12017 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetCoverageReductionModeNV(VkCommandBuffer commandBuffer,VkCoverageReductionModeNV coverageReductionMode)12018 radv_CmdSetCoverageReductionModeNV(VkCommandBuffer commandBuffer, VkCoverageReductionModeNV coverageReductionMode)
12019 {
12020    unreachable("Not supported by RADV.");
12021 }
12022 
12023 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetCoverageToColorEnableNV(VkCommandBuffer commandBuffer,VkBool32 coverageToColorEnable)12024 radv_CmdSetCoverageToColorEnableNV(VkCommandBuffer commandBuffer, VkBool32 coverageToColorEnable)
12025 {
12026    unreachable("Not supported by RADV.");
12027 }
12028 
12029 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetCoverageToColorLocationNV(VkCommandBuffer commandBuffer,uint32_t coverageToColorLocation)12030 radv_CmdSetCoverageToColorLocationNV(VkCommandBuffer commandBuffer, uint32_t coverageToColorLocation)
12031 {
12032    unreachable("Not supported by RADV.");
12033 }
12034 
12035 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetRepresentativeFragmentTestEnableNV(VkCommandBuffer commandBuffer,VkBool32 representativeFragmentTestEnable)12036 radv_CmdSetRepresentativeFragmentTestEnableNV(VkCommandBuffer commandBuffer, VkBool32 representativeFragmentTestEnable)
12037 {
12038    unreachable("Not supported by RADV.");
12039 }
12040 
12041 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetShadingRateImageEnableNV(VkCommandBuffer commandBuffer,VkBool32 shadingRateImageEnable)12042 radv_CmdSetShadingRateImageEnableNV(VkCommandBuffer commandBuffer, VkBool32 shadingRateImageEnable)
12043 {
12044    unreachable("Not supported by RADV.");
12045 }
12046 
12047 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetViewportSwizzleNV(VkCommandBuffer commandBuffer,uint32_t firstViewport,uint32_t viewportCount,const VkViewportSwizzleNV * pViewportSwizzles)12048 radv_CmdSetViewportSwizzleNV(VkCommandBuffer commandBuffer, uint32_t firstViewport, uint32_t viewportCount,
12049                              const VkViewportSwizzleNV *pViewportSwizzles)
12050 {
12051    unreachable("Not supported by RADV.");
12052 }
12053 
12054 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetViewportWScalingEnableNV(VkCommandBuffer commandBuffer,VkBool32 viewportWScalingEnable)12055 radv_CmdSetViewportWScalingEnableNV(VkCommandBuffer commandBuffer, VkBool32 viewportWScalingEnable)
12056 {
12057    unreachable("Not supported by RADV.");
12058 }
12059