1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 *
5 * based in part on anv driver which is:
6 * Copyright © 2015 Intel Corporation
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the "Software"),
10 * to deal in the Software without restriction, including without limitation
11 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 * and/or sell copies of the Software, and to permit persons to whom the
13 * Software is furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the next
16 * paragraph) shall be included in all copies or substantial portions of the
17 * Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * IN THE SOFTWARE.
26 */
27
28 #include "meta/radv_meta.h"
29 #include "radv_cs.h"
30 #include "radv_debug.h"
31 #include "radv_private.h"
32 #include "radv_radeon_winsys.h"
33 #include "radv_shader.h"
34 #include "sid.h"
35 #include "vk_common_entrypoints.h"
36 #include "vk_enum_defines.h"
37 #include "vk_format.h"
38 #include "vk_framebuffer.h"
39 #include "vk_render_pass.h"
40 #include "vk_util.h"
41
42 #include "ac_debug.h"
43 #include "ac_shader_args.h"
44
45 #include "aco_interface.h"
46
47 #include "util/fast_idiv_by_const.h"
48
49 enum {
50 RADV_PREFETCH_VBO_DESCRIPTORS = (1 << 0),
51 RADV_PREFETCH_VS = (1 << 1),
52 RADV_PREFETCH_TCS = (1 << 2),
53 RADV_PREFETCH_TES = (1 << 3),
54 RADV_PREFETCH_GS = (1 << 4),
55 RADV_PREFETCH_PS = (1 << 5),
56 RADV_PREFETCH_MS = (1 << 6),
57 RADV_PREFETCH_SHADERS = (RADV_PREFETCH_VS | RADV_PREFETCH_TCS | RADV_PREFETCH_TES | RADV_PREFETCH_GS |
58 RADV_PREFETCH_PS | RADV_PREFETCH_MS)
59 };
60
61 static void radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
62 VkImageLayout src_layout, VkImageLayout dst_layout, uint32_t src_family_index,
63 uint32_t dst_family_index, const VkImageSubresourceRange *range,
64 struct radv_sample_locations_state *sample_locs);
65
66 static void
radv_bind_dynamic_state(struct radv_cmd_buffer * cmd_buffer,const struct radv_dynamic_state * src)67 radv_bind_dynamic_state(struct radv_cmd_buffer *cmd_buffer, const struct radv_dynamic_state *src)
68 {
69 struct radv_dynamic_state *dest = &cmd_buffer->state.dynamic;
70 uint64_t copy_mask = src->mask;
71 uint64_t dest_mask = 0;
72
73 dest->vk.dr.rectangle_count = src->vk.dr.rectangle_count;
74 dest->sample_location.count = src->sample_location.count;
75
76 if (copy_mask & RADV_DYNAMIC_VIEWPORT) {
77 if (dest->vk.vp.viewport_count != src->vk.vp.viewport_count) {
78 dest->vk.vp.viewport_count = src->vk.vp.viewport_count;
79 dest_mask |= RADV_DYNAMIC_VIEWPORT;
80 }
81
82 if (memcmp(&dest->vk.vp.viewports, &src->vk.vp.viewports, src->vk.vp.viewport_count * sizeof(VkViewport))) {
83 typed_memcpy(dest->vk.vp.viewports, src->vk.vp.viewports, src->vk.vp.viewport_count);
84 typed_memcpy(dest->hw_vp.xform, src->hw_vp.xform, src->vk.vp.viewport_count);
85 dest_mask |= RADV_DYNAMIC_VIEWPORT;
86 }
87 }
88
89 if (copy_mask & RADV_DYNAMIC_SCISSOR) {
90 if (dest->vk.vp.scissor_count != src->vk.vp.scissor_count) {
91 dest->vk.vp.scissor_count = src->vk.vp.scissor_count;
92 dest_mask |= RADV_DYNAMIC_SCISSOR;
93 }
94
95 if (memcmp(&dest->vk.vp.scissors, &src->vk.vp.scissors, src->vk.vp.scissor_count * sizeof(VkRect2D))) {
96 typed_memcpy(dest->vk.vp.scissors, src->vk.vp.scissors, src->vk.vp.scissor_count);
97 dest_mask |= RADV_DYNAMIC_SCISSOR;
98 }
99 }
100
101 if (copy_mask & RADV_DYNAMIC_BLEND_CONSTANTS) {
102 if (memcmp(&dest->vk.cb.blend_constants, &src->vk.cb.blend_constants, sizeof(src->vk.cb.blend_constants))) {
103 typed_memcpy(dest->vk.cb.blend_constants, src->vk.cb.blend_constants, 4);
104 dest_mask |= RADV_DYNAMIC_BLEND_CONSTANTS;
105 }
106 }
107
108 if (copy_mask & RADV_DYNAMIC_DISCARD_RECTANGLE) {
109 if (memcmp(&dest->vk.dr.rectangles, &src->vk.dr.rectangles, src->vk.dr.rectangle_count * sizeof(VkRect2D))) {
110 typed_memcpy(dest->vk.dr.rectangles, src->vk.dr.rectangles, src->vk.dr.rectangle_count);
111 dest_mask |= RADV_DYNAMIC_DISCARD_RECTANGLE;
112 }
113 }
114
115 if (copy_mask & RADV_DYNAMIC_SAMPLE_LOCATIONS) {
116 if (dest->sample_location.per_pixel != src->sample_location.per_pixel ||
117 dest->sample_location.grid_size.width != src->sample_location.grid_size.width ||
118 dest->sample_location.grid_size.height != src->sample_location.grid_size.height ||
119 memcmp(&dest->sample_location.locations, &src->sample_location.locations,
120 src->sample_location.count * sizeof(VkSampleLocationEXT))) {
121 dest->sample_location.per_pixel = src->sample_location.per_pixel;
122 dest->sample_location.grid_size = src->sample_location.grid_size;
123 typed_memcpy(dest->sample_location.locations, src->sample_location.locations, src->sample_location.count);
124 dest_mask |= RADV_DYNAMIC_SAMPLE_LOCATIONS;
125 }
126 }
127
128 if (copy_mask & RADV_DYNAMIC_COLOR_WRITE_MASK) {
129 for (uint32_t i = 0; i < MAX_RTS; i++) {
130 if (dest->vk.cb.attachments[i].write_mask != src->vk.cb.attachments[i].write_mask) {
131 dest->vk.cb.attachments[i].write_mask = src->vk.cb.attachments[i].write_mask;
132 dest_mask |= RADV_DYNAMIC_COLOR_WRITE_MASK;
133 }
134 }
135 }
136
137 if (copy_mask & RADV_DYNAMIC_COLOR_BLEND_ENABLE) {
138 for (uint32_t i = 0; i < MAX_RTS; i++) {
139 if (dest->vk.cb.attachments[i].blend_enable != src->vk.cb.attachments[i].blend_enable) {
140 dest->vk.cb.attachments[i].blend_enable = src->vk.cb.attachments[i].blend_enable;
141 dest_mask |= RADV_DYNAMIC_COLOR_BLEND_ENABLE;
142 }
143 }
144 }
145
146 if (copy_mask & RADV_DYNAMIC_COLOR_BLEND_EQUATION) {
147 for (uint32_t i = 0; i < MAX_RTS; i++) {
148 if (dest->vk.cb.attachments[i].src_color_blend_factor != src->vk.cb.attachments[i].src_color_blend_factor ||
149 dest->vk.cb.attachments[i].dst_color_blend_factor != src->vk.cb.attachments[i].dst_color_blend_factor ||
150 dest->vk.cb.attachments[i].color_blend_op != src->vk.cb.attachments[i].color_blend_op ||
151 dest->vk.cb.attachments[i].src_alpha_blend_factor != src->vk.cb.attachments[i].src_alpha_blend_factor ||
152 dest->vk.cb.attachments[i].dst_alpha_blend_factor != src->vk.cb.attachments[i].dst_alpha_blend_factor ||
153 dest->vk.cb.attachments[i].alpha_blend_op != src->vk.cb.attachments[i].alpha_blend_op) {
154 dest->vk.cb.attachments[i].src_color_blend_factor = src->vk.cb.attachments[i].src_color_blend_factor;
155 dest->vk.cb.attachments[i].dst_color_blend_factor = src->vk.cb.attachments[i].dst_color_blend_factor;
156 dest->vk.cb.attachments[i].color_blend_op = src->vk.cb.attachments[i].color_blend_op;
157 dest->vk.cb.attachments[i].src_alpha_blend_factor = src->vk.cb.attachments[i].src_alpha_blend_factor;
158 dest->vk.cb.attachments[i].dst_alpha_blend_factor = src->vk.cb.attachments[i].dst_alpha_blend_factor;
159 dest->vk.cb.attachments[i].alpha_blend_op = src->vk.cb.attachments[i].alpha_blend_op;
160 dest_mask |= RADV_DYNAMIC_COLOR_BLEND_EQUATION;
161 }
162 }
163 }
164
165 #define RADV_CMP_COPY(field, flag) \
166 if (copy_mask & flag) { \
167 if (dest->field != src->field) { \
168 dest->field = src->field; \
169 dest_mask |= flag; \
170 } \
171 }
172
173 RADV_CMP_COPY(vk.ia.primitive_topology, RADV_DYNAMIC_PRIMITIVE_TOPOLOGY);
174 RADV_CMP_COPY(vk.ia.primitive_restart_enable, RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE);
175
176 RADV_CMP_COPY(vk.vp.depth_clip_negative_one_to_one, RADV_DYNAMIC_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE);
177
178 RADV_CMP_COPY(vk.ts.patch_control_points, RADV_DYNAMIC_PATCH_CONTROL_POINTS);
179 RADV_CMP_COPY(vk.ts.domain_origin, RADV_DYNAMIC_TESS_DOMAIN_ORIGIN);
180
181 RADV_CMP_COPY(vk.rs.line.width, RADV_DYNAMIC_LINE_WIDTH);
182 RADV_CMP_COPY(vk.rs.depth_bias.constant, RADV_DYNAMIC_DEPTH_BIAS);
183 RADV_CMP_COPY(vk.rs.depth_bias.clamp, RADV_DYNAMIC_DEPTH_BIAS);
184 RADV_CMP_COPY(vk.rs.depth_bias.slope, RADV_DYNAMIC_DEPTH_BIAS);
185 RADV_CMP_COPY(vk.rs.depth_bias.representation, RADV_DYNAMIC_DEPTH_BIAS);
186 RADV_CMP_COPY(vk.rs.line.stipple.factor, RADV_DYNAMIC_LINE_STIPPLE);
187 RADV_CMP_COPY(vk.rs.line.stipple.pattern, RADV_DYNAMIC_LINE_STIPPLE);
188 RADV_CMP_COPY(vk.rs.cull_mode, RADV_DYNAMIC_CULL_MODE);
189 RADV_CMP_COPY(vk.rs.front_face, RADV_DYNAMIC_FRONT_FACE);
190 RADV_CMP_COPY(vk.rs.depth_bias.enable, RADV_DYNAMIC_DEPTH_BIAS_ENABLE);
191 RADV_CMP_COPY(vk.rs.rasterizer_discard_enable, RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE);
192 RADV_CMP_COPY(vk.rs.polygon_mode, RADV_DYNAMIC_POLYGON_MODE);
193 RADV_CMP_COPY(vk.rs.line.stipple.enable, RADV_DYNAMIC_LINE_STIPPLE_ENABLE);
194 RADV_CMP_COPY(vk.rs.depth_clip_enable, RADV_DYNAMIC_DEPTH_CLIP_ENABLE);
195 RADV_CMP_COPY(vk.rs.conservative_mode, RADV_DYNAMIC_CONSERVATIVE_RAST_MODE);
196 RADV_CMP_COPY(vk.rs.provoking_vertex, RADV_DYNAMIC_PROVOKING_VERTEX_MODE);
197 RADV_CMP_COPY(vk.rs.depth_clamp_enable, RADV_DYNAMIC_DEPTH_CLAMP_ENABLE);
198 RADV_CMP_COPY(vk.rs.line.mode, RADV_DYNAMIC_LINE_RASTERIZATION_MODE);
199
200 RADV_CMP_COPY(vk.ms.alpha_to_coverage_enable, RADV_DYNAMIC_ALPHA_TO_COVERAGE_ENABLE);
201 RADV_CMP_COPY(vk.ms.sample_mask, RADV_DYNAMIC_SAMPLE_MASK);
202 RADV_CMP_COPY(vk.ms.rasterization_samples, RADV_DYNAMIC_RASTERIZATION_SAMPLES);
203 RADV_CMP_COPY(vk.ms.sample_locations_enable, RADV_DYNAMIC_SAMPLE_LOCATIONS_ENABLE);
204
205 RADV_CMP_COPY(vk.ds.depth.bounds_test.min, RADV_DYNAMIC_DEPTH_BOUNDS);
206 RADV_CMP_COPY(vk.ds.depth.bounds_test.max, RADV_DYNAMIC_DEPTH_BOUNDS);
207 RADV_CMP_COPY(vk.ds.stencil.front.compare_mask, RADV_DYNAMIC_STENCIL_COMPARE_MASK);
208 RADV_CMP_COPY(vk.ds.stencil.back.compare_mask, RADV_DYNAMIC_STENCIL_COMPARE_MASK);
209 RADV_CMP_COPY(vk.ds.stencil.front.write_mask, RADV_DYNAMIC_STENCIL_WRITE_MASK);
210 RADV_CMP_COPY(vk.ds.stencil.back.write_mask, RADV_DYNAMIC_STENCIL_WRITE_MASK);
211 RADV_CMP_COPY(vk.ds.stencil.front.reference, RADV_DYNAMIC_STENCIL_REFERENCE);
212 RADV_CMP_COPY(vk.ds.stencil.back.reference, RADV_DYNAMIC_STENCIL_REFERENCE);
213 RADV_CMP_COPY(vk.ds.depth.test_enable, RADV_DYNAMIC_DEPTH_TEST_ENABLE);
214 RADV_CMP_COPY(vk.ds.depth.write_enable, RADV_DYNAMIC_DEPTH_WRITE_ENABLE);
215 RADV_CMP_COPY(vk.ds.depth.compare_op, RADV_DYNAMIC_DEPTH_COMPARE_OP);
216 RADV_CMP_COPY(vk.ds.depth.bounds_test.enable, RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE);
217 RADV_CMP_COPY(vk.ds.stencil.test_enable, RADV_DYNAMIC_STENCIL_TEST_ENABLE);
218 RADV_CMP_COPY(vk.ds.stencil.front.op.fail, RADV_DYNAMIC_STENCIL_OP);
219 RADV_CMP_COPY(vk.ds.stencil.front.op.pass, RADV_DYNAMIC_STENCIL_OP);
220 RADV_CMP_COPY(vk.ds.stencil.front.op.depth_fail, RADV_DYNAMIC_STENCIL_OP);
221 RADV_CMP_COPY(vk.ds.stencil.front.op.compare, RADV_DYNAMIC_STENCIL_OP);
222 RADV_CMP_COPY(vk.ds.stencil.back.op.fail, RADV_DYNAMIC_STENCIL_OP);
223 RADV_CMP_COPY(vk.ds.stencil.back.op.pass, RADV_DYNAMIC_STENCIL_OP);
224 RADV_CMP_COPY(vk.ds.stencil.back.op.depth_fail, RADV_DYNAMIC_STENCIL_OP);
225 RADV_CMP_COPY(vk.ds.stencil.back.op.compare, RADV_DYNAMIC_STENCIL_OP);
226
227 RADV_CMP_COPY(vk.cb.logic_op, RADV_DYNAMIC_LOGIC_OP);
228 RADV_CMP_COPY(vk.cb.color_write_enables, RADV_DYNAMIC_COLOR_WRITE_ENABLE);
229 RADV_CMP_COPY(vk.cb.logic_op_enable, RADV_DYNAMIC_LOGIC_OP_ENABLE);
230
231 RADV_CMP_COPY(vk.fsr.fragment_size.width, RADV_DYNAMIC_FRAGMENT_SHADING_RATE);
232 RADV_CMP_COPY(vk.fsr.fragment_size.height, RADV_DYNAMIC_FRAGMENT_SHADING_RATE);
233 RADV_CMP_COPY(vk.fsr.combiner_ops[0], RADV_DYNAMIC_FRAGMENT_SHADING_RATE);
234 RADV_CMP_COPY(vk.fsr.combiner_ops[1], RADV_DYNAMIC_FRAGMENT_SHADING_RATE);
235
236 RADV_CMP_COPY(vk.dr.enable, RADV_DYNAMIC_DISCARD_RECTANGLE_ENABLE);
237 RADV_CMP_COPY(vk.dr.mode, RADV_DYNAMIC_DISCARD_RECTANGLE_MODE);
238
239 RADV_CMP_COPY(feedback_loop_aspects, RADV_DYNAMIC_ATTACHMENT_FEEDBACK_LOOP_ENABLE);
240
241 #undef RADV_CMP_COPY
242
243 cmd_buffer->state.dirty |= dest_mask;
244
245 /* Handle driver specific states that need to be re-emitted when PSO are bound. */
246 if (dest_mask & (RADV_DYNAMIC_VIEWPORT | RADV_DYNAMIC_POLYGON_MODE | RADV_DYNAMIC_LINE_WIDTH |
247 RADV_DYNAMIC_PRIMITIVE_TOPOLOGY)) {
248 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_GUARDBAND;
249 }
250
251 if (cmd_buffer->device->physical_device->rad_info.rbplus_allowed && (dest_mask & RADV_DYNAMIC_COLOR_WRITE_MASK)) {
252 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_RBPLUS;
253 }
254 }
255
256 bool
radv_cmd_buffer_uses_mec(struct radv_cmd_buffer * cmd_buffer)257 radv_cmd_buffer_uses_mec(struct radv_cmd_buffer *cmd_buffer)
258 {
259 return cmd_buffer->qf == RADV_QUEUE_COMPUTE && cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7;
260 }
261
262 enum amd_ip_type
radv_queue_family_to_ring(const struct radv_physical_device * physical_device,enum radv_queue_family f)263 radv_queue_family_to_ring(const struct radv_physical_device *physical_device, enum radv_queue_family f)
264 {
265 switch (f) {
266 case RADV_QUEUE_GENERAL:
267 return AMD_IP_GFX;
268 case RADV_QUEUE_COMPUTE:
269 return AMD_IP_COMPUTE;
270 case RADV_QUEUE_TRANSFER:
271 return AMD_IP_SDMA;
272 case RADV_QUEUE_VIDEO_DEC:
273 return physical_device->vid_decode_ip;
274 case RADV_QUEUE_VIDEO_ENC:
275 return AMD_IP_VCN_ENC;
276 default:
277 unreachable("Unknown queue family");
278 }
279 }
280
281 static void
radv_write_data(struct radv_cmd_buffer * cmd_buffer,const unsigned engine_sel,const uint64_t va,const unsigned count,const uint32_t * data,const bool predicating)282 radv_write_data(struct radv_cmd_buffer *cmd_buffer, const unsigned engine_sel, const uint64_t va, const unsigned count,
283 const uint32_t *data, const bool predicating)
284 {
285 radv_cs_write_data(cmd_buffer->device, cmd_buffer->cs, cmd_buffer->qf, engine_sel, va, count, data, predicating);
286 }
287
288 static void
radv_emit_clear_data(struct radv_cmd_buffer * cmd_buffer,unsigned engine_sel,uint64_t va,unsigned size)289 radv_emit_clear_data(struct radv_cmd_buffer *cmd_buffer, unsigned engine_sel, uint64_t va, unsigned size)
290 {
291 uint32_t *zeroes = alloca(size);
292 memset(zeroes, 0, size);
293 radv_write_data(cmd_buffer, engine_sel, va, size / 4, zeroes, false);
294 }
295
296 static void
radv_cmd_buffer_finish_shader_part_cache(struct radv_cmd_buffer * cmd_buffer)297 radv_cmd_buffer_finish_shader_part_cache(struct radv_cmd_buffer *cmd_buffer)
298 {
299 ralloc_free(cmd_buffer->vs_prologs.table);
300 ralloc_free(cmd_buffer->ps_epilogs.table);
301 ralloc_free(cmd_buffer->tcs_epilogs.table);
302 }
303
304 static bool
radv_cmd_buffer_init_shader_part_cache(struct radv_device * device,struct radv_cmd_buffer * cmd_buffer)305 radv_cmd_buffer_init_shader_part_cache(struct radv_device *device, struct radv_cmd_buffer *cmd_buffer)
306 {
307 if (device->vs_prologs.ops) {
308 if (!_mesa_set_init(&cmd_buffer->vs_prologs, NULL, device->vs_prologs.ops->hash, device->vs_prologs.ops->equals))
309 return false;
310 }
311 if (device->tcs_epilogs.ops) {
312 if (!_mesa_set_init(&cmd_buffer->tcs_epilogs, NULL, device->tcs_epilogs.ops->hash,
313 device->tcs_epilogs.ops->equals))
314 return false;
315 }
316 if (device->ps_epilogs.ops) {
317 if (!_mesa_set_init(&cmd_buffer->ps_epilogs, NULL, device->ps_epilogs.ops->hash, device->ps_epilogs.ops->equals))
318 return false;
319 }
320 return true;
321 }
322
323 static void
radv_destroy_cmd_buffer(struct vk_command_buffer * vk_cmd_buffer)324 radv_destroy_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer)
325 {
326 struct radv_cmd_buffer *cmd_buffer = container_of(vk_cmd_buffer, struct radv_cmd_buffer, vk);
327
328 if (cmd_buffer->qf != RADV_QUEUE_SPARSE) {
329 util_dynarray_fini(&cmd_buffer->ray_history);
330
331 list_for_each_entry_safe (struct radv_cmd_buffer_upload, up, &cmd_buffer->upload.list, list) {
332 radv_rmv_log_command_buffer_bo_destroy(cmd_buffer->device, up->upload_bo);
333 cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, up->upload_bo);
334 list_del(&up->list);
335 free(up);
336 }
337
338 if (cmd_buffer->upload.upload_bo) {
339 radv_rmv_log_command_buffer_bo_destroy(cmd_buffer->device, cmd_buffer->upload.upload_bo);
340 cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, cmd_buffer->upload.upload_bo);
341 }
342
343 if (cmd_buffer->cs)
344 cmd_buffer->device->ws->cs_destroy(cmd_buffer->cs);
345 if (cmd_buffer->gang.cs)
346 cmd_buffer->device->ws->cs_destroy(cmd_buffer->gang.cs);
347 if (cmd_buffer->transfer.copy_temp)
348 cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, cmd_buffer->transfer.copy_temp);
349
350 radv_cmd_buffer_finish_shader_part_cache(cmd_buffer);
351
352 for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
353 struct radv_descriptor_set_header *set = &cmd_buffer->descriptors[i].push_set.set;
354 free(set->mapped_ptr);
355 if (set->layout)
356 vk_descriptor_set_layout_unref(&cmd_buffer->device->vk, &set->layout->vk);
357 vk_object_base_finish(&set->base);
358 }
359
360 vk_object_base_finish(&cmd_buffer->meta_push_descriptors.base);
361 }
362
363 vk_command_buffer_finish(&cmd_buffer->vk);
364 vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer);
365 }
366
367 static VkResult
radv_create_cmd_buffer(struct vk_command_pool * pool,struct vk_command_buffer ** cmd_buffer_out)368 radv_create_cmd_buffer(struct vk_command_pool *pool, struct vk_command_buffer **cmd_buffer_out)
369 {
370 struct radv_device *device = container_of(pool->base.device, struct radv_device, vk);
371
372 struct radv_cmd_buffer *cmd_buffer;
373 unsigned ring;
374 cmd_buffer = vk_zalloc(&pool->alloc, sizeof(*cmd_buffer), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
375 if (cmd_buffer == NULL)
376 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
377
378 VkResult result = vk_command_buffer_init(pool, &cmd_buffer->vk, &radv_cmd_buffer_ops, 0);
379 if (result != VK_SUCCESS) {
380 vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer);
381 return result;
382 }
383
384 cmd_buffer->device = device;
385
386 cmd_buffer->qf = vk_queue_to_radv(device->physical_device, pool->queue_family_index);
387
388 if (cmd_buffer->qf != RADV_QUEUE_SPARSE) {
389 list_inithead(&cmd_buffer->upload.list);
390
391 if (!radv_cmd_buffer_init_shader_part_cache(device, cmd_buffer)) {
392 radv_destroy_cmd_buffer(&cmd_buffer->vk);
393 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
394 }
395
396 ring = radv_queue_family_to_ring(device->physical_device, cmd_buffer->qf);
397
398 cmd_buffer->cs =
399 device->ws->cs_create(device->ws, ring, cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
400 if (!cmd_buffer->cs) {
401 radv_destroy_cmd_buffer(&cmd_buffer->vk);
402 return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
403 }
404
405 vk_object_base_init(&device->vk, &cmd_buffer->meta_push_descriptors.base, VK_OBJECT_TYPE_DESCRIPTOR_SET);
406
407 for (unsigned i = 0; i < MAX_BIND_POINTS; i++)
408 vk_object_base_init(&device->vk, &cmd_buffer->descriptors[i].push_set.set.base, VK_OBJECT_TYPE_DESCRIPTOR_SET);
409
410 util_dynarray_init(&cmd_buffer->ray_history, NULL);
411 }
412
413 *cmd_buffer_out = &cmd_buffer->vk;
414
415 return VK_SUCCESS;
416 }
417
418 void
radv_cmd_buffer_reset_rendering(struct radv_cmd_buffer * cmd_buffer)419 radv_cmd_buffer_reset_rendering(struct radv_cmd_buffer *cmd_buffer)
420 {
421 memset(&cmd_buffer->state.render, 0, sizeof(cmd_buffer->state.render));
422 }
423
424 static void
radv_reset_cmd_buffer(struct vk_command_buffer * vk_cmd_buffer,UNUSED VkCommandBufferResetFlags flags)425 radv_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer, UNUSED VkCommandBufferResetFlags flags)
426 {
427 struct radv_cmd_buffer *cmd_buffer = container_of(vk_cmd_buffer, struct radv_cmd_buffer, vk);
428
429 vk_command_buffer_reset(&cmd_buffer->vk);
430
431 if (cmd_buffer->qf == RADV_QUEUE_SPARSE)
432 return;
433
434 cmd_buffer->device->ws->cs_reset(cmd_buffer->cs);
435 if (cmd_buffer->gang.cs)
436 cmd_buffer->device->ws->cs_reset(cmd_buffer->gang.cs);
437
438 list_for_each_entry_safe (struct radv_cmd_buffer_upload, up, &cmd_buffer->upload.list, list) {
439 radv_rmv_log_command_buffer_bo_destroy(cmd_buffer->device, up->upload_bo);
440 cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, up->upload_bo);
441 list_del(&up->list);
442 free(up);
443 }
444
445 util_dynarray_clear(&cmd_buffer->ray_history);
446
447 cmd_buffer->push_constant_stages = 0;
448 cmd_buffer->scratch_size_per_wave_needed = 0;
449 cmd_buffer->scratch_waves_wanted = 0;
450 cmd_buffer->compute_scratch_size_per_wave_needed = 0;
451 cmd_buffer->compute_scratch_waves_wanted = 0;
452 cmd_buffer->esgs_ring_size_needed = 0;
453 cmd_buffer->gsvs_ring_size_needed = 0;
454 cmd_buffer->tess_rings_needed = false;
455 cmd_buffer->task_rings_needed = false;
456 cmd_buffer->mesh_scratch_ring_needed = false;
457 cmd_buffer->gds_needed = false;
458 cmd_buffer->gds_oa_needed = false;
459 cmd_buffer->sample_positions_needed = false;
460 cmd_buffer->gang.sem.leader_value = 0;
461 cmd_buffer->gang.sem.emitted_leader_value = 0;
462 cmd_buffer->gang.sem.va = 0;
463 cmd_buffer->shader_upload_seq = 0;
464
465 if (cmd_buffer->upload.upload_bo)
466 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->upload.upload_bo);
467 cmd_buffer->upload.offset = 0;
468
469 memset(cmd_buffer->vertex_binding_buffers, 0, sizeof(struct radv_buffer *) * cmd_buffer->used_vertex_bindings);
470 cmd_buffer->used_vertex_bindings = 0;
471
472 for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
473 cmd_buffer->descriptors[i].dirty = 0;
474 cmd_buffer->descriptors[i].valid = 0;
475 }
476
477 radv_cmd_buffer_reset_rendering(cmd_buffer);
478 }
479
480 const struct vk_command_buffer_ops radv_cmd_buffer_ops = {
481 .create = radv_create_cmd_buffer,
482 .reset = radv_reset_cmd_buffer,
483 .destroy = radv_destroy_cmd_buffer,
484 };
485
486 static bool
radv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer * cmd_buffer,uint64_t min_needed)487 radv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer *cmd_buffer, uint64_t min_needed)
488 {
489 uint64_t new_size;
490 struct radeon_winsys_bo *bo = NULL;
491 struct radv_cmd_buffer_upload *upload;
492 struct radv_device *device = cmd_buffer->device;
493
494 new_size = MAX2(min_needed, 16 * 1024);
495 new_size = MAX2(new_size, 2 * cmd_buffer->upload.size);
496
497 VkResult result = device->ws->buffer_create(
498 device->ws, new_size, 4096, device->ws->cs_domain(device->ws),
499 RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_32BIT | RADEON_FLAG_GTT_WC,
500 RADV_BO_PRIORITY_UPLOAD_BUFFER, 0, &bo);
501
502 if (result != VK_SUCCESS) {
503 vk_command_buffer_set_error(&cmd_buffer->vk, result);
504 return false;
505 }
506
507 radv_cs_add_buffer(device->ws, cmd_buffer->cs, bo);
508 if (cmd_buffer->upload.upload_bo) {
509 upload = malloc(sizeof(*upload));
510
511 if (!upload) {
512 vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
513 device->ws->buffer_destroy(device->ws, bo);
514 return false;
515 }
516
517 memcpy(upload, &cmd_buffer->upload, sizeof(*upload));
518 list_add(&upload->list, &cmd_buffer->upload.list);
519 }
520
521 cmd_buffer->upload.upload_bo = bo;
522 cmd_buffer->upload.size = new_size;
523 cmd_buffer->upload.offset = 0;
524 cmd_buffer->upload.map = device->ws->buffer_map(cmd_buffer->upload.upload_bo);
525
526 if (!cmd_buffer->upload.map) {
527 vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_DEVICE_MEMORY);
528 return false;
529 }
530 radv_rmv_log_command_buffer_bo_create(device, cmd_buffer->upload.upload_bo, 0, cmd_buffer->upload.size, 0);
531
532 return true;
533 }
534
535 bool
radv_cmd_buffer_upload_alloc_aligned(struct radv_cmd_buffer * cmd_buffer,unsigned size,unsigned alignment,unsigned * out_offset,void ** ptr)536 radv_cmd_buffer_upload_alloc_aligned(struct radv_cmd_buffer *cmd_buffer, unsigned size, unsigned alignment,
537 unsigned *out_offset, void **ptr)
538 {
539 assert(size % 4 == 0);
540
541 const struct radeon_info *rad_info = &cmd_buffer->device->physical_device->rad_info;
542
543 /* Align to the scalar cache line size if it results in this allocation
544 * being placed in less of them.
545 */
546 unsigned offset = cmd_buffer->upload.offset;
547 unsigned line_size = rad_info->gfx_level >= GFX10 ? 64 : 32;
548 unsigned gap = align(offset, line_size) - offset;
549 if ((size & (line_size - 1)) > gap)
550 offset = align(offset, line_size);
551
552 if (alignment)
553 offset = align(offset, alignment);
554 if (offset + size > cmd_buffer->upload.size) {
555 if (!radv_cmd_buffer_resize_upload_buf(cmd_buffer, size))
556 return false;
557 offset = 0;
558 }
559
560 *out_offset = offset;
561 *ptr = cmd_buffer->upload.map + offset;
562
563 cmd_buffer->upload.offset = offset + size;
564 return true;
565 }
566
567 bool
radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer * cmd_buffer,unsigned size,unsigned * out_offset,void ** ptr)568 radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer *cmd_buffer, unsigned size, unsigned *out_offset, void **ptr)
569 {
570 return radv_cmd_buffer_upload_alloc_aligned(cmd_buffer, size, 0, out_offset, ptr);
571 }
572
573 bool
radv_cmd_buffer_upload_data(struct radv_cmd_buffer * cmd_buffer,unsigned size,const void * data,unsigned * out_offset)574 radv_cmd_buffer_upload_data(struct radv_cmd_buffer *cmd_buffer, unsigned size, const void *data, unsigned *out_offset)
575 {
576 uint8_t *ptr;
577
578 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, out_offset, (void **)&ptr))
579 return false;
580 assert(ptr);
581
582 memcpy(ptr, data, size);
583 return true;
584 }
585
586 void
radv_cmd_buffer_trace_emit(struct radv_cmd_buffer * cmd_buffer)587 radv_cmd_buffer_trace_emit(struct radv_cmd_buffer *cmd_buffer)
588 {
589 struct radv_device *device = cmd_buffer->device;
590 struct radeon_cmdbuf *cs = cmd_buffer->cs;
591 uint64_t va;
592
593 if (cmd_buffer->qf != RADV_QUEUE_GENERAL && cmd_buffer->qf != RADV_QUEUE_COMPUTE)
594 return;
595
596 va = radv_buffer_get_va(device->trace_bo);
597 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
598 va += 4;
599
600 ++cmd_buffer->state.trace_id;
601 radv_write_data(cmd_buffer, V_370_ME, va, 1, &cmd_buffer->state.trace_id, false);
602
603 radeon_check_space(cmd_buffer->device->ws, cs, 2);
604
605 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
606 radeon_emit(cs, AC_ENCODE_TRACE_POINT(cmd_buffer->state.trace_id));
607 }
608
609 static void
radv_gang_barrier(struct radv_cmd_buffer * cmd_buffer,VkPipelineStageFlags2 src_stage_mask,VkPipelineStageFlags2 dst_stage_mask)610 radv_gang_barrier(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 src_stage_mask,
611 VkPipelineStageFlags2 dst_stage_mask)
612 {
613 /* Update flush bits from the main cmdbuf, except the stage flush. */
614 cmd_buffer->gang.flush_bits |=
615 cmd_buffer->state.flush_bits & RADV_CMD_FLUSH_ALL_COMPUTE & ~RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
616
617 /* Add stage flush only when necessary. */
618 if (src_stage_mask & (VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_EXT | VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT |
619 VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
620 cmd_buffer->gang.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
621
622 /* Block task shaders when we have to wait for CP DMA on the GFX cmdbuf. */
623 if (src_stage_mask &
624 (VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_CLEAR_BIT | VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT |
625 VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
626 dst_stage_mask |= cmd_buffer->state.dma_is_busy ? VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_EXT : 0;
627
628 /* Increment the GFX/ACE semaphore when task shaders are blocked. */
629 if (dst_stage_mask & (VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT | VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT |
630 VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_EXT))
631 cmd_buffer->gang.sem.leader_value++;
632 }
633
634 void
radv_gang_cache_flush(struct radv_cmd_buffer * cmd_buffer)635 radv_gang_cache_flush(struct radv_cmd_buffer *cmd_buffer)
636 {
637 struct radeon_cmdbuf *ace_cs = cmd_buffer->gang.cs;
638 const uint32_t flush_bits = cmd_buffer->gang.flush_bits;
639 enum rgp_flush_bits sqtt_flush_bits = 0;
640
641 radv_cs_emit_cache_flush(cmd_buffer->device->ws, ace_cs, cmd_buffer->device->physical_device->rad_info.gfx_level,
642 NULL, 0, RADV_QUEUE_COMPUTE, flush_bits, &sqtt_flush_bits, 0);
643
644 cmd_buffer->gang.flush_bits = 0;
645 }
646
647 static bool
radv_gang_sem_init(struct radv_cmd_buffer * cmd_buffer)648 radv_gang_sem_init(struct radv_cmd_buffer *cmd_buffer)
649 {
650 if (cmd_buffer->gang.sem.va)
651 return true;
652
653 /* DWORD 0: GFX->ACE semaphore (GFX blocks ACE, ie. ACE waits for GFX)
654 * DWORD 1: ACE->GFX semaphore
655 */
656 uint64_t sem_init = 0;
657 uint32_t va_off = 0;
658 if (!radv_cmd_buffer_upload_data(cmd_buffer, sizeof(uint64_t), &sem_init, &va_off)) {
659 vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
660 return false;
661 }
662
663 cmd_buffer->gang.sem.va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + va_off;
664 return true;
665 }
666
667 static bool
radv_gang_leader_sem_dirty(const struct radv_cmd_buffer * cmd_buffer)668 radv_gang_leader_sem_dirty(const struct radv_cmd_buffer *cmd_buffer)
669 {
670 return cmd_buffer->gang.sem.leader_value != cmd_buffer->gang.sem.emitted_leader_value;
671 }
672
673 static bool
radv_gang_follower_sem_dirty(const struct radv_cmd_buffer * cmd_buffer)674 radv_gang_follower_sem_dirty(const struct radv_cmd_buffer *cmd_buffer)
675 {
676 return cmd_buffer->gang.sem.follower_value != cmd_buffer->gang.sem.emitted_follower_value;
677 }
678
679 ALWAYS_INLINE static bool
radv_flush_gang_semaphore(struct radv_cmd_buffer * cmd_buffer,struct radeon_cmdbuf * cs,const enum radv_queue_family qf,const uint32_t va_off,const uint32_t value)680 radv_flush_gang_semaphore(struct radv_cmd_buffer *cmd_buffer, struct radeon_cmdbuf *cs, const enum radv_queue_family qf,
681 const uint32_t va_off, const uint32_t value)
682 {
683 if (!radv_gang_sem_init(cmd_buffer))
684 return false;
685
686 ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 12);
687
688 radv_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level, qf,
689 V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT,
690 cmd_buffer->gang.sem.va + va_off, value, cmd_buffer->gfx9_eop_bug_va);
691
692 assert(cmd_buffer->cs->cdw <= cdw_max);
693 return true;
694 }
695
696 ALWAYS_INLINE static bool
radv_flush_gang_leader_semaphore(struct radv_cmd_buffer * cmd_buffer)697 radv_flush_gang_leader_semaphore(struct radv_cmd_buffer *cmd_buffer)
698 {
699 if (!radv_gang_leader_sem_dirty(cmd_buffer))
700 return false;
701
702 /* Gang leader writes a value to the semaphore which the follower can wait for. */
703 cmd_buffer->gang.sem.emitted_leader_value = cmd_buffer->gang.sem.leader_value;
704 return radv_flush_gang_semaphore(cmd_buffer, cmd_buffer->cs, cmd_buffer->qf, 0, cmd_buffer->gang.sem.leader_value);
705 }
706
707 ALWAYS_INLINE static bool
radv_flush_gang_follower_semaphore(struct radv_cmd_buffer * cmd_buffer)708 radv_flush_gang_follower_semaphore(struct radv_cmd_buffer *cmd_buffer)
709 {
710 if (!radv_gang_follower_sem_dirty(cmd_buffer))
711 return false;
712
713 /* Follower writes a value to the semaphore which the gang leader can wait for. */
714 cmd_buffer->gang.sem.emitted_follower_value = cmd_buffer->gang.sem.follower_value;
715 return radv_flush_gang_semaphore(cmd_buffer, cmd_buffer->gang.cs, RADV_QUEUE_COMPUTE, 4,
716 cmd_buffer->gang.sem.follower_value);
717 }
718
719 ALWAYS_INLINE static void
radv_wait_gang_semaphore(struct radv_cmd_buffer * cmd_buffer,struct radeon_cmdbuf * cs,const enum radv_queue_family qf,const uint32_t va_off,const uint32_t value)720 radv_wait_gang_semaphore(struct radv_cmd_buffer *cmd_buffer, struct radeon_cmdbuf *cs, const enum radv_queue_family qf,
721 const uint32_t va_off, const uint32_t value)
722 {
723 assert(cmd_buffer->gang.sem.va);
724 radeon_check_space(cmd_buffer->device->ws, cs, 7);
725 radv_cp_wait_mem(cs, qf, WAIT_REG_MEM_GREATER_OR_EQUAL, cmd_buffer->gang.sem.va + va_off, value, 0xffffffff);
726 }
727
728 ALWAYS_INLINE static void
radv_wait_gang_leader(struct radv_cmd_buffer * cmd_buffer)729 radv_wait_gang_leader(struct radv_cmd_buffer *cmd_buffer)
730 {
731 /* Follower waits for the semaphore which the gang leader wrote. */
732 radv_wait_gang_semaphore(cmd_buffer, cmd_buffer->gang.cs, RADV_QUEUE_COMPUTE, 0, cmd_buffer->gang.sem.leader_value);
733 }
734
735 ALWAYS_INLINE static void
radv_wait_gang_follower(struct radv_cmd_buffer * cmd_buffer)736 radv_wait_gang_follower(struct radv_cmd_buffer *cmd_buffer)
737 {
738 /* Gang leader waits for the semaphore which the follower wrote. */
739 radv_wait_gang_semaphore(cmd_buffer, cmd_buffer->cs, cmd_buffer->qf, 4, cmd_buffer->gang.sem.follower_value);
740 }
741
742 bool
radv_gang_init(struct radv_cmd_buffer * cmd_buffer)743 radv_gang_init(struct radv_cmd_buffer *cmd_buffer)
744 {
745 if (cmd_buffer->gang.cs)
746 return true;
747
748 struct radv_device *device = cmd_buffer->device;
749 struct radeon_cmdbuf *ace_cs =
750 device->ws->cs_create(device->ws, AMD_IP_COMPUTE, cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
751
752 if (!ace_cs) {
753 vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_DEVICE_MEMORY);
754 return false;
755 }
756
757 cmd_buffer->gang.cs = ace_cs;
758 return true;
759 }
760
761 static VkResult
radv_gang_finalize(struct radv_cmd_buffer * cmd_buffer)762 radv_gang_finalize(struct radv_cmd_buffer *cmd_buffer)
763 {
764 assert(cmd_buffer->gang.cs);
765 struct radv_device *device = cmd_buffer->device;
766 struct radeon_cmdbuf *ace_cs = cmd_buffer->gang.cs;
767
768 /* Emit pending cache flush. */
769 radv_gang_cache_flush(cmd_buffer);
770
771 /* Clear the leader<->follower semaphores if they exist.
772 * This is necessary in case the same cmd buffer is submitted again in the future.
773 */
774 if (cmd_buffer->gang.sem.va) {
775 uint64_t leader2follower_va = cmd_buffer->gang.sem.va;
776 uint64_t follower2leader_va = cmd_buffer->gang.sem.va + 4;
777 const uint32_t zero = 0;
778
779 /* Follower: write 0 to the leader->follower semaphore. */
780 radv_cs_write_data(device, ace_cs, RADV_QUEUE_COMPUTE, V_370_ME, leader2follower_va, 1, &zero, false);
781
782 /* Leader: write 0 to the follower->leader semaphore. */
783 radv_write_data(cmd_buffer, V_370_ME, follower2leader_va, 1, &zero, false);
784 }
785
786 return device->ws->cs_finalize(ace_cs);
787 }
788
789 static void
radv_cmd_buffer_after_draw(struct radv_cmd_buffer * cmd_buffer,enum radv_cmd_flush_bits flags,bool dgc)790 radv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer, enum radv_cmd_flush_bits flags, bool dgc)
791 {
792 const struct radv_device *device = cmd_buffer->device;
793 if (unlikely(device->sqtt.bo) && !dgc) {
794 radeon_check_space(device->ws, cmd_buffer->cs, 2);
795
796 radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, cmd_buffer->state.predicating));
797 radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) | EVENT_INDEX(0));
798 }
799
800 if (device->instance->debug_flags & RADV_DEBUG_SYNC_SHADERS) {
801 enum rgp_flush_bits sqtt_flush_bits = 0;
802 assert(flags & (RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH));
803
804 /* Force wait for graphics or compute engines to be idle. */
805 radv_cs_emit_cache_flush(device->ws, cmd_buffer->cs, device->physical_device->rad_info.gfx_level,
806 &cmd_buffer->gfx9_fence_idx, cmd_buffer->gfx9_fence_va, cmd_buffer->qf, flags,
807 &sqtt_flush_bits, cmd_buffer->gfx9_eop_bug_va);
808
809 if ((flags & RADV_CMD_FLAG_PS_PARTIAL_FLUSH) && radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_TASK)) {
810 /* Force wait for compute engines to be idle on the internal cmdbuf. */
811 radv_cs_emit_cache_flush(device->ws, cmd_buffer->gang.cs, device->physical_device->rad_info.gfx_level, NULL, 0,
812 RADV_QUEUE_COMPUTE, RADV_CMD_FLAG_CS_PARTIAL_FLUSH, &sqtt_flush_bits, 0);
813 }
814 }
815
816 if (radv_device_fault_detection_enabled(device))
817 radv_cmd_buffer_trace_emit(cmd_buffer);
818 }
819
820 static void
radv_save_pipeline(struct radv_cmd_buffer * cmd_buffer,struct radv_pipeline * pipeline)821 radv_save_pipeline(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline)
822 {
823 struct radv_device *device = cmd_buffer->device;
824 enum amd_ip_type ring;
825 uint32_t data[2];
826 uint64_t va;
827
828 va = radv_buffer_get_va(device->trace_bo);
829
830 ring = radv_queue_family_to_ring(device->physical_device, cmd_buffer->qf);
831
832 switch (ring) {
833 case AMD_IP_GFX:
834 va += 8;
835 break;
836 case AMD_IP_COMPUTE:
837 va += 16;
838 break;
839 default:
840 assert(!"invalid IP type");
841 }
842
843 uint64_t pipeline_address = (uintptr_t)pipeline;
844 data[0] = pipeline_address;
845 data[1] = pipeline_address >> 32;
846
847 radv_write_data(cmd_buffer, V_370_ME, va, 2, data, false);
848 }
849
850 static void
radv_save_vertex_descriptors(struct radv_cmd_buffer * cmd_buffer,uint64_t vb_ptr)851 radv_save_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, uint64_t vb_ptr)
852 {
853 struct radv_device *device = cmd_buffer->device;
854 uint32_t data[2];
855 uint64_t va;
856
857 va = radv_buffer_get_va(device->trace_bo);
858 va += 24;
859
860 data[0] = vb_ptr;
861 data[1] = vb_ptr >> 32;
862
863 radv_write_data(cmd_buffer, V_370_ME, va, 2, data, false);
864 }
865
866 static void
radv_save_vs_prolog(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader_part * prolog)867 radv_save_vs_prolog(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader_part *prolog)
868 {
869 struct radv_device *device = cmd_buffer->device;
870 uint32_t data[2];
871 uint64_t va;
872
873 va = radv_buffer_get_va(device->trace_bo);
874 va += 32;
875
876 uint64_t prolog_address = (uintptr_t)prolog;
877 data[0] = prolog_address;
878 data[1] = prolog_address >> 32;
879
880 radv_write_data(cmd_buffer, V_370_ME, va, 2, data, false);
881 }
882
883 void
radv_set_descriptor_set(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point,struct radv_descriptor_set * set,unsigned idx)884 radv_set_descriptor_set(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point,
885 struct radv_descriptor_set *set, unsigned idx)
886 {
887 struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);
888
889 descriptors_state->sets[idx] = set;
890
891 descriptors_state->valid |= (1u << idx); /* active descriptors */
892 descriptors_state->dirty |= (1u << idx);
893 }
894
895 static void
radv_save_descriptors(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)896 radv_save_descriptors(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point)
897 {
898 struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);
899 struct radv_device *device = cmd_buffer->device;
900 uint32_t data[MAX_SETS * 2] = {0};
901 uint64_t va;
902 va = radv_buffer_get_va(device->trace_bo) + 40;
903
904 u_foreach_bit (i, descriptors_state->valid) {
905 struct radv_descriptor_set *set = descriptors_state->sets[i];
906 data[i * 2] = (uint64_t)(uintptr_t)set;
907 data[i * 2 + 1] = (uint64_t)(uintptr_t)set >> 32;
908 }
909
910 radv_write_data(cmd_buffer, V_370_ME, va, MAX_SETS * 2, data, false);
911 }
912
913 const struct radv_userdata_info *
radv_get_user_sgpr(const struct radv_shader * shader,int idx)914 radv_get_user_sgpr(const struct radv_shader *shader, int idx)
915 {
916 return &shader->info.user_sgprs_locs.shader_data[idx];
917 }
918
919 static void
radv_emit_userdata_address(struct radv_device * device,struct radeon_cmdbuf * cs,struct radv_shader * shader,uint32_t base_reg,int idx,uint64_t va)920 radv_emit_userdata_address(struct radv_device *device, struct radeon_cmdbuf *cs, struct radv_shader *shader,
921 uint32_t base_reg, int idx, uint64_t va)
922 {
923 const struct radv_userdata_info *loc = &shader->info.user_sgprs_locs.shader_data[idx];
924
925 if (loc->sgpr_idx == -1)
926 return;
927
928 assert(loc->num_sgprs == 1);
929
930 radv_emit_shader_pointer(device, cs, base_reg + loc->sgpr_idx * 4, va, false);
931 }
932
933 static uint64_t
radv_descriptor_get_va(const struct radv_descriptor_state * descriptors_state,unsigned set_idx)934 radv_descriptor_get_va(const struct radv_descriptor_state *descriptors_state, unsigned set_idx)
935 {
936 struct radv_descriptor_set *set = descriptors_state->sets[set_idx];
937 uint64_t va;
938
939 if (set) {
940 va = set->header.va;
941 } else {
942 va = descriptors_state->descriptor_buffers[set_idx];
943 }
944
945 return va;
946 }
947
948 static void
radv_emit_descriptor_pointers(struct radv_device * device,struct radeon_cmdbuf * cs,struct radv_shader * shader,uint32_t sh_base,struct radv_descriptor_state * descriptors_state)949 radv_emit_descriptor_pointers(struct radv_device *device, struct radeon_cmdbuf *cs, struct radv_shader *shader,
950 uint32_t sh_base, struct radv_descriptor_state *descriptors_state)
951 {
952 struct radv_userdata_locations *locs = &shader->info.user_sgprs_locs;
953 unsigned mask = locs->descriptor_sets_enabled;
954
955 mask &= descriptors_state->dirty & descriptors_state->valid;
956
957 while (mask) {
958 int start, count;
959
960 u_bit_scan_consecutive_range(&mask, &start, &count);
961
962 struct radv_userdata_info *loc = &locs->descriptor_sets[start];
963 unsigned sh_offset = sh_base + loc->sgpr_idx * 4;
964
965 radv_emit_shader_pointer_head(cs, sh_offset, count, true);
966 for (int i = 0; i < count; i++) {
967 uint64_t va = radv_descriptor_get_va(descriptors_state, start + i);
968
969 radv_emit_shader_pointer_body(device, cs, va, true);
970 }
971 }
972 }
973
974 static unsigned
radv_get_rasterization_prim(struct radv_cmd_buffer * cmd_buffer)975 radv_get_rasterization_prim(struct radv_cmd_buffer *cmd_buffer)
976 {
977 const struct radv_shader *last_vgt_shader = cmd_buffer->state.last_vgt_shader;
978 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
979
980 if (cmd_buffer->state.active_stages &
981 (VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT | VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT |
982 VK_SHADER_STAGE_GEOMETRY_BIT | VK_SHADER_STAGE_MESH_BIT_EXT)) {
983 /* Ignore dynamic primitive topology for TES/GS/MS stages. */
984 return cmd_buffer->state.rast_prim;
985 }
986
987 return radv_conv_prim_to_gs_out(d->vk.ia.primitive_topology, last_vgt_shader->info.is_ngg);
988 }
989
990 static ALWAYS_INLINE unsigned
radv_get_rasterization_samples(struct radv_cmd_buffer * cmd_buffer)991 radv_get_rasterization_samples(struct radv_cmd_buffer *cmd_buffer)
992 {
993 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
994
995 if (d->vk.rs.line.mode == VK_LINE_RASTERIZATION_MODE_BRESENHAM_KHR &&
996 radv_rast_prim_is_line(radv_get_rasterization_prim(cmd_buffer))) {
997 /* From the Vulkan spec 1.3.221:
998 *
999 * "When Bresenham lines are being rasterized, sample locations may all be treated as being at
1000 * the pixel center (this may affect attribute and depth interpolation)."
1001 *
1002 * "One consequence of this is that Bresenham lines cover the same pixels regardless of the
1003 * number of rasterization samples, and cover all samples in those pixels (unless masked out
1004 * or killed)."
1005 */
1006 return 1;
1007 }
1008
1009 if (d->vk.rs.line.mode == VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_KHR &&
1010 radv_rast_prim_is_line(radv_get_rasterization_prim(cmd_buffer))) {
1011 return RADV_NUM_SMOOTH_AA_SAMPLES;
1012 }
1013
1014 return MAX2(1, d->vk.ms.rasterization_samples);
1015 }
1016
1017 static ALWAYS_INLINE unsigned
radv_get_ps_iter_samples(struct radv_cmd_buffer * cmd_buffer)1018 radv_get_ps_iter_samples(struct radv_cmd_buffer *cmd_buffer)
1019 {
1020 const struct radv_rendering_state *render = &cmd_buffer->state.render;
1021 unsigned ps_iter_samples = 1;
1022
1023 if (cmd_buffer->state.ms.sample_shading_enable) {
1024 unsigned rasterization_samples = radv_get_rasterization_samples(cmd_buffer);
1025 unsigned color_samples = MAX2(render->color_samples, rasterization_samples);
1026
1027 ps_iter_samples = ceilf(cmd_buffer->state.ms.min_sample_shading * color_samples);
1028 ps_iter_samples = util_next_power_of_two(ps_iter_samples);
1029 }
1030
1031 return ps_iter_samples;
1032 }
1033
1034 /**
1035 * Convert the user sample locations to hardware sample locations (the values
1036 * that will be emitted by PA_SC_AA_SAMPLE_LOCS_PIXEL_*).
1037 */
1038 static void
radv_convert_user_sample_locs(const struct radv_sample_locations_state * state,uint32_t x,uint32_t y,VkOffset2D * sample_locs)1039 radv_convert_user_sample_locs(const struct radv_sample_locations_state *state, uint32_t x, uint32_t y,
1040 VkOffset2D *sample_locs)
1041 {
1042 uint32_t x_offset = x % state->grid_size.width;
1043 uint32_t y_offset = y % state->grid_size.height;
1044 uint32_t num_samples = (uint32_t)state->per_pixel;
1045 uint32_t pixel_offset;
1046
1047 pixel_offset = (x_offset + y_offset * state->grid_size.width) * num_samples;
1048
1049 assert(pixel_offset <= MAX_SAMPLE_LOCATIONS);
1050 const VkSampleLocationEXT *user_locs = &state->locations[pixel_offset];
1051
1052 for (uint32_t i = 0; i < num_samples; i++) {
1053 float shifted_pos_x = user_locs[i].x - 0.5;
1054 float shifted_pos_y = user_locs[i].y - 0.5;
1055
1056 int32_t scaled_pos_x = floorf(shifted_pos_x * 16);
1057 int32_t scaled_pos_y = floorf(shifted_pos_y * 16);
1058
1059 sample_locs[i].x = CLAMP(scaled_pos_x, -8, 7);
1060 sample_locs[i].y = CLAMP(scaled_pos_y, -8, 7);
1061 }
1062 }
1063
1064 /**
1065 * Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask based on hardware sample
1066 * locations.
1067 */
1068 static void
radv_compute_sample_locs_pixel(uint32_t num_samples,VkOffset2D * sample_locs,uint32_t * sample_locs_pixel)1069 radv_compute_sample_locs_pixel(uint32_t num_samples, VkOffset2D *sample_locs, uint32_t *sample_locs_pixel)
1070 {
1071 for (uint32_t i = 0; i < num_samples; i++) {
1072 uint32_t sample_reg_idx = i / 4;
1073 uint32_t sample_loc_idx = i % 4;
1074 int32_t pos_x = sample_locs[i].x;
1075 int32_t pos_y = sample_locs[i].y;
1076
1077 uint32_t shift_x = 8 * sample_loc_idx;
1078 uint32_t shift_y = shift_x + 4;
1079
1080 sample_locs_pixel[sample_reg_idx] |= (pos_x & 0xf) << shift_x;
1081 sample_locs_pixel[sample_reg_idx] |= (pos_y & 0xf) << shift_y;
1082 }
1083 }
1084
1085 /**
1086 * Compute the PA_SC_CENTROID_PRIORITY_* mask based on the top left hardware
1087 * sample locations.
1088 */
1089 static uint64_t
radv_compute_centroid_priority(struct radv_cmd_buffer * cmd_buffer,VkOffset2D * sample_locs,uint32_t num_samples)1090 radv_compute_centroid_priority(struct radv_cmd_buffer *cmd_buffer, VkOffset2D *sample_locs, uint32_t num_samples)
1091 {
1092 uint32_t *centroid_priorities = alloca(num_samples * sizeof(*centroid_priorities));
1093 uint32_t sample_mask = num_samples - 1;
1094 uint32_t *distances = alloca(num_samples * sizeof(*distances));
1095 uint64_t centroid_priority = 0;
1096
1097 /* Compute the distances from center for each sample. */
1098 for (int i = 0; i < num_samples; i++) {
1099 distances[i] = (sample_locs[i].x * sample_locs[i].x) + (sample_locs[i].y * sample_locs[i].y);
1100 }
1101
1102 /* Compute the centroid priorities by looking at the distances array. */
1103 for (int i = 0; i < num_samples; i++) {
1104 uint32_t min_idx = 0;
1105
1106 for (int j = 1; j < num_samples; j++) {
1107 if (distances[j] < distances[min_idx])
1108 min_idx = j;
1109 }
1110
1111 centroid_priorities[i] = min_idx;
1112 distances[min_idx] = 0xffffffff;
1113 }
1114
1115 /* Compute the final centroid priority. */
1116 for (int i = 0; i < 8; i++) {
1117 centroid_priority |= centroid_priorities[i & sample_mask] << (i * 4);
1118 }
1119
1120 return centroid_priority << 32 | centroid_priority;
1121 }
1122
1123 /**
1124 * Emit the sample locations that are specified with VK_EXT_sample_locations.
1125 */
1126 static void
radv_emit_sample_locations(struct radv_cmd_buffer * cmd_buffer)1127 radv_emit_sample_locations(struct radv_cmd_buffer *cmd_buffer)
1128 {
1129 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1130 uint32_t num_samples = (uint32_t)d->sample_location.per_pixel;
1131 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1132 uint32_t sample_locs_pixel[4][2] = {0};
1133 VkOffset2D sample_locs[4][8]; /* 8 is the max. sample count supported */
1134 uint64_t centroid_priority;
1135
1136 if (!d->sample_location.count || !d->vk.ms.sample_locations_enable)
1137 return;
1138
1139 /* Convert the user sample locations to hardware sample locations. */
1140 radv_convert_user_sample_locs(&d->sample_location, 0, 0, sample_locs[0]);
1141 radv_convert_user_sample_locs(&d->sample_location, 1, 0, sample_locs[1]);
1142 radv_convert_user_sample_locs(&d->sample_location, 0, 1, sample_locs[2]);
1143 radv_convert_user_sample_locs(&d->sample_location, 1, 1, sample_locs[3]);
1144
1145 /* Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask. */
1146 for (uint32_t i = 0; i < 4; i++) {
1147 radv_compute_sample_locs_pixel(num_samples, sample_locs[i], sample_locs_pixel[i]);
1148 }
1149
1150 /* Compute the PA_SC_CENTROID_PRIORITY_* mask. */
1151 centroid_priority = radv_compute_centroid_priority(cmd_buffer, sample_locs[0], num_samples);
1152
1153 /* Emit the specified user sample locations. */
1154 switch (num_samples) {
1155 case 2:
1156 case 4:
1157 radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_pixel[0][0]);
1158 radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_pixel[1][0]);
1159 radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs_pixel[2][0]);
1160 radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs_pixel[3][0]);
1161 break;
1162 case 8:
1163 radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_pixel[0][0]);
1164 radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_pixel[1][0]);
1165 radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs_pixel[2][0]);
1166 radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs_pixel[3][0]);
1167 radeon_set_context_reg(cs, R_028BFC_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_1, sample_locs_pixel[0][1]);
1168 radeon_set_context_reg(cs, R_028C0C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_1, sample_locs_pixel[1][1]);
1169 radeon_set_context_reg(cs, R_028C1C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_1, sample_locs_pixel[2][1]);
1170 radeon_set_context_reg(cs, R_028C2C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1, sample_locs_pixel[3][1]);
1171 break;
1172 default:
1173 unreachable("invalid number of samples");
1174 }
1175
1176 radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
1177 radeon_emit(cs, centroid_priority);
1178 radeon_emit(cs, centroid_priority >> 32);
1179 }
1180
1181 static void
radv_emit_inline_push_consts(struct radv_device * device,struct radeon_cmdbuf * cs,const struct radv_shader * shader,uint32_t base_reg,int idx,uint32_t * values)1182 radv_emit_inline_push_consts(struct radv_device *device, struct radeon_cmdbuf *cs, const struct radv_shader *shader,
1183 uint32_t base_reg, int idx, uint32_t *values)
1184 {
1185 const struct radv_userdata_info *loc = &shader->info.user_sgprs_locs.shader_data[idx];
1186
1187 if (loc->sgpr_idx == -1)
1188 return;
1189
1190 radeon_check_space(device->ws, cs, 2 + loc->num_sgprs);
1191
1192 radeon_set_sh_reg_seq(cs, base_reg + loc->sgpr_idx * 4, loc->num_sgprs);
1193 radeon_emit_array(cs, values, loc->num_sgprs);
1194 }
1195
1196 struct radv_bin_size_entry {
1197 unsigned bpp;
1198 VkExtent2D extent;
1199 };
1200
1201 static VkExtent2D
radv_gfx10_compute_bin_size(struct radv_cmd_buffer * cmd_buffer)1202 radv_gfx10_compute_bin_size(struct radv_cmd_buffer *cmd_buffer)
1203 {
1204 const struct radv_physical_device *pdevice = cmd_buffer->device->physical_device;
1205 const struct radv_rendering_state *render = &cmd_buffer->state.render;
1206 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1207 VkExtent2D extent = {512, 512};
1208
1209 const unsigned db_tag_size = 64;
1210 const unsigned db_tag_count = 312;
1211 const unsigned color_tag_size = 1024;
1212 const unsigned color_tag_count = 31;
1213 const unsigned fmask_tag_size = 256;
1214 const unsigned fmask_tag_count = 44;
1215
1216 const unsigned rb_count = pdevice->rad_info.max_render_backends;
1217 const unsigned pipe_count = MAX2(rb_count, pdevice->rad_info.num_tcc_blocks);
1218
1219 const unsigned db_tag_part = (db_tag_count * rb_count / pipe_count) * db_tag_size * pipe_count;
1220 const unsigned color_tag_part = (color_tag_count * rb_count / pipe_count) * color_tag_size * pipe_count;
1221 const unsigned fmask_tag_part = (fmask_tag_count * rb_count / pipe_count) * fmask_tag_size * pipe_count;
1222
1223 const unsigned total_samples = radv_get_rasterization_samples(cmd_buffer);
1224 const unsigned samples_log = util_logbase2_ceil(total_samples);
1225
1226 unsigned color_bytes_per_pixel = 0;
1227 unsigned fmask_bytes_per_pixel = 0;
1228
1229 for (unsigned i = 0; i < render->color_att_count; ++i) {
1230 struct radv_image_view *iview = render->color_att[i].iview;
1231
1232 if (!iview)
1233 continue;
1234
1235 if (!d->vk.cb.attachments[i].write_mask)
1236 continue;
1237
1238 color_bytes_per_pixel += vk_format_get_blocksize(render->color_att[i].format);
1239
1240 if (total_samples > 1) {
1241 assert(samples_log <= 3);
1242 const unsigned fmask_array[] = {0, 1, 1, 4};
1243 fmask_bytes_per_pixel += fmask_array[samples_log];
1244 }
1245 }
1246
1247 color_bytes_per_pixel *= total_samples;
1248 color_bytes_per_pixel = MAX2(color_bytes_per_pixel, 1);
1249
1250 const unsigned color_pixel_count_log = util_logbase2(color_tag_part / color_bytes_per_pixel);
1251 extent.width = 1ull << ((color_pixel_count_log + 1) / 2);
1252 extent.height = 1ull << (color_pixel_count_log / 2);
1253
1254 if (fmask_bytes_per_pixel) {
1255 const unsigned fmask_pixel_count_log = util_logbase2(fmask_tag_part / fmask_bytes_per_pixel);
1256
1257 const VkExtent2D fmask_extent = (VkExtent2D){.width = 1ull << ((fmask_pixel_count_log + 1) / 2),
1258 .height = 1ull << (color_pixel_count_log / 2)};
1259
1260 if (fmask_extent.width * fmask_extent.height < extent.width * extent.height)
1261 extent = fmask_extent;
1262 }
1263
1264 if (render->ds_att.iview) {
1265 /* Coefficients taken from AMDVLK */
1266 unsigned depth_coeff = vk_format_has_depth(render->ds_att.format) ? 5 : 0;
1267 unsigned stencil_coeff = vk_format_has_stencil(render->ds_att.format) ? 1 : 0;
1268 unsigned db_bytes_per_pixel = (depth_coeff + stencil_coeff) * total_samples;
1269
1270 const unsigned db_pixel_count_log = util_logbase2(db_tag_part / db_bytes_per_pixel);
1271
1272 const VkExtent2D db_extent =
1273 (VkExtent2D){.width = 1ull << ((db_pixel_count_log + 1) / 2), .height = 1ull << (color_pixel_count_log / 2)};
1274
1275 if (db_extent.width * db_extent.height < extent.width * extent.height)
1276 extent = db_extent;
1277 }
1278
1279 extent.width = MAX2(extent.width, 128);
1280 extent.height = MAX2(extent.width, 64);
1281
1282 return extent;
1283 }
1284
1285 static VkExtent2D
radv_gfx9_compute_bin_size(struct radv_cmd_buffer * cmd_buffer)1286 radv_gfx9_compute_bin_size(struct radv_cmd_buffer *cmd_buffer)
1287 {
1288 const struct radv_physical_device *pdevice = cmd_buffer->device->physical_device;
1289 const struct radv_rendering_state *render = &cmd_buffer->state.render;
1290 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1291 static const struct radv_bin_size_entry color_size_table[][3][9] = {
1292 {
1293 /* One RB / SE */
1294 {
1295 /* One shader engine */
1296 {0, {128, 128}},
1297 {1, {64, 128}},
1298 {2, {32, 128}},
1299 {3, {16, 128}},
1300 {17, {0, 0}},
1301 {UINT_MAX, {0, 0}},
1302 },
1303 {
1304 /* Two shader engines */
1305 {0, {128, 128}},
1306 {2, {64, 128}},
1307 {3, {32, 128}},
1308 {5, {16, 128}},
1309 {17, {0, 0}},
1310 {UINT_MAX, {0, 0}},
1311 },
1312 {
1313 /* Four shader engines */
1314 {0, {128, 128}},
1315 {3, {64, 128}},
1316 {5, {16, 128}},
1317 {17, {0, 0}},
1318 {UINT_MAX, {0, 0}},
1319 },
1320 },
1321 {
1322 /* Two RB / SE */
1323 {
1324 /* One shader engine */
1325 {0, {128, 128}},
1326 {2, {64, 128}},
1327 {3, {32, 128}},
1328 {5, {16, 128}},
1329 {33, {0, 0}},
1330 {UINT_MAX, {0, 0}},
1331 },
1332 {
1333 /* Two shader engines */
1334 {0, {128, 128}},
1335 {3, {64, 128}},
1336 {5, {32, 128}},
1337 {9, {16, 128}},
1338 {33, {0, 0}},
1339 {UINT_MAX, {0, 0}},
1340 },
1341 {
1342 /* Four shader engines */
1343 {0, {256, 256}},
1344 {2, {128, 256}},
1345 {3, {128, 128}},
1346 {5, {64, 128}},
1347 {9, {16, 128}},
1348 {33, {0, 0}},
1349 {UINT_MAX, {0, 0}},
1350 },
1351 },
1352 {
1353 /* Four RB / SE */
1354 {
1355 /* One shader engine */
1356 {0, {128, 256}},
1357 {2, {128, 128}},
1358 {3, {64, 128}},
1359 {5, {32, 128}},
1360 {9, {16, 128}},
1361 {33, {0, 0}},
1362 {UINT_MAX, {0, 0}},
1363 },
1364 {
1365 /* Two shader engines */
1366 {0, {256, 256}},
1367 {2, {128, 256}},
1368 {3, {128, 128}},
1369 {5, {64, 128}},
1370 {9, {32, 128}},
1371 {17, {16, 128}},
1372 {33, {0, 0}},
1373 {UINT_MAX, {0, 0}},
1374 },
1375 {
1376 /* Four shader engines */
1377 {0, {256, 512}},
1378 {2, {256, 256}},
1379 {3, {128, 256}},
1380 {5, {128, 128}},
1381 {9, {64, 128}},
1382 {17, {16, 128}},
1383 {33, {0, 0}},
1384 {UINT_MAX, {0, 0}},
1385 },
1386 },
1387 };
1388 static const struct radv_bin_size_entry ds_size_table[][3][9] = {
1389 {
1390 // One RB / SE
1391 {
1392 // One shader engine
1393 {0, {128, 256}},
1394 {2, {128, 128}},
1395 {4, {64, 128}},
1396 {7, {32, 128}},
1397 {13, {16, 128}},
1398 {49, {0, 0}},
1399 {UINT_MAX, {0, 0}},
1400 },
1401 {
1402 // Two shader engines
1403 {0, {256, 256}},
1404 {2, {128, 256}},
1405 {4, {128, 128}},
1406 {7, {64, 128}},
1407 {13, {32, 128}},
1408 {25, {16, 128}},
1409 {49, {0, 0}},
1410 {UINT_MAX, {0, 0}},
1411 },
1412 {
1413 // Four shader engines
1414 {0, {256, 512}},
1415 {2, {256, 256}},
1416 {4, {128, 256}},
1417 {7, {128, 128}},
1418 {13, {64, 128}},
1419 {25, {16, 128}},
1420 {49, {0, 0}},
1421 {UINT_MAX, {0, 0}},
1422 },
1423 },
1424 {
1425 // Two RB / SE
1426 {
1427 // One shader engine
1428 {0, {256, 256}},
1429 {2, {128, 256}},
1430 {4, {128, 128}},
1431 {7, {64, 128}},
1432 {13, {32, 128}},
1433 {25, {16, 128}},
1434 {97, {0, 0}},
1435 {UINT_MAX, {0, 0}},
1436 },
1437 {
1438 // Two shader engines
1439 {0, {256, 512}},
1440 {2, {256, 256}},
1441 {4, {128, 256}},
1442 {7, {128, 128}},
1443 {13, {64, 128}},
1444 {25, {32, 128}},
1445 {49, {16, 128}},
1446 {97, {0, 0}},
1447 {UINT_MAX, {0, 0}},
1448 },
1449 {
1450 // Four shader engines
1451 {0, {512, 512}},
1452 {2, {256, 512}},
1453 {4, {256, 256}},
1454 {7, {128, 256}},
1455 {13, {128, 128}},
1456 {25, {64, 128}},
1457 {49, {16, 128}},
1458 {97, {0, 0}},
1459 {UINT_MAX, {0, 0}},
1460 },
1461 },
1462 {
1463 // Four RB / SE
1464 {
1465 // One shader engine
1466 {0, {256, 512}},
1467 {2, {256, 256}},
1468 {4, {128, 256}},
1469 {7, {128, 128}},
1470 {13, {64, 128}},
1471 {25, {32, 128}},
1472 {49, {16, 128}},
1473 {UINT_MAX, {0, 0}},
1474 },
1475 {
1476 // Two shader engines
1477 {0, {512, 512}},
1478 {2, {256, 512}},
1479 {4, {256, 256}},
1480 {7, {128, 256}},
1481 {13, {128, 128}},
1482 {25, {64, 128}},
1483 {49, {32, 128}},
1484 {97, {16, 128}},
1485 {UINT_MAX, {0, 0}},
1486 },
1487 {
1488 // Four shader engines
1489 {0, {512, 512}},
1490 {4, {256, 512}},
1491 {7, {256, 256}},
1492 {13, {128, 256}},
1493 {25, {128, 128}},
1494 {49, {64, 128}},
1495 {97, {16, 128}},
1496 {UINT_MAX, {0, 0}},
1497 },
1498 },
1499 };
1500
1501 VkExtent2D extent = {512, 512};
1502
1503 unsigned log_num_rb_per_se = util_logbase2_ceil(pdevice->rad_info.max_render_backends / pdevice->rad_info.max_se);
1504 unsigned log_num_se = util_logbase2_ceil(pdevice->rad_info.max_se);
1505
1506 unsigned total_samples = radv_get_rasterization_samples(cmd_buffer);
1507 unsigned ps_iter_samples = radv_get_ps_iter_samples(cmd_buffer);
1508 unsigned effective_samples = total_samples;
1509 unsigned color_bytes_per_pixel = 0;
1510
1511 for (unsigned i = 0; i < render->color_att_count; ++i) {
1512 struct radv_image_view *iview = render->color_att[i].iview;
1513
1514 if (!iview)
1515 continue;
1516
1517 if (!d->vk.cb.attachments[i].write_mask)
1518 continue;
1519
1520 color_bytes_per_pixel += vk_format_get_blocksize(render->color_att[i].format);
1521 }
1522
1523 /* MSAA images typically don't use all samples all the time. */
1524 if (effective_samples >= 2 && ps_iter_samples <= 1)
1525 effective_samples = 2;
1526 color_bytes_per_pixel *= effective_samples;
1527
1528 const struct radv_bin_size_entry *color_entry = color_size_table[log_num_rb_per_se][log_num_se];
1529 while (color_entry[1].bpp <= color_bytes_per_pixel)
1530 ++color_entry;
1531
1532 extent = color_entry->extent;
1533
1534 if (render->ds_att.iview) {
1535 /* Coefficients taken from AMDVLK */
1536 unsigned depth_coeff = vk_format_has_depth(render->ds_att.format) ? 5 : 0;
1537 unsigned stencil_coeff = vk_format_has_stencil(render->ds_att.format) ? 1 : 0;
1538 unsigned ds_bytes_per_pixel = 4 * (depth_coeff + stencil_coeff) * total_samples;
1539
1540 const struct radv_bin_size_entry *ds_entry = ds_size_table[log_num_rb_per_se][log_num_se];
1541 while (ds_entry[1].bpp <= ds_bytes_per_pixel)
1542 ++ds_entry;
1543
1544 if (ds_entry->extent.width * ds_entry->extent.height < extent.width * extent.height)
1545 extent = ds_entry->extent;
1546 }
1547
1548 return extent;
1549 }
1550
1551 static unsigned
radv_get_disabled_binning_state(struct radv_cmd_buffer * cmd_buffer)1552 radv_get_disabled_binning_state(struct radv_cmd_buffer *cmd_buffer)
1553 {
1554 const struct radv_physical_device *pdevice = cmd_buffer->device->physical_device;
1555 const struct radv_rendering_state *render = &cmd_buffer->state.render;
1556 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1557 uint32_t pa_sc_binner_cntl_0;
1558
1559 if (pdevice->rad_info.gfx_level >= GFX10) {
1560 unsigned min_bytes_per_pixel = 0;
1561
1562 for (unsigned i = 0; i < render->color_att_count; ++i) {
1563 struct radv_image_view *iview = render->color_att[i].iview;
1564
1565 if (!iview)
1566 continue;
1567
1568 if (!d->vk.cb.attachments[i].write_mask)
1569 continue;
1570
1571 unsigned bytes = vk_format_get_blocksize(render->color_att[i].format);
1572 if (!min_bytes_per_pixel || bytes < min_bytes_per_pixel)
1573 min_bytes_per_pixel = bytes;
1574 }
1575
1576 pa_sc_binner_cntl_0 = S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_NEW_SC) | S_028C44_BIN_SIZE_X(0) |
1577 S_028C44_BIN_SIZE_Y(0) | S_028C44_BIN_SIZE_X_EXTEND(2) | /* 128 */
1578 S_028C44_BIN_SIZE_Y_EXTEND(min_bytes_per_pixel <= 4 ? 2 : 1) | /* 128 or 64 */
1579 S_028C44_DISABLE_START_OF_PRIM(1) | S_028C44_FLUSH_ON_BINNING_TRANSITION(1);
1580 } else {
1581 pa_sc_binner_cntl_0 = S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) |
1582 S_028C44_DISABLE_START_OF_PRIM(1) |
1583 S_028C44_FLUSH_ON_BINNING_TRANSITION(pdevice->rad_info.family == CHIP_VEGA12 ||
1584 pdevice->rad_info.family == CHIP_VEGA20 ||
1585 pdevice->rad_info.family >= CHIP_RAVEN2);
1586 }
1587
1588 return pa_sc_binner_cntl_0;
1589 }
1590
1591 static unsigned
radv_get_binning_state(struct radv_cmd_buffer * cmd_buffer)1592 radv_get_binning_state(struct radv_cmd_buffer *cmd_buffer)
1593 {
1594 const struct radv_device *device = cmd_buffer->device;
1595 unsigned pa_sc_binner_cntl_0;
1596 VkExtent2D bin_size;
1597
1598 if (device->physical_device->rad_info.gfx_level >= GFX10) {
1599 bin_size = radv_gfx10_compute_bin_size(cmd_buffer);
1600 } else {
1601 assert(device->physical_device->rad_info.gfx_level == GFX9);
1602 bin_size = radv_gfx9_compute_bin_size(cmd_buffer);
1603 }
1604
1605 if (device->pbb_allowed && bin_size.width && bin_size.height) {
1606 struct radv_binning_settings *settings = &device->physical_device->binning_settings;
1607
1608 pa_sc_binner_cntl_0 =
1609 S_028C44_BINNING_MODE(V_028C44_BINNING_ALLOWED) | S_028C44_BIN_SIZE_X(bin_size.width == 16) |
1610 S_028C44_BIN_SIZE_Y(bin_size.height == 16) |
1611 S_028C44_BIN_SIZE_X_EXTEND(util_logbase2(MAX2(bin_size.width, 32)) - 5) |
1612 S_028C44_BIN_SIZE_Y_EXTEND(util_logbase2(MAX2(bin_size.height, 32)) - 5) |
1613 S_028C44_CONTEXT_STATES_PER_BIN(settings->context_states_per_bin - 1) |
1614 S_028C44_PERSISTENT_STATES_PER_BIN(settings->persistent_states_per_bin - 1) |
1615 S_028C44_DISABLE_START_OF_PRIM(1) | S_028C44_FPOVS_PER_BATCH(settings->fpovs_per_batch) |
1616 S_028C44_OPTIMAL_BIN_SELECTION(1) |
1617 S_028C44_FLUSH_ON_BINNING_TRANSITION(device->physical_device->rad_info.family == CHIP_VEGA12 ||
1618 device->physical_device->rad_info.family == CHIP_VEGA20 ||
1619 device->physical_device->rad_info.family >= CHIP_RAVEN2);
1620 } else {
1621 pa_sc_binner_cntl_0 = radv_get_disabled_binning_state(cmd_buffer);
1622 }
1623
1624 return pa_sc_binner_cntl_0;
1625 }
1626
1627 static void
radv_emit_binning_state(struct radv_cmd_buffer * cmd_buffer)1628 radv_emit_binning_state(struct radv_cmd_buffer *cmd_buffer)
1629 {
1630 unsigned pa_sc_binner_cntl_0;
1631
1632 if (cmd_buffer->device->physical_device->rad_info.gfx_level < GFX9)
1633 return;
1634
1635 pa_sc_binner_cntl_0 = radv_get_binning_state(cmd_buffer);
1636
1637 if (pa_sc_binner_cntl_0 == cmd_buffer->state.last_pa_sc_binner_cntl_0)
1638 return;
1639
1640 radeon_set_context_reg(cmd_buffer->cs, R_028C44_PA_SC_BINNER_CNTL_0, pa_sc_binner_cntl_0);
1641
1642 cmd_buffer->state.last_pa_sc_binner_cntl_0 = pa_sc_binner_cntl_0;
1643 }
1644
1645 static void
radv_emit_shader_prefetch(struct radv_cmd_buffer * cmd_buffer,struct radv_shader * shader)1646 radv_emit_shader_prefetch(struct radv_cmd_buffer *cmd_buffer, struct radv_shader *shader)
1647 {
1648 uint64_t va;
1649
1650 if (!shader)
1651 return;
1652
1653 va = radv_shader_get_va(shader);
1654
1655 radv_cp_dma_prefetch(cmd_buffer, va, shader->code_size);
1656 }
1657
1658 ALWAYS_INLINE static void
radv_emit_prefetch_L2(struct radv_cmd_buffer * cmd_buffer,bool first_stage_only)1659 radv_emit_prefetch_L2(struct radv_cmd_buffer *cmd_buffer, bool first_stage_only)
1660 {
1661 struct radv_cmd_state *state = &cmd_buffer->state;
1662 uint32_t mask = state->prefetch_L2_mask;
1663
1664 /* Fast prefetch path for starting draws as soon as possible. */
1665 if (first_stage_only)
1666 mask &= RADV_PREFETCH_VS | RADV_PREFETCH_VBO_DESCRIPTORS | RADV_PREFETCH_MS;
1667
1668 if (mask & RADV_PREFETCH_VS)
1669 radv_emit_shader_prefetch(cmd_buffer, cmd_buffer->state.shaders[MESA_SHADER_VERTEX]);
1670
1671 if (mask & RADV_PREFETCH_MS)
1672 radv_emit_shader_prefetch(cmd_buffer, cmd_buffer->state.shaders[MESA_SHADER_MESH]);
1673
1674 if (mask & RADV_PREFETCH_VBO_DESCRIPTORS)
1675 radv_cp_dma_prefetch(cmd_buffer, state->vb_va, state->vb_size);
1676
1677 if (mask & RADV_PREFETCH_TCS)
1678 radv_emit_shader_prefetch(cmd_buffer, cmd_buffer->state.shaders[MESA_SHADER_TESS_CTRL]);
1679
1680 if (mask & RADV_PREFETCH_TES)
1681 radv_emit_shader_prefetch(cmd_buffer, cmd_buffer->state.shaders[MESA_SHADER_TESS_EVAL]);
1682
1683 if (mask & RADV_PREFETCH_GS) {
1684 radv_emit_shader_prefetch(cmd_buffer, cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY]);
1685 if (cmd_buffer->state.gs_copy_shader)
1686 radv_emit_shader_prefetch(cmd_buffer, cmd_buffer->state.gs_copy_shader);
1687 }
1688
1689 if (mask & RADV_PREFETCH_PS) {
1690 radv_emit_shader_prefetch(cmd_buffer, cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT]);
1691 }
1692
1693 state->prefetch_L2_mask &= ~mask;
1694 }
1695
1696 static void
radv_emit_rbplus_state(struct radv_cmd_buffer * cmd_buffer)1697 radv_emit_rbplus_state(struct radv_cmd_buffer *cmd_buffer)
1698 {
1699 assert(cmd_buffer->device->physical_device->rad_info.rbplus_allowed);
1700
1701 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1702 struct radv_rendering_state *render = &cmd_buffer->state.render;
1703
1704 unsigned sx_ps_downconvert = 0;
1705 unsigned sx_blend_opt_epsilon = 0;
1706 unsigned sx_blend_opt_control = 0;
1707
1708 for (unsigned i = 0; i < render->color_att_count; i++) {
1709 unsigned format, swap;
1710 bool has_alpha, has_rgb;
1711 if (render->color_att[i].iview == NULL) {
1712 /* We don't set the DISABLE bits, because the HW can't have holes,
1713 * so the SPI color format is set to 32-bit 1-component. */
1714 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
1715 continue;
1716 }
1717
1718 struct radv_color_buffer_info *cb = &render->color_att[i].cb;
1719
1720 format = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11
1721 ? G_028C70_FORMAT_GFX11(cb->cb_color_info)
1722 : G_028C70_FORMAT_GFX6(cb->cb_color_info);
1723 swap = G_028C70_COMP_SWAP(cb->cb_color_info);
1724 has_alpha = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11
1725 ? !G_028C74_FORCE_DST_ALPHA_1_GFX11(cb->cb_color_attrib)
1726 : !G_028C74_FORCE_DST_ALPHA_1_GFX6(cb->cb_color_attrib);
1727
1728 uint32_t spi_format = (cmd_buffer->state.col_format_non_compacted >> (i * 4)) & 0xf;
1729 uint32_t colormask = d->vk.cb.attachments[i].write_mask;
1730
1731 if (format == V_028C70_COLOR_8 || format == V_028C70_COLOR_16 || format == V_028C70_COLOR_32)
1732 has_rgb = !has_alpha;
1733 else
1734 has_rgb = true;
1735
1736 /* Check the colormask and export format. */
1737 if (!(colormask & 0x7))
1738 has_rgb = false;
1739 if (!(colormask & 0x8))
1740 has_alpha = false;
1741
1742 if (spi_format == V_028714_SPI_SHADER_ZERO) {
1743 has_rgb = false;
1744 has_alpha = false;
1745 }
1746
1747 /* The HW doesn't quite blend correctly with rgb9e5 if we disable the alpha
1748 * optimization, even though it has no alpha. */
1749 if (has_rgb && format == V_028C70_COLOR_5_9_9_9)
1750 has_alpha = true;
1751
1752 /* Disable value checking for disabled channels. */
1753 if (!has_rgb)
1754 sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4);
1755 if (!has_alpha)
1756 sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4);
1757
1758 /* Enable down-conversion for 32bpp and smaller formats. */
1759 switch (format) {
1760 case V_028C70_COLOR_8:
1761 case V_028C70_COLOR_8_8:
1762 case V_028C70_COLOR_8_8_8_8:
1763 /* For 1 and 2-channel formats, use the superset thereof. */
1764 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR || spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
1765 spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
1766 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_8_8_8_8 << (i * 4);
1767
1768 if (G_028C70_NUMBER_TYPE(cb->cb_color_info) != V_028C70_NUMBER_SRGB)
1769 sx_blend_opt_epsilon |= V_028758_8BIT_FORMAT_0_5 << (i * 4);
1770 }
1771 break;
1772
1773 case V_028C70_COLOR_5_6_5:
1774 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1775 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_5_6_5 << (i * 4);
1776 sx_blend_opt_epsilon |= V_028758_6BIT_FORMAT_0_5 << (i * 4);
1777 }
1778 break;
1779
1780 case V_028C70_COLOR_1_5_5_5:
1781 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1782 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_1_5_5_5 << (i * 4);
1783 sx_blend_opt_epsilon |= V_028758_5BIT_FORMAT_0_5 << (i * 4);
1784 }
1785 break;
1786
1787 case V_028C70_COLOR_4_4_4_4:
1788 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1789 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_4_4_4_4 << (i * 4);
1790 sx_blend_opt_epsilon |= V_028758_4BIT_FORMAT_0_5 << (i * 4);
1791 }
1792 break;
1793
1794 case V_028C70_COLOR_32:
1795 if (swap == V_028C70_SWAP_STD && spi_format == V_028714_SPI_SHADER_32_R)
1796 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
1797 else if (swap == V_028C70_SWAP_ALT_REV && spi_format == V_028714_SPI_SHADER_32_AR)
1798 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_A << (i * 4);
1799 break;
1800
1801 case V_028C70_COLOR_16:
1802 case V_028C70_COLOR_16_16:
1803 /* For 1-channel formats, use the superset thereof. */
1804 if (spi_format == V_028714_SPI_SHADER_UNORM16_ABGR || spi_format == V_028714_SPI_SHADER_SNORM16_ABGR ||
1805 spi_format == V_028714_SPI_SHADER_UINT16_ABGR || spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
1806 if (swap == V_028C70_SWAP_STD || swap == V_028C70_SWAP_STD_REV)
1807 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_GR << (i * 4);
1808 else
1809 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_AR << (i * 4);
1810 }
1811 break;
1812
1813 case V_028C70_COLOR_10_11_11:
1814 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR)
1815 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_10_11_11 << (i * 4);
1816 break;
1817
1818 case V_028C70_COLOR_2_10_10_10:
1819 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1820 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_2_10_10_10 << (i * 4);
1821 sx_blend_opt_epsilon |= V_028758_10BIT_FORMAT_0_5 << (i * 4);
1822 }
1823 break;
1824 case V_028C70_COLOR_5_9_9_9:
1825 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR)
1826 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_9_9_9_E5 << (i * 4);
1827 break;
1828 }
1829 }
1830
1831 /* Do not set the DISABLE bits for the unused attachments, as that
1832 * breaks dual source blending in SkQP and does not seem to improve
1833 * performance. */
1834
1835 if (sx_ps_downconvert != cmd_buffer->state.last_sx_ps_downconvert ||
1836 sx_blend_opt_epsilon != cmd_buffer->state.last_sx_blend_opt_epsilon ||
1837 sx_blend_opt_control != cmd_buffer->state.last_sx_blend_opt_control) {
1838 radeon_set_context_reg_seq(cmd_buffer->cs, R_028754_SX_PS_DOWNCONVERT, 3);
1839 radeon_emit(cmd_buffer->cs, sx_ps_downconvert);
1840 radeon_emit(cmd_buffer->cs, sx_blend_opt_epsilon);
1841 radeon_emit(cmd_buffer->cs, sx_blend_opt_control);
1842
1843 cmd_buffer->state.last_sx_ps_downconvert = sx_ps_downconvert;
1844 cmd_buffer->state.last_sx_blend_opt_epsilon = sx_blend_opt_epsilon;
1845 cmd_buffer->state.last_sx_blend_opt_control = sx_blend_opt_control;
1846 }
1847
1848 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_RBPLUS;
1849 }
1850
1851 static void
radv_emit_ps_epilog_state(struct radv_cmd_buffer * cmd_buffer,struct radv_shader_part * ps_epilog)1852 radv_emit_ps_epilog_state(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_part *ps_epilog)
1853 {
1854 struct radv_shader *ps_shader = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
1855 const struct radv_device *device = cmd_buffer->device;
1856
1857 if (cmd_buffer->state.emitted_ps_epilog == ps_epilog)
1858 return;
1859
1860 uint32_t col_format = radv_compact_spi_shader_col_format(ps_shader, ps_epilog->spi_shader_col_format);
1861
1862 bool need_null_export_workaround =
1863 radv_needs_null_export_workaround(device, ps_shader, cmd_buffer->state.custom_blend_mode);
1864 if (need_null_export_workaround && !col_format)
1865 col_format = V_028714_SPI_SHADER_32_R;
1866 radeon_set_context_reg(cmd_buffer->cs, R_028714_SPI_SHADER_COL_FORMAT, col_format);
1867 radeon_set_context_reg(cmd_buffer->cs, R_02823C_CB_SHADER_MASK,
1868 ac_get_cb_shader_mask(ps_epilog->spi_shader_col_format));
1869
1870 if (ps_epilog->spi_shader_z_format)
1871 radeon_set_context_reg(cmd_buffer->cs, R_028710_SPI_SHADER_Z_FORMAT, ps_epilog->spi_shader_z_format);
1872
1873 assert(ps_shader->config.num_shared_vgprs == 0);
1874 if (G_00B848_VGPRS(ps_epilog->rsrc1) > G_00B848_VGPRS(ps_shader->config.rsrc1)) {
1875 uint32_t rsrc1 = ps_shader->config.rsrc1;
1876 rsrc1 = (rsrc1 & C_00B848_VGPRS) | (ps_epilog->rsrc1 & ~C_00B848_VGPRS);
1877 radeon_set_sh_reg(cmd_buffer->cs, R_00B028_SPI_SHADER_PGM_RSRC1_PS, rsrc1);
1878 }
1879
1880 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, ps_epilog->bo);
1881
1882 assert((ps_epilog->va >> 32) == cmd_buffer->device->physical_device->rad_info.address32_hi);
1883
1884 struct radv_userdata_info *loc = &ps_shader->info.user_sgprs_locs.shader_data[AC_UD_PS_EPILOG_PC];
1885 uint32_t base_reg = ps_shader->info.user_data_0;
1886 assert(loc->sgpr_idx != -1);
1887 assert(loc->num_sgprs == 1);
1888 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, ps_epilog->va, false);
1889
1890 cmd_buffer->shader_upload_seq = MAX2(cmd_buffer->shader_upload_seq, ps_epilog->upload_seq);
1891
1892 cmd_buffer->state.emitted_ps_epilog = ps_epilog;
1893 }
1894
1895 static void
radv_emit_tcs_epilog_state(struct radv_cmd_buffer * cmd_buffer,struct radv_shader_part * tcs_epilog)1896 radv_emit_tcs_epilog_state(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_part *tcs_epilog)
1897 {
1898 const enum amd_gfx_level gfx_level = cmd_buffer->device->physical_device->rad_info.gfx_level;
1899 struct radv_shader *tcs = cmd_buffer->state.shaders[MESA_SHADER_TESS_CTRL];
1900 uint32_t rsrc1;
1901
1902 if (cmd_buffer->state.emitted_tcs_epilog == tcs_epilog)
1903 return;
1904
1905 if (tcs->info.merged_shader_compiled_separately) {
1906 radv_shader_combine_cfg_vs_tcs(cmd_buffer->state.shaders[MESA_SHADER_VERTEX], tcs, &rsrc1, NULL);
1907 } else {
1908 rsrc1 = tcs->config.rsrc1;
1909 }
1910
1911 assert(tcs->config.num_shared_vgprs == 0);
1912 if (G_00B848_VGPRS(tcs_epilog->rsrc1) > G_00B848_VGPRS(rsrc1))
1913 rsrc1 = (rsrc1 & C_00B848_VGPRS) | (tcs_epilog->rsrc1 & ~C_00B848_VGPRS);
1914 if (gfx_level < GFX10 && G_00B228_SGPRS(tcs_epilog->rsrc1) > G_00B228_SGPRS(rsrc1))
1915 rsrc1 = (rsrc1 & C_00B228_SGPRS) | (tcs_epilog->rsrc1 & ~C_00B228_SGPRS);
1916
1917 radeon_set_sh_reg(cmd_buffer->cs, R_00B428_SPI_SHADER_PGM_RSRC1_HS, rsrc1);
1918
1919 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, tcs_epilog->bo);
1920
1921 assert((tcs_epilog->va >> 32) == cmd_buffer->device->physical_device->rad_info.address32_hi);
1922
1923 struct radv_userdata_info *loc = &tcs->info.user_sgprs_locs.shader_data[AC_UD_TCS_EPILOG_PC];
1924 uint32_t base_reg = tcs->info.user_data_0;
1925 assert(loc->sgpr_idx != -1);
1926 assert(loc->num_sgprs == 1);
1927 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, tcs_epilog->va, false);
1928
1929 cmd_buffer->shader_upload_seq = MAX2(cmd_buffer->shader_upload_seq, tcs_epilog->upload_seq);
1930
1931 cmd_buffer->state.emitted_tcs_epilog = tcs_epilog;
1932 }
1933
1934 static void
radv_emit_graphics_pipeline(struct radv_cmd_buffer * cmd_buffer)1935 radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
1936 {
1937 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
1938 const struct radv_device *device = cmd_buffer->device;
1939
1940 if (cmd_buffer->state.emitted_graphics_pipeline == pipeline)
1941 return;
1942
1943 if (cmd_buffer->state.emitted_graphics_pipeline) {
1944 if (radv_rast_prim_is_points_or_lines(cmd_buffer->state.emitted_graphics_pipeline->rast_prim) !=
1945 radv_rast_prim_is_points_or_lines(pipeline->rast_prim))
1946 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_GUARDBAND;
1947
1948 if (cmd_buffer->state.emitted_graphics_pipeline->ms.min_sample_shading != pipeline->ms.min_sample_shading ||
1949 cmd_buffer->state.emitted_graphics_pipeline->uses_out_of_order_rast != pipeline->uses_out_of_order_rast ||
1950 cmd_buffer->state.emitted_graphics_pipeline->uses_vrs_attachment != pipeline->uses_vrs_attachment ||
1951 cmd_buffer->state.emitted_graphics_pipeline->rast_prim != pipeline->rast_prim)
1952
1953 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_RASTERIZATION_SAMPLES;
1954
1955 if (cmd_buffer->state.emitted_graphics_pipeline->ms.sample_shading_enable != pipeline->ms.sample_shading_enable) {
1956 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_RASTERIZATION_SAMPLES;
1957 if (device->physical_device->rad_info.gfx_level >= GFX10_3)
1958 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE;
1959 }
1960
1961 if (cmd_buffer->state.emitted_graphics_pipeline->db_render_control != pipeline->db_render_control)
1962 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FRAMEBUFFER;
1963 }
1964
1965 radeon_emit_array(cmd_buffer->cs, pipeline->base.cs.buf, pipeline->base.cs.cdw);
1966
1967 if (!cmd_buffer->state.emitted_graphics_pipeline ||
1968 cmd_buffer->state.emitted_graphics_pipeline->base.ctx_cs.cdw != pipeline->base.ctx_cs.cdw ||
1969 cmd_buffer->state.emitted_graphics_pipeline->base.ctx_cs_hash != pipeline->base.ctx_cs_hash ||
1970 memcmp(cmd_buffer->state.emitted_graphics_pipeline->base.ctx_cs.buf, pipeline->base.ctx_cs.buf,
1971 pipeline->base.ctx_cs.cdw * 4)) {
1972 radeon_emit_array(cmd_buffer->cs, pipeline->base.ctx_cs.buf, pipeline->base.ctx_cs.cdw);
1973 }
1974
1975 if (device->pbb_allowed) {
1976 struct radv_binning_settings *settings = &device->physical_device->binning_settings;
1977
1978 if ((!cmd_buffer->state.emitted_graphics_pipeline ||
1979 cmd_buffer->state.emitted_graphics_pipeline->base.shaders[MESA_SHADER_FRAGMENT] !=
1980 cmd_buffer->state.graphics_pipeline->base.shaders[MESA_SHADER_FRAGMENT]) &&
1981 (settings->context_states_per_bin > 1 || settings->persistent_states_per_bin > 1)) {
1982 /* Break the batch on PS changes. */
1983 radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
1984 radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
1985 }
1986 }
1987
1988 if (pipeline->sqtt_shaders_reloc) {
1989 /* Emit shaders relocation because RGP requires them to be contiguous in memory. */
1990 radv_sqtt_emit_relocated_shaders(cmd_buffer, pipeline);
1991 }
1992
1993 for (unsigned s = 0; s < MESA_VULKAN_SHADER_STAGES; s++) {
1994 struct radv_shader *shader = cmd_buffer->state.shaders[s];
1995
1996 if (!shader)
1997 continue;
1998
1999 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, shader->bo);
2000 }
2001
2002 if (cmd_buffer->state.gs_copy_shader) {
2003 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->state.gs_copy_shader->bo);
2004 }
2005
2006 struct radv_shader *task_shader = cmd_buffer->state.shaders[MESA_SHADER_TASK];
2007 if (task_shader) {
2008 radv_emit_compute_shader(device->physical_device, cmd_buffer->gang.cs, task_shader);
2009
2010 /* Relocate the task shader because RGP requires shaders to be contiguous in memory. */
2011 if (pipeline->sqtt_shaders_reloc) {
2012 const struct radv_sqtt_shaders_reloc *reloc = pipeline->sqtt_shaders_reloc;
2013 const uint64_t va = reloc->va[MESA_SHADER_TASK];
2014
2015 radeon_set_sh_reg(cmd_buffer->gang.cs, R_00B830_COMPUTE_PGM_LO, va >> 8);
2016 }
2017 }
2018
2019 if (radv_device_fault_detection_enabled(cmd_buffer->device))
2020 radv_save_pipeline(cmd_buffer, &pipeline->base);
2021
2022 cmd_buffer->state.emitted_graphics_pipeline = pipeline;
2023
2024 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_PIPELINE;
2025 }
2026
2027 static bool
radv_get_depth_clip_enable(struct radv_cmd_buffer * cmd_buffer)2028 radv_get_depth_clip_enable(struct radv_cmd_buffer *cmd_buffer)
2029 {
2030 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2031
2032 return d->vk.rs.depth_clip_enable == VK_MESA_DEPTH_CLIP_ENABLE_TRUE ||
2033 (d->vk.rs.depth_clip_enable == VK_MESA_DEPTH_CLIP_ENABLE_NOT_CLAMP && !d->vk.rs.depth_clamp_enable);
2034 }
2035
2036 enum radv_depth_clamp_mode {
2037 RADV_DEPTH_CLAMP_MODE_VIEWPORT = 0, /* Clamp to the viewport min/max depth bounds */
2038 RADV_DEPTH_CLAMP_MODE_ZERO_TO_ONE = 1, /* Clamp between 0.0f and 1.0f */
2039 RADV_DEPTH_CLAMP_MODE_DISABLED = 2, /* Disable depth clamping */
2040 };
2041
2042 static enum radv_depth_clamp_mode
radv_get_depth_clamp_mode(struct radv_cmd_buffer * cmd_buffer)2043 radv_get_depth_clamp_mode(struct radv_cmd_buffer *cmd_buffer)
2044 {
2045 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2046 bool depth_clip_enable = radv_get_depth_clip_enable(cmd_buffer);
2047 const struct radv_device *device = cmd_buffer->device;
2048 enum radv_depth_clamp_mode mode;
2049
2050 mode = RADV_DEPTH_CLAMP_MODE_VIEWPORT;
2051 if (!d->vk.rs.depth_clamp_enable) {
2052 /* For optimal performance, depth clamping should always be enabled except if the application
2053 * disables clamping explicitly or uses depth values outside of the [0.0, 1.0] range.
2054 */
2055 if (!depth_clip_enable || device->vk.enabled_extensions.EXT_depth_range_unrestricted) {
2056 mode = RADV_DEPTH_CLAMP_MODE_DISABLED;
2057 } else {
2058 mode = RADV_DEPTH_CLAMP_MODE_ZERO_TO_ONE;
2059 }
2060 }
2061
2062 return mode;
2063 }
2064
2065 static void
radv_emit_viewport(struct radv_cmd_buffer * cmd_buffer)2066 radv_emit_viewport(struct radv_cmd_buffer *cmd_buffer)
2067 {
2068 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2069 enum radv_depth_clamp_mode depth_clamp_mode = radv_get_depth_clamp_mode(cmd_buffer);
2070
2071 assert(d->vk.vp.viewport_count);
2072 radeon_set_context_reg_seq(cmd_buffer->cs, R_02843C_PA_CL_VPORT_XSCALE, d->vk.vp.viewport_count * 6);
2073
2074 for (unsigned i = 0; i < d->vk.vp.viewport_count; i++) {
2075 radeon_emit(cmd_buffer->cs, fui(d->hw_vp.xform[i].scale[0]));
2076 radeon_emit(cmd_buffer->cs, fui(d->hw_vp.xform[i].translate[0]));
2077 radeon_emit(cmd_buffer->cs, fui(d->hw_vp.xform[i].scale[1]));
2078 radeon_emit(cmd_buffer->cs, fui(d->hw_vp.xform[i].translate[1]));
2079
2080 double scale_z, translate_z;
2081 if (d->vk.vp.depth_clip_negative_one_to_one) {
2082 scale_z = d->hw_vp.xform[i].scale[2] * 0.5f;
2083 translate_z = (d->hw_vp.xform[i].translate[2] + d->vk.vp.viewports[i].maxDepth) * 0.5f;
2084 } else {
2085 scale_z = d->hw_vp.xform[i].scale[2];
2086 translate_z = d->hw_vp.xform[i].translate[2];
2087 }
2088 radeon_emit(cmd_buffer->cs, fui(scale_z));
2089 radeon_emit(cmd_buffer->cs, fui(translate_z));
2090 }
2091
2092 radeon_set_context_reg_seq(cmd_buffer->cs, R_0282D0_PA_SC_VPORT_ZMIN_0, d->vk.vp.viewport_count * 2);
2093 for (unsigned i = 0; i < d->vk.vp.viewport_count; i++) {
2094 float zmin, zmax;
2095
2096 if (depth_clamp_mode == RADV_DEPTH_CLAMP_MODE_ZERO_TO_ONE) {
2097 zmin = 0.0f;
2098 zmax = 1.0f;
2099 } else {
2100 zmin = MIN2(d->vk.vp.viewports[i].minDepth, d->vk.vp.viewports[i].maxDepth);
2101 zmax = MAX2(d->vk.vp.viewports[i].minDepth, d->vk.vp.viewports[i].maxDepth);
2102 }
2103
2104 radeon_emit(cmd_buffer->cs, fui(zmin));
2105 radeon_emit(cmd_buffer->cs, fui(zmax));
2106 }
2107 }
2108
2109 static void
radv_emit_scissor(struct radv_cmd_buffer * cmd_buffer)2110 radv_emit_scissor(struct radv_cmd_buffer *cmd_buffer)
2111 {
2112 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2113
2114 radv_write_scissors(cmd_buffer->cs, d->vk.vp.scissor_count, d->vk.vp.scissors, d->vk.vp.viewports);
2115 }
2116
2117 static void
radv_emit_discard_rectangle(struct radv_cmd_buffer * cmd_buffer)2118 radv_emit_discard_rectangle(struct radv_cmd_buffer *cmd_buffer)
2119 {
2120 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2121 uint32_t cliprect_rule = 0;
2122
2123 if (!d->vk.dr.enable) {
2124 cliprect_rule = 0xffff;
2125 } else {
2126 for (unsigned i = 0; i < (1u << MAX_DISCARD_RECTANGLES); ++i) {
2127 /* Interpret i as a bitmask, and then set the bit in
2128 * the mask if that combination of rectangles in which
2129 * the pixel is contained should pass the cliprect
2130 * test.
2131 */
2132 unsigned relevant_subset = i & ((1u << d->vk.dr.rectangle_count) - 1);
2133
2134 if (d->vk.dr.mode == VK_DISCARD_RECTANGLE_MODE_INCLUSIVE_EXT && !relevant_subset)
2135 continue;
2136
2137 if (d->vk.dr.mode == VK_DISCARD_RECTANGLE_MODE_EXCLUSIVE_EXT && relevant_subset)
2138 continue;
2139
2140 cliprect_rule |= 1u << i;
2141 }
2142
2143 radeon_set_context_reg_seq(cmd_buffer->cs, R_028210_PA_SC_CLIPRECT_0_TL, d->vk.dr.rectangle_count * 2);
2144 for (unsigned i = 0; i < d->vk.dr.rectangle_count; ++i) {
2145 VkRect2D rect = d->vk.dr.rectangles[i];
2146 radeon_emit(cmd_buffer->cs, S_028210_TL_X(rect.offset.x) | S_028210_TL_Y(rect.offset.y));
2147 radeon_emit(cmd_buffer->cs, S_028214_BR_X(rect.offset.x + rect.extent.width) |
2148 S_028214_BR_Y(rect.offset.y + rect.extent.height));
2149 }
2150 }
2151
2152 radeon_set_context_reg(cmd_buffer->cs, R_02820C_PA_SC_CLIPRECT_RULE, cliprect_rule);
2153 }
2154
2155 static void
radv_emit_line_width(struct radv_cmd_buffer * cmd_buffer)2156 radv_emit_line_width(struct radv_cmd_buffer *cmd_buffer)
2157 {
2158 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2159
2160 radeon_set_context_reg(cmd_buffer->cs, R_028A08_PA_SU_LINE_CNTL,
2161 S_028A08_WIDTH(CLAMP(d->vk.rs.line.width * 8, 0, 0xFFFF)));
2162 }
2163
2164 static void
radv_emit_blend_constants(struct radv_cmd_buffer * cmd_buffer)2165 radv_emit_blend_constants(struct radv_cmd_buffer *cmd_buffer)
2166 {
2167 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2168
2169 radeon_set_context_reg_seq(cmd_buffer->cs, R_028414_CB_BLEND_RED, 4);
2170 radeon_emit_array(cmd_buffer->cs, (uint32_t *)d->vk.cb.blend_constants, 4);
2171 }
2172
2173 static void
radv_emit_stencil(struct radv_cmd_buffer * cmd_buffer)2174 radv_emit_stencil(struct radv_cmd_buffer *cmd_buffer)
2175 {
2176 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2177
2178 radeon_set_context_reg_seq(cmd_buffer->cs, R_028430_DB_STENCILREFMASK, 2);
2179 radeon_emit(cmd_buffer->cs, S_028430_STENCILTESTVAL(d->vk.ds.stencil.front.reference) |
2180 S_028430_STENCILMASK(d->vk.ds.stencil.front.compare_mask) |
2181 S_028430_STENCILWRITEMASK(d->vk.ds.stencil.front.write_mask) |
2182 S_028430_STENCILOPVAL(1));
2183 radeon_emit(cmd_buffer->cs, S_028434_STENCILTESTVAL_BF(d->vk.ds.stencil.back.reference) |
2184 S_028434_STENCILMASK_BF(d->vk.ds.stencil.back.compare_mask) |
2185 S_028434_STENCILWRITEMASK_BF(d->vk.ds.stencil.back.write_mask) |
2186 S_028434_STENCILOPVAL_BF(1));
2187 }
2188
2189 static void
radv_emit_depth_bounds(struct radv_cmd_buffer * cmd_buffer)2190 radv_emit_depth_bounds(struct radv_cmd_buffer *cmd_buffer)
2191 {
2192 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2193
2194 radeon_set_context_reg_seq(cmd_buffer->cs, R_028020_DB_DEPTH_BOUNDS_MIN, 2);
2195 radeon_emit(cmd_buffer->cs, fui(d->vk.ds.depth.bounds_test.min));
2196 radeon_emit(cmd_buffer->cs, fui(d->vk.ds.depth.bounds_test.max));
2197 }
2198
2199 static void
radv_emit_depth_bias(struct radv_cmd_buffer * cmd_buffer)2200 radv_emit_depth_bias(struct radv_cmd_buffer *cmd_buffer)
2201 {
2202 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2203 struct radv_rendering_state *render = &cmd_buffer->state.render;
2204 unsigned slope = fui(d->vk.rs.depth_bias.slope * 16.0f);
2205 unsigned pa_su_poly_offset_db_fmt_cntl = 0;
2206
2207 if (vk_format_has_depth(render->ds_att.format) &&
2208 d->vk.rs.depth_bias.representation != VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT) {
2209 VkFormat format = vk_format_depth_only(render->ds_att.format);
2210
2211 if (format == VK_FORMAT_D16_UNORM) {
2212 pa_su_poly_offset_db_fmt_cntl = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-16);
2213 } else {
2214 assert(format == VK_FORMAT_D32_SFLOAT);
2215 if (d->vk.rs.depth_bias.representation ==
2216 VK_DEPTH_BIAS_REPRESENTATION_LEAST_REPRESENTABLE_VALUE_FORCE_UNORM_EXT) {
2217 pa_su_poly_offset_db_fmt_cntl = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-24);
2218 } else {
2219 pa_su_poly_offset_db_fmt_cntl =
2220 S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-23) | S_028B78_POLY_OFFSET_DB_IS_FLOAT_FMT(1);
2221 }
2222 }
2223 }
2224
2225 radeon_set_context_reg_seq(cmd_buffer->cs, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, 5);
2226 radeon_emit(cmd_buffer->cs, fui(d->vk.rs.depth_bias.clamp)); /* CLAMP */
2227 radeon_emit(cmd_buffer->cs, slope); /* FRONT SCALE */
2228 radeon_emit(cmd_buffer->cs, fui(d->vk.rs.depth_bias.constant)); /* FRONT OFFSET */
2229 radeon_emit(cmd_buffer->cs, slope); /* BACK SCALE */
2230 radeon_emit(cmd_buffer->cs, fui(d->vk.rs.depth_bias.constant)); /* BACK OFFSET */
2231
2232 radeon_set_context_reg(cmd_buffer->cs, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, pa_su_poly_offset_db_fmt_cntl);
2233 }
2234
2235 static void
radv_emit_line_stipple(struct radv_cmd_buffer * cmd_buffer)2236 radv_emit_line_stipple(struct radv_cmd_buffer *cmd_buffer)
2237 {
2238 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2239 enum amd_gfx_level gfx_level = cmd_buffer->device->physical_device->rad_info.gfx_level;
2240 /* GFX9 chips fail linestrip CTS tests unless this is set to 0 = no reset */
2241 uint32_t auto_reset_cntl = (gfx_level == GFX9) ? 0 : 2;
2242
2243 if (radv_primitive_topology_is_line_list(d->vk.ia.primitive_topology))
2244 auto_reset_cntl = 1;
2245
2246 radeon_set_context_reg(cmd_buffer->cs, R_028A0C_PA_SC_LINE_STIPPLE,
2247 S_028A0C_LINE_PATTERN(d->vk.rs.line.stipple.pattern) |
2248 S_028A0C_REPEAT_COUNT(d->vk.rs.line.stipple.factor - 1) |
2249 S_028A0C_AUTO_RESET_CNTL(auto_reset_cntl));
2250 }
2251
2252 static uint32_t
radv_get_pa_su_sc_mode_cntl(const struct radv_cmd_buffer * cmd_buffer)2253 radv_get_pa_su_sc_mode_cntl(const struct radv_cmd_buffer *cmd_buffer)
2254 {
2255 enum amd_gfx_level gfx_level = cmd_buffer->device->physical_device->rad_info.gfx_level;
2256 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2257 unsigned pa_su_sc_mode_cntl;
2258
2259 pa_su_sc_mode_cntl =
2260 S_028814_CULL_FRONT(!!(d->vk.rs.cull_mode & VK_CULL_MODE_FRONT_BIT)) |
2261 S_028814_CULL_BACK(!!(d->vk.rs.cull_mode & VK_CULL_MODE_BACK_BIT)) | S_028814_FACE(d->vk.rs.front_face) |
2262 S_028814_POLY_OFFSET_FRONT_ENABLE(d->vk.rs.depth_bias.enable) |
2263 S_028814_POLY_OFFSET_BACK_ENABLE(d->vk.rs.depth_bias.enable) |
2264 S_028814_POLY_OFFSET_PARA_ENABLE(d->vk.rs.depth_bias.enable) |
2265 S_028814_POLY_MODE(d->vk.rs.polygon_mode != V_028814_X_DRAW_TRIANGLES) |
2266 S_028814_POLYMODE_FRONT_PTYPE(d->vk.rs.polygon_mode) | S_028814_POLYMODE_BACK_PTYPE(d->vk.rs.polygon_mode) |
2267 S_028814_PROVOKING_VTX_LAST(d->vk.rs.provoking_vertex == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT);
2268
2269 if (gfx_level >= GFX10) {
2270 /* Ensure that SC processes the primitive group in the same order as PA produced them. Needed
2271 * when either POLY_MODE or PERPENDICULAR_ENDCAP_ENA is set.
2272 */
2273 pa_su_sc_mode_cntl |=
2274 S_028814_KEEP_TOGETHER_ENABLE(d->vk.rs.polygon_mode != V_028814_X_DRAW_TRIANGLES ||
2275 d->vk.rs.line.mode == VK_LINE_RASTERIZATION_MODE_RECTANGULAR_KHR);
2276 }
2277
2278 return pa_su_sc_mode_cntl;
2279 }
2280
2281 static void
radv_emit_culling(struct radv_cmd_buffer * cmd_buffer)2282 radv_emit_culling(struct radv_cmd_buffer *cmd_buffer)
2283 {
2284 unsigned pa_su_sc_mode_cntl = radv_get_pa_su_sc_mode_cntl(cmd_buffer);
2285
2286 radeon_set_context_reg(cmd_buffer->cs, R_028814_PA_SU_SC_MODE_CNTL, pa_su_sc_mode_cntl);
2287 }
2288
2289 static void
radv_emit_provoking_vertex_mode(struct radv_cmd_buffer * cmd_buffer)2290 radv_emit_provoking_vertex_mode(struct radv_cmd_buffer *cmd_buffer)
2291 {
2292 const struct radv_shader *last_vgt_shader = cmd_buffer->state.last_vgt_shader;
2293 const unsigned stage = last_vgt_shader->info.stage;
2294 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2295 const struct radv_userdata_info *loc = radv_get_user_sgpr(last_vgt_shader, AC_UD_NGG_PROVOKING_VTX);
2296 unsigned provoking_vtx = 0;
2297 uint32_t base_reg;
2298
2299 if (loc->sgpr_idx == -1)
2300 return;
2301
2302 if (d->vk.rs.provoking_vertex == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT) {
2303 if (stage == MESA_SHADER_VERTEX) {
2304 provoking_vtx = radv_conv_prim_to_gs_out(d->vk.ia.primitive_topology, last_vgt_shader->info.is_ngg);
2305 } else {
2306 assert(stage == MESA_SHADER_GEOMETRY);
2307 provoking_vtx = last_vgt_shader->info.gs.vertices_in - 1;
2308 }
2309 }
2310
2311 base_reg = last_vgt_shader->info.user_data_0;
2312 radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, provoking_vtx);
2313 }
2314
2315 static void
radv_emit_primitive_topology(struct radv_cmd_buffer * cmd_buffer)2316 radv_emit_primitive_topology(struct radv_cmd_buffer *cmd_buffer)
2317 {
2318 const struct radv_shader *last_vgt_shader = cmd_buffer->state.last_vgt_shader;
2319 const struct radv_userdata_info *loc = radv_get_user_sgpr(last_vgt_shader, AC_UD_NUM_VERTS_PER_PRIM);
2320 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2321 uint32_t base_reg;
2322
2323 assert(!cmd_buffer->state.mesh_shading);
2324
2325 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) {
2326 radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device, cmd_buffer->cs, R_030908_VGT_PRIMITIVE_TYPE, 1,
2327 d->vk.ia.primitive_topology);
2328 } else {
2329 radeon_set_config_reg(cmd_buffer->cs, R_008958_VGT_PRIMITIVE_TYPE, d->vk.ia.primitive_topology);
2330 }
2331
2332 if (loc->sgpr_idx == -1)
2333 return;
2334
2335 base_reg = last_vgt_shader->info.user_data_0;
2336 radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4,
2337 radv_conv_prim_to_gs_out(d->vk.ia.primitive_topology, last_vgt_shader->info.is_ngg) + 1);
2338 }
2339
2340 static void
radv_emit_depth_control(struct radv_cmd_buffer * cmd_buffer)2341 radv_emit_depth_control(struct radv_cmd_buffer *cmd_buffer)
2342 {
2343 const struct radv_rendering_state *render = &cmd_buffer->state.render;
2344 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2345 const bool stencil_test_enable =
2346 d->vk.ds.stencil.test_enable && (render->ds_att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT);
2347
2348 radeon_set_context_reg(
2349 cmd_buffer->cs, R_028800_DB_DEPTH_CONTROL,
2350 S_028800_Z_ENABLE(d->vk.ds.depth.test_enable ? 1 : 0) |
2351 S_028800_Z_WRITE_ENABLE(d->vk.ds.depth.write_enable ? 1 : 0) | S_028800_ZFUNC(d->vk.ds.depth.compare_op) |
2352 S_028800_DEPTH_BOUNDS_ENABLE(d->vk.ds.depth.bounds_test.enable ? 1 : 0) |
2353 S_028800_STENCIL_ENABLE(stencil_test_enable) | S_028800_BACKFACE_ENABLE(stencil_test_enable) |
2354 S_028800_STENCILFUNC(d->vk.ds.stencil.front.op.compare) |
2355 S_028800_STENCILFUNC_BF(d->vk.ds.stencil.back.op.compare));
2356 }
2357
2358 static void
radv_emit_stencil_control(struct radv_cmd_buffer * cmd_buffer)2359 radv_emit_stencil_control(struct radv_cmd_buffer *cmd_buffer)
2360 {
2361 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2362
2363 radeon_set_context_reg(cmd_buffer->cs, R_02842C_DB_STENCIL_CONTROL,
2364 S_02842C_STENCILFAIL(radv_translate_stencil_op(d->vk.ds.stencil.front.op.fail)) |
2365 S_02842C_STENCILZPASS(radv_translate_stencil_op(d->vk.ds.stencil.front.op.pass)) |
2366 S_02842C_STENCILZFAIL(radv_translate_stencil_op(d->vk.ds.stencil.front.op.depth_fail)) |
2367 S_02842C_STENCILFAIL_BF(radv_translate_stencil_op(d->vk.ds.stencil.back.op.fail)) |
2368 S_02842C_STENCILZPASS_BF(radv_translate_stencil_op(d->vk.ds.stencil.back.op.pass)) |
2369 S_02842C_STENCILZFAIL_BF(radv_translate_stencil_op(d->vk.ds.stencil.back.op.depth_fail)));
2370 }
2371
2372 static bool
radv_should_force_vrs1x1(struct radv_cmd_buffer * cmd_buffer)2373 radv_should_force_vrs1x1(struct radv_cmd_buffer *cmd_buffer)
2374 {
2375 const struct radv_physical_device *pdevice = cmd_buffer->device->physical_device;
2376 const struct radv_shader *ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
2377
2378 return pdevice->rad_info.gfx_level >= GFX10_3 &&
2379 (cmd_buffer->state.ms.sample_shading_enable || (ps && ps->info.ps.force_sample_iter_shading_rate));
2380 }
2381
2382 static void
radv_emit_fragment_shading_rate(struct radv_cmd_buffer * cmd_buffer)2383 radv_emit_fragment_shading_rate(struct radv_cmd_buffer *cmd_buffer)
2384 {
2385 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2386
2387 /* When per-vertex VRS is forced and the dynamic fragment shading rate is a no-op, ignore
2388 * it. This is needed for vkd3d-proton because it always declares per-draw VRS as dynamic.
2389 */
2390 if (cmd_buffer->device->force_vrs != RADV_FORCE_VRS_1x1 && d->vk.fsr.fragment_size.width == 1 &&
2391 d->vk.fsr.fragment_size.height == 1 &&
2392 d->vk.fsr.combiner_ops[0] == VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR &&
2393 d->vk.fsr.combiner_ops[1] == VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR)
2394 return;
2395
2396 uint32_t rate_x = MIN2(2, d->vk.fsr.fragment_size.width) - 1;
2397 uint32_t rate_y = MIN2(2, d->vk.fsr.fragment_size.height) - 1;
2398 uint32_t pipeline_comb_mode = d->vk.fsr.combiner_ops[0];
2399 uint32_t htile_comb_mode = d->vk.fsr.combiner_ops[1];
2400 uint32_t pa_cl_vrs_cntl = 0;
2401
2402 assert(cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10_3);
2403
2404 if (!cmd_buffer->state.render.vrs_att.iview) {
2405 /* When the current subpass has no VRS attachment, the VRS rates are expected to be 1x1, so we
2406 * can cheat by tweaking the different combiner modes.
2407 */
2408 switch (htile_comb_mode) {
2409 case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MIN_KHR:
2410 /* The result of min(A, 1x1) is always 1x1. */
2411 FALLTHROUGH;
2412 case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR:
2413 /* Force the per-draw VRS rate to 1x1. */
2414 rate_x = rate_y = 0;
2415
2416 /* As the result of min(A, 1x1) or replace(A, 1x1) are always 1x1, set the vertex rate
2417 * combiner mode as passthrough.
2418 */
2419 pipeline_comb_mode = V_028848_SC_VRS_COMB_MODE_PASSTHRU;
2420 break;
2421 case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MAX_KHR:
2422 /* The result of max(A, 1x1) is always A. */
2423 FALLTHROUGH;
2424 case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR:
2425 /* Nothing to do here because the SAMPLE_ITER combiner mode should already be passthrough. */
2426 break;
2427 default:
2428 break;
2429 }
2430 }
2431
2432 /* Emit per-draw VRS rate which is the first combiner. */
2433 radeon_set_uconfig_reg(cmd_buffer->cs, R_03098C_GE_VRS_RATE, S_03098C_RATE_X(rate_x) | S_03098C_RATE_Y(rate_y));
2434
2435 /* Disable VRS and use the rates from PS_ITER_SAMPLES if:
2436 *
2437 * 1) sample shading is enabled or per-sample interpolation is used by the fragment shader
2438 * 2) the fragment shader requires 1x1 shading rate for some other reason
2439 */
2440 if (radv_should_force_vrs1x1(cmd_buffer)) {
2441 pa_cl_vrs_cntl |= S_028848_SAMPLE_ITER_COMBINER_MODE(V_028848_SC_VRS_COMB_MODE_OVERRIDE);
2442 }
2443
2444 /* VERTEX_RATE_COMBINER_MODE controls the combiner mode between the
2445 * draw rate and the vertex rate.
2446 */
2447 if (cmd_buffer->state.mesh_shading) {
2448 pa_cl_vrs_cntl |= S_028848_VERTEX_RATE_COMBINER_MODE(V_028848_SC_VRS_COMB_MODE_PASSTHRU) |
2449 S_028848_PRIMITIVE_RATE_COMBINER_MODE(pipeline_comb_mode);
2450 } else {
2451 pa_cl_vrs_cntl |= S_028848_VERTEX_RATE_COMBINER_MODE(pipeline_comb_mode) |
2452 S_028848_PRIMITIVE_RATE_COMBINER_MODE(V_028848_SC_VRS_COMB_MODE_PASSTHRU);
2453 }
2454
2455 /* HTILE_RATE_COMBINER_MODE controls the combiner mode between the primitive rate and the HTILE
2456 * rate.
2457 */
2458 pa_cl_vrs_cntl |= S_028848_HTILE_RATE_COMBINER_MODE(htile_comb_mode);
2459
2460 radeon_set_context_reg(cmd_buffer->cs, R_028848_PA_CL_VRS_CNTL, pa_cl_vrs_cntl);
2461 }
2462
2463 static uint32_t
radv_get_primitive_reset_index(const struct radv_cmd_buffer * cmd_buffer)2464 radv_get_primitive_reset_index(const struct radv_cmd_buffer *cmd_buffer)
2465 {
2466 const uint32_t index_type = G_028A7C_INDEX_TYPE(cmd_buffer->state.index_type);
2467 switch (index_type) {
2468 case V_028A7C_VGT_INDEX_8:
2469 return 0xffu;
2470 case V_028A7C_VGT_INDEX_16:
2471 return 0xffffu;
2472 case V_028A7C_VGT_INDEX_32:
2473 return 0xffffffffu;
2474 default:
2475 unreachable("invalid index type");
2476 }
2477 }
2478
2479 static void
radv_emit_primitive_restart_enable(struct radv_cmd_buffer * cmd_buffer)2480 radv_emit_primitive_restart_enable(struct radv_cmd_buffer *cmd_buffer)
2481 {
2482 const enum amd_gfx_level gfx_level = cmd_buffer->device->physical_device->rad_info.gfx_level;
2483 const struct radv_dynamic_state *const d = &cmd_buffer->state.dynamic;
2484 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2485 const bool en = d->vk.ia.primitive_restart_enable;
2486
2487 if (gfx_level >= GFX11) {
2488 radeon_set_uconfig_reg(cs, R_03092C_GE_MULTI_PRIM_IB_RESET_EN,
2489 S_03092C_RESET_EN(en) |
2490 /* This disables primitive restart for non-indexed draws.
2491 * By keeping this set, we don't have to unset RESET_EN
2492 * for non-indexed draws. */
2493 S_03092C_DISABLE_FOR_AUTO_INDEX(1));
2494 } else if (gfx_level >= GFX9) {
2495 radeon_set_uconfig_reg(cs, R_03092C_VGT_MULTI_PRIM_IB_RESET_EN, en);
2496 } else {
2497 radeon_set_context_reg(cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, en);
2498 }
2499
2500 /* GFX6-7: All 32 bits are compared.
2501 * GFX8: Only index type bits are compared.
2502 * GFX9+: Default is same as GFX8, MATCH_ALL_BITS=1 selects GFX6-7 behavior
2503 */
2504 if (en && gfx_level <= GFX7) {
2505 const uint32_t primitive_reset_index = radv_get_primitive_reset_index(cmd_buffer);
2506
2507 if (primitive_reset_index != cmd_buffer->state.last_primitive_reset_index) {
2508 radeon_set_context_reg(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, primitive_reset_index);
2509 cmd_buffer->state.last_primitive_reset_index = primitive_reset_index;
2510 }
2511 }
2512 }
2513
2514 static void
radv_emit_clipping(struct radv_cmd_buffer * cmd_buffer)2515 radv_emit_clipping(struct radv_cmd_buffer *cmd_buffer)
2516 {
2517 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2518 bool depth_clip_enable = radv_get_depth_clip_enable(cmd_buffer);
2519
2520 radeon_set_context_reg(
2521 cmd_buffer->cs, R_028810_PA_CL_CLIP_CNTL,
2522 S_028810_DX_RASTERIZATION_KILL(d->vk.rs.rasterizer_discard_enable) |
2523 S_028810_ZCLIP_NEAR_DISABLE(!depth_clip_enable) | S_028810_ZCLIP_FAR_DISABLE(!depth_clip_enable) |
2524 S_028810_DX_CLIP_SPACE_DEF(!d->vk.vp.depth_clip_negative_one_to_one) | S_028810_DX_LINEAR_ATTR_CLIP_ENA(1));
2525 }
2526
2527 static bool
radv_is_mrt0_dual_src(struct radv_cmd_buffer * cmd_buffer)2528 radv_is_mrt0_dual_src(struct radv_cmd_buffer *cmd_buffer)
2529 {
2530 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2531
2532 if (!d->vk.cb.attachments[0].write_mask || !d->vk.cb.attachments[0].blend_enable)
2533 return false;
2534
2535 return radv_can_enable_dual_src(&d->vk.cb.attachments[0]);
2536 }
2537
2538 static void
radv_emit_logic_op(struct radv_cmd_buffer * cmd_buffer)2539 radv_emit_logic_op(struct radv_cmd_buffer *cmd_buffer)
2540 {
2541 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2542 unsigned cb_color_control = 0;
2543
2544 if (d->vk.cb.logic_op_enable) {
2545 cb_color_control |= S_028808_ROP3(d->vk.cb.logic_op);
2546 } else {
2547 cb_color_control |= S_028808_ROP3(V_028808_ROP3_COPY);
2548 }
2549
2550 if (cmd_buffer->device->physical_device->rad_info.has_rbplus) {
2551 /* RB+ doesn't work with dual source blending, logic op and CB_RESOLVE. */
2552 bool mrt0_is_dual_src = radv_is_mrt0_dual_src(cmd_buffer);
2553
2554 cb_color_control |= S_028808_DISABLE_DUAL_QUAD(mrt0_is_dual_src || d->vk.cb.logic_op_enable ||
2555 cmd_buffer->state.custom_blend_mode == V_028808_CB_RESOLVE);
2556 }
2557
2558 if (cmd_buffer->state.custom_blend_mode) {
2559 cb_color_control |= S_028808_MODE(cmd_buffer->state.custom_blend_mode);
2560 } else {
2561 bool color_write_enabled = false;
2562
2563 for (unsigned i = 0; i < MAX_RTS; i++) {
2564 if (d->vk.cb.attachments[i].write_mask) {
2565 color_write_enabled = true;
2566 break;
2567 }
2568 }
2569
2570 if (color_write_enabled) {
2571 cb_color_control |= S_028808_MODE(V_028808_CB_NORMAL);
2572 } else {
2573 cb_color_control |= S_028808_MODE(V_028808_CB_DISABLE);
2574 }
2575 }
2576
2577 radeon_set_context_reg(cmd_buffer->cs, R_028808_CB_COLOR_CONTROL, cb_color_control);
2578 }
2579
2580 static void
radv_emit_color_write(struct radv_cmd_buffer * cmd_buffer)2581 radv_emit_color_write(struct radv_cmd_buffer *cmd_buffer)
2582 {
2583 const struct radv_device *device = cmd_buffer->device;
2584 const struct radv_binning_settings *settings = &device->physical_device->binning_settings;
2585 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2586 uint32_t color_write_enable = 0, color_write_mask = 0;
2587
2588 u_foreach_bit (i, d->vk.cb.color_write_enables) {
2589 color_write_enable |= 0xfu << (i * 4);
2590 }
2591
2592 for (unsigned i = 0; i < MAX_RTS; i++) {
2593 color_write_mask |= d->vk.cb.attachments[i].write_mask << (4 * i);
2594 }
2595
2596 if (device->pbb_allowed && settings->context_states_per_bin > 1) {
2597 /* Flush DFSM on CB_TARGET_MASK changes. */
2598 radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
2599 radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
2600 }
2601
2602 radeon_set_context_reg(cmd_buffer->cs, R_028238_CB_TARGET_MASK, color_write_mask & color_write_enable);
2603 }
2604
2605 static void
radv_emit_patch_control_points(struct radv_cmd_buffer * cmd_buffer)2606 radv_emit_patch_control_points(struct radv_cmd_buffer *cmd_buffer)
2607 {
2608 const struct radv_physical_device *pdevice = cmd_buffer->device->physical_device;
2609 const struct radv_shader *vs = radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_VERTEX);
2610 const struct radv_shader *tcs = cmd_buffer->state.shaders[MESA_SHADER_TESS_CTRL];
2611 const struct radv_shader *tes = radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_TESS_EVAL);
2612 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2613 unsigned ls_hs_config, base_reg;
2614
2615 /* Compute tessellation info that depends on the number of patch control points when this state
2616 * is dynamic.
2617 */
2618 if (cmd_buffer->state.uses_dynamic_patch_control_points) {
2619 /* Compute the number of patches. */
2620 cmd_buffer->state.tess_num_patches = get_tcs_num_patches(
2621 d->vk.ts.patch_control_points, tcs->info.tcs.tcs_vertices_out, vs->info.vs.num_linked_outputs,
2622 tcs->info.tcs.num_linked_outputs, tcs->info.tcs.num_linked_patch_outputs,
2623 pdevice->hs.tess_offchip_block_dw_size, pdevice->rad_info.gfx_level, pdevice->rad_info.family);
2624
2625 /* Compute the LDS size. */
2626 cmd_buffer->state.tess_lds_size = calculate_tess_lds_size(
2627 pdevice->rad_info.gfx_level, d->vk.ts.patch_control_points, tcs->info.tcs.tcs_vertices_out,
2628 vs->info.vs.num_linked_outputs, cmd_buffer->state.tess_num_patches, tcs->info.tcs.num_linked_outputs,
2629 tcs->info.tcs.num_linked_patch_outputs);
2630 }
2631
2632 ls_hs_config = S_028B58_NUM_PATCHES(cmd_buffer->state.tess_num_patches) |
2633 S_028B58_HS_NUM_INPUT_CP(d->vk.ts.patch_control_points) |
2634 S_028B58_HS_NUM_OUTPUT_CP(tcs->info.tcs.tcs_vertices_out);
2635
2636 if (pdevice->rad_info.gfx_level >= GFX7) {
2637 radeon_set_context_reg_idx(cmd_buffer->cs, R_028B58_VGT_LS_HS_CONFIG, 2, ls_hs_config);
2638 } else {
2639 radeon_set_context_reg(cmd_buffer->cs, R_028B58_VGT_LS_HS_CONFIG, ls_hs_config);
2640 }
2641
2642 if (pdevice->rad_info.gfx_level >= GFX9) {
2643 unsigned hs_rsrc2;
2644
2645 if (tcs->info.merged_shader_compiled_separately) {
2646 radv_shader_combine_cfg_vs_tcs(cmd_buffer->state.shaders[MESA_SHADER_VERTEX], tcs, NULL, &hs_rsrc2);
2647 } else {
2648 hs_rsrc2 = tcs->config.rsrc2;
2649 }
2650
2651 if (pdevice->rad_info.gfx_level >= GFX10) {
2652 hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX10(cmd_buffer->state.tess_lds_size);
2653 } else {
2654 hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX9(cmd_buffer->state.tess_lds_size);
2655 }
2656
2657 radeon_set_sh_reg(cmd_buffer->cs, R_00B42C_SPI_SHADER_PGM_RSRC2_HS, hs_rsrc2);
2658 } else {
2659 unsigned ls_rsrc2 = vs->config.rsrc2 | S_00B52C_LDS_SIZE(cmd_buffer->state.tess_lds_size);
2660
2661 radeon_set_sh_reg(cmd_buffer->cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, ls_rsrc2);
2662 }
2663
2664 /* Emit user SGPRs for dynamic patch control points. */
2665 const struct radv_userdata_info *offchip = radv_get_user_sgpr(tcs, AC_UD_TCS_OFFCHIP_LAYOUT);
2666 if (offchip->sgpr_idx == -1)
2667 return;
2668 assert(offchip->num_sgprs == 1);
2669
2670 unsigned tcs_offchip_layout =
2671 SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_PATCH_CONTROL_POINTS, d->vk.ts.patch_control_points) |
2672 SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_NUM_PATCHES, cmd_buffer->state.tess_num_patches) |
2673 SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_LSHS_VERTEX_STRIDE,
2674 get_tcs_input_vertex_stride(vs->info.vs.num_linked_outputs) / 4);
2675
2676 base_reg = tcs->info.user_data_0;
2677 radeon_set_sh_reg(cmd_buffer->cs, base_reg + offchip->sgpr_idx * 4, tcs_offchip_layout);
2678
2679 const struct radv_userdata_info *num_patches = radv_get_user_sgpr(tes, AC_UD_TES_STATE);
2680 assert(num_patches->sgpr_idx != -1 && num_patches->num_sgprs == 1);
2681
2682 const unsigned tes_state = SET_SGPR_FIELD(TES_STATE_NUM_PATCHES, cmd_buffer->state.tess_num_patches) |
2683 SET_SGPR_FIELD(TES_STATE_TCS_VERTICES_OUT, tcs->info.tcs.tcs_vertices_out) |
2684 SET_SGPR_FIELD(TES_STATE_NUM_TCS_OUTPUTS, tcs->info.tcs.num_linked_outputs);
2685
2686 base_reg = tes->info.user_data_0;
2687 radeon_set_sh_reg(cmd_buffer->cs, base_reg + num_patches->sgpr_idx * 4, tes_state);
2688 }
2689
2690 static void
radv_emit_conservative_rast_mode(struct radv_cmd_buffer * cmd_buffer)2691 radv_emit_conservative_rast_mode(struct radv_cmd_buffer *cmd_buffer)
2692 {
2693 const struct radv_physical_device *pdevice = cmd_buffer->device->physical_device;
2694 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2695
2696 if (pdevice->rad_info.gfx_level >= GFX9) {
2697 uint32_t pa_sc_conservative_rast;
2698
2699 if (d->vk.rs.conservative_mode != VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT) {
2700 const struct radv_shader *ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
2701 const bool uses_inner_coverage = ps && ps->info.ps.reads_fully_covered;
2702
2703 pa_sc_conservative_rast =
2704 S_028C4C_PREZ_AA_MASK_ENABLE(1) | S_028C4C_POSTZ_AA_MASK_ENABLE(1) | S_028C4C_CENTROID_SAMPLE_OVERRIDE(1);
2705
2706 /* Inner coverage requires underestimate conservative rasterization. */
2707 if (d->vk.rs.conservative_mode == VK_CONSERVATIVE_RASTERIZATION_MODE_OVERESTIMATE_EXT &&
2708 !uses_inner_coverage) {
2709 pa_sc_conservative_rast |= S_028C4C_OVER_RAST_ENABLE(1) | S_028C4C_UNDER_RAST_SAMPLE_SELECT(1) |
2710 S_028C4C_PBB_UNCERTAINTY_REGION_ENABLE(1);
2711 } else {
2712 pa_sc_conservative_rast |= S_028C4C_OVER_RAST_SAMPLE_SELECT(1) | S_028C4C_UNDER_RAST_ENABLE(1);
2713 }
2714 } else {
2715 pa_sc_conservative_rast = S_028C4C_NULL_SQUAD_AA_MASK_ENABLE(1);
2716 }
2717
2718 radeon_set_context_reg(cmd_buffer->cs, R_028C4C_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL, pa_sc_conservative_rast);
2719 }
2720 }
2721
2722 static void
radv_emit_depth_clamp_enable(struct radv_cmd_buffer * cmd_buffer)2723 radv_emit_depth_clamp_enable(struct radv_cmd_buffer *cmd_buffer)
2724 {
2725 enum radv_depth_clamp_mode mode = radv_get_depth_clamp_mode(cmd_buffer);
2726
2727 radeon_set_context_reg(cmd_buffer->cs, R_02800C_DB_RENDER_OVERRIDE,
2728 S_02800C_FORCE_HIS_ENABLE0(V_02800C_FORCE_DISABLE) |
2729 S_02800C_FORCE_HIS_ENABLE1(V_02800C_FORCE_DISABLE) |
2730 S_02800C_DISABLE_VIEWPORT_CLAMP(mode == RADV_DEPTH_CLAMP_MODE_DISABLED));
2731 }
2732
2733 static void
radv_emit_rasterization_samples(struct radv_cmd_buffer * cmd_buffer)2734 radv_emit_rasterization_samples(struct radv_cmd_buffer *cmd_buffer)
2735 {
2736 const struct radv_physical_device *pdevice = cmd_buffer->device->physical_device;
2737 unsigned rasterization_samples = radv_get_rasterization_samples(cmd_buffer);
2738 unsigned ps_iter_samples = radv_get_ps_iter_samples(cmd_buffer);
2739 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2740 unsigned spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1);
2741 unsigned pa_sc_mode_cntl_1;
2742
2743 pa_sc_mode_cntl_1 =
2744 S_028A4C_WALK_FENCE_ENABLE(1) | // TODO linear dst fixes
2745 S_028A4C_WALK_FENCE_SIZE(pdevice->rad_info.num_tile_pipes == 2 ? 2 : 3) |
2746 S_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(cmd_buffer->state.uses_out_of_order_rast) |
2747 S_028A4C_OUT_OF_ORDER_WATER_MARK(0x7) |
2748 /* always 1: */
2749 S_028A4C_SUPERTILE_WALK_ORDER_ENABLE(1) | S_028A4C_TILE_WALK_ORDER_ENABLE(1) |
2750 S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE(1) | S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) |
2751 S_028A4C_FORCE_EOV_REZ_ENABLE(1) |
2752 /* This should only be set when VRS surfaces aren't enabled on GFX11, otherwise the GPU might
2753 * hang.
2754 */
2755 S_028A4C_WALK_ALIGN8_PRIM_FITS_ST(pdevice->rad_info.gfx_level < GFX11 || !cmd_buffer->state.uses_vrs_attachment);
2756
2757 if (!d->sample_location.count)
2758 radv_emit_default_sample_locations(cmd_buffer->cs, rasterization_samples);
2759
2760 if (ps_iter_samples > 1) {
2761 spi_baryc_cntl |= S_0286E0_POS_FLOAT_LOCATION(2);
2762 pa_sc_mode_cntl_1 |= S_028A4C_PS_ITER_SAMPLE(1);
2763 }
2764
2765 if (radv_should_force_vrs1x1(cmd_buffer)) {
2766 /* Make sure sample shading is enabled even if only MSAA1x is used because the SAMPLE_ITER
2767 * combiner is in passthrough mode if PS_ITER_SAMPLE is 0, and it uses the per-draw rate. The
2768 * default VRS rate when sample shading is enabled is 1x1.
2769 */
2770 if (!G_028A4C_PS_ITER_SAMPLE(pa_sc_mode_cntl_1))
2771 pa_sc_mode_cntl_1 |= S_028A4C_PS_ITER_SAMPLE(1);
2772 }
2773
2774 radeon_set_context_reg(cmd_buffer->cs, R_0286E0_SPI_BARYC_CNTL, spi_baryc_cntl);
2775 radeon_set_context_reg(cmd_buffer->cs, R_028A4C_PA_SC_MODE_CNTL_1, pa_sc_mode_cntl_1);
2776 }
2777
2778 static void
radv_emit_fb_color_state(struct radv_cmd_buffer * cmd_buffer,int index,struct radv_color_buffer_info * cb,struct radv_image_view * iview,VkImageLayout layout)2779 radv_emit_fb_color_state(struct radv_cmd_buffer *cmd_buffer, int index, struct radv_color_buffer_info *cb,
2780 struct radv_image_view *iview, VkImageLayout layout)
2781 {
2782 bool is_vi = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX8;
2783 uint32_t cb_fdcc_control = cb->cb_dcc_control;
2784 uint32_t cb_color_info = cb->cb_color_info;
2785 struct radv_image *image = iview->image;
2786
2787 if (!radv_layout_dcc_compressed(cmd_buffer->device, image, iview->vk.base_mip_level, layout,
2788 radv_image_queue_family_mask(image, cmd_buffer->qf, cmd_buffer->qf))) {
2789 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
2790 cb_fdcc_control &= C_028C78_FDCC_ENABLE;
2791 } else {
2792 cb_color_info &= C_028C70_DCC_ENABLE;
2793 }
2794 }
2795
2796 const enum radv_fmask_compression fmask_comp = radv_layout_fmask_compression(
2797 cmd_buffer->device, image, layout, radv_image_queue_family_mask(image, cmd_buffer->qf, cmd_buffer->qf));
2798 if (fmask_comp == RADV_FMASK_COMPRESSION_NONE) {
2799 cb_color_info &= C_028C70_COMPRESSION;
2800 }
2801
2802 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
2803 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C6C_CB_COLOR0_VIEW + index * 0x3c, 4);
2804 radeon_emit(cmd_buffer->cs, cb->cb_color_view); /* CB_COLOR0_VIEW */
2805 radeon_emit(cmd_buffer->cs, cb->cb_color_info); /* CB_COLOR0_INFO */
2806 radeon_emit(cmd_buffer->cs, cb->cb_color_attrib); /* CB_COLOR0_ATTRIB */
2807 radeon_emit(cmd_buffer->cs, cb_fdcc_control); /* CB_COLOR0_FDCC_CONTROL */
2808
2809 radeon_set_context_reg(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, cb->cb_color_base);
2810 radeon_set_context_reg(cmd_buffer->cs, R_028E40_CB_COLOR0_BASE_EXT + index * 4, cb->cb_color_base >> 32);
2811 radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->cb_dcc_base);
2812 radeon_set_context_reg(cmd_buffer->cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + index * 4, cb->cb_dcc_base >> 32);
2813 radeon_set_context_reg(cmd_buffer->cs, R_028EC0_CB_COLOR0_ATTRIB2 + index * 4, cb->cb_color_attrib2);
2814 radeon_set_context_reg(cmd_buffer->cs, R_028EE0_CB_COLOR0_ATTRIB3 + index * 4, cb->cb_color_attrib3);
2815 } else if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10) {
2816 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
2817 radeon_emit(cmd_buffer->cs, cb->cb_color_base);
2818 radeon_emit(cmd_buffer->cs, 0);
2819 radeon_emit(cmd_buffer->cs, 0);
2820 radeon_emit(cmd_buffer->cs, cb->cb_color_view);
2821 radeon_emit(cmd_buffer->cs, cb_color_info);
2822 radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
2823 radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
2824 radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
2825 radeon_emit(cmd_buffer->cs, 0);
2826 radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
2827 radeon_emit(cmd_buffer->cs, 0);
2828
2829 radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->cb_dcc_base);
2830
2831 radeon_set_context_reg(cmd_buffer->cs, R_028E40_CB_COLOR0_BASE_EXT + index * 4, cb->cb_color_base >> 32);
2832 radeon_set_context_reg(cmd_buffer->cs, R_028E60_CB_COLOR0_CMASK_BASE_EXT + index * 4, cb->cb_color_cmask >> 32);
2833 radeon_set_context_reg(cmd_buffer->cs, R_028E80_CB_COLOR0_FMASK_BASE_EXT + index * 4, cb->cb_color_fmask >> 32);
2834 radeon_set_context_reg(cmd_buffer->cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + index * 4, cb->cb_dcc_base >> 32);
2835 radeon_set_context_reg(cmd_buffer->cs, R_028EC0_CB_COLOR0_ATTRIB2 + index * 4, cb->cb_color_attrib2);
2836 radeon_set_context_reg(cmd_buffer->cs, R_028EE0_CB_COLOR0_ATTRIB3 + index * 4, cb->cb_color_attrib3);
2837 } else if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX9) {
2838 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
2839 radeon_emit(cmd_buffer->cs, cb->cb_color_base);
2840 radeon_emit(cmd_buffer->cs, S_028C64_BASE_256B(cb->cb_color_base >> 32));
2841 radeon_emit(cmd_buffer->cs, cb->cb_color_attrib2);
2842 radeon_emit(cmd_buffer->cs, cb->cb_color_view);
2843 radeon_emit(cmd_buffer->cs, cb_color_info);
2844 radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
2845 radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
2846 radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
2847 radeon_emit(cmd_buffer->cs, S_028C80_BASE_256B(cb->cb_color_cmask >> 32));
2848 radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
2849 radeon_emit(cmd_buffer->cs, S_028C88_BASE_256B(cb->cb_color_fmask >> 32));
2850
2851 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, 2);
2852 radeon_emit(cmd_buffer->cs, cb->cb_dcc_base);
2853 radeon_emit(cmd_buffer->cs, S_028C98_BASE_256B(cb->cb_dcc_base >> 32));
2854
2855 radeon_set_context_reg(cmd_buffer->cs, R_0287A0_CB_MRT0_EPITCH + index * 4, cb->cb_mrt_epitch);
2856 } else {
2857 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
2858 radeon_emit(cmd_buffer->cs, cb->cb_color_base);
2859 radeon_emit(cmd_buffer->cs, cb->cb_color_pitch);
2860 radeon_emit(cmd_buffer->cs, cb->cb_color_slice);
2861 radeon_emit(cmd_buffer->cs, cb->cb_color_view);
2862 radeon_emit(cmd_buffer->cs, cb_color_info);
2863 radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
2864 radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
2865 radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
2866 radeon_emit(cmd_buffer->cs, cb->cb_color_cmask_slice);
2867 radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
2868 radeon_emit(cmd_buffer->cs, cb->cb_color_fmask_slice);
2869
2870 if (is_vi) { /* DCC BASE */
2871 radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->cb_dcc_base);
2872 }
2873 }
2874
2875 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11 ? G_028C78_FDCC_ENABLE(cb_fdcc_control)
2876 : G_028C70_DCC_ENABLE(cb_color_info)) {
2877 /* Drawing with DCC enabled also compresses colorbuffers. */
2878 VkImageSubresourceRange range = {
2879 .aspectMask = iview->vk.aspects,
2880 .baseMipLevel = iview->vk.base_mip_level,
2881 .levelCount = iview->vk.level_count,
2882 .baseArrayLayer = iview->vk.base_array_layer,
2883 .layerCount = iview->vk.layer_count,
2884 };
2885
2886 radv_update_dcc_metadata(cmd_buffer, image, &range, true);
2887 }
2888 }
2889
2890 static void
radv_update_zrange_precision(struct radv_cmd_buffer * cmd_buffer,struct radv_ds_buffer_info * ds,const struct radv_image_view * iview,bool requires_cond_exec)2891 radv_update_zrange_precision(struct radv_cmd_buffer *cmd_buffer, struct radv_ds_buffer_info *ds,
2892 const struct radv_image_view *iview, bool requires_cond_exec)
2893 {
2894 const struct radv_image *image = iview->image;
2895 uint32_t db_z_info = ds->db_z_info;
2896 uint32_t db_z_info_reg;
2897
2898 if (!cmd_buffer->device->physical_device->rad_info.has_tc_compat_zrange_bug || !radv_image_is_tc_compat_htile(image))
2899 return;
2900
2901 db_z_info &= C_028040_ZRANGE_PRECISION;
2902
2903 if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX9) {
2904 db_z_info_reg = R_028038_DB_Z_INFO;
2905 } else {
2906 db_z_info_reg = R_028040_DB_Z_INFO;
2907 }
2908
2909 /* When we don't know the last fast clear value we need to emit a
2910 * conditional packet that will eventually skip the following
2911 * SET_CONTEXT_REG packet.
2912 */
2913 if (requires_cond_exec) {
2914 uint64_t va = radv_get_tc_compat_zrange_va(image, iview->vk.base_mip_level);
2915
2916 radeon_emit(cmd_buffer->cs, PKT3(PKT3_COND_EXEC, 3, 0));
2917 radeon_emit(cmd_buffer->cs, va);
2918 radeon_emit(cmd_buffer->cs, va >> 32);
2919 radeon_emit(cmd_buffer->cs, 0);
2920 radeon_emit(cmd_buffer->cs, 3); /* SET_CONTEXT_REG size */
2921 }
2922
2923 radeon_set_context_reg(cmd_buffer->cs, db_z_info_reg, db_z_info);
2924 }
2925
2926 static struct radv_image *
radv_cmd_buffer_get_vrs_image(struct radv_cmd_buffer * cmd_buffer)2927 radv_cmd_buffer_get_vrs_image(struct radv_cmd_buffer *cmd_buffer)
2928 {
2929 struct radv_device *device = cmd_buffer->device;
2930
2931 if (!device->vrs.image) {
2932 VkResult result;
2933
2934 /* The global VRS state is initialized on-demand to avoid wasting VRAM. */
2935 result = radv_device_init_vrs_state(device);
2936 if (result != VK_SUCCESS) {
2937 vk_command_buffer_set_error(&cmd_buffer->vk, result);
2938 return NULL;
2939 }
2940 }
2941
2942 return device->vrs.image;
2943 }
2944
2945 static void
radv_emit_fb_ds_state(struct radv_cmd_buffer * cmd_buffer,struct radv_ds_buffer_info * ds,struct radv_image_view * iview,bool depth_compressed,bool stencil_compressed)2946 radv_emit_fb_ds_state(struct radv_cmd_buffer *cmd_buffer, struct radv_ds_buffer_info *ds, struct radv_image_view *iview,
2947 bool depth_compressed, bool stencil_compressed)
2948 {
2949 uint64_t db_htile_data_base = ds->db_htile_data_base;
2950 uint32_t db_htile_surface = ds->db_htile_surface;
2951 uint32_t db_render_control = ds->db_render_control | cmd_buffer->state.db_render_control;
2952 uint32_t db_z_info = ds->db_z_info;
2953
2954 if (!depth_compressed)
2955 db_render_control |= S_028000_DEPTH_COMPRESS_DISABLE(1);
2956 if (!stencil_compressed)
2957 db_render_control |= S_028000_STENCIL_COMPRESS_DISABLE(1);
2958
2959 if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX10_3) {
2960 if (!cmd_buffer->state.render.vrs_att.iview) {
2961 db_htile_surface &= C_028ABC_VRS_HTILE_ENCODING;
2962 } else {
2963 /* On GFX10.3, when a subpass uses VRS attachment but HTILE can't be enabled, we fallback to
2964 * our internal HTILE buffer.
2965 */
2966 if (!radv_htile_enabled(iview->image, iview->vk.base_mip_level) && radv_cmd_buffer_get_vrs_image(cmd_buffer)) {
2967 struct radv_buffer *htile_buffer = cmd_buffer->device->vrs.buffer;
2968
2969 assert(!G_028038_TILE_SURFACE_ENABLE(db_z_info) && !db_htile_data_base && !db_htile_surface);
2970 db_z_info |= S_028038_TILE_SURFACE_ENABLE(1);
2971 db_htile_data_base = radv_buffer_get_va(htile_buffer->bo) >> 8;
2972 db_htile_surface = S_028ABC_FULL_CACHE(1) | S_028ABC_PIPE_ALIGNED(1) |
2973 S_028ABC_VRS_HTILE_ENCODING(V_028ABC_VRS_HTILE_4BIT_ENCODING);
2974 }
2975 }
2976 }
2977
2978 radeon_set_context_reg(cmd_buffer->cs, R_028000_DB_RENDER_CONTROL, db_render_control);
2979 radeon_set_context_reg(cmd_buffer->cs, R_028008_DB_DEPTH_VIEW, ds->db_depth_view);
2980 radeon_set_context_reg(cmd_buffer->cs, R_028010_DB_RENDER_OVERRIDE2, ds->db_render_override2);
2981 radeon_set_context_reg(cmd_buffer->cs, R_028ABC_DB_HTILE_SURFACE, db_htile_surface);
2982
2983 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10) {
2984 radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, db_htile_data_base);
2985 radeon_set_context_reg(cmd_buffer->cs, R_02801C_DB_DEPTH_SIZE_XY, ds->db_depth_size);
2986
2987 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
2988 radeon_set_context_reg_seq(cmd_buffer->cs, R_028040_DB_Z_INFO, 6);
2989 } else {
2990 radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 7);
2991 radeon_emit(cmd_buffer->cs, S_02803C_RESOURCE_LEVEL(1));
2992 }
2993 radeon_emit(cmd_buffer->cs, db_z_info);
2994 radeon_emit(cmd_buffer->cs, ds->db_stencil_info);
2995 radeon_emit(cmd_buffer->cs, ds->db_z_read_base);
2996 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);
2997 radeon_emit(cmd_buffer->cs, ds->db_z_read_base);
2998 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);
2999
3000 radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_READ_BASE_HI, 5);
3001 radeon_emit(cmd_buffer->cs, ds->db_z_read_base >> 32);
3002 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base >> 32);
3003 radeon_emit(cmd_buffer->cs, ds->db_z_read_base >> 32);
3004 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base >> 32);
3005 radeon_emit(cmd_buffer->cs, db_htile_data_base >> 32);
3006 } else if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX9) {
3007 radeon_set_context_reg_seq(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, 3);
3008 radeon_emit(cmd_buffer->cs, db_htile_data_base);
3009 radeon_emit(cmd_buffer->cs, S_028018_BASE_HI(db_htile_data_base >> 32));
3010 radeon_emit(cmd_buffer->cs, ds->db_depth_size);
3011
3012 radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 10);
3013 radeon_emit(cmd_buffer->cs, db_z_info); /* DB_Z_INFO */
3014 radeon_emit(cmd_buffer->cs, ds->db_stencil_info); /* DB_STENCIL_INFO */
3015 radeon_emit(cmd_buffer->cs, ds->db_z_read_base); /* DB_Z_READ_BASE */
3016 radeon_emit(cmd_buffer->cs, S_028044_BASE_HI(ds->db_z_read_base >> 32)); /* DB_Z_READ_BASE_HI */
3017 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base); /* DB_STENCIL_READ_BASE */
3018 radeon_emit(cmd_buffer->cs, S_02804C_BASE_HI(ds->db_stencil_read_base >> 32)); /* DB_STENCIL_READ_BASE_HI */
3019 radeon_emit(cmd_buffer->cs, ds->db_z_write_base); /* DB_Z_WRITE_BASE */
3020 radeon_emit(cmd_buffer->cs, S_028054_BASE_HI(ds->db_z_write_base >> 32)); /* DB_Z_WRITE_BASE_HI */
3021 radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base); /* DB_STENCIL_WRITE_BASE */
3022 radeon_emit(cmd_buffer->cs, S_02805C_BASE_HI(ds->db_stencil_write_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */
3023
3024 radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_INFO2, 2);
3025 radeon_emit(cmd_buffer->cs, ds->db_z_info2);
3026 radeon_emit(cmd_buffer->cs, ds->db_stencil_info2);
3027 } else {
3028 radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, db_htile_data_base);
3029
3030 radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 9);
3031 radeon_emit(cmd_buffer->cs, ds->db_depth_info); /* R_02803C_DB_DEPTH_INFO */
3032 radeon_emit(cmd_buffer->cs, db_z_info); /* R_028040_DB_Z_INFO */
3033 radeon_emit(cmd_buffer->cs, ds->db_stencil_info); /* R_028044_DB_STENCIL_INFO */
3034 radeon_emit(cmd_buffer->cs, ds->db_z_read_base); /* R_028048_DB_Z_READ_BASE */
3035 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base); /* R_02804C_DB_STENCIL_READ_BASE */
3036 radeon_emit(cmd_buffer->cs, ds->db_z_write_base); /* R_028050_DB_Z_WRITE_BASE */
3037 radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base); /* R_028054_DB_STENCIL_WRITE_BASE */
3038 radeon_emit(cmd_buffer->cs, ds->db_depth_size); /* R_028058_DB_DEPTH_SIZE */
3039 radeon_emit(cmd_buffer->cs, ds->db_depth_slice); /* R_02805C_DB_DEPTH_SLICE */
3040 }
3041
3042 /* Update the ZRANGE_PRECISION value for the TC-compat bug. */
3043 radv_update_zrange_precision(cmd_buffer, ds, iview, true);
3044 }
3045
3046 static void
radv_emit_null_ds_state(struct radv_cmd_buffer * cmd_buffer)3047 radv_emit_null_ds_state(struct radv_cmd_buffer *cmd_buffer)
3048 {
3049 const enum amd_gfx_level gfx_level = cmd_buffer->device->physical_device->rad_info.gfx_level;
3050 unsigned db_render_control = 0;
3051 unsigned num_samples = 0;
3052
3053 /* On GFX11, DB_Z_INFO.NUM_SAMPLES should always match MSAA_EXPOSED_SAMPLES. It affects VRS,
3054 * occlusion queries and Primitive Ordered Pixel Shading if depth and stencil are not bound.
3055 */
3056 if (gfx_level == GFX11) {
3057 num_samples = util_logbase2(radv_get_rasterization_samples(cmd_buffer));
3058 radv_gfx11_set_db_render_control(cmd_buffer->device, 1, &db_render_control);
3059 }
3060
3061 if (gfx_level == GFX9) {
3062 radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 2);
3063 } else {
3064 radeon_set_context_reg_seq(cmd_buffer->cs, R_028040_DB_Z_INFO, 2);
3065 }
3066
3067 radeon_emit(cmd_buffer->cs, S_028040_FORMAT(V_028040_Z_INVALID) | S_028040_NUM_SAMPLES(num_samples));
3068 radeon_emit(cmd_buffer->cs, S_028044_FORMAT(V_028044_STENCIL_INVALID));
3069
3070 radeon_set_context_reg(cmd_buffer->cs, R_028000_DB_RENDER_CONTROL, db_render_control);
3071 radeon_set_context_reg(cmd_buffer->cs, R_028010_DB_RENDER_OVERRIDE2,
3072 S_028010_CENTROID_COMPUTATION_MODE(gfx_level >= GFX10_3));
3073 }
3074 /**
3075 * Update the fast clear depth/stencil values if the image is bound as a
3076 * depth/stencil buffer.
3077 */
3078 static void
radv_update_bound_fast_clear_ds(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview,VkClearDepthStencilValue ds_clear_value,VkImageAspectFlags aspects)3079 radv_update_bound_fast_clear_ds(struct radv_cmd_buffer *cmd_buffer, const struct radv_image_view *iview,
3080 VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects)
3081 {
3082 const struct radv_image *image = iview->image;
3083 struct radeon_cmdbuf *cs = cmd_buffer->cs;
3084
3085 if (cmd_buffer->state.render.ds_att.iview == NULL || cmd_buffer->state.render.ds_att.iview->image != image)
3086 return;
3087
3088 if (aspects == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
3089 radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2);
3090 radeon_emit(cs, ds_clear_value.stencil);
3091 radeon_emit(cs, fui(ds_clear_value.depth));
3092 } else if (aspects == VK_IMAGE_ASPECT_DEPTH_BIT) {
3093 radeon_set_context_reg(cs, R_02802C_DB_DEPTH_CLEAR, fui(ds_clear_value.depth));
3094 } else {
3095 assert(aspects == VK_IMAGE_ASPECT_STENCIL_BIT);
3096 radeon_set_context_reg(cs, R_028028_DB_STENCIL_CLEAR, ds_clear_value.stencil);
3097 }
3098
3099 /* Update the ZRANGE_PRECISION value for the TC-compat bug. This is
3100 * only needed when clearing Z to 0.0.
3101 */
3102 if ((aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && ds_clear_value.depth == 0.0) {
3103 radv_update_zrange_precision(cmd_buffer, &cmd_buffer->state.render.ds_att.ds, iview, false);
3104 }
3105
3106 cmd_buffer->state.context_roll_without_scissor_emitted = true;
3107 }
3108
3109 /**
3110 * Set the clear depth/stencil values to the image's metadata.
3111 */
3112 static void
radv_set_ds_clear_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,VkClearDepthStencilValue ds_clear_value,VkImageAspectFlags aspects)3113 radv_set_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
3114 const VkImageSubresourceRange *range, VkClearDepthStencilValue ds_clear_value,
3115 VkImageAspectFlags aspects)
3116 {
3117 struct radeon_cmdbuf *cs = cmd_buffer->cs;
3118 uint32_t level_count = vk_image_subresource_level_count(&image->vk, range);
3119
3120 if (aspects == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
3121 uint64_t va = radv_get_ds_clear_value_va(image, range->baseMipLevel);
3122
3123 /* Use the fastest way when both aspects are used. */
3124 ASSERTED unsigned cdw_end = radv_cs_write_data_head(cmd_buffer->device, cmd_buffer->cs, cmd_buffer->qf, V_370_PFP,
3125 va, 2 * level_count, cmd_buffer->state.predicating);
3126
3127 for (uint32_t l = 0; l < level_count; l++) {
3128 radeon_emit(cs, ds_clear_value.stencil);
3129 radeon_emit(cs, fui(ds_clear_value.depth));
3130 }
3131
3132 assert(cmd_buffer->cs->cdw == cdw_end);
3133 } else {
3134 /* Otherwise we need one WRITE_DATA packet per level. */
3135 for (uint32_t l = 0; l < level_count; l++) {
3136 uint64_t va = radv_get_ds_clear_value_va(image, range->baseMipLevel + l);
3137 unsigned value;
3138
3139 if (aspects == VK_IMAGE_ASPECT_DEPTH_BIT) {
3140 value = fui(ds_clear_value.depth);
3141 va += 4;
3142 } else {
3143 assert(aspects == VK_IMAGE_ASPECT_STENCIL_BIT);
3144 value = ds_clear_value.stencil;
3145 }
3146
3147 radv_write_data(cmd_buffer, V_370_PFP, va, 1, &value, cmd_buffer->state.predicating);
3148 }
3149 }
3150 }
3151
3152 /**
3153 * Update the TC-compat metadata value for this image.
3154 */
3155 static void
radv_set_tc_compat_zrange_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,uint32_t value)3156 radv_set_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
3157 const VkImageSubresourceRange *range, uint32_t value)
3158 {
3159 struct radeon_cmdbuf *cs = cmd_buffer->cs;
3160
3161 if (!cmd_buffer->device->physical_device->rad_info.has_tc_compat_zrange_bug)
3162 return;
3163
3164 uint64_t va = radv_get_tc_compat_zrange_va(image, range->baseMipLevel);
3165 uint32_t level_count = vk_image_subresource_level_count(&image->vk, range);
3166
3167 ASSERTED unsigned cdw_end = radv_cs_write_data_head(cmd_buffer->device, cmd_buffer->cs, cmd_buffer->qf, V_370_PFP,
3168 va, level_count, cmd_buffer->state.predicating);
3169
3170 for (uint32_t l = 0; l < level_count; l++)
3171 radeon_emit(cs, value);
3172
3173 assert(cmd_buffer->cs->cdw == cdw_end);
3174 }
3175
3176 static void
radv_update_tc_compat_zrange_metadata(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview,VkClearDepthStencilValue ds_clear_value)3177 radv_update_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer, const struct radv_image_view *iview,
3178 VkClearDepthStencilValue ds_clear_value)
3179 {
3180 VkImageSubresourceRange range = {
3181 .aspectMask = iview->vk.aspects,
3182 .baseMipLevel = iview->vk.base_mip_level,
3183 .levelCount = iview->vk.level_count,
3184 .baseArrayLayer = iview->vk.base_array_layer,
3185 .layerCount = iview->vk.layer_count,
3186 };
3187 uint32_t cond_val;
3188
3189 /* Conditionally set DB_Z_INFO.ZRANGE_PRECISION to 0 when the last
3190 * depth clear value is 0.0f.
3191 */
3192 cond_val = ds_clear_value.depth == 0.0f ? UINT_MAX : 0;
3193
3194 radv_set_tc_compat_zrange_metadata(cmd_buffer, iview->image, &range, cond_val);
3195 }
3196
3197 /**
3198 * Update the clear depth/stencil values for this image.
3199 */
3200 void
radv_update_ds_clear_metadata(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview,VkClearDepthStencilValue ds_clear_value,VkImageAspectFlags aspects)3201 radv_update_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, const struct radv_image_view *iview,
3202 VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects)
3203 {
3204 VkImageSubresourceRange range = {
3205 .aspectMask = iview->vk.aspects,
3206 .baseMipLevel = iview->vk.base_mip_level,
3207 .levelCount = iview->vk.level_count,
3208 .baseArrayLayer = iview->vk.base_array_layer,
3209 .layerCount = iview->vk.layer_count,
3210 };
3211 struct radv_image *image = iview->image;
3212
3213 assert(radv_htile_enabled(image, range.baseMipLevel));
3214
3215 radv_set_ds_clear_metadata(cmd_buffer, iview->image, &range, ds_clear_value, aspects);
3216
3217 if (radv_image_is_tc_compat_htile(image) && (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) {
3218 radv_update_tc_compat_zrange_metadata(cmd_buffer, iview, ds_clear_value);
3219 }
3220
3221 radv_update_bound_fast_clear_ds(cmd_buffer, iview, ds_clear_value, aspects);
3222 }
3223
3224 /**
3225 * Load the clear depth/stencil values from the image's metadata.
3226 */
3227 static void
radv_load_ds_clear_metadata(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview)3228 radv_load_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, const struct radv_image_view *iview)
3229 {
3230 struct radeon_cmdbuf *cs = cmd_buffer->cs;
3231 const struct radv_image *image = iview->image;
3232 VkImageAspectFlags aspects = vk_format_aspects(image->vk.format);
3233 uint64_t va = radv_get_ds_clear_value_va(image, iview->vk.base_mip_level);
3234 unsigned reg_offset = 0, reg_count = 0;
3235
3236 assert(radv_image_has_htile(image));
3237
3238 if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
3239 ++reg_count;
3240 } else {
3241 ++reg_offset;
3242 va += 4;
3243 }
3244 if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
3245 ++reg_count;
3246
3247 uint32_t reg = R_028028_DB_STENCIL_CLEAR + 4 * reg_offset;
3248
3249 if (cmd_buffer->device->physical_device->rad_info.has_load_ctx_reg_pkt) {
3250 radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, 0));
3251 radeon_emit(cs, va);
3252 radeon_emit(cs, va >> 32);
3253 radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
3254 radeon_emit(cs, reg_count);
3255 } else {
3256 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
3257 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) |
3258 (reg_count == 2 ? COPY_DATA_COUNT_SEL : 0));
3259 radeon_emit(cs, va);
3260 radeon_emit(cs, va >> 32);
3261 radeon_emit(cs, reg >> 2);
3262 radeon_emit(cs, 0);
3263
3264 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
3265 radeon_emit(cs, 0);
3266 }
3267 }
3268
3269 /*
3270 * With DCC some colors don't require CMASK elimination before being
3271 * used as a texture. This sets a predicate value to determine if the
3272 * cmask eliminate is required.
3273 */
3274 void
radv_update_fce_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,bool value)3275 radv_update_fce_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
3276 const VkImageSubresourceRange *range, bool value)
3277 {
3278 if (!image->fce_pred_offset)
3279 return;
3280
3281 uint64_t pred_val = value;
3282 uint64_t va = radv_image_get_fce_pred_va(image, range->baseMipLevel);
3283 uint32_t level_count = vk_image_subresource_level_count(&image->vk, range);
3284
3285 ASSERTED unsigned cdw_end = radv_cs_write_data_head(cmd_buffer->device, cmd_buffer->cs, cmd_buffer->qf, V_370_PFP,
3286 va, 2 * level_count, false);
3287
3288 for (uint32_t l = 0; l < level_count; l++) {
3289 radeon_emit(cmd_buffer->cs, pred_val);
3290 radeon_emit(cmd_buffer->cs, pred_val >> 32);
3291 }
3292
3293 assert(cmd_buffer->cs->cdw == cdw_end);
3294 }
3295
3296 /**
3297 * Update the DCC predicate to reflect the compression state.
3298 */
3299 void
radv_update_dcc_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,bool value)3300 radv_update_dcc_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
3301 const VkImageSubresourceRange *range, bool value)
3302 {
3303 if (image->dcc_pred_offset == 0)
3304 return;
3305
3306 uint64_t pred_val = value;
3307 uint64_t va = radv_image_get_dcc_pred_va(image, range->baseMipLevel);
3308 uint32_t level_count = vk_image_subresource_level_count(&image->vk, range);
3309
3310 assert(radv_dcc_enabled(image, range->baseMipLevel));
3311
3312 ASSERTED unsigned cdw_end = radv_cs_write_data_head(cmd_buffer->device, cmd_buffer->cs, cmd_buffer->qf, V_370_PFP,
3313 va, 2 * level_count, false);
3314
3315 for (uint32_t l = 0; l < level_count; l++) {
3316 radeon_emit(cmd_buffer->cs, pred_val);
3317 radeon_emit(cmd_buffer->cs, pred_val >> 32);
3318 }
3319
3320 assert(cmd_buffer->cs->cdw == cdw_end);
3321 }
3322
3323 /**
3324 * Update the fast clear color values if the image is bound as a color buffer.
3325 */
3326 static void
radv_update_bound_fast_clear_color(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,int cb_idx,uint32_t color_values[2])3327 radv_update_bound_fast_clear_color(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, int cb_idx,
3328 uint32_t color_values[2])
3329 {
3330 struct radeon_cmdbuf *cs = cmd_buffer->cs;
3331
3332 if (cb_idx >= cmd_buffer->state.render.color_att_count || cmd_buffer->state.render.color_att[cb_idx].iview == NULL ||
3333 cmd_buffer->state.render.color_att[cb_idx].iview->image != image)
3334 return;
3335
3336 ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4);
3337
3338 radeon_set_context_reg_seq(cs, R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c, 2);
3339 radeon_emit(cs, color_values[0]);
3340 radeon_emit(cs, color_values[1]);
3341
3342 assert(cmd_buffer->cs->cdw <= cdw_max);
3343
3344 cmd_buffer->state.context_roll_without_scissor_emitted = true;
3345 }
3346
3347 /**
3348 * Set the clear color values to the image's metadata.
3349 */
3350 static void
radv_set_color_clear_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,uint32_t color_values[2])3351 radv_set_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
3352 const VkImageSubresourceRange *range, uint32_t color_values[2])
3353 {
3354 struct radeon_cmdbuf *cs = cmd_buffer->cs;
3355 uint32_t level_count = vk_image_subresource_level_count(&image->vk, range);
3356
3357 assert(radv_image_has_cmask(image) || radv_dcc_enabled(image, range->baseMipLevel));
3358
3359 if (radv_image_has_clear_value(image)) {
3360 uint64_t va = radv_image_get_fast_clear_va(image, range->baseMipLevel);
3361
3362 ASSERTED unsigned cdw_end = radv_cs_write_data_head(cmd_buffer->device, cmd_buffer->cs, cmd_buffer->qf, V_370_PFP,
3363 va, 2 * level_count, cmd_buffer->state.predicating);
3364
3365 for (uint32_t l = 0; l < level_count; l++) {
3366 radeon_emit(cs, color_values[0]);
3367 radeon_emit(cs, color_values[1]);
3368 }
3369
3370 assert(cmd_buffer->cs->cdw == cdw_end);
3371 } else {
3372 /* Some default value we can set in the update. */
3373 assert(color_values[0] == 0 && color_values[1] == 0);
3374 }
3375 }
3376
3377 /**
3378 * Update the clear color values for this image.
3379 */
3380 void
radv_update_color_clear_metadata(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview,int cb_idx,uint32_t color_values[2])3381 radv_update_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, const struct radv_image_view *iview, int cb_idx,
3382 uint32_t color_values[2])
3383 {
3384 struct radv_image *image = iview->image;
3385 VkImageSubresourceRange range = {
3386 .aspectMask = iview->vk.aspects,
3387 .baseMipLevel = iview->vk.base_mip_level,
3388 .levelCount = iview->vk.level_count,
3389 .baseArrayLayer = iview->vk.base_array_layer,
3390 .layerCount = iview->vk.layer_count,
3391 };
3392
3393 assert(radv_image_has_cmask(image) || radv_dcc_enabled(image, iview->vk.base_mip_level));
3394
3395 /* Do not need to update the clear value for images that are fast cleared with the comp-to-single
3396 * mode because the hardware gets the value from the image directly.
3397 */
3398 if (iview->image->support_comp_to_single)
3399 return;
3400
3401 radv_set_color_clear_metadata(cmd_buffer, image, &range, color_values);
3402
3403 radv_update_bound_fast_clear_color(cmd_buffer, image, cb_idx, color_values);
3404 }
3405
3406 /**
3407 * Load the clear color values from the image's metadata.
3408 */
3409 static void
radv_load_color_clear_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image_view * iview,int cb_idx)3410 radv_load_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image_view *iview, int cb_idx)
3411 {
3412 struct radeon_cmdbuf *cs = cmd_buffer->cs;
3413 struct radv_image *image = iview->image;
3414
3415 if (!radv_image_has_cmask(image) && !radv_dcc_enabled(image, iview->vk.base_mip_level))
3416 return;
3417
3418 if (iview->image->support_comp_to_single)
3419 return;
3420
3421 if (!radv_image_has_clear_value(image)) {
3422 uint32_t color_values[2] = {0, 0};
3423 radv_update_bound_fast_clear_color(cmd_buffer, image, cb_idx, color_values);
3424 return;
3425 }
3426
3427 uint64_t va = radv_image_get_fast_clear_va(image, iview->vk.base_mip_level);
3428 uint32_t reg = R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c;
3429
3430 if (cmd_buffer->device->physical_device->rad_info.has_load_ctx_reg_pkt) {
3431 radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, cmd_buffer->state.predicating));
3432 radeon_emit(cs, va);
3433 radeon_emit(cs, va >> 32);
3434 radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
3435 radeon_emit(cs, 2);
3436 } else {
3437 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating));
3438 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) | COPY_DATA_COUNT_SEL);
3439 radeon_emit(cs, va);
3440 radeon_emit(cs, va >> 32);
3441 radeon_emit(cs, reg >> 2);
3442 radeon_emit(cs, 0);
3443
3444 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
3445 radeon_emit(cs, 0);
3446 }
3447 }
3448
3449 /* GFX9+ metadata cache flushing workaround. metadata cache coherency is
3450 * broken if the CB caches data of multiple mips of the same image at the
3451 * same time.
3452 *
3453 * Insert some flushes to avoid this.
3454 */
3455 static void
radv_emit_fb_mip_change_flush(struct radv_cmd_buffer * cmd_buffer)3456 radv_emit_fb_mip_change_flush(struct radv_cmd_buffer *cmd_buffer)
3457 {
3458 struct radv_rendering_state *render = &cmd_buffer->state.render;
3459 bool color_mip_changed = false;
3460
3461 /* Entire workaround is not applicable before GFX9 */
3462 if (cmd_buffer->device->physical_device->rad_info.gfx_level < GFX9)
3463 return;
3464
3465 for (int i = 0; i < render->color_att_count; ++i) {
3466 struct radv_image_view *iview = render->color_att[i].iview;
3467 if (!iview)
3468 continue;
3469
3470 if ((radv_image_has_cmask(iview->image) || radv_dcc_enabled(iview->image, iview->vk.base_mip_level) ||
3471 radv_dcc_enabled(iview->image, cmd_buffer->state.cb_mip[i])) &&
3472 cmd_buffer->state.cb_mip[i] != iview->vk.base_mip_level)
3473 color_mip_changed = true;
3474
3475 cmd_buffer->state.cb_mip[i] = iview->vk.base_mip_level;
3476 }
3477
3478 if (color_mip_changed) {
3479 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
3480 }
3481
3482 const struct radv_image_view *iview = render->ds_att.iview;
3483 if (iview) {
3484 if ((radv_htile_enabled(iview->image, iview->vk.base_mip_level) ||
3485 radv_htile_enabled(iview->image, cmd_buffer->state.ds_mip)) &&
3486 cmd_buffer->state.ds_mip != iview->vk.base_mip_level) {
3487 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
3488 }
3489
3490 cmd_buffer->state.ds_mip = iview->vk.base_mip_level;
3491 }
3492 }
3493
3494 /* This function does the flushes for mip changes if the levels are not zero for
3495 * all render targets. This way we can assume at the start of the next cmd_buffer
3496 * that rendering to mip 0 doesn't need any flushes. As that is the most common
3497 * case that saves some flushes. */
3498 static void
radv_emit_mip_change_flush_default(struct radv_cmd_buffer * cmd_buffer)3499 radv_emit_mip_change_flush_default(struct radv_cmd_buffer *cmd_buffer)
3500 {
3501 /* Entire workaround is not applicable before GFX9 */
3502 if (cmd_buffer->device->physical_device->rad_info.gfx_level < GFX9)
3503 return;
3504
3505 bool need_color_mip_flush = false;
3506 for (unsigned i = 0; i < 8; ++i) {
3507 if (cmd_buffer->state.cb_mip[i]) {
3508 need_color_mip_flush = true;
3509 break;
3510 }
3511 }
3512
3513 if (need_color_mip_flush) {
3514 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
3515 }
3516
3517 if (cmd_buffer->state.ds_mip) {
3518 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
3519 }
3520
3521 memset(cmd_buffer->state.cb_mip, 0, sizeof(cmd_buffer->state.cb_mip));
3522 cmd_buffer->state.ds_mip = 0;
3523 }
3524
3525 /* GFX9+ metadata cache flushing workaround. metadata cache coherency is
3526 * broken if the CB caches data of multiple mips of the same image at the
3527 * same time.
3528 *
3529 * Insert some flushes to avoid this.
3530 */
3531 static void
radv_emit_fb_mip_change_flush(struct radv_cmd_buffer * cmd_buffer)3532 radv_emit_fb_mip_change_flush(struct radv_cmd_buffer *cmd_buffer)
3533 {
3534 struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
3535 const struct radv_subpass *subpass = cmd_buffer->state.subpass;
3536 bool color_mip_changed = false;
3537
3538 /* Entire workaround is not applicable before GFX9 */
3539 if (cmd_buffer->device->physical_device->rad_info.chip_class < GFX9)
3540 return;
3541
3542 if (!framebuffer)
3543 return;
3544
3545 for (int i = 0; i < subpass->color_count; ++i) {
3546 int idx = subpass->color_attachments[i].attachment;
3547 if (idx == VK_ATTACHMENT_UNUSED)
3548 continue;
3549
3550 struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview;
3551
3552 if ((radv_image_has_CB_metadata(iview->image) ||
3553 radv_image_has_dcc(iview->image)) &&
3554 cmd_buffer->state.cb_mip[i] != iview->base_mip)
3555 color_mip_changed = true;
3556
3557 cmd_buffer->state.cb_mip[i] = iview->base_mip;
3558 }
3559
3560 if (color_mip_changed) {
3561 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
3562 RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
3563 }
3564 }
3565
3566 /* This function does the flushes for mip changes if the levels are not zero for
3567 * all render targets. This way we can assume at the start of the next cmd_buffer
3568 * that rendering to mip 0 doesn't need any flushes. As that is the most common
3569 * case that saves some flushes. */
3570 static void
radv_emit_mip_change_flush_default(struct radv_cmd_buffer * cmd_buffer)3571 radv_emit_mip_change_flush_default(struct radv_cmd_buffer *cmd_buffer)
3572 {
3573 /* Entire workaround is not applicable before GFX9 */
3574 if (cmd_buffer->device->physical_device->rad_info.chip_class < GFX9)
3575 return;
3576
3577 bool need_color_mip_flush = false;
3578 for (unsigned i = 0; i < 8; ++i) {
3579 if (cmd_buffer->state.cb_mip[i]) {
3580 need_color_mip_flush = true;
3581 break;
3582 }
3583 }
3584
3585 if (need_color_mip_flush) {
3586 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
3587 RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
3588 }
3589
3590 memset(cmd_buffer->state.cb_mip, 0, sizeof(cmd_buffer->state.cb_mip));
3591 }
3592
3593 static void
radv_emit_framebuffer_state(struct radv_cmd_buffer * cmd_buffer)3594 radv_emit_framebuffer_state(struct radv_cmd_buffer *cmd_buffer)
3595 {
3596 struct radv_rendering_state *render = &cmd_buffer->state.render;
3597 int i;
3598 bool disable_constant_encode_ac01 = false;
3599 unsigned color_invalid = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11
3600 ? S_028C70_FORMAT_GFX11(V_028C70_COLOR_INVALID)
3601 : S_028C70_FORMAT_GFX6(V_028C70_COLOR_INVALID);
3602 VkExtent2D extent = {MAX_FRAMEBUFFER_WIDTH, MAX_FRAMEBUFFER_HEIGHT};
3603
3604 ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 51 + MAX_RTS * 70);
3605
3606 for (i = 0; i < render->color_att_count; ++i) {
3607 struct radv_image_view *iview = render->color_att[i].iview;
3608 if (!iview) {
3609 radeon_set_context_reg(cmd_buffer->cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, color_invalid);
3610 continue;
3611 }
3612
3613 VkImageLayout layout = render->color_att[i].layout;
3614
3615 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, iview->image->bindings[0].bo);
3616
3617 assert(iview->vk.aspects & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_PLANE_0_BIT |
3618 VK_IMAGE_ASPECT_PLANE_1_BIT | VK_IMAGE_ASPECT_PLANE_2_BIT));
3619
3620 if (iview->image->disjoint && iview->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
3621 for (uint32_t plane_id = 0; plane_id < iview->image->plane_count; plane_id++) {
3622 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, iview->image->bindings[plane_id].bo);
3623 }
3624 } else {
3625 uint32_t plane_id = iview->image->disjoint ? iview->plane_id : 0;
3626 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, iview->image->bindings[plane_id].bo);
3627 }
3628
3629 radv_emit_fb_color_state(cmd_buffer, i, &render->color_att[i].cb, iview, layout);
3630
3631 radv_load_color_clear_metadata(cmd_buffer, iview, i);
3632
3633 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9 && iview->image->dcc_sign_reinterpret) {
3634 /* Disable constant encoding with the clear value of "1" with different DCC signedness
3635 * because the hardware will fill "1" instead of the clear value.
3636 */
3637 disable_constant_encode_ac01 = true;
3638 }
3639
3640 extent.width = MIN2(extent.width, iview->vk.extent.width);
3641 extent.height = MIN2(extent.height, iview->vk.extent.height);
3642 }
3643 for (; i < cmd_buffer->state.last_subpass_color_count; i++) {
3644 radeon_set_context_reg(cmd_buffer->cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, color_invalid);
3645 }
3646 cmd_buffer->state.last_subpass_color_count = render->color_att_count;
3647
3648 if (render->ds_att.iview) {
3649 struct radv_image_view *iview = render->ds_att.iview;
3650 const struct radv_image *image = iview->image;
3651 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, image->bindings[0].bo);
3652
3653 uint32_t qf_mask = radv_image_queue_family_mask(image, cmd_buffer->qf, cmd_buffer->qf);
3654 bool depth_compressed =
3655 radv_layout_is_htile_compressed(cmd_buffer->device, image, render->ds_att.layout, qf_mask);
3656 bool stencil_compressed =
3657 radv_layout_is_htile_compressed(cmd_buffer->device, image, render->ds_att.stencil_layout, qf_mask);
3658
3659 radv_emit_fb_ds_state(cmd_buffer, &render->ds_att.ds, iview, depth_compressed, stencil_compressed);
3660
3661 if (depth_compressed || stencil_compressed) {
3662 /* Only load the depth/stencil fast clear values when
3663 * compressed rendering is enabled.
3664 */
3665 radv_load_ds_clear_metadata(cmd_buffer, iview);
3666 }
3667
3668 extent.width = MIN2(extent.width, iview->vk.extent.width);
3669 extent.height = MIN2(extent.height, iview->vk.extent.height);
3670 } else if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX10_3 && render->vrs_att.iview &&
3671 radv_cmd_buffer_get_vrs_image(cmd_buffer)) {
3672 /* When a subpass uses a VRS attachment without binding a depth/stencil attachment, we have to
3673 * bind our internal depth buffer that contains the VRS data as part of HTILE.
3674 */
3675 VkImageLayout layout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
3676 struct radv_buffer *htile_buffer = cmd_buffer->device->vrs.buffer;
3677 struct radv_image *image = cmd_buffer->device->vrs.image;
3678 struct radv_ds_buffer_info ds;
3679 struct radv_image_view iview;
3680
3681 radv_image_view_init(&iview, cmd_buffer->device,
3682 &(VkImageViewCreateInfo){
3683 .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
3684 .image = radv_image_to_handle(image),
3685 .viewType = radv_meta_get_view_type(image),
3686 .format = image->vk.format,
3687 .subresourceRange =
3688 {
3689 .aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT,
3690 .baseMipLevel = 0,
3691 .levelCount = 1,
3692 .baseArrayLayer = 0,
3693 .layerCount = 1,
3694 },
3695 },
3696 0, NULL);
3697
3698 radv_initialise_vrs_surface(image, htile_buffer, &ds);
3699
3700 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, htile_buffer->bo);
3701
3702 bool depth_compressed = radv_layout_is_htile_compressed(
3703 cmd_buffer->device, image, layout, radv_image_queue_family_mask(image, cmd_buffer->qf, cmd_buffer->qf));
3704 radv_emit_fb_ds_state(cmd_buffer, &ds, &iview, depth_compressed, false);
3705
3706 radv_image_view_finish(&iview);
3707 } else {
3708 radv_emit_null_ds_state(cmd_buffer);
3709 }
3710
3711 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
3712 bool vrs_surface_enable = render->vrs_att.iview != NULL;
3713 unsigned xmax = 0, ymax = 0;
3714 uint64_t va = 0;
3715
3716 if (vrs_surface_enable) {
3717 struct radv_image *vrs_image = render->vrs_att.iview->image;
3718
3719 va = radv_buffer_get_va(vrs_image->bindings[0].bo) + vrs_image->bindings[0].offset;
3720 va |= vrs_image->planes[0].surface.tile_swizzle << 8;
3721
3722 xmax = vrs_image->vk.extent.width - 1;
3723 ymax = vrs_image->vk.extent.height - 1;
3724 }
3725
3726 radeon_set_context_reg_seq(cmd_buffer->cs, R_0283F0_PA_SC_VRS_RATE_BASE, 3);
3727 radeon_emit(cmd_buffer->cs, va >> 8);
3728 radeon_emit(cmd_buffer->cs, S_0283F4_BASE_256B(va >> 40));
3729 radeon_emit(cmd_buffer->cs, S_0283F8_X_MAX(xmax) | S_0283F8_Y_MAX(ymax));
3730
3731 radeon_set_context_reg(cmd_buffer->cs, R_0283D0_PA_SC_VRS_OVERRIDE_CNTL,
3732 S_0283D0_VRS_SURFACE_ENABLE(vrs_surface_enable));
3733 }
3734
3735 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX8) {
3736 bool disable_constant_encode = cmd_buffer->device->physical_device->rad_info.has_dcc_constant_encode;
3737 enum amd_gfx_level gfx_level = cmd_buffer->device->physical_device->rad_info.gfx_level;
3738
3739 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
3740 radeon_set_context_reg(cmd_buffer->cs, R_028424_CB_FDCC_CONTROL, S_028424_SAMPLE_MASK_TRACKER_WATERMARK(0));
3741 } else {
3742 uint8_t watermark = gfx_level >= GFX10 ? 6 : 4;
3743
3744 radeon_set_context_reg(cmd_buffer->cs, R_028424_CB_DCC_CONTROL,
3745 S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(gfx_level <= GFX9) |
3746 S_028424_OVERWRITE_COMBINER_WATERMARK(watermark) |
3747 S_028424_DISABLE_CONSTANT_ENCODE_AC01(disable_constant_encode_ac01) |
3748 S_028424_DISABLE_CONSTANT_ENCODE_REG(disable_constant_encode));
3749 }
3750 }
3751
3752 radeon_set_context_reg(cmd_buffer->cs, R_028034_PA_SC_SCREEN_SCISSOR_BR,
3753 S_028034_BR_X(extent.width) | S_028034_BR_Y(extent.height));
3754
3755 assert(cmd_buffer->cs->cdw <= cdw_max);
3756
3757 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_FRAMEBUFFER;
3758 }
3759
3760 static void
radv_emit_guardband_state(struct radv_cmd_buffer * cmd_buffer)3761 radv_emit_guardband_state(struct radv_cmd_buffer *cmd_buffer)
3762 {
3763 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3764 unsigned rast_prim = radv_get_rasterization_prim(cmd_buffer);
3765
3766 radv_write_guardband(cmd_buffer->cs, d->vk.vp.viewport_count, d->vk.vp.viewports, rast_prim, d->vk.rs.polygon_mode,
3767 d->vk.rs.line.width);
3768
3769 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_GUARDBAND;
3770 }
3771
3772 /* Bind an internal index buffer for GPUs that hang with 0-sized index buffers to handle robustness2
3773 * which requires 0 for out-of-bounds access.
3774 */
3775 static void
radv_handle_zero_index_buffer_bug(struct radv_cmd_buffer * cmd_buffer,uint64_t * index_va,uint32_t * remaining_indexes)3776 radv_handle_zero_index_buffer_bug(struct radv_cmd_buffer *cmd_buffer, uint64_t *index_va, uint32_t *remaining_indexes)
3777 {
3778 const uint32_t zero = 0;
3779 uint32_t offset;
3780
3781 if (!radv_cmd_buffer_upload_data(cmd_buffer, sizeof(uint32_t), &zero, &offset)) {
3782 vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
3783 return;
3784 }
3785
3786 *index_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset;
3787 *remaining_indexes = 1;
3788 }
3789
3790 static void
radv_emit_index_buffer(struct radv_cmd_buffer * cmd_buffer)3791 radv_emit_index_buffer(struct radv_cmd_buffer *cmd_buffer)
3792 {
3793 struct radeon_cmdbuf *cs = cmd_buffer->cs;
3794 struct radv_cmd_state *state = &cmd_buffer->state;
3795 uint32_t max_index_count = state->max_index_count;
3796 uint64_t index_va = state->index_va;
3797
3798 /* With indirect generated commands the index buffer bind may be part of the
3799 * indirect command buffer, in which case the app may not have bound any yet. */
3800 if (state->index_type < 0)
3801 return;
3802
3803 /* Handle indirect draw calls with NULL index buffer if the GPU doesn't support them. */
3804 if (!max_index_count && cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug) {
3805 radv_handle_zero_index_buffer_bug(cmd_buffer, &index_va, &max_index_count);
3806 }
3807
3808 radeon_emit(cs, PKT3(PKT3_INDEX_BASE, 1, 0));
3809 radeon_emit(cs, index_va);
3810 radeon_emit(cs, index_va >> 32);
3811
3812 radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
3813 radeon_emit(cs, max_index_count);
3814
3815 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_INDEX_BUFFER;
3816 }
3817
3818 static void
radv_flush_occlusion_query_state(struct radv_cmd_buffer * cmd_buffer)3819 radv_flush_occlusion_query_state(struct radv_cmd_buffer *cmd_buffer)
3820 {
3821 const enum amd_gfx_level gfx_level = cmd_buffer->device->physical_device->rad_info.gfx_level;
3822 const bool enable_occlusion_queries =
3823 cmd_buffer->state.active_occlusion_queries || cmd_buffer->state.inherited_occlusion_queries;
3824 uint32_t db_count_control;
3825
3826 if (!enable_occlusion_queries) {
3827 db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(gfx_level < GFX11);
3828 } else {
3829 uint32_t sample_rate = util_logbase2(cmd_buffer->state.render.max_samples);
3830 bool gfx10_perfect =
3831 gfx_level >= GFX10 && (cmd_buffer->state.perfect_occlusion_queries_enabled ||
3832 cmd_buffer->state.inherited_query_control_flags & VK_QUERY_CONTROL_PRECISE_BIT);
3833
3834 if (gfx_level >= GFX7) {
3835 /* Always enable PERFECT_ZPASS_COUNTS due to issues with partially
3836 * covered tiles, discards, and early depth testing. For more details,
3837 * see https://gitlab.freedesktop.org/mesa/mesa/-/issues/3218 */
3838 db_count_control = S_028004_PERFECT_ZPASS_COUNTS(1) |
3839 S_028004_DISABLE_CONSERVATIVE_ZPASS_COUNTS(gfx10_perfect) |
3840 S_028004_SAMPLE_RATE(sample_rate) | S_028004_ZPASS_ENABLE(1) |
3841 S_028004_SLICE_EVEN_ENABLE(1) | S_028004_SLICE_ODD_ENABLE(1);
3842 } else {
3843 db_count_control = S_028004_PERFECT_ZPASS_COUNTS(1) | S_028004_SAMPLE_RATE(sample_rate);
3844 }
3845 }
3846
3847 if (db_count_control != cmd_buffer->state.last_db_count_control) {
3848 radeon_set_context_reg(cmd_buffer->cs, R_028004_DB_COUNT_CONTROL, db_count_control);
3849
3850 cmd_buffer->state.context_roll_without_scissor_emitted = true;
3851
3852 cmd_buffer->state.last_db_count_control = db_count_control;
3853 }
3854
3855 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_OCCLUSION_QUERY;
3856 }
3857
3858 unsigned
radv_instance_rate_prolog_index(unsigned num_attributes,uint32_t instance_rate_inputs)3859 radv_instance_rate_prolog_index(unsigned num_attributes, uint32_t instance_rate_inputs)
3860 {
3861 /* instance_rate_vs_prologs is a flattened array of array of arrays of different sizes, or a
3862 * single array sorted in ascending order using:
3863 * - total number of attributes
3864 * - number of instanced attributes
3865 * - index of first instanced attribute
3866 */
3867
3868 /* From total number of attributes to offset. */
3869 static const uint16_t total_to_offset[16] = {0, 1, 4, 10, 20, 35, 56, 84, 120, 165, 220, 286, 364, 455, 560, 680};
3870 unsigned start_index = total_to_offset[num_attributes - 1];
3871
3872 /* From number of instanced attributes to offset. This would require a different LUT depending on
3873 * the total number of attributes, but we can exploit a pattern to use just the LUT for 16 total
3874 * attributes.
3875 */
3876 static const uint8_t count_to_offset_total16[16] = {0, 16, 31, 45, 58, 70, 81, 91,
3877 100, 108, 115, 121, 126, 130, 133, 135};
3878 unsigned count = util_bitcount(instance_rate_inputs);
3879 unsigned offset_from_start_index = count_to_offset_total16[count - 1] - ((16 - num_attributes) * (count - 1));
3880
3881 unsigned first = ffs(instance_rate_inputs) - 1;
3882 return start_index + offset_from_start_index + first;
3883 }
3884
3885 static struct radv_shader_part *
lookup_vs_prolog(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * vs_shader,uint32_t * nontrivial_divisors)3886 lookup_vs_prolog(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *vs_shader, uint32_t *nontrivial_divisors)
3887 {
3888 assert(vs_shader->info.vs.dynamic_inputs);
3889
3890 const struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
3891 struct radv_device *device = cmd_buffer->device;
3892
3893 unsigned num_attributes = util_last_bit(vs_shader->info.vs.vb_desc_usage_mask);
3894 uint32_t attribute_mask = BITFIELD_MASK(num_attributes);
3895
3896 uint32_t instance_rate_inputs = state->instance_rate_inputs & attribute_mask;
3897 uint32_t zero_divisors = state->zero_divisors & attribute_mask;
3898 *nontrivial_divisors = state->nontrivial_divisors & attribute_mask;
3899 uint32_t misaligned_mask = cmd_buffer->state.vbo_misaligned_mask;
3900 if (cmd_buffer->state.vbo_misaligned_mask_invalid) {
3901 assert(device->physical_device->rad_info.gfx_level == GFX6 ||
3902 device->physical_device->rad_info.gfx_level >= GFX10);
3903
3904 u_foreach_bit (index, cmd_buffer->state.vbo_misaligned_mask_invalid & attribute_mask) {
3905 uint8_t binding = state->bindings[index];
3906 if (!(cmd_buffer->state.vbo_bound_mask & BITFIELD_BIT(binding)))
3907 continue;
3908
3909 uint8_t req = state->format_align_req_minus_1[index];
3910 uint64_t vb_offset = cmd_buffer->vertex_bindings[binding].offset;
3911 uint64_t vb_stride;
3912
3913 if (cmd_buffer->state.uses_dynamic_vertex_binding_stride) {
3914 vb_stride = cmd_buffer->vertex_bindings[binding].stride;
3915 } else {
3916 vb_stride = cmd_buffer->state.graphics_pipeline->binding_stride[binding];
3917 }
3918
3919 VkDeviceSize offset = vb_offset + state->offsets[index];
3920 if ((offset & req) || (vb_stride & req))
3921 misaligned_mask |= BITFIELD_BIT(index);
3922 }
3923 cmd_buffer->state.vbo_misaligned_mask = misaligned_mask;
3924 cmd_buffer->state.vbo_misaligned_mask_invalid &= ~attribute_mask;
3925 }
3926 misaligned_mask |= state->nontrivial_formats;
3927 misaligned_mask &= attribute_mask;
3928
3929 const bool can_use_simple_input =
3930 cmd_buffer->state.shaders[MESA_SHADER_VERTEX] &&
3931 !cmd_buffer->state.shaders[MESA_SHADER_VERTEX]->info.merged_shader_compiled_separately &&
3932 cmd_buffer->state.shaders[MESA_SHADER_VERTEX]->info.is_ngg == device->physical_device->use_ngg &&
3933 cmd_buffer->state.shaders[MESA_SHADER_VERTEX]->info.wave_size == device->physical_device->ge_wave_size;
3934
3935 /* The instance ID input VGPR is placed differently when as_ls=true. as_ls is also needed to
3936 * workaround the LS VGPR initialization bug.
3937 */
3938 bool as_ls =
3939 vs_shader->info.vs.as_ls && (instance_rate_inputs || device->physical_device->rad_info.has_ls_vgpr_init_bug);
3940
3941 /* try to use a pre-compiled prolog first */
3942 struct radv_shader_part *prolog = NULL;
3943 if (can_use_simple_input && !as_ls && !misaligned_mask && !state->alpha_adjust_lo && !state->alpha_adjust_hi) {
3944 if (!instance_rate_inputs) {
3945 prolog = device->simple_vs_prologs[num_attributes - 1];
3946 } else if (num_attributes <= 16 && !*nontrivial_divisors && !zero_divisors &&
3947 util_bitcount(instance_rate_inputs) ==
3948 (util_last_bit(instance_rate_inputs) - ffs(instance_rate_inputs) + 1)) {
3949 unsigned index = radv_instance_rate_prolog_index(num_attributes, instance_rate_inputs);
3950 prolog = device->instance_rate_vs_prologs[index];
3951 }
3952 }
3953 if (prolog)
3954 return prolog;
3955
3956 struct radv_vs_prolog_key key;
3957 memset(&key, 0, sizeof(key));
3958 key.instance_rate_inputs = instance_rate_inputs;
3959 key.nontrivial_divisors = *nontrivial_divisors;
3960 key.zero_divisors = zero_divisors;
3961 /* If the attribute is aligned, post shuffle is implemented using DST_SEL instead. */
3962 key.post_shuffle = state->post_shuffle & misaligned_mask;
3963 key.alpha_adjust_hi = state->alpha_adjust_hi & attribute_mask;
3964 key.alpha_adjust_lo = state->alpha_adjust_lo & attribute_mask;
3965 u_foreach_bit (index, misaligned_mask)
3966 key.formats[index] = state->formats[index];
3967 key.num_attributes = num_attributes;
3968 key.misaligned_mask = misaligned_mask;
3969 key.as_ls = as_ls;
3970 key.is_ngg = vs_shader->info.is_ngg;
3971 key.wave32 = vs_shader->info.wave_size == 32;
3972
3973 if (vs_shader->info.merged_shader_compiled_separately) {
3974 assert(vs_shader->info.next_stage == MESA_SHADER_TESS_CTRL || vs_shader->info.next_stage == MESA_SHADER_GEOMETRY);
3975 key.next_stage = vs_shader->info.next_stage;
3976 } else {
3977 key.next_stage = vs_shader->info.stage;
3978 }
3979
3980 return radv_shader_part_cache_get(device, &device->vs_prologs, &cmd_buffer->vs_prologs, &key);
3981 }
3982
3983 static void
emit_prolog_regs(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * vs_shader,const struct radv_shader_part * prolog)3984 emit_prolog_regs(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *vs_shader,
3985 const struct radv_shader_part *prolog)
3986 {
3987 uint32_t rsrc1, rsrc2;
3988
3989 /* no need to re-emit anything in this case */
3990 if (cmd_buffer->state.emitted_vs_prolog == prolog)
3991 return;
3992
3993 enum amd_gfx_level chip = cmd_buffer->device->physical_device->rad_info.gfx_level;
3994
3995 assert(cmd_buffer->state.emitted_graphics_pipeline == cmd_buffer->state.graphics_pipeline);
3996
3997 if (vs_shader->info.merged_shader_compiled_separately) {
3998 if (vs_shader->info.next_stage == MESA_SHADER_GEOMETRY) {
3999 radv_shader_combine_cfg_vs_gs(vs_shader, cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY], &rsrc1, &rsrc2);
4000 } else {
4001 assert(vs_shader->info.next_stage == MESA_SHADER_TESS_CTRL);
4002
4003 radv_shader_combine_cfg_vs_tcs(vs_shader, cmd_buffer->state.shaders[MESA_SHADER_TESS_CTRL], &rsrc1, &rsrc2);
4004 }
4005 } else {
4006 rsrc1 = vs_shader->config.rsrc1;
4007 }
4008
4009 if (chip < GFX10 && G_00B228_SGPRS(prolog->rsrc1) > G_00B228_SGPRS(rsrc1))
4010 rsrc1 = (rsrc1 & C_00B228_SGPRS) | (prolog->rsrc1 & ~C_00B228_SGPRS);
4011
4012 /* The main shader must not use less VGPRs than the prolog, otherwise shared vgprs might not
4013 * work.
4014 */
4015 assert(G_00B848_VGPRS(vs_shader->config.rsrc1) >= G_00B848_VGPRS(prolog->rsrc1));
4016
4017 unsigned pgm_lo_reg = R_00B120_SPI_SHADER_PGM_LO_VS;
4018 unsigned rsrc1_reg = R_00B128_SPI_SHADER_PGM_RSRC1_VS;
4019 if (vs_shader->info.is_ngg || cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY] == vs_shader ||
4020 (vs_shader->info.merged_shader_compiled_separately && vs_shader->info.next_stage == MESA_SHADER_GEOMETRY)) {
4021 pgm_lo_reg = chip >= GFX10 ? R_00B320_SPI_SHADER_PGM_LO_ES : R_00B210_SPI_SHADER_PGM_LO_ES;
4022 rsrc1_reg = R_00B228_SPI_SHADER_PGM_RSRC1_GS;
4023 } else if (cmd_buffer->state.shaders[MESA_SHADER_TESS_CTRL] == vs_shader ||
4024 (vs_shader->info.merged_shader_compiled_separately &&
4025 vs_shader->info.next_stage == MESA_SHADER_TESS_CTRL)) {
4026 pgm_lo_reg = chip >= GFX10 ? R_00B520_SPI_SHADER_PGM_LO_LS : R_00B410_SPI_SHADER_PGM_LO_LS;
4027 rsrc1_reg = R_00B428_SPI_SHADER_PGM_RSRC1_HS;
4028 } else if (vs_shader->info.vs.as_ls) {
4029 pgm_lo_reg = R_00B520_SPI_SHADER_PGM_LO_LS;
4030 rsrc1_reg = R_00B528_SPI_SHADER_PGM_RSRC1_LS;
4031 } else if (vs_shader->info.vs.as_es) {
4032 pgm_lo_reg = R_00B320_SPI_SHADER_PGM_LO_ES;
4033 rsrc1_reg = R_00B328_SPI_SHADER_PGM_RSRC1_ES;
4034 }
4035
4036 radeon_set_sh_reg(cmd_buffer->cs, pgm_lo_reg, prolog->va >> 8);
4037
4038 if (chip < GFX10 || vs_shader->info.merged_shader_compiled_separately) {
4039 radeon_set_sh_reg(cmd_buffer->cs, rsrc1_reg, rsrc1);
4040
4041 if (vs_shader->info.merged_shader_compiled_separately) {
4042 if (vs_shader->info.next_stage == MESA_SHADER_GEOMETRY) {
4043 const struct radv_shader *gs = cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY];
4044
4045 radeon_set_sh_reg(cmd_buffer->cs, rsrc1_reg + 4, rsrc2 | S_00B22C_LDS_SIZE(gs->info.gs_ring_info.lds_size));
4046 } else {
4047 radeon_set_sh_reg(cmd_buffer->cs, rsrc1_reg + 4, rsrc2);
4048 }
4049 }
4050 } else {
4051 assert(rsrc1 == vs_shader->config.rsrc1);
4052 }
4053
4054 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, prolog->bo);
4055 }
4056
4057 static void
emit_prolog_inputs(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * vs_shader,uint32_t nontrivial_divisors)4058 emit_prolog_inputs(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *vs_shader,
4059 uint32_t nontrivial_divisors)
4060 {
4061 /* no need to re-emit anything in this case */
4062 if (!nontrivial_divisors && cmd_buffer->state.emitted_vs_prolog &&
4063 !cmd_buffer->state.emitted_vs_prolog->nontrivial_divisors)
4064 return;
4065
4066 const struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
4067 uint64_t input_va = radv_shader_get_va(vs_shader);
4068
4069 if (nontrivial_divisors) {
4070 unsigned inputs_offset;
4071 uint32_t *inputs;
4072 unsigned size = 8 + util_bitcount(nontrivial_divisors) * 8;
4073 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, &inputs_offset, (void **)&inputs))
4074 return;
4075
4076 *(inputs++) = input_va;
4077 *(inputs++) = input_va >> 32;
4078
4079 u_foreach_bit (index, nontrivial_divisors) {
4080 uint32_t div = state->divisors[index];
4081 if (div == 0) {
4082 *(inputs++) = 0;
4083 *(inputs++) = 1;
4084 } else if (util_is_power_of_two_or_zero(div)) {
4085 *(inputs++) = util_logbase2(div) | (1 << 8);
4086 *(inputs++) = 0xffffffffu;
4087 } else {
4088 struct util_fast_udiv_info info = util_compute_fast_udiv_info(div, 32, 32);
4089 *(inputs++) = info.pre_shift | (info.increment << 8) | (info.post_shift << 16);
4090 *(inputs++) = info.multiplier;
4091 }
4092 }
4093
4094 input_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + inputs_offset;
4095 }
4096
4097 const struct radv_userdata_info *loc = &vs_shader->info.user_sgprs_locs.shader_data[AC_UD_VS_PROLOG_INPUTS];
4098 uint32_t base_reg = vs_shader->info.user_data_0;
4099 assert(loc->sgpr_idx != -1);
4100 assert(loc->num_sgprs == 2);
4101 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, input_va, true);
4102 }
4103
4104 static void
radv_emit_vertex_input(struct radv_cmd_buffer * cmd_buffer)4105 radv_emit_vertex_input(struct radv_cmd_buffer *cmd_buffer)
4106 {
4107 const struct radv_shader *vs_shader = radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_VERTEX);
4108
4109 assert(!cmd_buffer->state.mesh_shading);
4110
4111 if (!vs_shader->info.vs.has_prolog)
4112 return;
4113
4114 uint32_t nontrivial_divisors;
4115 struct radv_shader_part *prolog = lookup_vs_prolog(cmd_buffer, vs_shader, &nontrivial_divisors);
4116 if (!prolog) {
4117 vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
4118 return;
4119 }
4120 emit_prolog_regs(cmd_buffer, vs_shader, prolog);
4121 emit_prolog_inputs(cmd_buffer, vs_shader, nontrivial_divisors);
4122
4123 cmd_buffer->shader_upload_seq = MAX2(cmd_buffer->shader_upload_seq, prolog->upload_seq);
4124
4125 cmd_buffer->state.emitted_vs_prolog = prolog;
4126
4127 if (radv_device_fault_detection_enabled(cmd_buffer->device))
4128 radv_save_vs_prolog(cmd_buffer, prolog);
4129 }
4130
4131 static void
radv_emit_tess_domain_origin(struct radv_cmd_buffer * cmd_buffer)4132 radv_emit_tess_domain_origin(struct radv_cmd_buffer *cmd_buffer)
4133 {
4134 const struct radv_physical_device *pdevice = cmd_buffer->device->physical_device;
4135 const struct radv_shader *tes = radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_TESS_EVAL);
4136 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
4137 unsigned type = 0, partitioning = 0, distribution_mode = 0;
4138 unsigned topology;
4139
4140 switch (tes->info.tes._primitive_mode) {
4141 case TESS_PRIMITIVE_TRIANGLES:
4142 type = V_028B6C_TESS_TRIANGLE;
4143 break;
4144 case TESS_PRIMITIVE_QUADS:
4145 type = V_028B6C_TESS_QUAD;
4146 break;
4147 case TESS_PRIMITIVE_ISOLINES:
4148 type = V_028B6C_TESS_ISOLINE;
4149 break;
4150 default:
4151 unreachable("Invalid tess primitive type");
4152 }
4153
4154 switch (tes->info.tes.spacing) {
4155 case TESS_SPACING_EQUAL:
4156 partitioning = V_028B6C_PART_INTEGER;
4157 break;
4158 case TESS_SPACING_FRACTIONAL_ODD:
4159 partitioning = V_028B6C_PART_FRAC_ODD;
4160 break;
4161 case TESS_SPACING_FRACTIONAL_EVEN:
4162 partitioning = V_028B6C_PART_FRAC_EVEN;
4163 break;
4164 default:
4165 unreachable("Invalid tess spacing type");
4166 }
4167
4168 if (pdevice->rad_info.has_distributed_tess) {
4169 if (pdevice->rad_info.family == CHIP_FIJI || pdevice->rad_info.family >= CHIP_POLARIS10)
4170 distribution_mode = V_028B6C_TRAPEZOIDS;
4171 else
4172 distribution_mode = V_028B6C_DONUTS;
4173 } else {
4174 distribution_mode = V_028B6C_NO_DIST;
4175 }
4176
4177 if (tes->info.tes.point_mode) {
4178 topology = V_028B6C_OUTPUT_POINT;
4179 } else if (tes->info.tes._primitive_mode == TESS_PRIMITIVE_ISOLINES) {
4180 topology = V_028B6C_OUTPUT_LINE;
4181 } else {
4182 bool ccw = tes->info.tes.ccw;
4183
4184 if (d->vk.ts.domain_origin != VK_TESSELLATION_DOMAIN_ORIGIN_UPPER_LEFT) {
4185 ccw = !ccw;
4186 }
4187
4188 topology = ccw ? V_028B6C_OUTPUT_TRIANGLE_CCW : V_028B6C_OUTPUT_TRIANGLE_CW;
4189 }
4190
4191 radeon_set_context_reg(cmd_buffer->cs, R_028B6C_VGT_TF_PARAM,
4192 S_028B6C_TYPE(type) | S_028B6C_PARTITIONING(partitioning) | S_028B6C_TOPOLOGY(topology) |
4193 S_028B6C_DISTRIBUTION_MODE(distribution_mode));
4194 }
4195
4196 static void
radv_emit_alpha_to_coverage_enable(struct radv_cmd_buffer * cmd_buffer)4197 radv_emit_alpha_to_coverage_enable(struct radv_cmd_buffer *cmd_buffer)
4198 {
4199 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
4200 unsigned db_alpha_to_mask = 0;
4201
4202 if (cmd_buffer->device->instance->debug_flags & RADV_DEBUG_NO_ATOC_DITHERING) {
4203 db_alpha_to_mask = S_028B70_ALPHA_TO_MASK_OFFSET0(2) | S_028B70_ALPHA_TO_MASK_OFFSET1(2) |
4204 S_028B70_ALPHA_TO_MASK_OFFSET2(2) | S_028B70_ALPHA_TO_MASK_OFFSET3(2) |
4205 S_028B70_OFFSET_ROUND(0);
4206 } else {
4207 db_alpha_to_mask = S_028B70_ALPHA_TO_MASK_OFFSET0(3) | S_028B70_ALPHA_TO_MASK_OFFSET1(1) |
4208 S_028B70_ALPHA_TO_MASK_OFFSET2(0) | S_028B70_ALPHA_TO_MASK_OFFSET3(2) |
4209 S_028B70_OFFSET_ROUND(1);
4210 }
4211
4212 db_alpha_to_mask |= S_028B70_ALPHA_TO_MASK_ENABLE(d->vk.ms.alpha_to_coverage_enable);
4213
4214 radeon_set_context_reg(cmd_buffer->cs, R_028B70_DB_ALPHA_TO_MASK, db_alpha_to_mask);
4215 }
4216
4217 static void
radv_emit_sample_mask(struct radv_cmd_buffer * cmd_buffer)4218 radv_emit_sample_mask(struct radv_cmd_buffer *cmd_buffer)
4219 {
4220 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
4221
4222 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2);
4223 radeon_emit(cmd_buffer->cs, d->vk.ms.sample_mask | ((uint32_t)d->vk.ms.sample_mask << 16));
4224 radeon_emit(cmd_buffer->cs, d->vk.ms.sample_mask | ((uint32_t)d->vk.ms.sample_mask << 16));
4225 }
4226
4227 static void
radv_emit_color_blend(struct radv_cmd_buffer * cmd_buffer)4228 radv_emit_color_blend(struct radv_cmd_buffer *cmd_buffer)
4229 {
4230 const struct radv_physical_device *pdevice = cmd_buffer->device->physical_device;
4231 const enum amd_gfx_level gfx_level = pdevice->rad_info.gfx_level;
4232 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
4233 unsigned cb_blend_control[MAX_RTS], sx_mrt_blend_opt[MAX_RTS];
4234 bool mrt0_is_dual_src = radv_is_mrt0_dual_src(cmd_buffer);
4235
4236 for (unsigned i = 0; i < MAX_RTS; i++) {
4237 VkBlendOp eqRGB = d->vk.cb.attachments[i].color_blend_op;
4238 VkBlendFactor srcRGB = d->vk.cb.attachments[i].src_color_blend_factor;
4239 VkBlendFactor dstRGB = d->vk.cb.attachments[i].dst_color_blend_factor;
4240 VkBlendOp eqA = d->vk.cb.attachments[i].alpha_blend_op;
4241 VkBlendFactor srcA = d->vk.cb.attachments[i].src_alpha_blend_factor;
4242 VkBlendFactor dstA = d->vk.cb.attachments[i].dst_alpha_blend_factor;
4243 unsigned srcRGB_opt, dstRGB_opt, srcA_opt, dstA_opt;
4244 unsigned blend_cntl = 0;
4245
4246 cb_blend_control[i] = sx_mrt_blend_opt[i] = 0;
4247
4248 /* Ignore other blend targets if dual-source blending is enabled to prevent wrong behaviour.
4249 */
4250 if (i > 0 && mrt0_is_dual_src)
4251 continue;
4252
4253 if (!d->vk.cb.attachments[i].blend_enable) {
4254 sx_mrt_blend_opt[i] |= S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) |
4255 S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED);
4256 continue;
4257 }
4258
4259 radv_normalize_blend_factor(eqRGB, &srcRGB, &dstRGB);
4260 radv_normalize_blend_factor(eqA, &srcA, &dstA);
4261
4262 /* Blending optimizations for RB+.
4263 * These transformations don't change the behavior.
4264 *
4265 * First, get rid of DST in the blend factors:
4266 * func(src * DST, dst * 0) ---> func(src * 0, dst * SRC)
4267 */
4268 radv_blend_remove_dst(&eqRGB, &srcRGB, &dstRGB, VK_BLEND_FACTOR_DST_COLOR, VK_BLEND_FACTOR_SRC_COLOR);
4269
4270 radv_blend_remove_dst(&eqA, &srcA, &dstA, VK_BLEND_FACTOR_DST_COLOR, VK_BLEND_FACTOR_SRC_COLOR);
4271
4272 radv_blend_remove_dst(&eqA, &srcA, &dstA, VK_BLEND_FACTOR_DST_ALPHA, VK_BLEND_FACTOR_SRC_ALPHA);
4273
4274 /* Look up the ideal settings from tables. */
4275 srcRGB_opt = radv_translate_blend_opt_factor(srcRGB, false);
4276 dstRGB_opt = radv_translate_blend_opt_factor(dstRGB, false);
4277 srcA_opt = radv_translate_blend_opt_factor(srcA, true);
4278 dstA_opt = radv_translate_blend_opt_factor(dstA, true);
4279
4280 /* Handle interdependencies. */
4281 if (radv_blend_factor_uses_dst(srcRGB))
4282 dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
4283 if (radv_blend_factor_uses_dst(srcA))
4284 dstA_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
4285
4286 if (srcRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE &&
4287 (dstRGB == VK_BLEND_FACTOR_ZERO || dstRGB == VK_BLEND_FACTOR_SRC_ALPHA ||
4288 dstRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE))
4289 dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0;
4290
4291 /* Set the final value. */
4292 sx_mrt_blend_opt[i] = S_028760_COLOR_SRC_OPT(srcRGB_opt) | S_028760_COLOR_DST_OPT(dstRGB_opt) |
4293 S_028760_COLOR_COMB_FCN(radv_translate_blend_opt_function(eqRGB)) |
4294 S_028760_ALPHA_SRC_OPT(srcA_opt) | S_028760_ALPHA_DST_OPT(dstA_opt) |
4295 S_028760_ALPHA_COMB_FCN(radv_translate_blend_opt_function(eqA));
4296
4297 blend_cntl |= S_028780_ENABLE(1);
4298 blend_cntl |= S_028780_COLOR_COMB_FCN(radv_translate_blend_function(eqRGB));
4299 blend_cntl |= S_028780_COLOR_SRCBLEND(radv_translate_blend_factor(gfx_level, srcRGB));
4300 blend_cntl |= S_028780_COLOR_DESTBLEND(radv_translate_blend_factor(gfx_level, dstRGB));
4301 if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB) {
4302 blend_cntl |= S_028780_SEPARATE_ALPHA_BLEND(1);
4303 blend_cntl |= S_028780_ALPHA_COMB_FCN(radv_translate_blend_function(eqA));
4304 blend_cntl |= S_028780_ALPHA_SRCBLEND(radv_translate_blend_factor(gfx_level, srcA));
4305 blend_cntl |= S_028780_ALPHA_DESTBLEND(radv_translate_blend_factor(gfx_level, dstA));
4306 }
4307 cb_blend_control[i] = blend_cntl;
4308 }
4309
4310 if (pdevice->rad_info.has_rbplus) {
4311 /* Disable RB+ blend optimizations for dual source blending. */
4312 if (mrt0_is_dual_src) {
4313 for (unsigned i = 0; i < MAX_RTS; i++) {
4314 sx_mrt_blend_opt[i] =
4315 S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_NONE) | S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_NONE);
4316 }
4317 }
4318
4319 /* Disable RB+ blend optimizations on GFX11 when alpha-to-coverage is enabled. */
4320 if (gfx_level >= GFX11 && d->vk.ms.alpha_to_coverage_enable) {
4321 sx_mrt_blend_opt[0] =
4322 S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_NONE) | S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_NONE);
4323 }
4324 }
4325
4326 radeon_set_context_reg_seq(cmd_buffer->cs, R_028780_CB_BLEND0_CONTROL, MAX_RTS);
4327 radeon_emit_array(cmd_buffer->cs, cb_blend_control, MAX_RTS);
4328
4329 if (pdevice->rad_info.has_rbplus) {
4330 radeon_set_context_reg_seq(cmd_buffer->cs, R_028760_SX_MRT0_BLEND_OPT, MAX_RTS);
4331 radeon_emit_array(cmd_buffer->cs, sx_mrt_blend_opt, MAX_RTS);
4332 }
4333 }
4334
4335 static struct radv_shader_part *
lookup_ps_epilog(struct radv_cmd_buffer * cmd_buffer)4336 lookup_ps_epilog(struct radv_cmd_buffer *cmd_buffer)
4337 {
4338 const struct radv_shader *ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
4339 const struct radv_rendering_state *render = &cmd_buffer->state.render;
4340 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
4341 struct radv_device *device = cmd_buffer->device;
4342 struct radv_ps_epilog_state state = {0};
4343
4344 state.color_attachment_count = render->color_att_count;
4345 for (unsigned i = 0; i < render->color_att_count; ++i) {
4346 state.color_attachment_formats[i] = render->color_att[i].format;
4347 }
4348
4349 for (unsigned i = 0; i < MAX_RTS; i++) {
4350 VkBlendOp eqRGB = d->vk.cb.attachments[i].color_blend_op;
4351 VkBlendFactor srcRGB = d->vk.cb.attachments[i].src_color_blend_factor;
4352 VkBlendFactor dstRGB = d->vk.cb.attachments[i].dst_color_blend_factor;
4353
4354 state.color_write_mask |= d->vk.cb.attachments[i].write_mask << (4 * i);
4355 state.color_blend_enable |= d->vk.cb.attachments[i].blend_enable << (4 * i);
4356
4357 radv_normalize_blend_factor(eqRGB, &srcRGB, &dstRGB);
4358
4359 if (srcRGB == VK_BLEND_FACTOR_SRC_ALPHA || dstRGB == VK_BLEND_FACTOR_SRC_ALPHA ||
4360 srcRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE || dstRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE ||
4361 srcRGB == VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA || dstRGB == VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA)
4362 state.need_src_alpha |= 1 << i;
4363 }
4364
4365 state.mrt0_is_dual_src = radv_is_mrt0_dual_src(cmd_buffer);
4366
4367 if (d->vk.ms.alpha_to_coverage_enable) {
4368 /* Select a color export format with alpha when alpha to coverage is enabled. */
4369 state.need_src_alpha |= 0x1;
4370 }
4371
4372 if (ps) {
4373 state.colors_written = ps->info.ps.colors_written;
4374
4375 if (ps->info.ps.exports_mrtz_via_epilog) {
4376 assert(device->physical_device->rad_info.gfx_level >= GFX11);
4377 state.export_depth = ps->info.ps.writes_z;
4378 state.export_stencil = ps->info.ps.writes_stencil;
4379 state.export_sample_mask = ps->info.ps.writes_sample_mask;
4380 state.alpha_to_coverage_via_mrtz = d->vk.ms.alpha_to_coverage_enable;
4381 }
4382 }
4383
4384 struct radv_ps_epilog_key key = radv_generate_ps_epilog_key(device, &state);
4385
4386 /* Clear color attachments that aren't exported by the FS to match IO shader arguments. */
4387 key.spi_shader_col_format &= ps->info.ps.colors_written;
4388
4389 return radv_shader_part_cache_get(device, &device->ps_epilogs, &cmd_buffer->ps_epilogs, &key);
4390 }
4391
4392 static struct radv_shader_part *
lookup_tcs_epilog(struct radv_cmd_buffer * cmd_buffer)4393 lookup_tcs_epilog(struct radv_cmd_buffer *cmd_buffer)
4394 {
4395 const struct radv_shader *tcs = cmd_buffer->state.shaders[MESA_SHADER_TESS_CTRL];
4396 const struct radv_shader *tes = radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_TESS_EVAL);
4397 struct radv_device *device = cmd_buffer->device;
4398
4399 struct radv_tcs_epilog_key key = {
4400 .primitive_mode = tes->info.tes._primitive_mode,
4401 .tes_reads_tessfactors = tes->info.tes.reads_tess_factors,
4402 .tcs_out_patch_fits_subgroup = tcs->info.wave_size % tcs->info.tcs.tcs_vertices_out == 0,
4403 };
4404
4405 return radv_shader_part_cache_get(device, &device->tcs_epilogs, &cmd_buffer->tcs_epilogs, &key);
4406 }
4407
4408 static void
radv_emit_msaa_state(struct radv_cmd_buffer * cmd_buffer)4409 radv_emit_msaa_state(struct radv_cmd_buffer *cmd_buffer)
4410 {
4411 const struct radv_physical_device *pdevice = cmd_buffer->device->physical_device;
4412 const struct radv_shader *ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
4413 unsigned rasterization_samples = radv_get_rasterization_samples(cmd_buffer);
4414 const struct radv_rendering_state *render = &cmd_buffer->state.render;
4415 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
4416 unsigned log_samples = util_logbase2(rasterization_samples);
4417 unsigned pa_sc_aa_config = 0;
4418 unsigned max_sample_dist = 0;
4419 unsigned db_eqaa;
4420
4421 db_eqaa = S_028804_HIGH_QUALITY_INTERSECTIONS(1) | S_028804_INCOHERENT_EQAA_READS(1) |
4422 S_028804_STATIC_ANCHOR_ASSOCIATIONS(1);
4423
4424 if (pdevice->rad_info.gfx_level >= GFX9 &&
4425 d->vk.rs.conservative_mode != VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT) {
4426 /* Adjust MSAA state if conservative rasterization is enabled. */
4427 db_eqaa |= S_028804_OVERRASTERIZATION_AMOUNT(4);
4428 pa_sc_aa_config |= S_028BE0_AA_MASK_CENTROID_DTMN(1);
4429 }
4430
4431 if (!d->sample_location.count) {
4432 max_sample_dist = radv_get_default_max_sample_dist(log_samples);
4433 } else {
4434 uint32_t num_samples = (uint32_t)d->sample_location.per_pixel;
4435 VkOffset2D sample_locs[4][8]; /* 8 is the max. sample count supported */
4436
4437 /* Convert the user sample locations to hardware sample locations. */
4438 radv_convert_user_sample_locs(&d->sample_location, 0, 0, sample_locs[0]);
4439 radv_convert_user_sample_locs(&d->sample_location, 1, 0, sample_locs[1]);
4440 radv_convert_user_sample_locs(&d->sample_location, 0, 1, sample_locs[2]);
4441 radv_convert_user_sample_locs(&d->sample_location, 1, 1, sample_locs[3]);
4442
4443 /* Compute the maximum sample distance from the specified locations. */
4444 for (unsigned i = 0; i < 4; ++i) {
4445 for (uint32_t j = 0; j < num_samples; j++) {
4446 VkOffset2D offset = sample_locs[i][j];
4447 max_sample_dist = MAX2(max_sample_dist, MAX2(abs(offset.x), abs(offset.y)));
4448 }
4449 }
4450 }
4451
4452 if (rasterization_samples > 1) {
4453 unsigned z_samples = MAX2(render->ds_samples, rasterization_samples);
4454 unsigned ps_iter_samples = radv_get_ps_iter_samples(cmd_buffer);
4455 unsigned log_z_samples = util_logbase2(z_samples);
4456 unsigned log_ps_iter_samples = util_logbase2(ps_iter_samples);
4457 bool uses_underestimate = d->vk.rs.conservative_mode == VK_CONSERVATIVE_RASTERIZATION_MODE_UNDERESTIMATE_EXT;
4458
4459 db_eqaa |= S_028804_MAX_ANCHOR_SAMPLES(log_z_samples) | S_028804_PS_ITER_SAMPLES(log_ps_iter_samples) |
4460 S_028804_MASK_EXPORT_NUM_SAMPLES(log_samples) | S_028804_ALPHA_TO_MASK_NUM_SAMPLES(log_samples);
4461
4462 pa_sc_aa_config |= S_028BE0_MSAA_NUM_SAMPLES(uses_underestimate ? 0 : log_samples) |
4463 S_028BE0_MAX_SAMPLE_DIST(max_sample_dist) | S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples) |
4464 S_028BE0_COVERED_CENTROID_IS_CENTER(pdevice->rad_info.gfx_level >= GFX10_3);
4465
4466 if (d->vk.rs.line.mode == VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_KHR)
4467 db_eqaa |= S_028804_OVERRASTERIZATION_AMOUNT(log_samples);
4468 }
4469
4470 pa_sc_aa_config |= S_028BE0_COVERAGE_TO_SHADER_SELECT(ps && ps->info.ps.reads_fully_covered);
4471
4472 /* On GFX11, DB_Z_INFO.NUM_SAMPLES should always match MSAA_EXPOSED_SAMPLES. It affects VRS,
4473 * occlusion queries and Primitive Ordered Pixel Shading if depth and stencil are not bound.
4474 * This is normally emitted as framebuffer state, but if no attachments are bound the sample
4475 * count is independent of the framebuffer state and hence may need to be updated with MSAA
4476 * state.
4477 * Checking the format, not the image view, because the latter may not exist in a secondary
4478 * command buffer.
4479 */
4480 if (pdevice->rad_info.gfx_level == GFX11 && render->ds_att.format == VK_FORMAT_UNDEFINED) {
4481 assert(!render->ds_att.iview);
4482 radeon_set_context_reg(cmd_buffer->cs, R_028040_DB_Z_INFO,
4483 S_028040_FORMAT(V_028040_Z_INVALID) | S_028040_NUM_SAMPLES(log_samples));
4484 }
4485 radeon_set_context_reg(cmd_buffer->cs, R_028804_DB_EQAA, db_eqaa);
4486 radeon_set_context_reg(cmd_buffer->cs, R_028BE0_PA_SC_AA_CONFIG, pa_sc_aa_config);
4487 radeon_set_context_reg(
4488 cmd_buffer->cs, R_028A48_PA_SC_MODE_CNTL_0,
4489 S_028A48_ALTERNATE_RBS_PER_TILE(pdevice->rad_info.gfx_level >= GFX9) | S_028A48_VPORT_SCISSOR_ENABLE(1) |
4490 S_028A48_LINE_STIPPLE_ENABLE(d->vk.rs.line.stipple.enable) | S_028A48_MSAA_ENABLE(rasterization_samples > 1));
4491 }
4492
4493 static void
radv_emit_line_rasterization_mode(struct radv_cmd_buffer * cmd_buffer)4494 radv_emit_line_rasterization_mode(struct radv_cmd_buffer *cmd_buffer)
4495 {
4496 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
4497
4498 /* The DX10 diamond test is unnecessary with Vulkan and it decreases line rasterization
4499 * performance.
4500 */
4501 radeon_set_context_reg(
4502 cmd_buffer->cs, R_028BDC_PA_SC_LINE_CNTL,
4503 S_028BDC_PERPENDICULAR_ENDCAP_ENA(d->vk.rs.line.mode == VK_LINE_RASTERIZATION_MODE_RECTANGULAR_KHR));
4504 }
4505
4506 static void
radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer * cmd_buffer,const uint64_t states)4507 radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer, const uint64_t states)
4508 {
4509 if (states & (RADV_CMD_DIRTY_DYNAMIC_VIEWPORT | RADV_CMD_DIRTY_DYNAMIC_DEPTH_CLIP_ENABLE |
4510 RADV_CMD_DIRTY_DYNAMIC_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE | RADV_CMD_DIRTY_DYNAMIC_DEPTH_CLAMP_ENABLE))
4511 radv_emit_viewport(cmd_buffer);
4512
4513 if (states & (RADV_CMD_DIRTY_DYNAMIC_SCISSOR | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT) &&
4514 !cmd_buffer->device->physical_device->rad_info.has_gfx9_scissor_bug)
4515 radv_emit_scissor(cmd_buffer);
4516
4517 if (states & RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH)
4518 radv_emit_line_width(cmd_buffer);
4519
4520 if (states & RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS)
4521 radv_emit_blend_constants(cmd_buffer);
4522
4523 if (states & (RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE | RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK |
4524 RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK))
4525 radv_emit_stencil(cmd_buffer);
4526
4527 if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS)
4528 radv_emit_depth_bounds(cmd_buffer);
4529
4530 if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS)
4531 radv_emit_depth_bias(cmd_buffer);
4532
4533 if (states & (RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE | RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE_ENABLE |
4534 RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE_MODE))
4535 radv_emit_discard_rectangle(cmd_buffer);
4536
4537 if (states & RADV_CMD_DIRTY_DYNAMIC_CONSERVATIVE_RAST_MODE)
4538 radv_emit_conservative_rast_mode(cmd_buffer);
4539
4540 if (states & RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS)
4541 radv_emit_sample_locations(cmd_buffer);
4542
4543 if (states & RADV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE)
4544 radv_emit_line_stipple(cmd_buffer);
4545
4546 if (states & (RADV_CMD_DIRTY_DYNAMIC_CULL_MODE | RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE |
4547 RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE | RADV_CMD_DIRTY_DYNAMIC_POLYGON_MODE |
4548 RADV_CMD_DIRTY_DYNAMIC_PROVOKING_VERTEX_MODE | RADV_CMD_DIRTY_DYNAMIC_LINE_RASTERIZATION_MODE))
4549 radv_emit_culling(cmd_buffer);
4550
4551 if (states & (RADV_CMD_DIRTY_DYNAMIC_PROVOKING_VERTEX_MODE | RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY))
4552 radv_emit_provoking_vertex_mode(cmd_buffer);
4553
4554 if (states & RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY)
4555 radv_emit_primitive_topology(cmd_buffer);
4556
4557 if (states & (RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE |
4558 RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP | RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE |
4559 RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP))
4560 radv_emit_depth_control(cmd_buffer);
4561
4562 if (states & RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP)
4563 radv_emit_stencil_control(cmd_buffer);
4564
4565 if (states & RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE)
4566 radv_emit_fragment_shading_rate(cmd_buffer);
4567
4568 if (states & RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE)
4569 radv_emit_primitive_restart_enable(cmd_buffer);
4570
4571 if (states & (RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE | RADV_CMD_DIRTY_DYNAMIC_DEPTH_CLIP_ENABLE |
4572 RADV_CMD_DIRTY_DYNAMIC_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE | RADV_CMD_DIRTY_DYNAMIC_DEPTH_CLAMP_ENABLE))
4573 radv_emit_clipping(cmd_buffer);
4574
4575 if (states & (RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP | RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP_ENABLE |
4576 RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_MASK | RADV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_ENABLE |
4577 RADV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_EQUATION))
4578 radv_emit_logic_op(cmd_buffer);
4579
4580 if (states & (RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE | RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_MASK))
4581 radv_emit_color_write(cmd_buffer);
4582
4583 if (states & RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT)
4584 radv_emit_vertex_input(cmd_buffer);
4585
4586 if (states & RADV_CMD_DIRTY_DYNAMIC_PATCH_CONTROL_POINTS)
4587 radv_emit_patch_control_points(cmd_buffer);
4588
4589 if (states & RADV_CMD_DIRTY_DYNAMIC_TESS_DOMAIN_ORIGIN)
4590 radv_emit_tess_domain_origin(cmd_buffer);
4591
4592 if (states & RADV_CMD_DIRTY_DYNAMIC_ALPHA_TO_COVERAGE_ENABLE)
4593 radv_emit_alpha_to_coverage_enable(cmd_buffer);
4594
4595 if (states & RADV_CMD_DIRTY_DYNAMIC_SAMPLE_MASK)
4596 radv_emit_sample_mask(cmd_buffer);
4597
4598 if (states & (RADV_CMD_DIRTY_DYNAMIC_DEPTH_CLAMP_ENABLE | RADV_CMD_DIRTY_DYNAMIC_DEPTH_CLIP_ENABLE))
4599 radv_emit_depth_clamp_enable(cmd_buffer);
4600
4601 if (states & (RADV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_ENABLE | RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_MASK |
4602 RADV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_EQUATION | RADV_CMD_DIRTY_DYNAMIC_ALPHA_TO_COVERAGE_ENABLE))
4603 radv_emit_color_blend(cmd_buffer);
4604
4605 if (states & RADV_CMD_DIRTY_DYNAMIC_LINE_RASTERIZATION_MODE)
4606 radv_emit_line_rasterization_mode(cmd_buffer);
4607
4608 if (states & (RADV_CMD_DIRTY_DYNAMIC_RASTERIZATION_SAMPLES | RADV_CMD_DIRTY_DYNAMIC_LINE_RASTERIZATION_MODE))
4609 radv_emit_rasterization_samples(cmd_buffer);
4610
4611 if (states & (RADV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE_ENABLE | RADV_CMD_DIRTY_DYNAMIC_CONSERVATIVE_RAST_MODE |
4612 RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS | RADV_CMD_DIRTY_DYNAMIC_RASTERIZATION_SAMPLES |
4613 RADV_CMD_DIRTY_DYNAMIC_LINE_RASTERIZATION_MODE))
4614 radv_emit_msaa_state(cmd_buffer);
4615
4616 /* RADV_CMD_DIRTY_DYNAMIC_ATTACHMENT_FEEDBACK_LOOP_ENABLE is handled by radv_emit_db_shader_control. */
4617
4618 cmd_buffer->state.dirty &= ~states;
4619 }
4620
4621 static void
radv_flush_push_descriptors(struct radv_cmd_buffer * cmd_buffer,struct radv_descriptor_state * descriptors_state)4622 radv_flush_push_descriptors(struct radv_cmd_buffer *cmd_buffer, struct radv_descriptor_state *descriptors_state)
4623 {
4624 struct radv_descriptor_set *set = (struct radv_descriptor_set *)&descriptors_state->push_set.set;
4625 unsigned bo_offset;
4626
4627 if (!radv_cmd_buffer_upload_data(cmd_buffer, set->header.size, set->header.mapped_ptr, &bo_offset))
4628 return;
4629
4630 set->header.va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
4631 set->header.va += bo_offset;
4632 }
4633
4634 static void
radv_flush_indirect_descriptor_sets(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)4635 radv_flush_indirect_descriptor_sets(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point)
4636 {
4637 struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);
4638 uint32_t size = MAX_SETS * 4;
4639 uint32_t offset;
4640 void *ptr;
4641
4642 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, &offset, &ptr))
4643 return;
4644
4645 for (unsigned i = 0; i < MAX_SETS; i++) {
4646 uint32_t *uptr = ((uint32_t *)ptr) + i;
4647 uint64_t set_va = 0;
4648 if (descriptors_state->valid & (1u << i))
4649 set_va = radv_descriptor_get_va(descriptors_state, i);
4650
4651 uptr[0] = set_va & 0xffffffff;
4652 }
4653
4654 struct radeon_cmdbuf *cs = cmd_buffer->cs;
4655 struct radv_device *device = cmd_buffer->device;
4656 uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
4657 va += offset;
4658
4659 ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cs, MESA_VULKAN_SHADER_STAGES * 3);
4660
4661 if (bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) {
4662 for (unsigned s = MESA_SHADER_VERTEX; s <= MESA_SHADER_FRAGMENT; s++)
4663 if (radv_cmdbuf_has_stage(cmd_buffer, s))
4664 radv_emit_userdata_address(device, cs, cmd_buffer->state.shaders[s],
4665 cmd_buffer->state.shaders[s]->info.user_data_0, AC_UD_INDIRECT_DESCRIPTOR_SETS,
4666 va);
4667
4668 if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_MESH))
4669 radv_emit_userdata_address(device, cs, cmd_buffer->state.shaders[MESA_SHADER_MESH],
4670 cmd_buffer->state.shaders[MESA_SHADER_MESH]->info.user_data_0,
4671 AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
4672
4673 if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_TASK)) {
4674 radeon_check_space(device->ws, cmd_buffer->gang.cs, 3);
4675 radv_emit_userdata_address(device, cmd_buffer->gang.cs, cmd_buffer->state.shaders[MESA_SHADER_TASK],
4676 cmd_buffer->state.shaders[MESA_SHADER_TASK]->info.user_data_0,
4677 AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
4678 }
4679 } else {
4680 struct radv_shader *compute_shader = bind_point == VK_PIPELINE_BIND_POINT_COMPUTE
4681 ? cmd_buffer->state.shaders[MESA_SHADER_COMPUTE]
4682 : cmd_buffer->state.rt_prolog;
4683
4684 radv_emit_userdata_address(device, cs, compute_shader, compute_shader->info.user_data_0,
4685 AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
4686 }
4687
4688 assert(cmd_buffer->cs->cdw <= cdw_max);
4689 }
4690
4691 ALWAYS_INLINE static void
radv_flush_descriptors(struct radv_cmd_buffer * cmd_buffer,VkShaderStageFlags stages,VkPipelineBindPoint bind_point)4692 radv_flush_descriptors(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stages, VkPipelineBindPoint bind_point)
4693 {
4694 struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);
4695 struct radv_device *device = cmd_buffer->device;
4696 struct radeon_cmdbuf *cs = cmd_buffer->cs;
4697 bool flush_indirect_descriptors;
4698
4699 if (!descriptors_state->dirty)
4700 return;
4701
4702 flush_indirect_descriptors = descriptors_state->need_indirect_descriptor_sets;
4703
4704 if (flush_indirect_descriptors)
4705 radv_flush_indirect_descriptor_sets(cmd_buffer, bind_point);
4706
4707 ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cs, MAX_SETS * MESA_VULKAN_SHADER_STAGES * 4);
4708
4709 if (stages & VK_SHADER_STAGE_COMPUTE_BIT) {
4710 struct radv_shader *compute_shader = bind_point == VK_PIPELINE_BIND_POINT_COMPUTE
4711 ? cmd_buffer->state.shaders[MESA_SHADER_COMPUTE]
4712 : cmd_buffer->state.rt_prolog;
4713
4714 radv_emit_descriptor_pointers(device, cs, compute_shader, compute_shader->info.user_data_0, descriptors_state);
4715 } else {
4716 radv_foreach_stage(stage, stages & ~VK_SHADER_STAGE_TASK_BIT_EXT)
4717 {
4718 if (!cmd_buffer->state.shaders[stage])
4719 continue;
4720
4721 radv_emit_descriptor_pointers(device, cs, cmd_buffer->state.shaders[stage],
4722 cmd_buffer->state.shaders[stage]->info.user_data_0, descriptors_state);
4723 }
4724
4725 if (stages & VK_SHADER_STAGE_TASK_BIT_EXT) {
4726 radv_emit_descriptor_pointers(device, cmd_buffer->gang.cs, cmd_buffer->state.shaders[MESA_SHADER_TASK],
4727 cmd_buffer->state.shaders[MESA_SHADER_TASK]->info.user_data_0,
4728 descriptors_state);
4729 }
4730 }
4731
4732 descriptors_state->dirty = 0;
4733
4734 assert(cmd_buffer->cs->cdw <= cdw_max);
4735
4736 if (radv_device_fault_detection_enabled(cmd_buffer->device))
4737 radv_save_descriptors(cmd_buffer, bind_point);
4738 }
4739
4740 static void
radv_emit_all_inline_push_consts(struct radv_device * device,struct radeon_cmdbuf * cs,struct radv_shader * shader,uint32_t base_reg,uint32_t * values,bool * need_push_constants)4741 radv_emit_all_inline_push_consts(struct radv_device *device, struct radeon_cmdbuf *cs, struct radv_shader *shader,
4742 uint32_t base_reg, uint32_t *values, bool *need_push_constants)
4743 {
4744 if (radv_get_user_sgpr(shader, AC_UD_PUSH_CONSTANTS)->sgpr_idx != -1)
4745 *need_push_constants |= true;
4746
4747 const uint64_t mask = shader->info.inline_push_constant_mask;
4748 if (!mask)
4749 return;
4750
4751 const uint8_t base = ffs(mask) - 1;
4752 if (mask == u_bit_consecutive64(base, util_last_bit64(mask) - base)) {
4753 /* consecutive inline push constants */
4754 radv_emit_inline_push_consts(device, cs, shader, base_reg, AC_UD_INLINE_PUSH_CONSTANTS, values + base);
4755 } else {
4756 /* sparse inline push constants */
4757 uint32_t consts[AC_MAX_INLINE_PUSH_CONSTS];
4758 unsigned num_consts = 0;
4759 u_foreach_bit64 (idx, mask)
4760 consts[num_consts++] = values[idx];
4761 radv_emit_inline_push_consts(device, cs, shader, base_reg, AC_UD_INLINE_PUSH_CONSTANTS, consts);
4762 }
4763 }
4764
4765 ALWAYS_INLINE static VkShaderStageFlags
radv_must_flush_constants(const struct radv_cmd_buffer * cmd_buffer,VkShaderStageFlags stages,VkPipelineBindPoint bind_point)4766 radv_must_flush_constants(const struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stages,
4767 VkPipelineBindPoint bind_point)
4768 {
4769 const struct radv_push_constant_state *push_constants = radv_get_push_constants_state(cmd_buffer, bind_point);
4770
4771 if (push_constants->size || push_constants->dynamic_offset_count)
4772 return stages & cmd_buffer->push_constant_stages;
4773
4774 return 0;
4775 }
4776
4777 static void
radv_flush_constants(struct radv_cmd_buffer * cmd_buffer,VkShaderStageFlags stages,VkPipelineBindPoint bind_point)4778 radv_flush_constants(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stages, VkPipelineBindPoint bind_point)
4779 {
4780 struct radv_device *device = cmd_buffer->device;
4781 struct radeon_cmdbuf *cs = cmd_buffer->cs;
4782 struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);
4783 const struct radv_push_constant_state *push_constants = radv_get_push_constants_state(cmd_buffer, bind_point);
4784 struct radv_shader *shader, *prev_shader;
4785 bool need_push_constants = false;
4786 unsigned offset;
4787 void *ptr;
4788 uint64_t va;
4789 uint32_t internal_stages = stages;
4790 uint32_t dirty_stages = 0;
4791
4792 switch (bind_point) {
4793 case VK_PIPELINE_BIND_POINT_GRAPHICS:
4794 break;
4795 case VK_PIPELINE_BIND_POINT_COMPUTE:
4796 dirty_stages = RADV_RT_STAGE_BITS;
4797 break;
4798 case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR:
4799 internal_stages = VK_SHADER_STAGE_COMPUTE_BIT;
4800 dirty_stages = VK_SHADER_STAGE_COMPUTE_BIT;
4801 break;
4802 default:
4803 unreachable("Unhandled bind point");
4804 }
4805
4806 if (internal_stages & VK_SHADER_STAGE_COMPUTE_BIT) {
4807 struct radv_shader *compute_shader = bind_point == VK_PIPELINE_BIND_POINT_COMPUTE
4808 ? cmd_buffer->state.shaders[MESA_SHADER_COMPUTE]
4809 : cmd_buffer->state.rt_prolog;
4810
4811 radv_emit_all_inline_push_consts(device, cs, compute_shader, compute_shader->info.user_data_0,
4812 (uint32_t *)cmd_buffer->push_constants, &need_push_constants);
4813 } else {
4814 radv_foreach_stage(stage, internal_stages & ~VK_SHADER_STAGE_TASK_BIT_EXT)
4815 {
4816 shader = radv_get_shader(cmd_buffer->state.shaders, stage);
4817
4818 if (!shader)
4819 continue;
4820
4821 radv_emit_all_inline_push_consts(device, cs, shader, shader->info.user_data_0,
4822 (uint32_t *)cmd_buffer->push_constants, &need_push_constants);
4823 }
4824
4825 if (internal_stages & VK_SHADER_STAGE_TASK_BIT_EXT) {
4826 radv_emit_all_inline_push_consts(device, cmd_buffer->gang.cs, cmd_buffer->state.shaders[MESA_SHADER_TASK],
4827 cmd_buffer->state.shaders[MESA_SHADER_TASK]->info.user_data_0,
4828 (uint32_t *)cmd_buffer->push_constants, &need_push_constants);
4829 }
4830 }
4831
4832 if (need_push_constants) {
4833 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, push_constants->size + 16 * push_constants->dynamic_offset_count,
4834 &offset, &ptr))
4835 return;
4836
4837 memcpy(ptr, cmd_buffer->push_constants, push_constants->size);
4838 memcpy((char *)ptr + push_constants->size, descriptors_state->dynamic_buffers,
4839 16 * push_constants->dynamic_offset_count);
4840
4841 va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
4842 va += offset;
4843
4844 ASSERTED unsigned cdw_max =
4845 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, MESA_VULKAN_SHADER_STAGES * 4);
4846
4847 if (internal_stages & VK_SHADER_STAGE_COMPUTE_BIT) {
4848 struct radv_shader *compute_shader = bind_point == VK_PIPELINE_BIND_POINT_COMPUTE
4849 ? cmd_buffer->state.shaders[MESA_SHADER_COMPUTE]
4850 : cmd_buffer->state.rt_prolog;
4851
4852 radv_emit_userdata_address(device, cs, compute_shader, compute_shader->info.user_data_0, AC_UD_PUSH_CONSTANTS,
4853 va);
4854 } else {
4855 prev_shader = NULL;
4856 radv_foreach_stage(stage, internal_stages & ~VK_SHADER_STAGE_TASK_BIT_EXT)
4857 {
4858 shader = radv_get_shader(cmd_buffer->state.shaders, stage);
4859
4860 /* Avoid redundantly emitting the address for merged stages. */
4861 if (shader && shader != prev_shader) {
4862 radv_emit_userdata_address(device, cs, shader, shader->info.user_data_0, AC_UD_PUSH_CONSTANTS, va);
4863
4864 prev_shader = shader;
4865 }
4866 }
4867
4868 if (internal_stages & VK_SHADER_STAGE_TASK_BIT_EXT) {
4869 radv_emit_userdata_address(device, cmd_buffer->gang.cs, cmd_buffer->state.shaders[MESA_SHADER_TASK],
4870 cmd_buffer->state.shaders[MESA_SHADER_TASK]->info.user_data_0,
4871 AC_UD_PUSH_CONSTANTS, va);
4872 }
4873 }
4874
4875 assert(cmd_buffer->cs->cdw <= cdw_max);
4876 }
4877
4878 cmd_buffer->push_constant_stages &= ~stages;
4879 cmd_buffer->push_constant_stages |= dirty_stages;
4880 }
4881
4882 void
radv_write_vertex_descriptors(const struct radv_cmd_buffer * cmd_buffer,const struct radv_graphics_pipeline * pipeline,bool full_null_descriptors,void * vb_ptr)4883 radv_write_vertex_descriptors(const struct radv_cmd_buffer *cmd_buffer, const struct radv_graphics_pipeline *pipeline,
4884 bool full_null_descriptors, void *vb_ptr)
4885 {
4886 struct radv_shader *vs_shader = radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_VERTEX);
4887 enum amd_gfx_level chip = cmd_buffer->device->physical_device->rad_info.gfx_level;
4888 enum radeon_family family = cmd_buffer->device->physical_device->rad_info.family;
4889 unsigned desc_index = 0;
4890 uint32_t mask = vs_shader->info.vs.vb_desc_usage_mask;
4891 uint64_t va;
4892 const struct radv_vs_input_state *vs_state =
4893 vs_shader->info.vs.dynamic_inputs ? &cmd_buffer->state.dynamic_vs_input : NULL;
4894 assert(!vs_state || vs_shader->info.vs.use_per_attribute_vb_descs);
4895
4896 const struct ac_vtx_format_info *vtx_info_table = vs_state ? ac_get_vtx_format_info_table(chip, family) : NULL;
4897
4898 while (mask) {
4899 unsigned i = u_bit_scan(&mask);
4900 uint32_t *desc = &((uint32_t *)vb_ptr)[desc_index++ * 4];
4901 uint32_t offset, rsrc_word3;
4902
4903 if (vs_state && !(vs_state->attribute_mask & BITFIELD_BIT(i))) {
4904 /* No vertex attribute description given: assume that the shader doesn't use this
4905 * location (vb_desc_usage_mask can be larger than attribute usage) and use a null
4906 * descriptor to avoid hangs (prologs load all attributes, even if there are holes).
4907 */
4908 memset(desc, 0, 4 * 4);
4909 continue;
4910 }
4911
4912 unsigned binding = vs_state ? cmd_buffer->state.dynamic_vs_input.bindings[i]
4913 : (vs_shader->info.vs.use_per_attribute_vb_descs ? pipeline->attrib_bindings[i] : i);
4914 struct radv_buffer *buffer = cmd_buffer->vertex_binding_buffers[binding];
4915 unsigned num_records;
4916 unsigned stride;
4917
4918 if (vs_state && !(vs_state->nontrivial_formats & BITFIELD_BIT(i))) {
4919 const struct ac_vtx_format_info *vtx_info = &vtx_info_table[vs_state->formats[i]];
4920 unsigned hw_format = vtx_info->hw_format[vtx_info->num_channels - 1];
4921
4922 if (chip >= GFX10) {
4923 rsrc_word3 = vtx_info->dst_sel | S_008F0C_FORMAT(hw_format);
4924 } else {
4925 rsrc_word3 =
4926 vtx_info->dst_sel | S_008F0C_NUM_FORMAT((hw_format >> 4) & 0x7) | S_008F0C_DATA_FORMAT(hw_format & 0xf);
4927 }
4928 } else {
4929 rsrc_word3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
4930 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
4931 if (chip >= GFX10)
4932 rsrc_word3 |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_UINT);
4933 else
4934 rsrc_word3 |=
4935 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) | S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
4936 }
4937
4938 if (cmd_buffer->state.uses_dynamic_vertex_binding_stride) {
4939 stride = cmd_buffer->vertex_bindings[binding].stride;
4940 } else {
4941 stride = pipeline->binding_stride[binding];
4942 }
4943
4944 if (!buffer) {
4945 if (full_null_descriptors) {
4946 /* Put all the info in for the DGC generation shader in case the VBO gets overridden. */
4947 desc[0] = 0;
4948 desc[1] = S_008F04_STRIDE(stride);
4949 desc[2] = 0;
4950 desc[3] = rsrc_word3;
4951 } else if (vs_state) {
4952 /* Stride needs to be non-zero on GFX9, or else bounds checking is disabled. We need
4953 * to include the format/word3 so that the alpha channel is 1 for formats without an
4954 * alpha channel.
4955 */
4956 desc[0] = 0;
4957 desc[1] = S_008F04_STRIDE(16);
4958 desc[2] = 0;
4959 desc[3] = rsrc_word3;
4960 } else {
4961 memset(desc, 0, 4 * 4);
4962 }
4963
4964 continue;
4965 }
4966
4967 va = radv_buffer_get_va(buffer->bo);
4968
4969 offset = cmd_buffer->vertex_bindings[binding].offset;
4970 va += offset + buffer->offset;
4971 if (vs_state)
4972 va += vs_state->offsets[i];
4973
4974 if (cmd_buffer->vertex_bindings[binding].size) {
4975 num_records = cmd_buffer->vertex_bindings[binding].size;
4976 } else {
4977 num_records = vk_buffer_range(&buffer->vk, offset, VK_WHOLE_SIZE);
4978 }
4979
4980 if (vs_shader->info.vs.use_per_attribute_vb_descs) {
4981 uint32_t attrib_end = vs_state ? vs_state->offsets[i] + vs_state->format_sizes[i] : pipeline->attrib_ends[i];
4982
4983 if (num_records < attrib_end) {
4984 num_records = 0; /* not enough space for one vertex */
4985 } else if (stride == 0) {
4986 num_records = 1; /* only one vertex */
4987 } else {
4988 num_records = (num_records - attrib_end) / stride + 1;
4989 /* If attrib_offset>stride, then the compiler will increase the vertex index by
4990 * attrib_offset/stride and decrease the offset by attrib_offset%stride. This is
4991 * only allowed with static strides.
4992 */
4993 num_records += pipeline ? pipeline->attrib_index_offset[i] : 0;
4994 }
4995
4996 /* GFX10 uses OOB_SELECT_RAW if stride==0, so convert num_records from elements into
4997 * into bytes in that case. GFX8 always uses bytes.
4998 */
4999 if (num_records && (chip == GFX8 || (chip != GFX9 && !stride))) {
5000 num_records = (num_records - 1) * stride + attrib_end;
5001 } else if (!num_records) {
5002 /* On GFX9, it seems bounds checking is disabled if both
5003 * num_records and stride are zero. This doesn't seem necessary on GFX8, GFX10 and
5004 * GFX10.3 but it doesn't hurt.
5005 */
5006 if (full_null_descriptors) {
5007 /* Put all the info in for the DGC generation shader in case the VBO gets overridden.
5008 */
5009 desc[0] = 0;
5010 desc[1] = S_008F04_STRIDE(stride);
5011 desc[2] = 0;
5012 desc[3] = rsrc_word3;
5013 } else if (vs_state) {
5014 desc[0] = 0;
5015 desc[1] = S_008F04_STRIDE(16);
5016 desc[2] = 0;
5017 desc[3] = rsrc_word3;
5018 } else {
5019 memset(desc, 0, 16);
5020 }
5021
5022 continue;
5023 }
5024 } else {
5025 if (chip != GFX8 && stride)
5026 num_records = DIV_ROUND_UP(num_records, stride);
5027 }
5028
5029 if (chip >= GFX10) {
5030 /* OOB_SELECT chooses the out-of-bounds check:
5031 * - 1: index >= NUM_RECORDS (Structured)
5032 * - 3: offset >= NUM_RECORDS (Raw)
5033 */
5034 int oob_select = stride ? V_008F0C_OOB_SELECT_STRUCTURED : V_008F0C_OOB_SELECT_RAW;
5035 rsrc_word3 |= S_008F0C_OOB_SELECT(oob_select) | S_008F0C_RESOURCE_LEVEL(chip < GFX11);
5036 }
5037
5038 desc[0] = va;
5039 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride);
5040 desc[2] = num_records;
5041 desc[3] = rsrc_word3;
5042 }
5043 }
5044
5045 static void
radv_flush_vertex_descriptors(struct radv_cmd_buffer * cmd_buffer)5046 radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer)
5047 {
5048 struct radv_shader *vs = radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_VERTEX);
5049
5050 if (!vs->info.vs.vb_desc_usage_mask)
5051 return;
5052
5053 /* Mesh shaders don't have vertex descriptors. */
5054 assert(!cmd_buffer->state.mesh_shading);
5055
5056 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
5057 unsigned vb_desc_alloc_size = util_bitcount(vs->info.vs.vb_desc_usage_mask) * 16;
5058 unsigned vb_offset;
5059 void *vb_ptr;
5060 uint64_t va;
5061
5062 /* allocate some descriptor state for vertex buffers */
5063 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, vb_desc_alloc_size, &vb_offset, &vb_ptr))
5064 return;
5065
5066 radv_write_vertex_descriptors(cmd_buffer, pipeline, false, vb_ptr);
5067
5068 va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
5069 va += vb_offset;
5070
5071 radv_emit_userdata_address(cmd_buffer->device, cmd_buffer->cs, vs, vs->info.user_data_0, AC_UD_VS_VERTEX_BUFFERS,
5072 va);
5073
5074 cmd_buffer->state.vb_va = va;
5075 cmd_buffer->state.vb_size = vb_desc_alloc_size;
5076 cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_VBO_DESCRIPTORS;
5077
5078 if (radv_device_fault_detection_enabled(cmd_buffer->device))
5079 radv_save_vertex_descriptors(cmd_buffer, (uintptr_t)vb_ptr);
5080
5081 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_VERTEX_BUFFER;
5082 }
5083
5084 static void
radv_emit_streamout_buffers(struct radv_cmd_buffer * cmd_buffer,uint64_t va)5085 radv_emit_streamout_buffers(struct radv_cmd_buffer *cmd_buffer, uint64_t va)
5086 {
5087 const struct radv_shader *last_vgt_shader = cmd_buffer->state.last_vgt_shader;
5088 const struct radv_userdata_info *loc = radv_get_user_sgpr(last_vgt_shader, AC_UD_STREAMOUT_BUFFERS);
5089 uint32_t base_reg;
5090
5091 if (loc->sgpr_idx == -1)
5092 return;
5093
5094 base_reg = last_vgt_shader->info.user_data_0;
5095
5096 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, va, false);
5097
5098 if (cmd_buffer->state.gs_copy_shader) {
5099 loc = &cmd_buffer->state.gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_STREAMOUT_BUFFERS];
5100 if (loc->sgpr_idx != -1) {
5101 base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0;
5102
5103 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, va, false);
5104 }
5105 }
5106 }
5107
5108 static void
radv_flush_streamout_descriptors(struct radv_cmd_buffer * cmd_buffer)5109 radv_flush_streamout_descriptors(struct radv_cmd_buffer *cmd_buffer)
5110 {
5111 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_STREAMOUT_BUFFER) {
5112 struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
5113 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
5114 unsigned so_offset;
5115 uint64_t desc_va;
5116 void *so_ptr;
5117
5118 /* Allocate some descriptor state for streamout buffers. */
5119 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, MAX_SO_BUFFERS * 16, &so_offset, &so_ptr))
5120 return;
5121
5122 for (uint32_t i = 0; i < MAX_SO_BUFFERS; i++) {
5123 struct radv_buffer *buffer = sb[i].buffer;
5124 uint32_t *desc = &((uint32_t *)so_ptr)[i * 4];
5125 uint32_t size = 0;
5126 uint64_t va = 0;
5127
5128 if (so->enabled_mask & (1 << i)) {
5129 va = radv_buffer_get_va(buffer->bo) + buffer->offset;
5130
5131 va += sb[i].offset;
5132
5133 /* Set the descriptor.
5134 *
5135 * On GFX8, the format must be non-INVALID, otherwise
5136 * the buffer will be considered not bound and store
5137 * instructions will be no-ops.
5138 */
5139 size = 0xffffffff;
5140
5141 if (cmd_buffer->device->physical_device->use_ngg_streamout) {
5142 /* With NGG streamout, the buffer size is used to determine the max emit per buffer
5143 * and also acts as a disable bit when it's 0.
5144 */
5145 size = radv_is_streamout_enabled(cmd_buffer) ? sb[i].size : 0;
5146 }
5147 }
5148
5149 uint32_t rsrc_word3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
5150 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
5151
5152 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
5153 rsrc_word3 |=
5154 S_008F0C_FORMAT(V_008F0C_GFX11_FORMAT_32_FLOAT) | S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW);
5155 } else if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10) {
5156 rsrc_word3 |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
5157 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
5158 } else {
5159 rsrc_word3 |= S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
5160 }
5161
5162 desc[0] = va;
5163 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
5164 desc[2] = size;
5165 desc[3] = rsrc_word3;
5166 }
5167
5168 desc_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
5169 desc_va += so_offset;
5170
5171 radv_emit_streamout_buffers(cmd_buffer, desc_va);
5172 }
5173
5174 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_STREAMOUT_BUFFER;
5175 }
5176
5177 static void
radv_flush_shader_query_state_gfx(struct radv_cmd_buffer * cmd_buffer)5178 radv_flush_shader_query_state_gfx(struct radv_cmd_buffer *cmd_buffer)
5179 {
5180 const struct radv_shader *last_vgt_shader = cmd_buffer->state.last_vgt_shader;
5181 const struct radv_userdata_info *loc = radv_get_user_sgpr(last_vgt_shader, AC_UD_SHADER_QUERY_STATE);
5182 enum radv_shader_query_state shader_query_state = radv_shader_query_none;
5183 uint32_t base_reg;
5184
5185 if (loc->sgpr_idx == -1)
5186 return;
5187
5188 assert(last_vgt_shader->info.is_ngg || last_vgt_shader->info.stage == MESA_SHADER_GEOMETRY);
5189
5190 /* By default shader queries are disabled but they are enabled if the command buffer has active GDS
5191 * queries or if it's a secondary command buffer that inherits the number of generated
5192 * primitives.
5193 */
5194 if (cmd_buffer->state.active_pipeline_gds_queries ||
5195 (cmd_buffer->state.inherited_pipeline_statistics &
5196 (VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT |
5197 VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT)) ||
5198 (cmd_buffer->device->physical_device->emulate_mesh_shader_queries &&
5199 (cmd_buffer->state.inherited_pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_MESH_SHADER_INVOCATIONS_BIT_EXT)))
5200 shader_query_state |= radv_shader_query_pipeline_stat;
5201
5202 if (cmd_buffer->state.active_prims_gen_gds_queries)
5203 shader_query_state |= radv_shader_query_prim_gen;
5204
5205 if (cmd_buffer->state.active_prims_xfb_gds_queries && radv_is_streamout_enabled(cmd_buffer)) {
5206 shader_query_state |= radv_shader_query_prim_xfb | radv_shader_query_prim_gen;
5207 }
5208
5209 base_reg = last_vgt_shader->info.user_data_0;
5210 assert(loc->sgpr_idx != -1);
5211
5212 radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, shader_query_state);
5213 }
5214
5215 static void
radv_flush_shader_query_state_ace(struct radv_cmd_buffer * cmd_buffer,struct radv_shader * task_shader)5216 radv_flush_shader_query_state_ace(struct radv_cmd_buffer *cmd_buffer, struct radv_shader *task_shader)
5217 {
5218 const struct radv_userdata_info *loc = radv_get_user_sgpr(task_shader, AC_UD_SHADER_QUERY_STATE);
5219 enum radv_shader_query_state shader_query_state = radv_shader_query_none;
5220 uint32_t base_reg;
5221
5222 if (loc->sgpr_idx == -1)
5223 return;
5224
5225 /* By default shader queries are disabled but they are enabled if the command buffer has active ACE
5226 * queries or if it's a secondary command buffer that inherits the number of task shader
5227 * invocations query.
5228 */
5229 if (cmd_buffer->state.active_pipeline_ace_queries ||
5230 (cmd_buffer->state.inherited_pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT))
5231 shader_query_state |= radv_shader_query_pipeline_stat;
5232
5233 base_reg = task_shader->info.user_data_0;
5234 assert(loc->sgpr_idx != -1);
5235
5236 radeon_set_sh_reg(cmd_buffer->gang.cs, base_reg + loc->sgpr_idx * 4, shader_query_state);
5237 }
5238
5239 static void
radv_flush_shader_query_state(struct radv_cmd_buffer * cmd_buffer)5240 radv_flush_shader_query_state(struct radv_cmd_buffer *cmd_buffer)
5241 {
5242 radv_flush_shader_query_state_gfx(cmd_buffer);
5243
5244 if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_TASK) &&
5245 cmd_buffer->device->physical_device->emulate_mesh_shader_queries)
5246 radv_flush_shader_query_state_ace(cmd_buffer, cmd_buffer->state.shaders[MESA_SHADER_TASK]);
5247
5248 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_SHADER_QUERY;
5249 }
5250
5251 static void
radv_flush_force_vrs_state(struct radv_cmd_buffer * cmd_buffer)5252 radv_flush_force_vrs_state(struct radv_cmd_buffer *cmd_buffer)
5253 {
5254 const struct radv_shader *last_vgt_shader = cmd_buffer->state.last_vgt_shader;
5255
5256 if (!last_vgt_shader->info.force_vrs_per_vertex) {
5257 /* Un-set the SGPR index so we know to re-emit it later. */
5258 cmd_buffer->state.last_vrs_rates_sgpr_idx = -1;
5259 return;
5260 }
5261
5262 const struct radv_userdata_info *loc;
5263 uint32_t base_reg;
5264
5265 if (cmd_buffer->state.gs_copy_shader) {
5266 loc = &cmd_buffer->state.gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_FORCE_VRS_RATES];
5267 base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0;
5268 } else {
5269 loc = radv_get_user_sgpr(last_vgt_shader, AC_UD_FORCE_VRS_RATES);
5270 base_reg = last_vgt_shader->info.user_data_0;
5271 }
5272
5273 assert(loc->sgpr_idx != -1);
5274
5275 enum amd_gfx_level gfx_level = cmd_buffer->device->physical_device->rad_info.gfx_level;
5276 uint32_t vrs_rates = 0;
5277
5278 switch (cmd_buffer->device->force_vrs) {
5279 case RADV_FORCE_VRS_2x2:
5280 vrs_rates = gfx_level >= GFX11 ? V_0283D0_VRS_SHADING_RATE_2X2 : (1u << 2) | (1u << 4);
5281 break;
5282 case RADV_FORCE_VRS_2x1:
5283 vrs_rates = gfx_level >= GFX11 ? V_0283D0_VRS_SHADING_RATE_2X1 : (1u << 2) | (0u << 4);
5284 break;
5285 case RADV_FORCE_VRS_1x2:
5286 vrs_rates = gfx_level >= GFX11 ? V_0283D0_VRS_SHADING_RATE_1X2 : (0u << 2) | (1u << 4);
5287 break;
5288 default:
5289 break;
5290 }
5291
5292 if (cmd_buffer->state.last_vrs_rates != vrs_rates || cmd_buffer->state.last_vrs_rates_sgpr_idx != loc->sgpr_idx) {
5293 radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, vrs_rates);
5294 }
5295
5296 cmd_buffer->state.last_vrs_rates = vrs_rates;
5297 cmd_buffer->state.last_vrs_rates_sgpr_idx = loc->sgpr_idx;
5298 }
5299
5300 static void
radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer * cmd_buffer)5301 radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer)
5302 {
5303 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_VERTEX_BUFFER)
5304 radv_flush_vertex_descriptors(cmd_buffer);
5305
5306 radv_flush_streamout_descriptors(cmd_buffer);
5307
5308 VkShaderStageFlags stages = VK_SHADER_STAGE_ALL_GRAPHICS;
5309 radv_flush_descriptors(cmd_buffer, stages, VK_PIPELINE_BIND_POINT_GRAPHICS);
5310
5311 const VkShaderStageFlags pc_stages = radv_must_flush_constants(cmd_buffer, stages, VK_PIPELINE_BIND_POINT_GRAPHICS);
5312 if (pc_stages)
5313 radv_flush_constants(cmd_buffer, pc_stages, VK_PIPELINE_BIND_POINT_GRAPHICS);
5314
5315 radv_flush_force_vrs_state(cmd_buffer);
5316 }
5317
5318 struct radv_draw_info {
5319 /**
5320 * Number of vertices.
5321 */
5322 uint32_t count;
5323
5324 /**
5325 * First instance id.
5326 */
5327 uint32_t first_instance;
5328
5329 /**
5330 * Number of instances.
5331 */
5332 uint32_t instance_count;
5333
5334 /**
5335 * Whether it's an indexed draw.
5336 */
5337 bool indexed;
5338
5339 /**
5340 * Indirect draw parameters resource.
5341 */
5342 struct radv_buffer *indirect;
5343 uint64_t indirect_offset;
5344 uint32_t stride;
5345
5346 /**
5347 * Draw count parameters resource.
5348 */
5349 struct radv_buffer *count_buffer;
5350 uint64_t count_buffer_offset;
5351
5352 /**
5353 * Stream output parameters resource.
5354 */
5355 struct radv_buffer *strmout_buffer;
5356 uint64_t strmout_buffer_offset;
5357 };
5358
5359 static void
radv_emit_ia_multi_vgt_param(struct radv_cmd_buffer * cmd_buffer,bool instanced_draw,bool indirect_draw,bool count_from_stream_output,uint32_t draw_vertex_count)5360 radv_emit_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer, bool instanced_draw, bool indirect_draw,
5361 bool count_from_stream_output, uint32_t draw_vertex_count)
5362 {
5363 const struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info;
5364 struct radv_cmd_state *state = &cmd_buffer->state;
5365 const unsigned patch_control_points = state->dynamic.vk.ts.patch_control_points;
5366 const unsigned topology = state->dynamic.vk.ia.primitive_topology;
5367 const bool prim_restart_enable = state->dynamic.vk.ia.primitive_restart_enable;
5368 struct radeon_cmdbuf *cs = cmd_buffer->cs;
5369 unsigned ia_multi_vgt_param;
5370
5371 ia_multi_vgt_param = radv_get_ia_multi_vgt_param(cmd_buffer, instanced_draw, indirect_draw, count_from_stream_output,
5372 draw_vertex_count, topology, prim_restart_enable,
5373 patch_control_points, state->tess_num_patches);
5374
5375 if (state->last_ia_multi_vgt_param != ia_multi_vgt_param) {
5376 if (info->gfx_level == GFX9) {
5377 radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device, cs, R_030960_IA_MULTI_VGT_PARAM, 4,
5378 ia_multi_vgt_param);
5379 } else if (info->gfx_level >= GFX7) {
5380 radeon_set_context_reg_idx(cs, R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param);
5381 } else {
5382 radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param);
5383 }
5384 state->last_ia_multi_vgt_param = ia_multi_vgt_param;
5385 }
5386 }
5387
5388 static void
gfx10_emit_ge_cntl(struct radv_cmd_buffer * cmd_buffer)5389 gfx10_emit_ge_cntl(struct radv_cmd_buffer *cmd_buffer)
5390 {
5391 const struct radv_shader *last_vgt_shader = cmd_buffer->state.last_vgt_shader;
5392 struct radv_cmd_state *state = &cmd_buffer->state;
5393 bool break_wave_at_eoi = false;
5394 unsigned primgroup_size;
5395 unsigned ge_cntl;
5396
5397 if (last_vgt_shader->info.is_ngg)
5398 return;
5399
5400 if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_TESS_CTRL)) {
5401 primgroup_size = state->tess_num_patches;
5402
5403 if (cmd_buffer->state.shaders[MESA_SHADER_TESS_CTRL]->info.uses_prim_id ||
5404 radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_TESS_EVAL)->info.uses_prim_id) {
5405 break_wave_at_eoi = true;
5406 }
5407 } else if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_GEOMETRY)) {
5408 const struct radv_legacy_gs_info *gs_state = &cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY]->info.gs_ring_info;
5409 primgroup_size = G_028A44_GS_PRIMS_PER_SUBGRP(gs_state->vgt_gs_onchip_cntl);
5410 } else {
5411 primgroup_size = 128; /* recommended without a GS and tess */
5412 }
5413
5414 ge_cntl = S_03096C_PRIM_GRP_SIZE_GFX10(primgroup_size) | S_03096C_VERT_GRP_SIZE(256) | /* disable vertex grouping */
5415 S_03096C_PACKET_TO_ONE_PA(0) /* this should only be set if LINE_STIPPLE_TEX_ENA == 1 */ |
5416 S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
5417
5418 if (state->last_ge_cntl != ge_cntl) {
5419 radeon_set_uconfig_reg(cmd_buffer->cs, R_03096C_GE_CNTL, ge_cntl);
5420 state->last_ge_cntl = ge_cntl;
5421 }
5422 }
5423
5424 static void
radv_emit_draw_registers(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * draw_info)5425 radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *draw_info)
5426 {
5427 const struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info;
5428 struct radv_cmd_state *state = &cmd_buffer->state;
5429 struct radeon_cmdbuf *cs = cmd_buffer->cs;
5430 uint32_t topology = state->dynamic.vk.ia.primitive_topology;
5431 bool disable_instance_packing = false;
5432
5433 /* Draw state. */
5434 if (info->gfx_level >= GFX10) {
5435 gfx10_emit_ge_cntl(cmd_buffer);
5436 } else {
5437 radv_emit_ia_multi_vgt_param(cmd_buffer, draw_info->instance_count > 1, draw_info->indirect,
5438 !!draw_info->strmout_buffer, draw_info->indirect ? 0 : draw_info->count);
5439 }
5440
5441 /* RDNA2 is affected by a hardware bug when instance packing is enabled for adjacent primitive
5442 * topologies and instance_count > 1, pipeline stats generated by GE are incorrect. It needs to
5443 * be applied for indexed and non-indexed draws.
5444 */
5445 if (info->gfx_level == GFX10_3 && state->active_pipeline_queries > 0 &&
5446 (draw_info->instance_count > 1 || draw_info->indirect) &&
5447 (topology == V_008958_DI_PT_LINELIST_ADJ || topology == V_008958_DI_PT_LINESTRIP_ADJ ||
5448 topology == V_008958_DI_PT_TRILIST_ADJ || topology == V_008958_DI_PT_TRISTRIP_ADJ)) {
5449 disable_instance_packing = true;
5450 }
5451
5452 if ((draw_info->indexed && state->index_type != state->last_index_type) ||
5453 (info->gfx_level == GFX10_3 &&
5454 (state->last_index_type == -1 ||
5455 disable_instance_packing != G_028A7C_DISABLE_INSTANCE_PACKING(state->last_index_type)))) {
5456 uint32_t index_type = state->index_type | S_028A7C_DISABLE_INSTANCE_PACKING(disable_instance_packing);
5457
5458 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9) {
5459 radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device, cs, R_03090C_VGT_INDEX_TYPE, 2, index_type);
5460 } else {
5461 radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
5462 radeon_emit(cs, index_type);
5463 }
5464
5465 state->last_index_type = index_type;
5466 }
5467 }
5468
5469 static void
radv_stage_flush(struct radv_cmd_buffer * cmd_buffer,VkPipelineStageFlags2 src_stage_mask)5470 radv_stage_flush(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 src_stage_mask)
5471 {
5472 /* For simplicity, if the barrier wants to wait for the task shader,
5473 * just make it wait for the mesh shader too.
5474 */
5475 if (src_stage_mask & VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_EXT)
5476 src_stage_mask |= VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_EXT;
5477
5478 if (src_stage_mask & (VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_RESOLVE_BIT | VK_PIPELINE_STAGE_2_BLIT_BIT |
5479 VK_PIPELINE_STAGE_2_CLEAR_BIT)) {
5480 /* Be conservative for now. */
5481 src_stage_mask |= VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT;
5482 }
5483
5484 if (src_stage_mask &
5485 (VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT |
5486 VK_PIPELINE_STAGE_2_COMMAND_PREPROCESS_BIT_NV | VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_KHR |
5487 VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_COPY_BIT_KHR | VK_PIPELINE_STAGE_2_RAY_TRACING_SHADER_BIT_KHR |
5488 VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) {
5489 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
5490 }
5491
5492 if (src_stage_mask & (VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT |
5493 VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT |
5494 VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT | VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT |
5495 VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) {
5496 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
5497 } else if (src_stage_mask &
5498 (VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT |
5499 VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT |
5500 VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT | VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT |
5501 VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_EXT | VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT |
5502 VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT)) {
5503 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH;
5504 }
5505 }
5506
5507 static bool
can_skip_buffer_l2_flushes(struct radv_device * device)5508 can_skip_buffer_l2_flushes(struct radv_device *device)
5509 {
5510 return device->physical_device->rad_info.gfx_level == GFX9 ||
5511 (device->physical_device->rad_info.gfx_level >= GFX10 &&
5512 !device->physical_device->rad_info.tcc_rb_non_coherent);
5513 }
5514
5515 /*
5516 * In vulkan barriers have two kinds of operations:
5517 *
5518 * - visibility (implemented with radv_src_access_flush)
5519 * - availability (implemented with radv_dst_access_flush)
5520 *
5521 * for a memory operation to observe the result of a previous memory operation
5522 * one needs to do a visibility operation from the source memory and then an
5523 * availability operation to the target memory.
5524 *
5525 * The complication is the availability and visibility operations do not need to
5526 * be in the same barrier.
5527 *
5528 * The cleanest way to implement this is to define the visibility operation to
5529 * bring the caches to a "state of rest", which none of the caches below that
5530 * level dirty.
5531 *
5532 * For GFX8 and earlier this would be VRAM/GTT with none of the caches dirty.
5533 *
5534 * For GFX9+ we can define the state at rest to be L2 instead of VRAM for all
5535 * buffers and for images marked as coherent, and VRAM/GTT for non-coherent
5536 * images. However, given the existence of memory barriers which do not specify
5537 * the image/buffer it often devolves to just VRAM/GTT anyway.
5538 *
5539 * To help reducing the invalidations for GPUs that have L2 coherency between the
5540 * RB and the shader caches, we always invalidate L2 on the src side, as we can
5541 * use our knowledge of past usage to optimize flushes away.
5542 */
5543
5544 enum radv_cmd_flush_bits
radv_src_access_flush(struct radv_cmd_buffer * cmd_buffer,VkAccessFlags2 src_flags,const struct radv_image * image)5545 radv_src_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags2 src_flags, const struct radv_image *image)
5546 {
5547 bool has_CB_meta = true, has_DB_meta = true;
5548 bool image_is_coherent = image ? image->l2_coherent : false;
5549 enum radv_cmd_flush_bits flush_bits = 0;
5550
5551 if (image) {
5552 if (!radv_image_has_CB_metadata(image))
5553 has_CB_meta = false;
5554 if (!radv_image_has_htile(image))
5555 has_DB_meta = false;
5556 }
5557
5558 u_foreach_bit64 (b, src_flags) {
5559 switch ((VkAccessFlags2)BITFIELD64_BIT(b)) {
5560 case VK_ACCESS_2_COMMAND_PREPROCESS_WRITE_BIT_NV:
5561 flush_bits |= RADV_CMD_FLAG_INV_L2;
5562 break;
5563 case VK_ACCESS_2_SHADER_WRITE_BIT:
5564 case VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT:
5565 /* since the STORAGE bit isn't set we know that this is a meta operation.
5566 * on the dst flush side we skip CB/DB flushes without the STORAGE bit, so
5567 * set it here. */
5568 if (image && !(image->vk.usage & VK_IMAGE_USAGE_STORAGE_BIT)) {
5569 if (vk_format_is_depth_or_stencil(image->vk.format)) {
5570 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
5571 } else {
5572 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
5573 }
5574 }
5575
5576 if (!image_is_coherent)
5577 flush_bits |= RADV_CMD_FLAG_INV_L2;
5578 break;
5579 case VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_KHR:
5580 case VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT:
5581 case VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
5582 if (!image_is_coherent)
5583 flush_bits |= RADV_CMD_FLAG_WB_L2;
5584 break;
5585 case VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT:
5586 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
5587 if (has_CB_meta)
5588 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
5589 break;
5590 case VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
5591 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
5592 if (has_DB_meta)
5593 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
5594 break;
5595 case VK_ACCESS_2_TRANSFER_WRITE_BIT:
5596 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB;
5597
5598 if (!image_is_coherent)
5599 flush_bits |= RADV_CMD_FLAG_INV_L2;
5600 if (has_CB_meta)
5601 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
5602 if (has_DB_meta)
5603 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
5604 break;
5605 case VK_ACCESS_2_MEMORY_WRITE_BIT:
5606 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB;
5607
5608 if (!image_is_coherent)
5609 flush_bits |= RADV_CMD_FLAG_INV_L2;
5610 if (has_CB_meta)
5611 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
5612 if (has_DB_meta)
5613 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
5614 break;
5615 default:
5616 break;
5617 }
5618 }
5619 return flush_bits;
5620 }
5621
5622 enum radv_cmd_flush_bits
radv_dst_access_flush(struct radv_cmd_buffer * cmd_buffer,VkAccessFlags2 dst_flags,const struct radv_image * image)5623 radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags2 dst_flags, const struct radv_image *image)
5624 {
5625 bool has_CB_meta = true, has_DB_meta = true;
5626 enum radv_cmd_flush_bits flush_bits = 0;
5627 bool flush_CB = true, flush_DB = true;
5628 bool image_is_coherent = image ? image->l2_coherent : false;
5629
5630 if (image) {
5631 if (!(image->vk.usage & VK_IMAGE_USAGE_STORAGE_BIT)) {
5632 flush_CB = false;
5633 flush_DB = false;
5634 }
5635
5636 if (!radv_image_has_CB_metadata(image))
5637 has_CB_meta = false;
5638 if (!radv_image_has_htile(image))
5639 has_DB_meta = false;
5640 }
5641
5642 /* All the L2 invalidations below are not the CB/DB. So if there are no incoherent images
5643 * in the L2 cache in CB/DB mode then they are already usable from all the other L2 clients. */
5644 image_is_coherent |= can_skip_buffer_l2_flushes(cmd_buffer->device) && !cmd_buffer->state.rb_noncoherent_dirty;
5645
5646 u_foreach_bit64 (b, dst_flags) {
5647 switch ((VkAccessFlags2)BITFIELD64_BIT(b)) {
5648 case VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT:
5649 /* SMEM loads are used to read compute dispatch size in shaders */
5650 if (!cmd_buffer->device->load_grid_size_from_user_sgpr)
5651 flush_bits |= RADV_CMD_FLAG_INV_SCACHE;
5652
5653 /* Ensure the DGC meta shader can read the commands. */
5654 if (radv_uses_device_generated_commands(cmd_buffer->device)) {
5655 flush_bits |= RADV_CMD_FLAG_INV_SCACHE | RADV_CMD_FLAG_INV_VCACHE;
5656
5657 if (cmd_buffer->device->physical_device->rad_info.gfx_level < GFX9)
5658 flush_bits |= RADV_CMD_FLAG_INV_L2;
5659 }
5660
5661 break;
5662 case VK_ACCESS_2_INDEX_READ_BIT:
5663 case VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
5664 break;
5665 case VK_ACCESS_2_UNIFORM_READ_BIT:
5666 flush_bits |= RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_SCACHE;
5667 break;
5668 case VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT:
5669 case VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT:
5670 case VK_ACCESS_2_TRANSFER_READ_BIT:
5671 case VK_ACCESS_2_TRANSFER_WRITE_BIT:
5672 flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
5673
5674 if (has_CB_meta || has_DB_meta)
5675 flush_bits |= RADV_CMD_FLAG_INV_L2_METADATA;
5676 if (!image_is_coherent)
5677 flush_bits |= RADV_CMD_FLAG_INV_L2;
5678 break;
5679 case VK_ACCESS_2_DESCRIPTOR_BUFFER_READ_BIT_EXT:
5680 flush_bits |= RADV_CMD_FLAG_INV_SCACHE;
5681 break;
5682 case VK_ACCESS_2_SHADER_BINDING_TABLE_READ_BIT_KHR:
5683 case VK_ACCESS_2_SHADER_READ_BIT:
5684 case VK_ACCESS_2_SHADER_STORAGE_READ_BIT:
5685 /* Unlike LLVM, ACO uses SMEM for SSBOs and we have to
5686 * invalidate the scalar cache. */
5687 if (!cmd_buffer->device->physical_device->use_llvm && !image)
5688 flush_bits |= RADV_CMD_FLAG_INV_SCACHE;
5689 FALLTHROUGH;
5690 case VK_ACCESS_2_SHADER_SAMPLED_READ_BIT:
5691 flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
5692 if (has_CB_meta || has_DB_meta)
5693 flush_bits |= RADV_CMD_FLAG_INV_L2_METADATA;
5694 if (!image_is_coherent)
5695 flush_bits |= RADV_CMD_FLAG_INV_L2;
5696 break;
5697 case VK_ACCESS_2_COMMAND_PREPROCESS_READ_BIT_NV:
5698 case VK_ACCESS_2_ACCELERATION_STRUCTURE_READ_BIT_KHR:
5699 flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
5700 if (cmd_buffer->device->physical_device->rad_info.gfx_level < GFX9)
5701 flush_bits |= RADV_CMD_FLAG_INV_L2;
5702 break;
5703 case VK_ACCESS_2_SHADER_WRITE_BIT:
5704 case VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT:
5705 case VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_KHR:
5706 break;
5707 case VK_ACCESS_2_COLOR_ATTACHMENT_READ_BIT:
5708 case VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT:
5709 if (flush_CB)
5710 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
5711 if (has_CB_meta)
5712 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
5713 break;
5714 case VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_READ_BIT:
5715 case VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
5716 if (flush_DB)
5717 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
5718 if (has_DB_meta)
5719 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
5720 break;
5721 case VK_ACCESS_2_MEMORY_READ_BIT:
5722 case VK_ACCESS_2_MEMORY_WRITE_BIT:
5723 flush_bits |= RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_SCACHE;
5724 if (!image_is_coherent)
5725 flush_bits |= RADV_CMD_FLAG_INV_L2;
5726 if (flush_CB)
5727 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
5728 if (has_CB_meta)
5729 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
5730 if (flush_DB)
5731 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
5732 if (has_DB_meta)
5733 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
5734 break;
5735 default:
5736 break;
5737 }
5738 }
5739 return flush_bits;
5740 }
5741
5742 void
radv_emit_resolve_barrier(struct radv_cmd_buffer * cmd_buffer,const struct radv_resolve_barrier * barrier)5743 radv_emit_resolve_barrier(struct radv_cmd_buffer *cmd_buffer, const struct radv_resolve_barrier *barrier)
5744 {
5745 struct radv_rendering_state *render = &cmd_buffer->state.render;
5746
5747 for (uint32_t i = 0; i < render->color_att_count; i++) {
5748 struct radv_image_view *iview = render->color_att[i].iview;
5749 if (!iview)
5750 continue;
5751
5752 cmd_buffer->state.flush_bits |= radv_src_access_flush(cmd_buffer, barrier->src_access_mask, iview->image);
5753 }
5754 if (render->ds_att.iview) {
5755 cmd_buffer->state.flush_bits |=
5756 radv_src_access_flush(cmd_buffer, barrier->src_access_mask, render->ds_att.iview->image);
5757 }
5758
5759 radv_stage_flush(cmd_buffer, barrier->src_stage_mask);
5760
5761 for (uint32_t i = 0; i < render->color_att_count; i++) {
5762 struct radv_image_view *iview = render->color_att[i].iview;
5763 if (!iview)
5764 continue;
5765
5766 cmd_buffer->state.flush_bits |= radv_dst_access_flush(cmd_buffer, barrier->dst_access_mask, iview->image);
5767 }
5768 if (render->ds_att.iview) {
5769 cmd_buffer->state.flush_bits |=
5770 radv_dst_access_flush(cmd_buffer, barrier->dst_access_mask, render->ds_att.iview->image);
5771 }
5772
5773 radv_gang_barrier(cmd_buffer, barrier->src_stage_mask, barrier->dst_stage_mask);
5774 }
5775
5776 static void
radv_handle_image_transition_separate(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,VkImageLayout dst_layout,VkImageLayout src_stencil_layout,VkImageLayout dst_stencil_layout,uint32_t src_family_index,uint32_t dst_family_index,const VkImageSubresourceRange * range,struct radv_sample_locations_state * sample_locs)5777 radv_handle_image_transition_separate(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
5778 VkImageLayout src_layout, VkImageLayout dst_layout,
5779 VkImageLayout src_stencil_layout, VkImageLayout dst_stencil_layout,
5780 uint32_t src_family_index, uint32_t dst_family_index,
5781 const VkImageSubresourceRange *range,
5782 struct radv_sample_locations_state *sample_locs)
5783 {
5784 /* If we have a stencil layout that's different from depth, we need to
5785 * perform the stencil transition separately.
5786 */
5787 if ((range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) &&
5788 (src_layout != src_stencil_layout || dst_layout != dst_stencil_layout)) {
5789 VkImageSubresourceRange aspect_range = *range;
5790 /* Depth-only transitions. */
5791 if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
5792 aspect_range.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT;
5793 radv_handle_image_transition(cmd_buffer, image, src_layout, dst_layout, src_family_index, dst_family_index,
5794 &aspect_range, sample_locs);
5795 }
5796
5797 /* Stencil-only transitions. */
5798 aspect_range.aspectMask = VK_IMAGE_ASPECT_STENCIL_BIT;
5799 radv_handle_image_transition(cmd_buffer, image, src_stencil_layout, dst_stencil_layout, src_family_index,
5800 dst_family_index, &aspect_range, sample_locs);
5801 } else {
5802 radv_handle_image_transition(cmd_buffer, image, src_layout, dst_layout, src_family_index, dst_family_index, range,
5803 sample_locs);
5804 }
5805 }
5806
5807 static void
radv_handle_rendering_image_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_image_view * view,uint32_t layer_count,uint32_t view_mask,VkImageLayout initial_layout,VkImageLayout initial_stencil_layout,VkImageLayout final_layout,VkImageLayout final_stencil_layout,struct radv_sample_locations_state * sample_locs)5808 radv_handle_rendering_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image_view *view,
5809 uint32_t layer_count, uint32_t view_mask, VkImageLayout initial_layout,
5810 VkImageLayout initial_stencil_layout, VkImageLayout final_layout,
5811 VkImageLayout final_stencil_layout,
5812 struct radv_sample_locations_state *sample_locs)
5813 {
5814 VkImageSubresourceRange range;
5815 range.aspectMask = view->image->vk.aspects;
5816 range.baseMipLevel = view->vk.base_mip_level;
5817 range.levelCount = 1;
5818
5819 if (view_mask) {
5820 while (view_mask) {
5821 int start, count;
5822 u_bit_scan_consecutive_range(&view_mask, &start, &count);
5823
5824 range.baseArrayLayer = view->vk.base_array_layer + start;
5825 range.layerCount = count;
5826
5827 radv_handle_image_transition_separate(cmd_buffer, view->image, initial_layout, final_layout,
5828 initial_stencil_layout, final_stencil_layout, 0, 0, &range, sample_locs);
5829 }
5830 } else {
5831 range.baseArrayLayer = view->vk.base_array_layer;
5832 range.layerCount = layer_count;
5833 radv_handle_image_transition_separate(cmd_buffer, view->image, initial_layout, final_layout,
5834 initial_stencil_layout, final_stencil_layout, 0, 0, &range, sample_locs);
5835 }
5836 }
5837
5838 VKAPI_ATTR VkResult VKAPI_CALL
radv_BeginCommandBuffer(VkCommandBuffer commandBuffer,const VkCommandBufferBeginInfo * pBeginInfo)5839 radv_BeginCommandBuffer(VkCommandBuffer commandBuffer, const VkCommandBufferBeginInfo *pBeginInfo)
5840 {
5841 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5842 VkResult result = VK_SUCCESS;
5843
5844 vk_command_buffer_begin(&cmd_buffer->vk, pBeginInfo);
5845
5846 if (cmd_buffer->qf == RADV_QUEUE_SPARSE)
5847 return result;
5848
5849 memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
5850 cmd_buffer->state.last_index_type = -1;
5851 cmd_buffer->state.last_num_instances = -1;
5852 cmd_buffer->state.last_vertex_offset_valid = false;
5853 cmd_buffer->state.last_first_instance = -1;
5854 cmd_buffer->state.last_drawid = -1;
5855 cmd_buffer->state.last_subpass_color_count = MAX_RTS;
5856 cmd_buffer->state.predication_type = -1;
5857 cmd_buffer->state.last_sx_ps_downconvert = -1;
5858 cmd_buffer->state.last_sx_blend_opt_epsilon = -1;
5859 cmd_buffer->state.last_sx_blend_opt_control = -1;
5860 cmd_buffer->state.mesh_shading = false;
5861 cmd_buffer->state.last_vrs_rates = -1;
5862 cmd_buffer->state.last_vrs_rates_sgpr_idx = -1;
5863 cmd_buffer->state.last_pa_sc_binner_cntl_0 = -1;
5864 cmd_buffer->state.last_db_count_control = -1;
5865 cmd_buffer->state.last_db_shader_control = -1;
5866 cmd_buffer->usage_flags = pBeginInfo->flags;
5867
5868 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_ALL | RADV_CMD_DIRTY_GUARDBAND | RADV_CMD_DIRTY_OCCLUSION_QUERY |
5869 RADV_CMD_DIRTY_DB_SHADER_CONTROL;
5870
5871 if (cmd_buffer->qf == RADV_QUEUE_GENERAL)
5872 vk_dynamic_graphics_state_init(&cmd_buffer->state.dynamic.vk);
5873
5874 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) {
5875 uint32_t pred_value = 0;
5876 uint32_t pred_offset;
5877 if (!radv_cmd_buffer_upload_data(cmd_buffer, 4, &pred_value, &pred_offset))
5878 vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
5879
5880 cmd_buffer->mec_inv_pred_emitted = false;
5881 cmd_buffer->mec_inv_pred_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + pred_offset;
5882 }
5883
5884 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9 && cmd_buffer->qf == RADV_QUEUE_GENERAL) {
5885 unsigned num_db = cmd_buffer->device->physical_device->rad_info.max_render_backends;
5886 unsigned fence_offset, eop_bug_offset;
5887 void *fence_ptr;
5888
5889 radv_cmd_buffer_upload_alloc(cmd_buffer, 8, &fence_offset, &fence_ptr);
5890 memset(fence_ptr, 0, 8);
5891
5892 cmd_buffer->gfx9_fence_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
5893 cmd_buffer->gfx9_fence_va += fence_offset;
5894
5895 radv_emit_clear_data(cmd_buffer, V_370_PFP, cmd_buffer->gfx9_fence_va, 8);
5896
5897 if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX9) {
5898 /* Allocate a buffer for the EOP bug on GFX9. */
5899 radv_cmd_buffer_upload_alloc(cmd_buffer, 16 * num_db, &eop_bug_offset, &fence_ptr);
5900 memset(fence_ptr, 0, 16 * num_db);
5901 cmd_buffer->gfx9_eop_bug_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
5902 cmd_buffer->gfx9_eop_bug_va += eop_bug_offset;
5903
5904 radv_emit_clear_data(cmd_buffer, V_370_PFP, cmd_buffer->gfx9_eop_bug_va, 16 * num_db);
5905 }
5906 }
5907
5908 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
5909 (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) {
5910
5911 char gcbiar_data[VK_GCBIARR_DATA_SIZE(MAX_RTS)];
5912 const VkRenderingInfo *resume_info =
5913 vk_get_command_buffer_inheritance_as_rendering_resume(cmd_buffer->vk.level, pBeginInfo, gcbiar_data);
5914 if (resume_info) {
5915 radv_CmdBeginRendering(commandBuffer, resume_info);
5916 } else {
5917 const VkCommandBufferInheritanceRenderingInfo *inheritance_info =
5918 vk_get_command_buffer_inheritance_rendering_info(cmd_buffer->vk.level, pBeginInfo);
5919
5920 radv_cmd_buffer_reset_rendering(cmd_buffer);
5921 struct radv_rendering_state *render = &cmd_buffer->state.render;
5922 render->active = true;
5923 render->view_mask = inheritance_info->viewMask;
5924 render->max_samples = inheritance_info->rasterizationSamples;
5925 render->color_att_count = inheritance_info->colorAttachmentCount;
5926 for (uint32_t i = 0; i < render->color_att_count; i++) {
5927 render->color_att[i] = (struct radv_attachment){
5928 .format = inheritance_info->pColorAttachmentFormats[i],
5929 };
5930 }
5931 assert(inheritance_info->depthAttachmentFormat == VK_FORMAT_UNDEFINED ||
5932 inheritance_info->stencilAttachmentFormat == VK_FORMAT_UNDEFINED ||
5933 inheritance_info->depthAttachmentFormat == inheritance_info->stencilAttachmentFormat);
5934 render->ds_att = (struct radv_attachment){.iview = NULL};
5935 if (inheritance_info->depthAttachmentFormat != VK_FORMAT_UNDEFINED)
5936 render->ds_att.format = inheritance_info->depthAttachmentFormat;
5937 if (inheritance_info->stencilAttachmentFormat != VK_FORMAT_UNDEFINED)
5938 render->ds_att.format = inheritance_info->stencilAttachmentFormat;
5939
5940 if (vk_format_has_depth(render->ds_att.format))
5941 render->ds_att_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
5942 if (vk_format_has_stencil(render->ds_att.format))
5943 render->ds_att_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
5944 }
5945
5946 cmd_buffer->state.inherited_pipeline_statistics = pBeginInfo->pInheritanceInfo->pipelineStatistics;
5947
5948 if (cmd_buffer->state.inherited_pipeline_statistics &
5949 (VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT |
5950 VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT))
5951 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
5952
5953 cmd_buffer->state.inherited_occlusion_queries = pBeginInfo->pInheritanceInfo->occlusionQueryEnable;
5954 cmd_buffer->state.inherited_query_control_flags = pBeginInfo->pInheritanceInfo->queryFlags;
5955 if (cmd_buffer->state.inherited_occlusion_queries)
5956 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_OCCLUSION_QUERY;
5957 }
5958
5959 if (radv_device_fault_detection_enabled(cmd_buffer->device))
5960 radv_cmd_buffer_trace_emit(cmd_buffer);
5961
5962 radv_describe_begin_cmd_buffer(cmd_buffer);
5963
5964 return result;
5965 }
5966
5967 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes,const VkDeviceSize * pStrides)5968 radv_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer, uint32_t firstBinding, uint32_t bindingCount,
5969 const VkBuffer *pBuffers, const VkDeviceSize *pOffsets, const VkDeviceSize *pSizes,
5970 const VkDeviceSize *pStrides)
5971 {
5972 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5973 struct radv_vertex_binding *vb = cmd_buffer->vertex_bindings;
5974 const struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
5975
5976 /* We have to defer setting up vertex buffer since we need the buffer
5977 * stride from the pipeline. */
5978
5979 assert(firstBinding + bindingCount <= MAX_VBS);
5980 enum amd_gfx_level chip = cmd_buffer->device->physical_device->rad_info.gfx_level;
5981
5982 if (firstBinding + bindingCount > cmd_buffer->used_vertex_bindings)
5983 cmd_buffer->used_vertex_bindings = firstBinding + bindingCount;
5984
5985 uint32_t misaligned_mask_invalid = 0;
5986
5987 for (uint32_t i = 0; i < bindingCount; i++) {
5988 RADV_FROM_HANDLE(radv_buffer, buffer, pBuffers[i]);
5989 uint32_t idx = firstBinding + i;
5990 VkDeviceSize size = pSizes ? pSizes[i] : 0;
5991 /* if pStrides=NULL, it shouldn't overwrite the strides specified by CmdSetVertexInputEXT */
5992 VkDeviceSize stride = pStrides ? pStrides[i] : vb[idx].stride;
5993
5994 if (!!cmd_buffer->vertex_binding_buffers[idx] != !!buffer ||
5995 (buffer && ((vb[idx].offset & 0x3) != (pOffsets[i] & 0x3) || (vb[idx].stride & 0x3) != (stride & 0x3)))) {
5996 misaligned_mask_invalid |= state->bindings_match_attrib ? BITFIELD_BIT(idx) : 0xffffffff;
5997 }
5998
5999 cmd_buffer->vertex_binding_buffers[idx] = buffer;
6000 vb[idx].offset = pOffsets[i];
6001 vb[idx].size = buffer ? vk_buffer_range(&buffer->vk, pOffsets[i], size) : size;
6002 vb[idx].stride = stride;
6003
6004 uint32_t bit = BITFIELD_BIT(idx);
6005 if (buffer) {
6006 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->vertex_binding_buffers[idx]->bo);
6007 cmd_buffer->state.vbo_bound_mask |= bit;
6008 } else {
6009 cmd_buffer->state.vbo_bound_mask &= ~bit;
6010 }
6011 }
6012
6013 if ((chip == GFX6 || chip >= GFX10) && misaligned_mask_invalid) {
6014 cmd_buffer->state.vbo_misaligned_mask_invalid = misaligned_mask_invalid;
6015 cmd_buffer->state.vbo_misaligned_mask &= ~misaligned_mask_invalid;
6016 }
6017
6018 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER | RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT;
6019 }
6020
6021 static uint32_t
vk_to_index_type(VkIndexType type)6022 vk_to_index_type(VkIndexType type)
6023 {
6024 switch (type) {
6025 case VK_INDEX_TYPE_UINT8_KHR:
6026 return V_028A7C_VGT_INDEX_8;
6027 case VK_INDEX_TYPE_UINT16:
6028 return V_028A7C_VGT_INDEX_16;
6029 case VK_INDEX_TYPE_UINT32:
6030 return V_028A7C_VGT_INDEX_32;
6031 default:
6032 unreachable("invalid index type");
6033 }
6034 }
6035
6036 uint32_t
radv_get_vgt_index_size(uint32_t type)6037 radv_get_vgt_index_size(uint32_t type)
6038 {
6039 uint32_t index_type = G_028A7C_INDEX_TYPE(type);
6040 switch (index_type) {
6041 case V_028A7C_VGT_INDEX_8:
6042 return 1;
6043 case V_028A7C_VGT_INDEX_16:
6044 return 2;
6045 case V_028A7C_VGT_INDEX_32:
6046 return 4;
6047 default:
6048 unreachable("invalid index type");
6049 }
6050 }
6051
6052 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer,VkBuffer buffer,VkDeviceSize offset,VkDeviceSize size,VkIndexType indexType)6053 radv_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, VkDeviceSize size,
6054 VkIndexType indexType)
6055 {
6056 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6057 RADV_FROM_HANDLE(radv_buffer, index_buffer, buffer);
6058
6059 cmd_buffer->state.index_type = vk_to_index_type(indexType);
6060
6061 if (index_buffer) {
6062 cmd_buffer->state.index_va = radv_buffer_get_va(index_buffer->bo);
6063 cmd_buffer->state.index_va += index_buffer->offset + offset;
6064
6065 int index_size = radv_get_vgt_index_size(vk_to_index_type(indexType));
6066 cmd_buffer->state.max_index_count = (vk_buffer_range(&index_buffer->vk, offset, size)) / index_size;
6067 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, index_buffer->bo);
6068 } else {
6069 cmd_buffer->state.index_va = 0;
6070 cmd_buffer->state.max_index_count = 0;
6071 }
6072
6073 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
6074
6075 /* Primitive restart state depends on the index type. */
6076 if (cmd_buffer->state.dynamic.vk.ia.primitive_restart_enable)
6077 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE;
6078 }
6079
6080 static void
radv_bind_descriptor_set(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point,struct radv_descriptor_set * set,unsigned idx)6081 radv_bind_descriptor_set(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point,
6082 struct radv_descriptor_set *set, unsigned idx)
6083 {
6084 struct radeon_winsys *ws = cmd_buffer->device->ws;
6085
6086 radv_set_descriptor_set(cmd_buffer, bind_point, set, idx);
6087
6088 assert(set);
6089 assert(!(set->header.layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR));
6090
6091 if (!cmd_buffer->device->use_global_bo_list) {
6092 for (unsigned j = 0; j < set->header.buffer_count; ++j)
6093 if (set->descriptors[j])
6094 radv_cs_add_buffer(ws, cmd_buffer->cs, set->descriptors[j]);
6095 }
6096
6097 if (set->header.bo)
6098 radv_cs_add_buffer(ws, cmd_buffer->cs, set->header.bo);
6099 }
6100
6101 static void
radv_bind_descriptor_sets(struct radv_cmd_buffer * cmd_buffer,const VkBindDescriptorSetsInfoKHR * pBindDescriptorSetsInfo,VkPipelineBindPoint bind_point)6102 radv_bind_descriptor_sets(struct radv_cmd_buffer *cmd_buffer,
6103 const VkBindDescriptorSetsInfoKHR *pBindDescriptorSetsInfo, VkPipelineBindPoint bind_point)
6104 {
6105 RADV_FROM_HANDLE(radv_pipeline_layout, layout, pBindDescriptorSetsInfo->layout);
6106 const bool no_dynamic_bounds = cmd_buffer->device->instance->debug_flags & RADV_DEBUG_NO_DYNAMIC_BOUNDS;
6107 struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);
6108 unsigned dyn_idx = 0;
6109
6110 for (unsigned i = 0; i < pBindDescriptorSetsInfo->descriptorSetCount; ++i) {
6111 unsigned set_idx = i + pBindDescriptorSetsInfo->firstSet;
6112 RADV_FROM_HANDLE(radv_descriptor_set, set, pBindDescriptorSetsInfo->pDescriptorSets[i]);
6113
6114 if (!set)
6115 continue;
6116
6117 /* If the set is already bound we only need to update the
6118 * (potentially changed) dynamic offsets. */
6119 if (descriptors_state->sets[set_idx] != set || !(descriptors_state->valid & (1u << set_idx))) {
6120 radv_bind_descriptor_set(cmd_buffer, bind_point, set, set_idx);
6121 }
6122
6123 for (unsigned j = 0; j < set->header.layout->dynamic_offset_count; ++j, ++dyn_idx) {
6124 unsigned idx = j + layout->set[i + pBindDescriptorSetsInfo->firstSet].dynamic_offset_start;
6125 uint32_t *dst = descriptors_state->dynamic_buffers + idx * 4;
6126 assert(dyn_idx < pBindDescriptorSetsInfo->dynamicOffsetCount);
6127
6128 struct radv_descriptor_range *range = set->header.dynamic_descriptors + j;
6129
6130 if (!range->va) {
6131 memset(dst, 0, 4 * 4);
6132 } else {
6133 uint64_t va = range->va + pBindDescriptorSetsInfo->pDynamicOffsets[dyn_idx];
6134 dst[0] = va;
6135 dst[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
6136 dst[2] = no_dynamic_bounds ? 0xffffffffu : range->size;
6137 dst[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
6138 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
6139
6140 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
6141 dst[3] |= S_008F0C_FORMAT(V_008F0C_GFX11_FORMAT_32_FLOAT) | S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW);
6142 } else if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10) {
6143 dst[3] |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
6144 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
6145 } else {
6146 dst[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
6147 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
6148 }
6149 }
6150
6151 cmd_buffer->push_constant_stages |= set->header.layout->dynamic_shader_stages;
6152 }
6153 }
6154 }
6155
6156 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindDescriptorSets2KHR(VkCommandBuffer commandBuffer,const VkBindDescriptorSetsInfoKHR * pBindDescriptorSetsInfo)6157 radv_CmdBindDescriptorSets2KHR(VkCommandBuffer commandBuffer,
6158 const VkBindDescriptorSetsInfoKHR *pBindDescriptorSetsInfo)
6159 {
6160 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6161
6162 if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) {
6163 radv_bind_descriptor_sets(cmd_buffer, pBindDescriptorSetsInfo, VK_PIPELINE_BIND_POINT_COMPUTE);
6164 }
6165
6166 if (pBindDescriptorSetsInfo->stageFlags & RADV_GRAPHICS_STAGE_BITS) {
6167 radv_bind_descriptor_sets(cmd_buffer, pBindDescriptorSetsInfo, VK_PIPELINE_BIND_POINT_GRAPHICS);
6168 }
6169
6170 if (pBindDescriptorSetsInfo->stageFlags & RADV_RT_STAGE_BITS) {
6171 radv_bind_descriptor_sets(cmd_buffer, pBindDescriptorSetsInfo, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
6172 }
6173 }
6174
6175 static bool
radv_init_push_descriptor_set(struct radv_cmd_buffer * cmd_buffer,struct radv_descriptor_set * set,struct radv_descriptor_set_layout * layout,VkPipelineBindPoint bind_point)6176 radv_init_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer, struct radv_descriptor_set *set,
6177 struct radv_descriptor_set_layout *layout, VkPipelineBindPoint bind_point)
6178 {
6179 struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);
6180 set->header.size = layout->size;
6181
6182 if (set->header.layout != layout) {
6183 if (set->header.layout)
6184 vk_descriptor_set_layout_unref(&cmd_buffer->device->vk, &set->header.layout->vk);
6185 vk_descriptor_set_layout_ref(&layout->vk);
6186 set->header.layout = layout;
6187 }
6188
6189 if (descriptors_state->push_set.capacity < set->header.size) {
6190 size_t new_size = MAX2(set->header.size, 1024);
6191 new_size = MAX2(new_size, 2 * descriptors_state->push_set.capacity);
6192 new_size = MIN2(new_size, 96 * MAX_PUSH_DESCRIPTORS);
6193
6194 free(set->header.mapped_ptr);
6195 set->header.mapped_ptr = malloc(new_size);
6196
6197 if (!set->header.mapped_ptr) {
6198 descriptors_state->push_set.capacity = 0;
6199 vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
6200 return false;
6201 }
6202
6203 descriptors_state->push_set.capacity = new_size;
6204 }
6205
6206 return true;
6207 }
6208
6209 void
radv_meta_push_descriptor_set(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint pipelineBindPoint,VkPipelineLayout _layout,uint32_t set,uint32_t descriptorWriteCount,const VkWriteDescriptorSet * pDescriptorWrites)6210 radv_meta_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint pipelineBindPoint,
6211 VkPipelineLayout _layout, uint32_t set, uint32_t descriptorWriteCount,
6212 const VkWriteDescriptorSet *pDescriptorWrites)
6213 {
6214 RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
6215 struct radv_descriptor_set *push_set = (struct radv_descriptor_set *)&cmd_buffer->meta_push_descriptors;
6216 unsigned bo_offset;
6217
6218 assert(set == 0);
6219 assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
6220
6221 push_set->header.size = layout->set[set].layout->size;
6222 push_set->header.layout = layout->set[set].layout;
6223
6224 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, push_set->header.size, &bo_offset,
6225 (void **)&push_set->header.mapped_ptr))
6226 return;
6227
6228 push_set->header.va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
6229 push_set->header.va += bo_offset;
6230
6231 radv_cmd_update_descriptor_sets(cmd_buffer->device, cmd_buffer, radv_descriptor_set_to_handle(push_set),
6232 descriptorWriteCount, pDescriptorWrites, 0, NULL);
6233
6234 radv_set_descriptor_set(cmd_buffer, pipelineBindPoint, push_set, set);
6235 }
6236
6237 static void
radv_push_descriptor_set(struct radv_cmd_buffer * cmd_buffer,const VkPushDescriptorSetInfoKHR * pPushDescriptorSetInfo,VkPipelineBindPoint bind_point)6238 radv_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer, const VkPushDescriptorSetInfoKHR *pPushDescriptorSetInfo,
6239 VkPipelineBindPoint bind_point)
6240 {
6241 RADV_FROM_HANDLE(radv_pipeline_layout, layout, pPushDescriptorSetInfo->layout);
6242 struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);
6243 struct radv_descriptor_set *push_set = (struct radv_descriptor_set *)&descriptors_state->push_set.set;
6244
6245 assert(layout->set[pPushDescriptorSetInfo->set].layout->flags &
6246 VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
6247
6248 if (!radv_init_push_descriptor_set(cmd_buffer, push_set, layout->set[pPushDescriptorSetInfo->set].layout,
6249 bind_point))
6250 return;
6251
6252 /* Check that there are no inline uniform block updates when calling vkCmdPushDescriptorSetKHR()
6253 * because it is invalid, according to Vulkan spec.
6254 */
6255 for (int i = 0; i < pPushDescriptorSetInfo->descriptorWriteCount; i++) {
6256 ASSERTED const VkWriteDescriptorSet *writeset = &pPushDescriptorSetInfo->pDescriptorWrites[i];
6257 assert(writeset->descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK);
6258 }
6259
6260 radv_cmd_update_descriptor_sets(cmd_buffer->device, cmd_buffer, radv_descriptor_set_to_handle(push_set),
6261 pPushDescriptorSetInfo->descriptorWriteCount,
6262 pPushDescriptorSetInfo->pDescriptorWrites, 0, NULL);
6263
6264 radv_set_descriptor_set(cmd_buffer, bind_point, push_set, pPushDescriptorSetInfo->set);
6265
6266 radv_flush_push_descriptors(cmd_buffer, descriptors_state);
6267 }
6268
6269 VKAPI_ATTR void VKAPI_CALL
radv_CmdPushDescriptorSet2KHR(VkCommandBuffer commandBuffer,const VkPushDescriptorSetInfoKHR * pPushDescriptorSetInfo)6270 radv_CmdPushDescriptorSet2KHR(VkCommandBuffer commandBuffer, const VkPushDescriptorSetInfoKHR *pPushDescriptorSetInfo)
6271 {
6272 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6273
6274 if (pPushDescriptorSetInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) {
6275 radv_push_descriptor_set(cmd_buffer, pPushDescriptorSetInfo, VK_PIPELINE_BIND_POINT_COMPUTE);
6276 }
6277
6278 if (pPushDescriptorSetInfo->stageFlags & RADV_GRAPHICS_STAGE_BITS) {
6279 radv_push_descriptor_set(cmd_buffer, pPushDescriptorSetInfo, VK_PIPELINE_BIND_POINT_GRAPHICS);
6280 }
6281
6282 if (pPushDescriptorSetInfo->stageFlags & RADV_RT_STAGE_BITS) {
6283 radv_push_descriptor_set(cmd_buffer, pPushDescriptorSetInfo, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
6284 }
6285 }
6286
6287 VKAPI_ATTR void VKAPI_CALL
radv_CmdPushDescriptorSetWithTemplate2KHR(VkCommandBuffer commandBuffer,const VkPushDescriptorSetWithTemplateInfoKHR * pPushDescriptorSetWithTemplateInfo)6288 radv_CmdPushDescriptorSetWithTemplate2KHR(
6289 VkCommandBuffer commandBuffer, const VkPushDescriptorSetWithTemplateInfoKHR *pPushDescriptorSetWithTemplateInfo)
6290 {
6291 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6292 RADV_FROM_HANDLE(radv_pipeline_layout, layout, pPushDescriptorSetWithTemplateInfo->layout);
6293 RADV_FROM_HANDLE(radv_descriptor_update_template, templ,
6294 pPushDescriptorSetWithTemplateInfo->descriptorUpdateTemplate);
6295 struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, templ->bind_point);
6296 struct radv_descriptor_set *push_set = (struct radv_descriptor_set *)&descriptors_state->push_set.set;
6297
6298 assert(layout->set[pPushDescriptorSetWithTemplateInfo->set].layout->flags &
6299 VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
6300
6301 if (!radv_init_push_descriptor_set(cmd_buffer, push_set, layout->set[pPushDescriptorSetWithTemplateInfo->set].layout,
6302 templ->bind_point))
6303 return;
6304
6305 radv_cmd_update_descriptor_set_with_template(cmd_buffer->device, cmd_buffer, push_set,
6306 pPushDescriptorSetWithTemplateInfo->descriptorUpdateTemplate,
6307 pPushDescriptorSetWithTemplateInfo->pData);
6308
6309 radv_set_descriptor_set(cmd_buffer, templ->bind_point, push_set, pPushDescriptorSetWithTemplateInfo->set);
6310
6311 radv_flush_push_descriptors(cmd_buffer, descriptors_state);
6312 }
6313
6314 VKAPI_ATTR void VKAPI_CALL
radv_CmdPushConstants2KHR(VkCommandBuffer commandBuffer,const VkPushConstantsInfoKHR * pPushConstantsInfo)6315 radv_CmdPushConstants2KHR(VkCommandBuffer commandBuffer, const VkPushConstantsInfoKHR *pPushConstantsInfo)
6316 {
6317 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6318 memcpy(cmd_buffer->push_constants + pPushConstantsInfo->offset, pPushConstantsInfo->pValues,
6319 pPushConstantsInfo->size);
6320 cmd_buffer->push_constant_stages |= pPushConstantsInfo->stageFlags;
6321 }
6322
6323 VKAPI_ATTR VkResult VKAPI_CALL
radv_EndCommandBuffer(VkCommandBuffer commandBuffer)6324 radv_EndCommandBuffer(VkCommandBuffer commandBuffer)
6325 {
6326 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6327
6328 if (cmd_buffer->qf == RADV_QUEUE_SPARSE)
6329 return vk_command_buffer_end(&cmd_buffer->vk);
6330
6331 radv_emit_mip_change_flush_default(cmd_buffer);
6332
6333 const bool is_gfx_or_ace = cmd_buffer->qf == RADV_QUEUE_GENERAL || cmd_buffer->qf == RADV_QUEUE_COMPUTE;
6334
6335 if (is_gfx_or_ace) {
6336 if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX6)
6337 cmd_buffer->state.flush_bits |=
6338 RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_WB_L2;
6339
6340 /* Make sure to sync all pending active queries at the end of
6341 * command buffer.
6342 */
6343 cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
6344
6345 /* Flush noncoherent images on GFX9+ so we can assume they're clean on the start of a
6346 * command buffer.
6347 */
6348 if (cmd_buffer->state.rb_noncoherent_dirty && !can_skip_buffer_l2_flushes(cmd_buffer->device))
6349 cmd_buffer->state.flush_bits |= radv_src_access_flush(
6350 cmd_buffer, VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT | VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, NULL);
6351
6352 /* Since NGG streamout uses GDS, we need to make GDS idle when
6353 * we leave the IB, otherwise another process might overwrite
6354 * it while our shaders are busy.
6355 */
6356 if (cmd_buffer->gds_needed)
6357 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
6358 }
6359
6360 /* Finalize the internal compute command stream, if it exists. */
6361 if (cmd_buffer->gang.cs) {
6362 VkResult result = radv_gang_finalize(cmd_buffer);
6363 if (result != VK_SUCCESS)
6364 return vk_error(cmd_buffer, result);
6365 }
6366
6367 if (is_gfx_or_ace) {
6368 radv_emit_cache_flush(cmd_buffer);
6369
6370 /* Make sure CP DMA is idle at the end of IBs because the kernel
6371 * doesn't wait for it.
6372 */
6373 radv_cp_dma_wait_for_idle(cmd_buffer);
6374 }
6375
6376 radv_describe_end_cmd_buffer(cmd_buffer);
6377
6378 VkResult result = cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs);
6379 if (result != VK_SUCCESS)
6380 return vk_error(cmd_buffer, result);
6381
6382 return vk_command_buffer_end(&cmd_buffer->vk);
6383 }
6384
6385 static void
radv_emit_compute_pipeline(struct radv_cmd_buffer * cmd_buffer,struct radv_compute_pipeline * pipeline)6386 radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer, struct radv_compute_pipeline *pipeline)
6387 {
6388 if (pipeline == cmd_buffer->state.emitted_compute_pipeline)
6389 return;
6390
6391 assert(!pipeline->base.ctx_cs.cdw);
6392
6393 cmd_buffer->state.emitted_compute_pipeline = pipeline;
6394
6395 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->base.cs.cdw);
6396 radeon_emit_array(cmd_buffer->cs, pipeline->base.cs.buf, pipeline->base.cs.cdw);
6397
6398 if (pipeline->base.type == RADV_PIPELINE_COMPUTE) {
6399 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->state.shaders[MESA_SHADER_COMPUTE]->bo);
6400 } else {
6401 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->state.rt_prolog->bo);
6402
6403 if (cmd_buffer->state.shaders[MESA_SHADER_INTERSECTION])
6404 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
6405 cmd_buffer->state.shaders[MESA_SHADER_INTERSECTION]->bo);
6406
6407 struct radv_ray_tracing_pipeline *rt_pipeline = radv_pipeline_to_ray_tracing(&pipeline->base);
6408 for (unsigned i = 0; i < rt_pipeline->stage_count; ++i) {
6409 struct radv_shader *shader = rt_pipeline->stages[i].shader;
6410 if (shader)
6411 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, shader->bo);
6412 }
6413 }
6414
6415 if (radv_device_fault_detection_enabled(cmd_buffer->device))
6416 radv_save_pipeline(cmd_buffer, &pipeline->base);
6417 }
6418
6419 static void
radv_mark_descriptor_sets_dirty(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)6420 radv_mark_descriptor_sets_dirty(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point)
6421 {
6422 struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);
6423
6424 descriptors_state->dirty |= descriptors_state->valid;
6425 }
6426
6427 static void
radv_bind_vs_input_state(struct radv_cmd_buffer * cmd_buffer,const struct radv_graphics_pipeline * pipeline)6428 radv_bind_vs_input_state(struct radv_cmd_buffer *cmd_buffer, const struct radv_graphics_pipeline *pipeline)
6429 {
6430 const struct radv_shader *vs_shader = radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_VERTEX);
6431 const struct radv_vs_input_state *src = &pipeline->vs_input_state;
6432
6433 /* Bind the vertex input state from the pipeline when the VS has a prolog and the state isn't
6434 * dynamic. This can happen when the pre-rasterization stages and the vertex input state are from
6435 * two different libraries. Otherwise, if the VS has a prolog, the state is dynamic and there is
6436 * nothing to bind.
6437 */
6438 if (!vs_shader || !vs_shader->info.vs.has_prolog || (pipeline->dynamic_states & RADV_DYNAMIC_VERTEX_INPUT))
6439 return;
6440
6441 cmd_buffer->state.dynamic_vs_input = *src;
6442
6443 if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX6 ||
6444 cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10) {
6445 cmd_buffer->state.vbo_misaligned_mask = 0;
6446 cmd_buffer->state.vbo_misaligned_mask_invalid = src->attribute_mask;
6447 }
6448
6449 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT;
6450 }
6451
6452 static void
radv_bind_multisample_state(struct radv_cmd_buffer * cmd_buffer,const struct radv_multisample_state * ms)6453 radv_bind_multisample_state(struct radv_cmd_buffer *cmd_buffer, const struct radv_multisample_state *ms)
6454 {
6455 if (ms->sample_shading_enable) {
6456 cmd_buffer->state.ms.sample_shading_enable = true;
6457 cmd_buffer->state.ms.min_sample_shading = ms->min_sample_shading;
6458 }
6459 }
6460
6461 static void
radv_bind_custom_blend_mode(struct radv_cmd_buffer * cmd_buffer,unsigned custom_blend_mode)6462 radv_bind_custom_blend_mode(struct radv_cmd_buffer *cmd_buffer, unsigned custom_blend_mode)
6463 {
6464 /* Re-emit CB_COLOR_CONTROL when the custom blending mode changes. */
6465 if (cmd_buffer->state.custom_blend_mode != custom_blend_mode)
6466 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP | RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP_ENABLE;
6467
6468 cmd_buffer->state.custom_blend_mode = custom_blend_mode;
6469 }
6470
6471 static void
radv_bind_pre_rast_shader(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * shader)6472 radv_bind_pre_rast_shader(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *shader)
6473 {
6474 bool mesh_shading = shader->info.stage == MESA_SHADER_MESH;
6475 const struct radv_userdata_info *loc;
6476
6477 assert(shader->info.stage == MESA_SHADER_VERTEX || shader->info.stage == MESA_SHADER_TESS_CTRL ||
6478 shader->info.stage == MESA_SHADER_TESS_EVAL || shader->info.stage == MESA_SHADER_GEOMETRY ||
6479 shader->info.stage == MESA_SHADER_MESH);
6480
6481 if (radv_get_user_sgpr(shader, AC_UD_NGG_PROVOKING_VTX)->sgpr_idx != -1) {
6482 /* Re-emit the provoking vertex mode state because the SGPR idx can be different. */
6483 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_PROVOKING_VERTEX_MODE;
6484 }
6485
6486 if (radv_get_user_sgpr(shader, AC_UD_STREAMOUT_BUFFERS)->sgpr_idx != -1) {
6487 /* Re-emit the streamout buffers because the SGPR idx can be different and with NGG streamout
6488 * they always need to be emitted because a buffer size of 0 is used to disable streamout.
6489 */
6490 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_BUFFER;
6491
6492 if (cmd_buffer->device->physical_device->use_ngg_streamout) {
6493 /* GFX11 needs GDS OA for streamout. */
6494 cmd_buffer->gds_oa_needed = true;
6495 }
6496 }
6497
6498 if (radv_get_user_sgpr(shader, AC_UD_NUM_VERTS_PER_PRIM)->sgpr_idx != -1) {
6499 /* Re-emit the primitive topology because the SGPR idx can be different. */
6500 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY;
6501 }
6502
6503 if (radv_get_user_sgpr(shader, AC_UD_SHADER_QUERY_STATE)->sgpr_idx != -1) {
6504 /* Re-emit shader query state when SGPR exists but location potentially changed. */
6505 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
6506 }
6507
6508 const bool needs_vtx_sgpr =
6509 shader->info.stage == MESA_SHADER_VERTEX || shader->info.stage == MESA_SHADER_MESH ||
6510 (shader->info.stage == MESA_SHADER_GEOMETRY && !shader->info.merged_shader_compiled_separately) ||
6511 (shader->info.stage == MESA_SHADER_TESS_CTRL && !shader->info.merged_shader_compiled_separately);
6512
6513 loc = radv_get_user_sgpr(shader, AC_UD_VS_BASE_VERTEX_START_INSTANCE);
6514 if (needs_vtx_sgpr && loc->sgpr_idx != -1) {
6515 cmd_buffer->state.vtx_base_sgpr = shader->info.user_data_0 + loc->sgpr_idx * 4;
6516 cmd_buffer->state.vtx_emit_num = loc->num_sgprs;
6517 cmd_buffer->state.uses_drawid = shader->info.vs.needs_draw_id;
6518 cmd_buffer->state.uses_baseinstance = shader->info.vs.needs_base_instance;
6519
6520 if (shader->info.merged_shader_compiled_separately) {
6521 /* Merged shaders compiled separately (eg. VS+TCS) always declare these user SGPRS
6522 * because the input arguments must match.
6523 */
6524 cmd_buffer->state.uses_drawid = true;
6525 cmd_buffer->state.uses_baseinstance = true;
6526 }
6527
6528 /* Re-emit some vertex states because the SGPR idx can be different. */
6529 cmd_buffer->state.last_first_instance = -1;
6530 cmd_buffer->state.last_vertex_offset_valid = false;
6531 cmd_buffer->state.last_drawid = -1;
6532 }
6533
6534 if (mesh_shading != cmd_buffer->state.mesh_shading) {
6535 /* Re-emit VRS state because the combiner is different (vertex vs primitive). Re-emit
6536 * primitive topology because the mesh shading pipeline clobbered it.
6537 */
6538 cmd_buffer->state.dirty |=
6539 RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE | RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY;
6540 }
6541
6542 cmd_buffer->state.mesh_shading = mesh_shading;
6543 }
6544
6545 static void
radv_bind_vertex_shader(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * vs)6546 radv_bind_vertex_shader(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *vs)
6547 {
6548 radv_bind_pre_rast_shader(cmd_buffer, vs);
6549
6550 /* Re-emit states that need to be updated when the vertex shader is compiled separately
6551 * because shader configs are combined.
6552 */
6553 if (vs->info.merged_shader_compiled_separately && vs->info.next_stage == MESA_SHADER_TESS_CTRL) {
6554 cmd_buffer->state.emitted_tcs_epilog = NULL;
6555 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_PATCH_CONTROL_POINTS;
6556 }
6557
6558 /* Can't put anything else here due to merged shaders */
6559 }
6560
6561 static void
radv_bind_tess_ctrl_shader(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * tcs)6562 radv_bind_tess_ctrl_shader(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *tcs)
6563 {
6564 radv_bind_pre_rast_shader(cmd_buffer, tcs);
6565
6566 cmd_buffer->tess_rings_needed = true;
6567
6568 /* Always re-emit patch control points/domain origin when a new pipeline with tessellation is
6569 * bound because a bunch of parameters (user SGPRs, TCS vertices out, ccw, etc) can be different.
6570 */
6571 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_PATCH_CONTROL_POINTS | RADV_CMD_DIRTY_DYNAMIC_TESS_DOMAIN_ORIGIN;
6572
6573 /* Re-emit the TCS epilog when a new tessellation control shader is bound. */
6574 if (tcs->info.has_epilog)
6575 cmd_buffer->state.emitted_tcs_epilog = NULL;
6576
6577 /* Re-emit the VS prolog when the tessellation control shader is compiled separately because
6578 * shader configs are combined and need to be updated.
6579 */
6580 if (tcs->info.merged_shader_compiled_separately)
6581 cmd_buffer->state.emitted_vs_prolog = NULL;
6582 }
6583
6584 static void
radv_bind_tess_eval_shader(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * tes)6585 radv_bind_tess_eval_shader(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *tes)
6586 {
6587 radv_bind_pre_rast_shader(cmd_buffer, tes);
6588
6589 /* Can't put anything else here due to merged shaders */
6590 }
6591
6592 static void
radv_bind_geometry_shader(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * gs)6593 radv_bind_geometry_shader(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *gs)
6594 {
6595 radv_bind_pre_rast_shader(cmd_buffer, gs);
6596
6597 cmd_buffer->esgs_ring_size_needed = MAX2(cmd_buffer->esgs_ring_size_needed, gs->info.gs_ring_info.esgs_ring_size);
6598 cmd_buffer->gsvs_ring_size_needed = MAX2(cmd_buffer->gsvs_ring_size_needed, gs->info.gs_ring_info.gsvs_ring_size);
6599
6600 /* Re-emit the VS prolog when the geometry shader is compiled separately because shader configs
6601 * are combined and need to be updated.
6602 */
6603 if (gs->info.merged_shader_compiled_separately)
6604 cmd_buffer->state.emitted_vs_prolog = NULL;
6605 }
6606
6607 static void
radv_bind_mesh_shader(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * ms)6608 radv_bind_mesh_shader(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *ms)
6609 {
6610 radv_bind_pre_rast_shader(cmd_buffer, ms);
6611
6612 cmd_buffer->mesh_scratch_ring_needed |= ms->info.ms.needs_ms_scratch_ring;
6613 }
6614
6615 static void
radv_bind_fragment_shader(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * ps)6616 radv_bind_fragment_shader(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *ps)
6617 {
6618 const enum amd_gfx_level gfx_level = cmd_buffer->device->physical_device->rad_info.gfx_level;
6619 const struct radv_shader *previous_ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
6620 const float min_sample_shading = 1.0f;
6621
6622 if (ps->info.ps.needs_sample_positions) {
6623 cmd_buffer->sample_positions_needed = true;
6624 }
6625
6626 /* Re-emit the FS state because the SGPR idx can be different. */
6627 if (radv_get_user_sgpr(ps, AC_UD_PS_STATE)->sgpr_idx != -1) {
6628 cmd_buffer->state.dirty |=
6629 RADV_CMD_DIRTY_DYNAMIC_RASTERIZATION_SAMPLES | RADV_CMD_DIRTY_DYNAMIC_LINE_RASTERIZATION_MODE;
6630 }
6631
6632 /* Re-emit the conservative rasterization mode because inner coverage is different. */
6633 if (!previous_ps || previous_ps->info.ps.reads_fully_covered != ps->info.ps.reads_fully_covered)
6634 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_CONSERVATIVE_RAST_MODE;
6635
6636 if (gfx_level >= GFX10_3 && (!previous_ps || previous_ps->info.ps.force_sample_iter_shading_rate !=
6637 ps->info.ps.force_sample_iter_shading_rate))
6638 cmd_buffer->state.dirty |=
6639 RADV_CMD_DIRTY_DYNAMIC_RASTERIZATION_SAMPLES | RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE;
6640
6641 if (cmd_buffer->state.ms.sample_shading_enable != ps->info.ps.uses_sample_shading) {
6642 cmd_buffer->state.ms.sample_shading_enable = ps->info.ps.uses_sample_shading;
6643 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_RASTERIZATION_SAMPLES;
6644
6645 if (gfx_level >= GFX10_3)
6646 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE;
6647 }
6648
6649 if (cmd_buffer->state.ms.min_sample_shading != min_sample_shading) {
6650 cmd_buffer->state.ms.min_sample_shading = min_sample_shading;
6651 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_RASTERIZATION_SAMPLES;
6652 }
6653
6654 if (!previous_ps || previous_ps->info.ps.db_shader_control != ps->info.ps.db_shader_control ||
6655 previous_ps->info.ps.pops_is_per_sample != ps->info.ps.pops_is_per_sample)
6656 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DB_SHADER_CONTROL;
6657
6658 /* Re-emit the PS epilog when a new fragment shader is bound. */
6659 if (ps->info.has_epilog)
6660 cmd_buffer->state.emitted_ps_epilog = NULL;
6661 }
6662
6663 static void
radv_bind_task_shader(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * ts)6664 radv_bind_task_shader(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *ts)
6665 {
6666 if (!radv_gang_init(cmd_buffer))
6667 return;
6668
6669 cmd_buffer->task_rings_needed = true;
6670 }
6671
6672 /* This function binds/unbinds a shader to the cmdbuffer state. */
6673 static void
radv_bind_shader(struct radv_cmd_buffer * cmd_buffer,struct radv_shader * shader,gl_shader_stage stage)6674 radv_bind_shader(struct radv_cmd_buffer *cmd_buffer, struct radv_shader *shader, gl_shader_stage stage)
6675 {
6676 const struct radv_device *device = cmd_buffer->device;
6677
6678 if (!shader) {
6679 cmd_buffer->state.shaders[stage] = NULL;
6680 cmd_buffer->state.active_stages &= ~mesa_to_vk_shader_stage(stage);
6681
6682 /* Reset some dynamic states when a shader stage is unbound. */
6683 switch (stage) {
6684 case MESA_SHADER_FRAGMENT:
6685 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_CONSERVATIVE_RAST_MODE |
6686 RADV_CMD_DIRTY_DYNAMIC_RASTERIZATION_SAMPLES |
6687 RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE | RADV_CMD_DIRTY_DB_SHADER_CONTROL;
6688 break;
6689 default:
6690 break;
6691 }
6692 return;
6693 }
6694
6695 switch (stage) {
6696 case MESA_SHADER_VERTEX:
6697 radv_bind_vertex_shader(cmd_buffer, shader);
6698 break;
6699 case MESA_SHADER_TESS_CTRL:
6700 radv_bind_tess_ctrl_shader(cmd_buffer, shader);
6701 break;
6702 case MESA_SHADER_TESS_EVAL:
6703 radv_bind_tess_eval_shader(cmd_buffer, shader);
6704 break;
6705 case MESA_SHADER_GEOMETRY:
6706 radv_bind_geometry_shader(cmd_buffer, shader);
6707 break;
6708 case MESA_SHADER_FRAGMENT:
6709 radv_bind_fragment_shader(cmd_buffer, shader);
6710 break;
6711 case MESA_SHADER_MESH:
6712 radv_bind_mesh_shader(cmd_buffer, shader);
6713 break;
6714 case MESA_SHADER_TASK:
6715 radv_bind_task_shader(cmd_buffer, shader);
6716 break;
6717 case MESA_SHADER_COMPUTE: {
6718 cmd_buffer->compute_scratch_size_per_wave_needed =
6719 MAX2(cmd_buffer->compute_scratch_size_per_wave_needed, shader->config.scratch_bytes_per_wave);
6720
6721 const unsigned max_stage_waves = radv_get_max_scratch_waves(device, shader);
6722 cmd_buffer->compute_scratch_waves_wanted = MAX2(cmd_buffer->compute_scratch_waves_wanted, max_stage_waves);
6723 break;
6724 }
6725 case MESA_SHADER_INTERSECTION:
6726 /* no-op */
6727 break;
6728 default:
6729 unreachable("invalid shader stage");
6730 }
6731
6732 cmd_buffer->state.shaders[stage] = shader;
6733 cmd_buffer->state.active_stages |= mesa_to_vk_shader_stage(stage);
6734
6735 if (mesa_to_vk_shader_stage(stage) & RADV_GRAPHICS_STAGE_BITS) {
6736 cmd_buffer->scratch_size_per_wave_needed =
6737 MAX2(cmd_buffer->scratch_size_per_wave_needed, shader->config.scratch_bytes_per_wave);
6738
6739 const unsigned max_stage_waves = radv_get_max_scratch_waves(device, shader);
6740 cmd_buffer->scratch_waves_wanted = MAX2(cmd_buffer->scratch_waves_wanted, max_stage_waves);
6741 }
6742 }
6743
6744 static void
radv_reset_shader_object_state(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint pipelineBindPoint)6745 radv_reset_shader_object_state(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint pipelineBindPoint)
6746 {
6747 switch (pipelineBindPoint) {
6748 case VK_PIPELINE_BIND_POINT_COMPUTE:
6749 if (cmd_buffer->state.shader_objs[MESA_SHADER_COMPUTE]) {
6750 radv_bind_shader(cmd_buffer, NULL, MESA_SHADER_COMPUTE);
6751 cmd_buffer->state.shader_objs[MESA_SHADER_COMPUTE] = NULL;
6752 }
6753 break;
6754 case VK_PIPELINE_BIND_POINT_GRAPHICS:
6755 radv_foreach_stage(s, RADV_GRAPHICS_STAGE_BITS)
6756 {
6757 if (cmd_buffer->state.shader_objs[s]) {
6758 radv_bind_shader(cmd_buffer, NULL, s);
6759 cmd_buffer->state.shader_objs[s] = NULL;
6760 }
6761 }
6762 break;
6763 default:
6764 break;
6765 }
6766
6767 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_GRAPHICS_SHADERS;
6768 }
6769
6770 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindPipeline(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipeline _pipeline)6771 radv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint, VkPipeline _pipeline)
6772 {
6773 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6774 RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);
6775
6776 radv_reset_shader_object_state(cmd_buffer, pipelineBindPoint);
6777
6778 switch (pipelineBindPoint) {
6779 case VK_PIPELINE_BIND_POINT_COMPUTE: {
6780 struct radv_compute_pipeline *compute_pipeline = radv_pipeline_to_compute(pipeline);
6781
6782 if (cmd_buffer->state.compute_pipeline == compute_pipeline)
6783 return;
6784 radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
6785
6786 radv_bind_shader(cmd_buffer, compute_pipeline->base.shaders[MESA_SHADER_COMPUTE], MESA_SHADER_COMPUTE);
6787
6788 cmd_buffer->state.compute_pipeline = compute_pipeline;
6789 cmd_buffer->push_constant_stages |= VK_SHADER_STAGE_COMPUTE_BIT;
6790 break;
6791 }
6792 case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR: {
6793 struct radv_ray_tracing_pipeline *rt_pipeline = radv_pipeline_to_ray_tracing(pipeline);
6794
6795 if (cmd_buffer->state.rt_pipeline == rt_pipeline)
6796 return;
6797 radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
6798
6799 radv_bind_shader(cmd_buffer, rt_pipeline->base.base.shaders[MESA_SHADER_INTERSECTION], MESA_SHADER_INTERSECTION);
6800 cmd_buffer->state.rt_prolog = rt_pipeline->prolog;
6801
6802 cmd_buffer->state.rt_pipeline = rt_pipeline;
6803 cmd_buffer->push_constant_stages |= RADV_RT_STAGE_BITS;
6804
6805 /* Bind the stack size when it's not dynamic. */
6806 if (rt_pipeline->stack_size != -1u)
6807 cmd_buffer->state.rt_stack_size = rt_pipeline->stack_size;
6808
6809 const unsigned max_scratch_waves = radv_get_max_scratch_waves(cmd_buffer->device, rt_pipeline->prolog);
6810 cmd_buffer->compute_scratch_waves_wanted = MAX2(cmd_buffer->compute_scratch_waves_wanted, max_scratch_waves);
6811 break;
6812 }
6813 case VK_PIPELINE_BIND_POINT_GRAPHICS: {
6814 struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline);
6815
6816 /* Bind the non-dynamic graphics state from the pipeline unconditionally because some PSO
6817 * might have been overwritten between two binds of the same pipeline.
6818 */
6819 radv_bind_dynamic_state(cmd_buffer, &graphics_pipeline->dynamic_state);
6820
6821 if (cmd_buffer->state.graphics_pipeline == graphics_pipeline)
6822 return;
6823 radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
6824
6825 radv_foreach_stage(
6826 stage, (cmd_buffer->state.active_stages | graphics_pipeline->active_stages) & RADV_GRAPHICS_STAGE_BITS)
6827 {
6828 radv_bind_shader(cmd_buffer, graphics_pipeline->base.shaders[stage], stage);
6829 }
6830
6831 cmd_buffer->state.gs_copy_shader = graphics_pipeline->base.gs_copy_shader;
6832 cmd_buffer->state.last_vgt_shader = graphics_pipeline->base.shaders[graphics_pipeline->last_vgt_api_stage];
6833
6834 cmd_buffer->state.graphics_pipeline = graphics_pipeline;
6835
6836 cmd_buffer->state.has_nggc = graphics_pipeline->has_ngg_culling;
6837 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_PIPELINE;
6838 cmd_buffer->push_constant_stages |= graphics_pipeline->active_stages;
6839
6840 /* Prefetch all pipeline shaders at first draw time. */
6841 cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_SHADERS;
6842
6843 if (cmd_buffer->device->physical_device->rad_info.has_vgt_flush_ngg_legacy_bug &&
6844 cmd_buffer->state.emitted_graphics_pipeline && cmd_buffer->state.emitted_graphics_pipeline->is_ngg &&
6845 !cmd_buffer->state.graphics_pipeline->is_ngg) {
6846 /* Transitioning from NGG to legacy GS requires
6847 * VGT_FLUSH on GFX10 and Navi21. VGT_FLUSH
6848 * is also emitted at the beginning of IBs when legacy
6849 * GS ring pointers are set.
6850 */
6851 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_FLUSH;
6852 }
6853
6854 cmd_buffer->state.uses_dynamic_patch_control_points =
6855 !!(graphics_pipeline->dynamic_states & RADV_DYNAMIC_PATCH_CONTROL_POINTS);
6856
6857 if (graphics_pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT) {
6858 if (!cmd_buffer->state.uses_dynamic_patch_control_points) {
6859 /* Bind the tessellation state from the pipeline when it's not dynamic. */
6860 struct radv_shader *tcs = cmd_buffer->state.shaders[MESA_SHADER_TESS_CTRL];
6861
6862 cmd_buffer->state.tess_num_patches = tcs->info.num_tess_patches;
6863 cmd_buffer->state.tess_lds_size = tcs->info.tcs.num_lds_blocks;
6864 }
6865 }
6866
6867 const struct radv_shader *vs = radv_get_shader(graphics_pipeline->base.shaders, MESA_SHADER_VERTEX);
6868 if (vs) {
6869 /* Re-emit the VS prolog when a new vertex shader is bound. */
6870 if (vs->info.vs.has_prolog) {
6871 cmd_buffer->state.emitted_vs_prolog = NULL;
6872 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT;
6873 }
6874
6875 /* Re-emit the vertex buffer descriptors because they are really tied to the pipeline. */
6876 if (vs->info.vs.vb_desc_usage_mask) {
6877 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER;
6878 }
6879 }
6880
6881 if (cmd_buffer->device->physical_device->rad_info.rbplus_allowed &&
6882 (!cmd_buffer->state.emitted_graphics_pipeline ||
6883 cmd_buffer->state.col_format_non_compacted != graphics_pipeline->col_format_non_compacted)) {
6884 cmd_buffer->state.col_format_non_compacted = graphics_pipeline->col_format_non_compacted;
6885 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_RBPLUS;
6886 }
6887
6888 radv_bind_vs_input_state(cmd_buffer, graphics_pipeline);
6889
6890 radv_bind_multisample_state(cmd_buffer, &graphics_pipeline->ms);
6891
6892 radv_bind_custom_blend_mode(cmd_buffer, graphics_pipeline->custom_blend_mode);
6893
6894 cmd_buffer->state.db_render_control = graphics_pipeline->db_render_control;
6895
6896 cmd_buffer->state.rast_prim = graphics_pipeline->rast_prim;
6897
6898 cmd_buffer->state.ia_multi_vgt_param = graphics_pipeline->ia_multi_vgt_param;
6899
6900 cmd_buffer->state.uses_out_of_order_rast = graphics_pipeline->uses_out_of_order_rast;
6901 cmd_buffer->state.uses_vrs_attachment = graphics_pipeline->uses_vrs_attachment;
6902 cmd_buffer->state.uses_dynamic_vertex_binding_stride =
6903 !!(graphics_pipeline->dynamic_states & (RADV_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE | RADV_DYNAMIC_VERTEX_INPUT));
6904 break;
6905 }
6906 default:
6907 assert(!"invalid bind point");
6908 break;
6909 }
6910
6911 cmd_buffer->push_constant_state[vk_to_bind_point(pipelineBindPoint)].size = pipeline->push_constant_size;
6912 cmd_buffer->push_constant_state[vk_to_bind_point(pipelineBindPoint)].dynamic_offset_count =
6913 pipeline->dynamic_offset_count;
6914 cmd_buffer->descriptors[vk_to_bind_point(pipelineBindPoint)].need_indirect_descriptor_sets =
6915 pipeline->need_indirect_descriptor_sets;
6916
6917 if (cmd_buffer->device->shader_use_invisible_vram)
6918 cmd_buffer->shader_upload_seq = MAX2(cmd_buffer->shader_upload_seq, pipeline->shader_upload_seq);
6919 }
6920
6921 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetViewport(VkCommandBuffer commandBuffer,uint32_t firstViewport,uint32_t viewportCount,const VkViewport * pViewports)6922 radv_CmdSetViewport(VkCommandBuffer commandBuffer, uint32_t firstViewport, uint32_t viewportCount,
6923 const VkViewport *pViewports)
6924 {
6925 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6926 struct radv_cmd_state *state = &cmd_buffer->state;
6927 ASSERTED const uint32_t total_count = firstViewport + viewportCount;
6928
6929 assert(firstViewport < MAX_VIEWPORTS);
6930 assert(total_count >= 1 && total_count <= MAX_VIEWPORTS);
6931
6932 if (state->dynamic.vk.vp.viewport_count < total_count)
6933 state->dynamic.vk.vp.viewport_count = total_count;
6934
6935 memcpy(state->dynamic.vk.vp.viewports + firstViewport, pViewports, viewportCount * sizeof(*pViewports));
6936 for (unsigned i = 0; i < viewportCount; i++) {
6937 radv_get_viewport_xform(&pViewports[i], state->dynamic.hw_vp.xform[i + firstViewport].scale,
6938 state->dynamic.hw_vp.xform[i + firstViewport].translate);
6939 }
6940
6941 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_VIEWPORT | RADV_CMD_DIRTY_GUARDBAND;
6942 }
6943
6944 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetScissor(VkCommandBuffer commandBuffer,uint32_t firstScissor,uint32_t scissorCount,const VkRect2D * pScissors)6945 radv_CmdSetScissor(VkCommandBuffer commandBuffer, uint32_t firstScissor, uint32_t scissorCount,
6946 const VkRect2D *pScissors)
6947 {
6948 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6949 struct radv_cmd_state *state = &cmd_buffer->state;
6950 ASSERTED const uint32_t total_count = firstScissor + scissorCount;
6951
6952 assert(firstScissor < MAX_SCISSORS);
6953 assert(total_count >= 1 && total_count <= MAX_SCISSORS);
6954
6955 if (state->dynamic.vk.vp.scissor_count < total_count)
6956 state->dynamic.vk.vp.scissor_count = total_count;
6957
6958 memcpy(state->dynamic.vk.vp.scissors + firstScissor, pScissors, scissorCount * sizeof(*pScissors));
6959
6960 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR;
6961 }
6962
6963 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetLineWidth(VkCommandBuffer commandBuffer,float lineWidth)6964 radv_CmdSetLineWidth(VkCommandBuffer commandBuffer, float lineWidth)
6965 {
6966 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6967 struct radv_cmd_state *state = &cmd_buffer->state;
6968
6969 state->dynamic.vk.rs.line.width = lineWidth;
6970
6971 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH | RADV_CMD_DIRTY_GUARDBAND;
6972 }
6973
6974 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetBlendConstants(VkCommandBuffer commandBuffer,const float blendConstants[4])6975 radv_CmdSetBlendConstants(VkCommandBuffer commandBuffer, const float blendConstants[4])
6976 {
6977 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6978 struct radv_cmd_state *state = &cmd_buffer->state;
6979
6980 memcpy(state->dynamic.vk.cb.blend_constants, blendConstants, sizeof(float) * 4);
6981
6982 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS;
6983 }
6984
6985 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthBounds(VkCommandBuffer commandBuffer,float minDepthBounds,float maxDepthBounds)6986 radv_CmdSetDepthBounds(VkCommandBuffer commandBuffer, float minDepthBounds, float maxDepthBounds)
6987 {
6988 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6989 struct radv_cmd_state *state = &cmd_buffer->state;
6990
6991 state->dynamic.vk.ds.depth.bounds_test.min = minDepthBounds;
6992 state->dynamic.vk.ds.depth.bounds_test.max = maxDepthBounds;
6993
6994 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS;
6995 }
6996
6997 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t compareMask)6998 radv_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask, uint32_t compareMask)
6999 {
7000 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7001 struct radv_cmd_state *state = &cmd_buffer->state;
7002
7003 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
7004 state->dynamic.vk.ds.stencil.front.compare_mask = compareMask;
7005 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
7006 state->dynamic.vk.ds.stencil.back.compare_mask = compareMask;
7007
7008 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK;
7009 }
7010
7011 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t writeMask)7012 radv_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask, uint32_t writeMask)
7013 {
7014 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7015 struct radv_cmd_state *state = &cmd_buffer->state;
7016
7017 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
7018 state->dynamic.vk.ds.stencil.front.write_mask = writeMask;
7019 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
7020 state->dynamic.vk.ds.stencil.back.write_mask = writeMask;
7021
7022 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK;
7023 }
7024
7025 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetStencilReference(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t reference)7026 radv_CmdSetStencilReference(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask, uint32_t reference)
7027 {
7028 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7029 struct radv_cmd_state *state = &cmd_buffer->state;
7030
7031 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
7032 state->dynamic.vk.ds.stencil.front.reference = reference;
7033 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
7034 state->dynamic.vk.ds.stencil.back.reference = reference;
7035
7036 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE;
7037 }
7038
7039 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDiscardRectangleEXT(VkCommandBuffer commandBuffer,uint32_t firstDiscardRectangle,uint32_t discardRectangleCount,const VkRect2D * pDiscardRectangles)7040 radv_CmdSetDiscardRectangleEXT(VkCommandBuffer commandBuffer, uint32_t firstDiscardRectangle,
7041 uint32_t discardRectangleCount, const VkRect2D *pDiscardRectangles)
7042 {
7043 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7044 struct radv_cmd_state *state = &cmd_buffer->state;
7045 ASSERTED const uint32_t total_count = firstDiscardRectangle + discardRectangleCount;
7046
7047 assert(firstDiscardRectangle < MAX_DISCARD_RECTANGLES);
7048 assert(total_count >= 1 && total_count <= MAX_DISCARD_RECTANGLES);
7049
7050 typed_memcpy(&state->dynamic.vk.dr.rectangles[firstDiscardRectangle], pDiscardRectangles, discardRectangleCount);
7051
7052 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE;
7053 }
7054
7055 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetSampleLocationsEXT(VkCommandBuffer commandBuffer,const VkSampleLocationsInfoEXT * pSampleLocationsInfo)7056 radv_CmdSetSampleLocationsEXT(VkCommandBuffer commandBuffer, const VkSampleLocationsInfoEXT *pSampleLocationsInfo)
7057 {
7058 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7059 struct radv_cmd_state *state = &cmd_buffer->state;
7060
7061 assert(pSampleLocationsInfo->sampleLocationsCount <= MAX_SAMPLE_LOCATIONS);
7062
7063 state->dynamic.sample_location.per_pixel = pSampleLocationsInfo->sampleLocationsPerPixel;
7064 state->dynamic.sample_location.grid_size = pSampleLocationsInfo->sampleLocationGridSize;
7065 state->dynamic.sample_location.count = pSampleLocationsInfo->sampleLocationsCount;
7066 typed_memcpy(&state->dynamic.sample_location.locations[0], pSampleLocationsInfo->pSampleLocations,
7067 pSampleLocationsInfo->sampleLocationsCount);
7068
7069 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS;
7070 }
7071
7072 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetLineStippleKHR(VkCommandBuffer commandBuffer,uint32_t lineStippleFactor,uint16_t lineStipplePattern)7073 radv_CmdSetLineStippleKHR(VkCommandBuffer commandBuffer, uint32_t lineStippleFactor, uint16_t lineStipplePattern)
7074 {
7075 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7076 struct radv_cmd_state *state = &cmd_buffer->state;
7077
7078 state->dynamic.vk.rs.line.stipple.factor = lineStippleFactor;
7079 state->dynamic.vk.rs.line.stipple.pattern = lineStipplePattern;
7080
7081 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE;
7082 }
7083
7084 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetCullMode(VkCommandBuffer commandBuffer,VkCullModeFlags cullMode)7085 radv_CmdSetCullMode(VkCommandBuffer commandBuffer, VkCullModeFlags cullMode)
7086 {
7087 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7088 struct radv_cmd_state *state = &cmd_buffer->state;
7089
7090 state->dynamic.vk.rs.cull_mode = cullMode;
7091
7092 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_CULL_MODE;
7093 }
7094
7095 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetFrontFace(VkCommandBuffer commandBuffer,VkFrontFace frontFace)7096 radv_CmdSetFrontFace(VkCommandBuffer commandBuffer, VkFrontFace frontFace)
7097 {
7098 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7099 struct radv_cmd_state *state = &cmd_buffer->state;
7100
7101 state->dynamic.vk.rs.front_face = frontFace;
7102
7103 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE;
7104 }
7105
7106 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetPrimitiveTopology(VkCommandBuffer commandBuffer,VkPrimitiveTopology primitiveTopology)7107 radv_CmdSetPrimitiveTopology(VkCommandBuffer commandBuffer, VkPrimitiveTopology primitiveTopology)
7108 {
7109 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7110 struct radv_cmd_state *state = &cmd_buffer->state;
7111 unsigned primitive_topology = radv_translate_prim(primitiveTopology);
7112
7113 if (radv_primitive_topology_is_line_list(state->dynamic.vk.ia.primitive_topology) !=
7114 radv_primitive_topology_is_line_list(primitive_topology))
7115 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE;
7116
7117 if (radv_prim_is_points_or_lines(state->dynamic.vk.ia.primitive_topology) !=
7118 radv_prim_is_points_or_lines(primitive_topology))
7119 state->dirty |= RADV_CMD_DIRTY_GUARDBAND;
7120
7121 state->dynamic.vk.ia.primitive_topology = primitive_topology;
7122
7123 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY;
7124 }
7125
7126 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetViewportWithCount(VkCommandBuffer commandBuffer,uint32_t viewportCount,const VkViewport * pViewports)7127 radv_CmdSetViewportWithCount(VkCommandBuffer commandBuffer, uint32_t viewportCount, const VkViewport *pViewports)
7128 {
7129 radv_CmdSetViewport(commandBuffer, 0, viewportCount, pViewports);
7130 }
7131
7132 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetScissorWithCount(VkCommandBuffer commandBuffer,uint32_t scissorCount,const VkRect2D * pScissors)7133 radv_CmdSetScissorWithCount(VkCommandBuffer commandBuffer, uint32_t scissorCount, const VkRect2D *pScissors)
7134 {
7135 radv_CmdSetScissor(commandBuffer, 0, scissorCount, pScissors);
7136 }
7137
7138 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthTestEnable(VkCommandBuffer commandBuffer,VkBool32 depthTestEnable)7139 radv_CmdSetDepthTestEnable(VkCommandBuffer commandBuffer, VkBool32 depthTestEnable)
7140
7141 {
7142 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7143 struct radv_cmd_state *state = &cmd_buffer->state;
7144
7145 state->dynamic.vk.ds.depth.test_enable = depthTestEnable;
7146
7147 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE;
7148 }
7149
7150 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthWriteEnable(VkCommandBuffer commandBuffer,VkBool32 depthWriteEnable)7151 radv_CmdSetDepthWriteEnable(VkCommandBuffer commandBuffer, VkBool32 depthWriteEnable)
7152 {
7153 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7154 struct radv_cmd_state *state = &cmd_buffer->state;
7155
7156 state->dynamic.vk.ds.depth.write_enable = depthWriteEnable;
7157
7158 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE;
7159 }
7160
7161 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthCompareOp(VkCommandBuffer commandBuffer,VkCompareOp depthCompareOp)7162 radv_CmdSetDepthCompareOp(VkCommandBuffer commandBuffer, VkCompareOp depthCompareOp)
7163 {
7164 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7165 struct radv_cmd_state *state = &cmd_buffer->state;
7166
7167 state->dynamic.vk.ds.depth.compare_op = depthCompareOp;
7168
7169 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP;
7170 }
7171
7172 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthBoundsTestEnable(VkCommandBuffer commandBuffer,VkBool32 depthBoundsTestEnable)7173 radv_CmdSetDepthBoundsTestEnable(VkCommandBuffer commandBuffer, VkBool32 depthBoundsTestEnable)
7174 {
7175 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7176 struct radv_cmd_state *state = &cmd_buffer->state;
7177
7178 state->dynamic.vk.ds.depth.bounds_test.enable = depthBoundsTestEnable;
7179
7180 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE;
7181 }
7182
7183 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetStencilTestEnable(VkCommandBuffer commandBuffer,VkBool32 stencilTestEnable)7184 radv_CmdSetStencilTestEnable(VkCommandBuffer commandBuffer, VkBool32 stencilTestEnable)
7185 {
7186 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7187 struct radv_cmd_state *state = &cmd_buffer->state;
7188
7189 state->dynamic.vk.ds.stencil.test_enable = stencilTestEnable;
7190
7191 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE;
7192 }
7193
7194 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetStencilOp(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,VkStencilOp failOp,VkStencilOp passOp,VkStencilOp depthFailOp,VkCompareOp compareOp)7195 radv_CmdSetStencilOp(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask, VkStencilOp failOp, VkStencilOp passOp,
7196 VkStencilOp depthFailOp, VkCompareOp compareOp)
7197 {
7198 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7199 struct radv_cmd_state *state = &cmd_buffer->state;
7200
7201 if (faceMask & VK_STENCIL_FACE_FRONT_BIT) {
7202 state->dynamic.vk.ds.stencil.front.op.fail = failOp;
7203 state->dynamic.vk.ds.stencil.front.op.pass = passOp;
7204 state->dynamic.vk.ds.stencil.front.op.depth_fail = depthFailOp;
7205 state->dynamic.vk.ds.stencil.front.op.compare = compareOp;
7206 }
7207
7208 if (faceMask & VK_STENCIL_FACE_BACK_BIT) {
7209 state->dynamic.vk.ds.stencil.back.op.fail = failOp;
7210 state->dynamic.vk.ds.stencil.back.op.pass = passOp;
7211 state->dynamic.vk.ds.stencil.back.op.depth_fail = depthFailOp;
7212 state->dynamic.vk.ds.stencil.back.op.compare = compareOp;
7213 }
7214
7215 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP;
7216 }
7217
7218 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetFragmentShadingRateKHR(VkCommandBuffer commandBuffer,const VkExtent2D * pFragmentSize,const VkFragmentShadingRateCombinerOpKHR combinerOps[2])7219 radv_CmdSetFragmentShadingRateKHR(VkCommandBuffer commandBuffer, const VkExtent2D *pFragmentSize,
7220 const VkFragmentShadingRateCombinerOpKHR combinerOps[2])
7221 {
7222 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7223 struct radv_cmd_state *state = &cmd_buffer->state;
7224
7225 state->dynamic.vk.fsr.fragment_size = *pFragmentSize;
7226 for (unsigned i = 0; i < 2; i++)
7227 state->dynamic.vk.fsr.combiner_ops[i] = combinerOps[i];
7228
7229 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE;
7230 }
7231
7232 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthBiasEnable(VkCommandBuffer commandBuffer,VkBool32 depthBiasEnable)7233 radv_CmdSetDepthBiasEnable(VkCommandBuffer commandBuffer, VkBool32 depthBiasEnable)
7234 {
7235 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7236 struct radv_cmd_state *state = &cmd_buffer->state;
7237
7238 state->dynamic.vk.rs.depth_bias.enable = depthBiasEnable;
7239
7240 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE;
7241 }
7242
7243 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetPrimitiveRestartEnable(VkCommandBuffer commandBuffer,VkBool32 primitiveRestartEnable)7244 radv_CmdSetPrimitiveRestartEnable(VkCommandBuffer commandBuffer, VkBool32 primitiveRestartEnable)
7245 {
7246 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7247 struct radv_cmd_state *state = &cmd_buffer->state;
7248
7249 state->dynamic.vk.ia.primitive_restart_enable = primitiveRestartEnable;
7250
7251 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE;
7252 }
7253
7254 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetRasterizerDiscardEnable(VkCommandBuffer commandBuffer,VkBool32 rasterizerDiscardEnable)7255 radv_CmdSetRasterizerDiscardEnable(VkCommandBuffer commandBuffer, VkBool32 rasterizerDiscardEnable)
7256 {
7257 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7258 struct radv_cmd_state *state = &cmd_buffer->state;
7259
7260 state->dynamic.vk.rs.rasterizer_discard_enable = rasterizerDiscardEnable;
7261
7262 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
7263 }
7264
7265 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetPatchControlPointsEXT(VkCommandBuffer commandBuffer,uint32_t patchControlPoints)7266 radv_CmdSetPatchControlPointsEXT(VkCommandBuffer commandBuffer, uint32_t patchControlPoints)
7267 {
7268 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7269 struct radv_cmd_state *state = &cmd_buffer->state;
7270
7271 state->dynamic.vk.ts.patch_control_points = patchControlPoints;
7272
7273 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_PATCH_CONTROL_POINTS;
7274 }
7275
7276 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetLogicOpEXT(VkCommandBuffer commandBuffer,VkLogicOp logicOp)7277 radv_CmdSetLogicOpEXT(VkCommandBuffer commandBuffer, VkLogicOp logicOp)
7278 {
7279 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7280 struct radv_cmd_state *state = &cmd_buffer->state;
7281 unsigned logic_op = radv_translate_blend_logic_op(logicOp);
7282
7283 state->dynamic.vk.cb.logic_op = logic_op;
7284
7285 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP;
7286 }
7287
7288 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer,uint32_t attachmentCount,const VkBool32 * pColorWriteEnables)7289 radv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer, uint32_t attachmentCount,
7290 const VkBool32 *pColorWriteEnables)
7291 {
7292 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7293 struct radv_cmd_state *state = &cmd_buffer->state;
7294 uint8_t color_write_enable = 0;
7295
7296 assert(attachmentCount <= MAX_RTS);
7297
7298 for (uint32_t i = 0; i < attachmentCount; i++) {
7299 if (pColorWriteEnables[i]) {
7300 color_write_enable |= BITFIELD_BIT(i);
7301 }
7302 }
7303
7304 state->dynamic.vk.cb.color_write_enables = color_write_enable;
7305
7306 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE;
7307 }
7308
7309 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetVertexInputEXT(VkCommandBuffer commandBuffer,uint32_t vertexBindingDescriptionCount,const VkVertexInputBindingDescription2EXT * pVertexBindingDescriptions,uint32_t vertexAttributeDescriptionCount,const VkVertexInputAttributeDescription2EXT * pVertexAttributeDescriptions)7310 radv_CmdSetVertexInputEXT(VkCommandBuffer commandBuffer, uint32_t vertexBindingDescriptionCount,
7311 const VkVertexInputBindingDescription2EXT *pVertexBindingDescriptions,
7312 uint32_t vertexAttributeDescriptionCount,
7313 const VkVertexInputAttributeDescription2EXT *pVertexAttributeDescriptions)
7314 {
7315 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7316 struct radv_cmd_state *state = &cmd_buffer->state;
7317 struct radv_vs_input_state *vs_state = &state->dynamic_vs_input;
7318
7319 const VkVertexInputBindingDescription2EXT *bindings[MAX_VBS];
7320 for (unsigned i = 0; i < vertexBindingDescriptionCount; i++)
7321 bindings[pVertexBindingDescriptions[i].binding] = &pVertexBindingDescriptions[i];
7322
7323 state->vbo_misaligned_mask = 0;
7324 state->vbo_misaligned_mask_invalid = 0;
7325
7326 vs_state->attribute_mask = 0;
7327 vs_state->instance_rate_inputs = 0;
7328 vs_state->nontrivial_divisors = 0;
7329 vs_state->zero_divisors = 0;
7330 vs_state->post_shuffle = 0;
7331 vs_state->alpha_adjust_lo = 0;
7332 vs_state->alpha_adjust_hi = 0;
7333 vs_state->nontrivial_formats = 0;
7334 vs_state->bindings_match_attrib = true;
7335
7336 enum amd_gfx_level chip = cmd_buffer->device->physical_device->rad_info.gfx_level;
7337 enum radeon_family family = cmd_buffer->device->physical_device->rad_info.family;
7338 const struct ac_vtx_format_info *vtx_info_table = ac_get_vtx_format_info_table(chip, family);
7339
7340 for (unsigned i = 0; i < vertexAttributeDescriptionCount; i++) {
7341 const VkVertexInputAttributeDescription2EXT *attrib = &pVertexAttributeDescriptions[i];
7342 const VkVertexInputBindingDescription2EXT *binding = bindings[attrib->binding];
7343 unsigned loc = attrib->location;
7344
7345 vs_state->attribute_mask |= 1u << loc;
7346 vs_state->bindings[loc] = attrib->binding;
7347 if (attrib->binding != loc)
7348 vs_state->bindings_match_attrib = false;
7349 if (binding->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE) {
7350 vs_state->instance_rate_inputs |= 1u << loc;
7351 vs_state->divisors[loc] = binding->divisor;
7352 if (binding->divisor == 0) {
7353 vs_state->zero_divisors |= 1u << loc;
7354 } else if (binding->divisor > 1) {
7355 vs_state->nontrivial_divisors |= 1u << loc;
7356 }
7357 }
7358 cmd_buffer->vertex_bindings[attrib->binding].stride = binding->stride;
7359 vs_state->offsets[loc] = attrib->offset;
7360
7361 enum pipe_format format = vk_format_map[attrib->format];
7362 const struct ac_vtx_format_info *vtx_info = &vtx_info_table[format];
7363
7364 vs_state->formats[loc] = format;
7365 uint8_t align_req_minus_1 = vtx_info->chan_byte_size >= 4 ? 3 : (vtx_info->element_size - 1);
7366 vs_state->format_align_req_minus_1[loc] = align_req_minus_1;
7367 vs_state->format_sizes[loc] = vtx_info->element_size;
7368 vs_state->alpha_adjust_lo |= (vtx_info->alpha_adjust & 0x1) << loc;
7369 vs_state->alpha_adjust_hi |= (vtx_info->alpha_adjust >> 1) << loc;
7370 if (G_008F0C_DST_SEL_X(vtx_info->dst_sel) == V_008F0C_SQ_SEL_Z)
7371 vs_state->post_shuffle |= BITFIELD_BIT(loc);
7372
7373 if (!(vtx_info->has_hw_format & BITFIELD_BIT(vtx_info->num_channels - 1)))
7374 vs_state->nontrivial_formats |= BITFIELD_BIT(loc);
7375
7376 if ((chip == GFX6 || chip >= GFX10) && state->vbo_bound_mask & BITFIELD_BIT(attrib->binding)) {
7377 if (binding->stride & align_req_minus_1) {
7378 state->vbo_misaligned_mask |= BITFIELD_BIT(loc);
7379 } else if ((cmd_buffer->vertex_bindings[attrib->binding].offset + vs_state->offsets[loc]) &
7380 align_req_minus_1) {
7381 state->vbo_misaligned_mask |= BITFIELD_BIT(loc);
7382 }
7383 }
7384 }
7385
7386 state->dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER | RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT;
7387 }
7388
7389 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetPolygonModeEXT(VkCommandBuffer commandBuffer,VkPolygonMode polygonMode)7390 radv_CmdSetPolygonModeEXT(VkCommandBuffer commandBuffer, VkPolygonMode polygonMode)
7391 {
7392 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7393 struct radv_cmd_state *state = &cmd_buffer->state;
7394 unsigned polygon_mode = radv_translate_fill(polygonMode);
7395
7396 if (radv_polygon_mode_is_points_or_lines(state->dynamic.vk.rs.polygon_mode) !=
7397 radv_polygon_mode_is_points_or_lines(polygon_mode))
7398 state->dirty |= RADV_CMD_DIRTY_GUARDBAND;
7399
7400 state->dynamic.vk.rs.polygon_mode = polygon_mode;
7401
7402 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_POLYGON_MODE;
7403 }
7404
7405 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetTessellationDomainOriginEXT(VkCommandBuffer commandBuffer,VkTessellationDomainOrigin domainOrigin)7406 radv_CmdSetTessellationDomainOriginEXT(VkCommandBuffer commandBuffer, VkTessellationDomainOrigin domainOrigin)
7407 {
7408 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7409 struct radv_cmd_state *state = &cmd_buffer->state;
7410
7411 state->dynamic.vk.ts.domain_origin = domainOrigin;
7412
7413 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_TESS_DOMAIN_ORIGIN;
7414 }
7415
7416 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetLogicOpEnableEXT(VkCommandBuffer commandBuffer,VkBool32 logicOpEnable)7417 radv_CmdSetLogicOpEnableEXT(VkCommandBuffer commandBuffer, VkBool32 logicOpEnable)
7418 {
7419 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7420 struct radv_cmd_state *state = &cmd_buffer->state;
7421
7422 state->dynamic.vk.cb.logic_op_enable = logicOpEnable;
7423
7424 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP_ENABLE;
7425 }
7426
7427 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetLineStippleEnableEXT(VkCommandBuffer commandBuffer,VkBool32 stippledLineEnable)7428 radv_CmdSetLineStippleEnableEXT(VkCommandBuffer commandBuffer, VkBool32 stippledLineEnable)
7429 {
7430 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7431 struct radv_cmd_state *state = &cmd_buffer->state;
7432
7433 state->dynamic.vk.rs.line.stipple.enable = stippledLineEnable;
7434
7435 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE_ENABLE;
7436 }
7437
7438 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetAlphaToCoverageEnableEXT(VkCommandBuffer commandBuffer,VkBool32 alphaToCoverageEnable)7439 radv_CmdSetAlphaToCoverageEnableEXT(VkCommandBuffer commandBuffer, VkBool32 alphaToCoverageEnable)
7440 {
7441 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7442 struct radv_cmd_state *state = &cmd_buffer->state;
7443
7444 state->dynamic.vk.ms.alpha_to_coverage_enable = alphaToCoverageEnable;
7445
7446 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_ALPHA_TO_COVERAGE_ENABLE;
7447 }
7448
7449 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetSampleMaskEXT(VkCommandBuffer commandBuffer,VkSampleCountFlagBits samples,const VkSampleMask * pSampleMask)7450 radv_CmdSetSampleMaskEXT(VkCommandBuffer commandBuffer, VkSampleCountFlagBits samples, const VkSampleMask *pSampleMask)
7451 {
7452 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7453 struct radv_cmd_state *state = &cmd_buffer->state;
7454
7455 state->dynamic.vk.ms.sample_mask = pSampleMask[0] & 0xffff;
7456
7457 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SAMPLE_MASK;
7458 }
7459
7460 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthClipEnableEXT(VkCommandBuffer commandBuffer,VkBool32 depthClipEnable)7461 radv_CmdSetDepthClipEnableEXT(VkCommandBuffer commandBuffer, VkBool32 depthClipEnable)
7462 {
7463 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7464 struct radv_cmd_state *state = &cmd_buffer->state;
7465
7466 state->dynamic.vk.rs.depth_clip_enable = depthClipEnable;
7467
7468 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_CLIP_ENABLE;
7469 }
7470
7471 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetConservativeRasterizationModeEXT(VkCommandBuffer commandBuffer,VkConservativeRasterizationModeEXT conservativeRasterizationMode)7472 radv_CmdSetConservativeRasterizationModeEXT(VkCommandBuffer commandBuffer,
7473 VkConservativeRasterizationModeEXT conservativeRasterizationMode)
7474 {
7475 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7476 struct radv_cmd_state *state = &cmd_buffer->state;
7477
7478 state->dynamic.vk.rs.conservative_mode = conservativeRasterizationMode;
7479
7480 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_CONSERVATIVE_RAST_MODE;
7481 }
7482
7483 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthClipNegativeOneToOneEXT(VkCommandBuffer commandBuffer,VkBool32 negativeOneToOne)7484 radv_CmdSetDepthClipNegativeOneToOneEXT(VkCommandBuffer commandBuffer, VkBool32 negativeOneToOne)
7485 {
7486 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7487 struct radv_cmd_state *state = &cmd_buffer->state;
7488
7489 state->dynamic.vk.vp.depth_clip_negative_one_to_one = negativeOneToOne;
7490
7491 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE;
7492 }
7493
7494 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetProvokingVertexModeEXT(VkCommandBuffer commandBuffer,VkProvokingVertexModeEXT provokingVertexMode)7495 radv_CmdSetProvokingVertexModeEXT(VkCommandBuffer commandBuffer, VkProvokingVertexModeEXT provokingVertexMode)
7496 {
7497 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7498 struct radv_cmd_state *state = &cmd_buffer->state;
7499
7500 state->dynamic.vk.rs.provoking_vertex = provokingVertexMode;
7501
7502 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_PROVOKING_VERTEX_MODE;
7503 }
7504
7505 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthClampEnableEXT(VkCommandBuffer commandBuffer,VkBool32 depthClampEnable)7506 radv_CmdSetDepthClampEnableEXT(VkCommandBuffer commandBuffer, VkBool32 depthClampEnable)
7507 {
7508 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7509 struct radv_cmd_state *state = &cmd_buffer->state;
7510
7511 state->dynamic.vk.rs.depth_clamp_enable = depthClampEnable;
7512
7513 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_CLAMP_ENABLE;
7514 }
7515
7516 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetColorWriteMaskEXT(VkCommandBuffer commandBuffer,uint32_t firstAttachment,uint32_t attachmentCount,const VkColorComponentFlags * pColorWriteMasks)7517 radv_CmdSetColorWriteMaskEXT(VkCommandBuffer commandBuffer, uint32_t firstAttachment, uint32_t attachmentCount,
7518 const VkColorComponentFlags *pColorWriteMasks)
7519 {
7520 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7521 struct radv_cmd_state *state = &cmd_buffer->state;
7522
7523 assert(firstAttachment + attachmentCount <= MAX_RTS);
7524
7525 for (uint32_t i = 0; i < attachmentCount; i++) {
7526 uint32_t idx = firstAttachment + i;
7527
7528 state->dynamic.vk.cb.attachments[idx].write_mask = pColorWriteMasks[i];
7529 }
7530
7531 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_MASK;
7532
7533 if (cmd_buffer->device->physical_device->rad_info.rbplus_allowed)
7534 state->dirty |= RADV_CMD_DIRTY_RBPLUS;
7535 }
7536
7537 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetColorBlendEnableEXT(VkCommandBuffer commandBuffer,uint32_t firstAttachment,uint32_t attachmentCount,const VkBool32 * pColorBlendEnables)7538 radv_CmdSetColorBlendEnableEXT(VkCommandBuffer commandBuffer, uint32_t firstAttachment, uint32_t attachmentCount,
7539 const VkBool32 *pColorBlendEnables)
7540 {
7541 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7542 struct radv_cmd_state *state = &cmd_buffer->state;
7543
7544 assert(firstAttachment + attachmentCount <= MAX_RTS);
7545
7546 for (uint32_t i = 0; i < attachmentCount; i++) {
7547 uint32_t idx = firstAttachment + i;
7548
7549 state->dynamic.vk.cb.attachments[idx].blend_enable = pColorBlendEnables[i];
7550 }
7551
7552 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_ENABLE;
7553 }
7554
7555 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetRasterizationSamplesEXT(VkCommandBuffer commandBuffer,VkSampleCountFlagBits rasterizationSamples)7556 radv_CmdSetRasterizationSamplesEXT(VkCommandBuffer commandBuffer, VkSampleCountFlagBits rasterizationSamples)
7557 {
7558 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7559 struct radv_cmd_state *state = &cmd_buffer->state;
7560
7561 state->dynamic.vk.ms.rasterization_samples = rasterizationSamples;
7562
7563 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_RASTERIZATION_SAMPLES;
7564 }
7565
7566 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetLineRasterizationModeEXT(VkCommandBuffer commandBuffer,VkLineRasterizationModeKHR lineRasterizationMode)7567 radv_CmdSetLineRasterizationModeEXT(VkCommandBuffer commandBuffer, VkLineRasterizationModeKHR lineRasterizationMode)
7568 {
7569 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7570 struct radv_cmd_state *state = &cmd_buffer->state;
7571
7572 state->dynamic.vk.rs.line.mode = lineRasterizationMode;
7573
7574 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_RASTERIZATION_MODE;
7575 }
7576
7577 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetColorBlendEquationEXT(VkCommandBuffer commandBuffer,uint32_t firstAttachment,uint32_t attachmentCount,const VkColorBlendEquationEXT * pColorBlendEquations)7578 radv_CmdSetColorBlendEquationEXT(VkCommandBuffer commandBuffer, uint32_t firstAttachment, uint32_t attachmentCount,
7579 const VkColorBlendEquationEXT *pColorBlendEquations)
7580 {
7581 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7582 struct radv_cmd_state *state = &cmd_buffer->state;
7583
7584 assert(firstAttachment + attachmentCount <= MAX_RTS);
7585 for (uint32_t i = 0; i < attachmentCount; i++) {
7586 unsigned idx = firstAttachment + i;
7587
7588 state->dynamic.vk.cb.attachments[idx].src_color_blend_factor = pColorBlendEquations[i].srcColorBlendFactor;
7589 state->dynamic.vk.cb.attachments[idx].dst_color_blend_factor = pColorBlendEquations[i].dstColorBlendFactor;
7590 state->dynamic.vk.cb.attachments[idx].color_blend_op = pColorBlendEquations[i].colorBlendOp;
7591 state->dynamic.vk.cb.attachments[idx].src_alpha_blend_factor = pColorBlendEquations[i].srcAlphaBlendFactor;
7592 state->dynamic.vk.cb.attachments[idx].dst_alpha_blend_factor = pColorBlendEquations[i].dstAlphaBlendFactor;
7593 state->dynamic.vk.cb.attachments[idx].alpha_blend_op = pColorBlendEquations[i].alphaBlendOp;
7594 }
7595
7596 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_EQUATION;
7597 }
7598
7599 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetSampleLocationsEnableEXT(VkCommandBuffer commandBuffer,VkBool32 sampleLocationsEnable)7600 radv_CmdSetSampleLocationsEnableEXT(VkCommandBuffer commandBuffer, VkBool32 sampleLocationsEnable)
7601 {
7602 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7603 struct radv_cmd_state *state = &cmd_buffer->state;
7604
7605 state->dynamic.vk.ms.sample_locations_enable = sampleLocationsEnable;
7606
7607 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS_ENABLE;
7608 }
7609
7610 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDiscardRectangleEnableEXT(VkCommandBuffer commandBuffer,VkBool32 discardRectangleEnable)7611 radv_CmdSetDiscardRectangleEnableEXT(VkCommandBuffer commandBuffer, VkBool32 discardRectangleEnable)
7612 {
7613 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7614 struct radv_cmd_state *state = &cmd_buffer->state;
7615
7616 state->dynamic.vk.dr.enable = discardRectangleEnable;
7617 state->dynamic.vk.dr.rectangle_count = discardRectangleEnable ? MAX_DISCARD_RECTANGLES : 0;
7618
7619 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE_ENABLE;
7620 }
7621
7622 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDiscardRectangleModeEXT(VkCommandBuffer commandBuffer,VkDiscardRectangleModeEXT discardRectangleMode)7623 radv_CmdSetDiscardRectangleModeEXT(VkCommandBuffer commandBuffer, VkDiscardRectangleModeEXT discardRectangleMode)
7624 {
7625 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7626 struct radv_cmd_state *state = &cmd_buffer->state;
7627
7628 state->dynamic.vk.dr.mode = discardRectangleMode;
7629
7630 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE_MODE;
7631 }
7632
7633 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetAttachmentFeedbackLoopEnableEXT(VkCommandBuffer commandBuffer,VkImageAspectFlags aspectMask)7634 radv_CmdSetAttachmentFeedbackLoopEnableEXT(VkCommandBuffer commandBuffer, VkImageAspectFlags aspectMask)
7635 {
7636 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7637 struct radv_cmd_state *state = &cmd_buffer->state;
7638
7639 state->dynamic.feedback_loop_aspects = aspectMask;
7640
7641 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_ATTACHMENT_FEEDBACK_LOOP_ENABLE;
7642 }
7643
7644 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthBias2EXT(VkCommandBuffer commandBuffer,const VkDepthBiasInfoEXT * pDepthBiasInfo)7645 radv_CmdSetDepthBias2EXT(VkCommandBuffer commandBuffer, const VkDepthBiasInfoEXT *pDepthBiasInfo)
7646 {
7647 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7648 struct radv_cmd_state *state = &cmd_buffer->state;
7649
7650 const VkDepthBiasRepresentationInfoEXT *dbr_info =
7651 vk_find_struct_const(pDepthBiasInfo->pNext, DEPTH_BIAS_REPRESENTATION_INFO_EXT);
7652
7653 state->dynamic.vk.rs.depth_bias.constant = pDepthBiasInfo->depthBiasConstantFactor;
7654 state->dynamic.vk.rs.depth_bias.clamp = pDepthBiasInfo->depthBiasClamp;
7655 state->dynamic.vk.rs.depth_bias.slope = pDepthBiasInfo->depthBiasSlopeFactor;
7656 state->dynamic.vk.rs.depth_bias.representation =
7657 dbr_info ? dbr_info->depthBiasRepresentation : VK_DEPTH_BIAS_REPRESENTATION_LEAST_REPRESENTABLE_VALUE_FORMAT_EXT;
7658
7659 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
7660 }
7661
7662 VKAPI_ATTR void VKAPI_CALL
radv_CmdExecuteCommands(VkCommandBuffer commandBuffer,uint32_t commandBufferCount,const VkCommandBuffer * pCmdBuffers)7663 radv_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCount, const VkCommandBuffer *pCmdBuffers)
7664 {
7665 RADV_FROM_HANDLE(radv_cmd_buffer, primary, commandBuffer);
7666
7667 assert(commandBufferCount > 0);
7668
7669 radv_emit_mip_change_flush_default(primary);
7670
7671 /* Emit pending flushes on primary prior to executing secondary */
7672 radv_emit_cache_flush(primary);
7673
7674 /* Make sure CP DMA is idle on primary prior to executing secondary. */
7675 radv_cp_dma_wait_for_idle(primary);
7676
7677 for (uint32_t i = 0; i < commandBufferCount; i++) {
7678 RADV_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]);
7679
7680 /* Do not launch an IB2 for secondary command buffers that contain
7681 * DRAW_{INDEX}_INDIRECT_{MULTI} on GFX6-7 because it's illegal and hangs the GPU.
7682 */
7683 const bool allow_ib2 =
7684 !secondary->state.uses_draw_indirect || secondary->device->physical_device->rad_info.gfx_level >= GFX8;
7685
7686 primary->scratch_size_per_wave_needed =
7687 MAX2(primary->scratch_size_per_wave_needed, secondary->scratch_size_per_wave_needed);
7688 primary->scratch_waves_wanted = MAX2(primary->scratch_waves_wanted, secondary->scratch_waves_wanted);
7689 primary->compute_scratch_size_per_wave_needed =
7690 MAX2(primary->compute_scratch_size_per_wave_needed, secondary->compute_scratch_size_per_wave_needed);
7691 primary->compute_scratch_waves_wanted =
7692 MAX2(primary->compute_scratch_waves_wanted, secondary->compute_scratch_waves_wanted);
7693
7694 if (secondary->esgs_ring_size_needed > primary->esgs_ring_size_needed)
7695 primary->esgs_ring_size_needed = secondary->esgs_ring_size_needed;
7696 if (secondary->gsvs_ring_size_needed > primary->gsvs_ring_size_needed)
7697 primary->gsvs_ring_size_needed = secondary->gsvs_ring_size_needed;
7698 if (secondary->tess_rings_needed)
7699 primary->tess_rings_needed = true;
7700 if (secondary->task_rings_needed)
7701 primary->task_rings_needed = true;
7702 if (secondary->mesh_scratch_ring_needed)
7703 primary->mesh_scratch_ring_needed = true;
7704 if (secondary->sample_positions_needed)
7705 primary->sample_positions_needed = true;
7706 if (secondary->gds_needed)
7707 primary->gds_needed = true;
7708 if (secondary->gds_oa_needed)
7709 primary->gds_oa_needed = true;
7710
7711 primary->shader_upload_seq = MAX2(primary->shader_upload_seq, secondary->shader_upload_seq);
7712
7713 if (!secondary->state.render.has_image_views && primary->state.render.active &&
7714 (primary->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)) {
7715 /* Emit the framebuffer state from primary if secondary
7716 * has been recorded without a framebuffer, otherwise
7717 * fast color/depth clears can't work.
7718 */
7719 radv_emit_framebuffer_state(primary);
7720 }
7721
7722 if (secondary->gang.cs) {
7723 if (!radv_gang_init(primary))
7724 return;
7725
7726 struct radeon_cmdbuf *ace_primary = primary->gang.cs;
7727 struct radeon_cmdbuf *ace_secondary = secondary->gang.cs;
7728
7729 /* Emit pending flushes on primary prior to executing secondary. */
7730 radv_gang_cache_flush(primary);
7731
7732 /* Wait for gang semaphores, if necessary. */
7733 if (radv_flush_gang_leader_semaphore(primary))
7734 radv_wait_gang_leader(primary);
7735 if (radv_flush_gang_follower_semaphore(primary))
7736 radv_wait_gang_follower(primary);
7737
7738 /* Execute the secondary compute cmdbuf.
7739 * Don't use IB2 packets because they are not supported on compute queues.
7740 */
7741 primary->device->ws->cs_execute_secondary(ace_primary, ace_secondary, false);
7742 }
7743
7744 /* Update pending ACE internal flush bits from the secondary cmdbuf */
7745 primary->gang.flush_bits |= secondary->gang.flush_bits;
7746
7747 /* Increment gang semaphores if secondary was dirty.
7748 * This happens when the secondary cmdbuf has a barrier which
7749 * isn't consumed by a draw call.
7750 */
7751 if (radv_gang_leader_sem_dirty(secondary))
7752 primary->gang.sem.leader_value++;
7753 if (radv_gang_follower_sem_dirty(secondary))
7754 primary->gang.sem.follower_value++;
7755
7756 primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs, allow_ib2);
7757
7758 /* When the secondary command buffer is compute only we don't
7759 * need to re-emit the current graphics pipeline.
7760 */
7761 if (secondary->state.emitted_graphics_pipeline) {
7762 primary->state.emitted_graphics_pipeline = secondary->state.emitted_graphics_pipeline;
7763 }
7764
7765 /* When the secondary command buffer is graphics only we don't
7766 * need to re-emit the current compute pipeline.
7767 */
7768 if (secondary->state.emitted_compute_pipeline) {
7769 primary->state.emitted_compute_pipeline = secondary->state.emitted_compute_pipeline;
7770 }
7771
7772 if (secondary->state.last_primitive_reset_index) {
7773 primary->state.last_primitive_reset_index = secondary->state.last_primitive_reset_index;
7774 }
7775
7776 if (secondary->state.last_ia_multi_vgt_param) {
7777 primary->state.last_ia_multi_vgt_param = secondary->state.last_ia_multi_vgt_param;
7778 }
7779
7780 if (secondary->state.last_ge_cntl) {
7781 primary->state.last_ge_cntl = secondary->state.last_ge_cntl;
7782 }
7783
7784 primary->state.last_num_instances = secondary->state.last_num_instances;
7785 primary->state.last_subpass_color_count = secondary->state.last_subpass_color_count;
7786 primary->state.last_sx_ps_downconvert = secondary->state.last_sx_ps_downconvert;
7787 primary->state.last_sx_blend_opt_epsilon = secondary->state.last_sx_blend_opt_epsilon;
7788 primary->state.last_sx_blend_opt_control = secondary->state.last_sx_blend_opt_control;
7789
7790 if (secondary->state.last_index_type != -1) {
7791 primary->state.last_index_type = secondary->state.last_index_type;
7792 }
7793
7794 primary->state.last_vrs_rates = secondary->state.last_vrs_rates;
7795 primary->state.last_vrs_rates_sgpr_idx = secondary->state.last_vrs_rates_sgpr_idx;
7796
7797 primary->state.last_pa_sc_binner_cntl_0 = secondary->state.last_pa_sc_binner_cntl_0;
7798
7799 primary->state.last_db_shader_control = secondary->state.last_db_shader_control;
7800
7801 primary->state.rb_noncoherent_dirty |= secondary->state.rb_noncoherent_dirty;
7802 }
7803
7804 /* After executing commands from secondary buffers we have to dirty
7805 * some states.
7806 */
7807 primary->state.dirty |= RADV_CMD_DIRTY_PIPELINE | RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_GUARDBAND |
7808 RADV_CMD_DIRTY_DYNAMIC_ALL | RADV_CMD_DIRTY_SHADER_QUERY | RADV_CMD_DIRTY_OCCLUSION_QUERY |
7809 RADV_CMD_DIRTY_DB_SHADER_CONTROL;
7810 radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_GRAPHICS);
7811 radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_COMPUTE);
7812
7813 primary->state.last_first_instance = -1;
7814 primary->state.last_drawid = -1;
7815 primary->state.last_vertex_offset_valid = false;
7816 primary->state.last_db_count_control = -1;
7817 }
7818
7819 static void
radv_mark_noncoherent_rb(struct radv_cmd_buffer * cmd_buffer)7820 radv_mark_noncoherent_rb(struct radv_cmd_buffer *cmd_buffer)
7821 {
7822 struct radv_rendering_state *render = &cmd_buffer->state.render;
7823
7824 /* Have to be conservative in cmdbuffers with inherited attachments. */
7825 if (!render->has_image_views) {
7826 cmd_buffer->state.rb_noncoherent_dirty = true;
7827 return;
7828 }
7829
7830 for (uint32_t i = 0; i < render->color_att_count; i++) {
7831 if (render->color_att[i].iview && !render->color_att[i].iview->image->l2_coherent) {
7832 cmd_buffer->state.rb_noncoherent_dirty = true;
7833 return;
7834 }
7835 }
7836 if (render->ds_att.iview && !render->ds_att.iview->image->l2_coherent)
7837 cmd_buffer->state.rb_noncoherent_dirty = true;
7838 }
7839
7840 static VkImageLayout
attachment_initial_layout(const VkRenderingAttachmentInfo * att)7841 attachment_initial_layout(const VkRenderingAttachmentInfo *att)
7842 {
7843 const VkRenderingAttachmentInitialLayoutInfoMESA *layout_info =
7844 vk_find_struct_const(att->pNext, RENDERING_ATTACHMENT_INITIAL_LAYOUT_INFO_MESA);
7845 if (layout_info != NULL)
7846 return layout_info->initialLayout;
7847
7848 return att->imageLayout;
7849 }
7850
7851 VKAPI_ATTR void VKAPI_CALL
radv_CmdBeginRendering(VkCommandBuffer commandBuffer,const VkRenderingInfo * pRenderingInfo)7852 radv_CmdBeginRendering(VkCommandBuffer commandBuffer, const VkRenderingInfo *pRenderingInfo)
7853 {
7854 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7855
7856 const struct VkSampleLocationsInfoEXT *sample_locs_info =
7857 vk_find_struct_const(pRenderingInfo->pNext, SAMPLE_LOCATIONS_INFO_EXT);
7858
7859 struct radv_sample_locations_state sample_locations = {
7860 .count = 0,
7861 };
7862 if (sample_locs_info) {
7863 sample_locations = (struct radv_sample_locations_state){
7864 .per_pixel = sample_locs_info->sampleLocationsPerPixel,
7865 .grid_size = sample_locs_info->sampleLocationGridSize,
7866 .count = sample_locs_info->sampleLocationsCount,
7867 };
7868 typed_memcpy(sample_locations.locations, sample_locs_info->pSampleLocations,
7869 sample_locs_info->sampleLocationsCount);
7870 }
7871
7872 /* Dynamic rendering does not have implicit transitions, so limit the marker to
7873 * when a render pass is used.
7874 * Additionally, some internal meta operations called inside a barrier may issue
7875 * render calls (with dynamic rendering), so this makes sure those case don't
7876 * create a nested barrier scope.
7877 */
7878 if (cmd_buffer->vk.render_pass)
7879 radv_describe_barrier_start(cmd_buffer, RGP_BARRIER_EXTERNAL_RENDER_PASS_SYNC);
7880 uint32_t color_samples = 0, ds_samples = 0;
7881 struct radv_attachment color_att[MAX_RTS];
7882 for (uint32_t i = 0; i < pRenderingInfo->colorAttachmentCount; i++) {
7883 const VkRenderingAttachmentInfo *att_info = &pRenderingInfo->pColorAttachments[i];
7884
7885 color_att[i] = (struct radv_attachment){.iview = NULL};
7886 if (att_info->imageView == VK_NULL_HANDLE)
7887 continue;
7888
7889 VK_FROM_HANDLE(radv_image_view, iview, att_info->imageView);
7890 color_att[i].format = iview->vk.format;
7891 color_att[i].iview = iview;
7892 color_att[i].layout = att_info->imageLayout;
7893 radv_initialise_color_surface(cmd_buffer->device, &color_att[i].cb, iview);
7894
7895 if (att_info->resolveMode != VK_RESOLVE_MODE_NONE && att_info->resolveImageView != VK_NULL_HANDLE) {
7896 color_att[i].resolve_mode = att_info->resolveMode;
7897 color_att[i].resolve_iview = radv_image_view_from_handle(att_info->resolveImageView);
7898 color_att[i].resolve_layout = att_info->resolveImageLayout;
7899 }
7900
7901 color_samples = MAX2(color_samples, color_att[i].iview->vk.image->samples);
7902
7903 VkImageLayout initial_layout = attachment_initial_layout(att_info);
7904 if (initial_layout != color_att[i].layout) {
7905 assert(!(pRenderingInfo->flags & VK_RENDERING_RESUMING_BIT));
7906 radv_handle_rendering_image_transition(cmd_buffer, color_att[i].iview, pRenderingInfo->layerCount,
7907 pRenderingInfo->viewMask, initial_layout, VK_IMAGE_LAYOUT_UNDEFINED,
7908 color_att[i].layout, VK_IMAGE_LAYOUT_UNDEFINED, &sample_locations);
7909 }
7910 }
7911
7912 struct radv_attachment ds_att = {.iview = NULL};
7913 VkImageAspectFlags ds_att_aspects = 0;
7914 const VkRenderingAttachmentInfo *d_att_info = pRenderingInfo->pDepthAttachment;
7915 const VkRenderingAttachmentInfo *s_att_info = pRenderingInfo->pStencilAttachment;
7916 if ((d_att_info != NULL && d_att_info->imageView != VK_NULL_HANDLE) ||
7917 (s_att_info != NULL && s_att_info->imageView != VK_NULL_HANDLE)) {
7918 struct radv_image_view *d_iview = NULL, *s_iview = NULL;
7919 struct radv_image_view *d_res_iview = NULL, *s_res_iview = NULL;
7920 VkImageLayout initial_depth_layout = VK_IMAGE_LAYOUT_UNDEFINED;
7921 VkImageLayout initial_stencil_layout = VK_IMAGE_LAYOUT_UNDEFINED;
7922
7923 if (d_att_info != NULL && d_att_info->imageView != VK_NULL_HANDLE) {
7924 d_iview = radv_image_view_from_handle(d_att_info->imageView);
7925 initial_depth_layout = attachment_initial_layout(d_att_info);
7926 ds_att.layout = d_att_info->imageLayout;
7927
7928 if (d_att_info->resolveMode != VK_RESOLVE_MODE_NONE && d_att_info->resolveImageView != VK_NULL_HANDLE) {
7929 d_res_iview = radv_image_view_from_handle(d_att_info->resolveImageView);
7930 ds_att.resolve_mode = d_att_info->resolveMode;
7931 ds_att.resolve_layout = d_att_info->resolveImageLayout;
7932 }
7933 }
7934
7935 if (s_att_info != NULL && s_att_info->imageView != VK_NULL_HANDLE) {
7936 s_iview = radv_image_view_from_handle(s_att_info->imageView);
7937 initial_stencil_layout = attachment_initial_layout(s_att_info);
7938 ds_att.stencil_layout = s_att_info->imageLayout;
7939
7940 if (s_att_info->resolveMode != VK_RESOLVE_MODE_NONE && s_att_info->resolveImageView != VK_NULL_HANDLE) {
7941 s_res_iview = radv_image_view_from_handle(s_att_info->resolveImageView);
7942 ds_att.stencil_resolve_mode = s_att_info->resolveMode;
7943 ds_att.stencil_resolve_layout = s_att_info->resolveImageLayout;
7944 }
7945 }
7946
7947 assert(d_iview == NULL || s_iview == NULL || d_iview == s_iview);
7948 ds_att.iview = d_iview ? d_iview : s_iview, ds_att.format = ds_att.iview->vk.format;
7949
7950 if (d_iview && s_iview) {
7951 ds_att_aspects = VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
7952 } else if (d_iview) {
7953 ds_att_aspects = VK_IMAGE_ASPECT_DEPTH_BIT;
7954 } else {
7955 ds_att_aspects = VK_IMAGE_ASPECT_STENCIL_BIT;
7956 }
7957
7958 radv_initialise_ds_surface(cmd_buffer->device, &ds_att.ds, ds_att.iview, ds_att_aspects);
7959
7960 assert(d_res_iview == NULL || s_res_iview == NULL || d_res_iview == s_res_iview);
7961 ds_att.resolve_iview = d_res_iview ? d_res_iview : s_res_iview;
7962
7963 ds_samples = ds_att.iview->vk.image->samples;
7964
7965 if (initial_depth_layout != ds_att.layout || initial_stencil_layout != ds_att.stencil_layout) {
7966 assert(!(pRenderingInfo->flags & VK_RENDERING_RESUMING_BIT));
7967 radv_handle_rendering_image_transition(cmd_buffer, ds_att.iview, pRenderingInfo->layerCount,
7968 pRenderingInfo->viewMask, initial_depth_layout, initial_stencil_layout,
7969 ds_att.layout, ds_att.stencil_layout, &sample_locations);
7970 }
7971 }
7972 if (cmd_buffer->vk.render_pass)
7973 radv_describe_barrier_end(cmd_buffer);
7974
7975 const VkRenderingFragmentShadingRateAttachmentInfoKHR *fsr_info =
7976 vk_find_struct_const(pRenderingInfo->pNext, RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_INFO_KHR);
7977 struct radv_attachment vrs_att = {.iview = NULL};
7978 VkExtent2D vrs_texel_size = {.width = 0};
7979 if (fsr_info && fsr_info->imageView) {
7980 VK_FROM_HANDLE(radv_image_view, iview, fsr_info->imageView);
7981 vrs_att = (struct radv_attachment){
7982 .format = iview->vk.format,
7983 .iview = iview,
7984 .layout = fsr_info->imageLayout,
7985 };
7986 vrs_texel_size = fsr_info->shadingRateAttachmentTexelSize;
7987 }
7988
7989 /* Now that we've done any layout transitions which may invoke meta, we can
7990 * fill out the actual rendering info and set up for the client's render pass.
7991 */
7992 radv_cmd_buffer_reset_rendering(cmd_buffer);
7993
7994 struct radv_rendering_state *render = &cmd_buffer->state.render;
7995 render->active = true;
7996 render->has_image_views = true;
7997 render->area = pRenderingInfo->renderArea;
7998 render->view_mask = pRenderingInfo->viewMask;
7999 render->layer_count = pRenderingInfo->layerCount;
8000 render->color_samples = color_samples;
8001 render->ds_samples = ds_samples;
8002 render->max_samples = MAX2(color_samples, ds_samples);
8003 render->sample_locations = sample_locations;
8004 render->color_att_count = pRenderingInfo->colorAttachmentCount;
8005 typed_memcpy(render->color_att, color_att, render->color_att_count);
8006 render->ds_att = ds_att;
8007 render->ds_att_aspects = ds_att_aspects;
8008 render->vrs_att = vrs_att;
8009 render->vrs_texel_size = vrs_texel_size;
8010 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FRAMEBUFFER;
8011
8012 if (cmd_buffer->device->physical_device->rad_info.rbplus_allowed)
8013 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_RBPLUS;
8014
8015 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS | RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE;
8016
8017 if (render->vrs_att.iview && cmd_buffer->device->physical_device->rad_info.gfx_level == GFX10_3) {
8018 if (render->ds_att.iview &&
8019 radv_htile_enabled(render->ds_att.iview->image, render->ds_att.iview->vk.base_mip_level)) {
8020 /* When we have a VRS attachment and a depth/stencil attachment, we just need to copy the
8021 * VRS rates to the HTILE buffer of the attachment.
8022 */
8023 struct radv_image_view *ds_iview = render->ds_att.iview;
8024 struct radv_image *ds_image = ds_iview->image;
8025 uint32_t level = ds_iview->vk.base_mip_level;
8026
8027 /* HTILE buffer */
8028 uint64_t htile_offset = ds_image->bindings[0].offset + ds_image->planes[0].surface.meta_offset +
8029 ds_image->planes[0].surface.u.gfx9.meta_levels[level].offset;
8030 uint64_t htile_size = ds_image->planes[0].surface.u.gfx9.meta_levels[level].size;
8031 struct radv_buffer htile_buffer;
8032
8033 radv_buffer_init(&htile_buffer, cmd_buffer->device, ds_image->bindings[0].bo, htile_size, htile_offset);
8034
8035 assert(render->area.offset.x + render->area.extent.width <= ds_image->vk.extent.width &&
8036 render->area.offset.x + render->area.extent.height <= ds_image->vk.extent.height);
8037
8038 /* Copy the VRS rates to the HTILE buffer. */
8039 radv_copy_vrs_htile(cmd_buffer, render->vrs_att.iview->image, &render->area, ds_image, &htile_buffer, true);
8040
8041 radv_buffer_finish(&htile_buffer);
8042 } else {
8043 /* When a subpass uses a VRS attachment without binding a depth/stencil attachment, or when
8044 * HTILE isn't enabled, we use a fallback that copies the VRS rates to our internal HTILE buffer.
8045 */
8046 struct radv_image *ds_image = radv_cmd_buffer_get_vrs_image(cmd_buffer);
8047
8048 if (ds_image && render->area.offset.x < ds_image->vk.extent.width &&
8049 render->area.offset.y < ds_image->vk.extent.height) {
8050 /* HTILE buffer */
8051 struct radv_buffer *htile_buffer = cmd_buffer->device->vrs.buffer;
8052
8053 VkRect2D area = render->area;
8054 area.extent.width = MIN2(area.extent.width, ds_image->vk.extent.width - area.offset.x);
8055 area.extent.height = MIN2(area.extent.height, ds_image->vk.extent.height - area.offset.y);
8056
8057 /* Copy the VRS rates to the HTILE buffer. */
8058 radv_copy_vrs_htile(cmd_buffer, render->vrs_att.iview->image, &area, ds_image, htile_buffer, false);
8059 }
8060 }
8061 }
8062
8063 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 6);
8064 radeon_set_context_reg(cmd_buffer->cs, R_028204_PA_SC_WINDOW_SCISSOR_TL,
8065 S_028204_TL_X(render->area.offset.x) | S_028204_TL_Y(render->area.offset.y));
8066 radeon_set_context_reg(cmd_buffer->cs, R_028208_PA_SC_WINDOW_SCISSOR_BR,
8067 S_028208_BR_X(render->area.offset.x + render->area.extent.width) |
8068 S_028208_BR_Y(render->area.offset.y + render->area.extent.height));
8069
8070 radv_emit_fb_mip_change_flush(cmd_buffer);
8071
8072 if (!(pRenderingInfo->flags & VK_RENDERING_RESUMING_BIT))
8073 radv_cmd_buffer_clear_rendering(cmd_buffer, pRenderingInfo);
8074 }
8075
8076 VKAPI_ATTR void VKAPI_CALL
radv_CmdEndRendering(VkCommandBuffer commandBuffer)8077 radv_CmdEndRendering(VkCommandBuffer commandBuffer)
8078 {
8079 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8080
8081 radv_mark_noncoherent_rb(cmd_buffer);
8082 radv_cmd_buffer_resolve_rendering(cmd_buffer);
8083 radv_cmd_buffer_reset_rendering(cmd_buffer);
8084 }
8085
8086 static void
radv_emit_view_index_per_stage(struct radeon_cmdbuf * cs,const struct radv_shader * shader,uint32_t base_reg,unsigned index)8087 radv_emit_view_index_per_stage(struct radeon_cmdbuf *cs, const struct radv_shader *shader, uint32_t base_reg,
8088 unsigned index)
8089 {
8090 const struct radv_userdata_info *loc = radv_get_user_sgpr(shader, AC_UD_VIEW_INDEX);
8091
8092 if (loc->sgpr_idx == -1)
8093 return;
8094
8095 radeon_set_sh_reg(cs, base_reg + loc->sgpr_idx * 4, index);
8096 }
8097
8098 static void
radv_emit_view_index(struct radv_cmd_buffer * cmd_buffer,unsigned index)8099 radv_emit_view_index(struct radv_cmd_buffer *cmd_buffer, unsigned index)
8100 {
8101 struct radeon_cmdbuf *cs = cmd_buffer->cs;
8102
8103 radv_foreach_stage(stage, cmd_buffer->state.active_stages & ~VK_SHADER_STAGE_TASK_BIT_EXT)
8104 {
8105 const struct radv_shader *shader = radv_get_shader(cmd_buffer->state.shaders, stage);
8106
8107 radv_emit_view_index_per_stage(cs, shader, shader->info.user_data_0, index);
8108 }
8109
8110 if (cmd_buffer->state.gs_copy_shader) {
8111 radv_emit_view_index_per_stage(cs, cmd_buffer->state.gs_copy_shader, R_00B130_SPI_SHADER_USER_DATA_VS_0, index);
8112 }
8113
8114 if (cmd_buffer->state.active_stages & VK_SHADER_STAGE_TASK_BIT_EXT) {
8115 radv_emit_view_index_per_stage(cmd_buffer->gang.cs, cmd_buffer->state.shaders[MESA_SHADER_TASK],
8116 cmd_buffer->state.shaders[MESA_SHADER_TASK]->info.user_data_0, index);
8117 }
8118 }
8119
8120 /**
8121 * Emulates predication for MEC using COND_EXEC.
8122 * When the current command buffer is predicating, emit a COND_EXEC packet
8123 * so that the MEC skips the next few dwords worth of packets.
8124 *
8125 * To make it work with inverted conditional rendering, we allocate
8126 * space in the upload BO and emit some packets to invert the condition.
8127 */
8128 static void
radv_cs_emit_compute_predication(struct radv_cmd_state * state,struct radeon_cmdbuf * cs,uint64_t inv_va,bool * inv_emitted,unsigned dwords)8129 radv_cs_emit_compute_predication(struct radv_cmd_state *state, struct radeon_cmdbuf *cs, uint64_t inv_va,
8130 bool *inv_emitted, unsigned dwords)
8131 {
8132 if (!state->predicating)
8133 return;
8134
8135 uint64_t va = state->predication_va;
8136
8137 if (!state->predication_type) {
8138 /* Invert the condition the first time it is needed. */
8139 if (!*inv_emitted) {
8140 *inv_emitted = true;
8141
8142 /* Write 1 to the inverted predication VA. */
8143 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
8144 radeon_emit(cs,
8145 COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM);
8146 radeon_emit(cs, 1);
8147 radeon_emit(cs, 0);
8148 radeon_emit(cs, inv_va);
8149 radeon_emit(cs, inv_va >> 32);
8150
8151 /* If the API predication VA == 0, skip next command. */
8152 radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0));
8153 radeon_emit(cs, va);
8154 radeon_emit(cs, va >> 32);
8155 radeon_emit(cs, 0);
8156 radeon_emit(cs, 6); /* 1x COPY_DATA size */
8157
8158 /* Write 0 to the new predication VA (when the API condition != 0) */
8159 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
8160 radeon_emit(cs,
8161 COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM);
8162 radeon_emit(cs, 0);
8163 radeon_emit(cs, 0);
8164 radeon_emit(cs, inv_va);
8165 radeon_emit(cs, inv_va >> 32);
8166 }
8167
8168 va = inv_va;
8169 }
8170
8171 radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0));
8172 radeon_emit(cs, va);
8173 radeon_emit(cs, va >> 32);
8174 radeon_emit(cs, 0); /* Cache policy */
8175 radeon_emit(cs, dwords); /* Size of the predicated packet(s) in DWORDs. */
8176 }
8177
8178 static void
radv_cs_emit_draw_packet(struct radv_cmd_buffer * cmd_buffer,uint32_t vertex_count,uint32_t use_opaque)8179 radv_cs_emit_draw_packet(struct radv_cmd_buffer *cmd_buffer, uint32_t vertex_count, uint32_t use_opaque)
8180 {
8181 radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, cmd_buffer->state.predicating));
8182 radeon_emit(cmd_buffer->cs, vertex_count);
8183 radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX | use_opaque);
8184 }
8185
8186 /**
8187 * Emit a PKT3_DRAW_INDEX_2 packet to render "index_count` vertices.
8188 *
8189 * The starting address "index_va" may point anywhere within the index buffer. The number of
8190 * indexes allocated in the index buffer *past that point* is specified by "max_index_count".
8191 * Hardware uses this information to return 0 for out-of-bounds reads.
8192 */
8193 static void
radv_cs_emit_draw_indexed_packet(struct radv_cmd_buffer * cmd_buffer,uint64_t index_va,uint32_t max_index_count,uint32_t index_count,bool not_eop)8194 radv_cs_emit_draw_indexed_packet(struct radv_cmd_buffer *cmd_buffer, uint64_t index_va, uint32_t max_index_count,
8195 uint32_t index_count, bool not_eop)
8196 {
8197 radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_2, 4, cmd_buffer->state.predicating));
8198 radeon_emit(cmd_buffer->cs, max_index_count);
8199 radeon_emit(cmd_buffer->cs, index_va);
8200 radeon_emit(cmd_buffer->cs, index_va >> 32);
8201 radeon_emit(cmd_buffer->cs, index_count);
8202 /* NOT_EOP allows merging multiple draws into 1 wave, but only user VGPRs
8203 * can be changed between draws and GS fast launch must be disabled.
8204 * NOT_EOP doesn't work on gfx9 and older.
8205 */
8206 radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_DMA | S_0287F0_NOT_EOP(not_eop));
8207 }
8208
8209 /* MUST inline this function to avoid massive perf loss in drawoverhead */
8210 ALWAYS_INLINE static void
radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer * cmd_buffer,bool indexed,uint32_t draw_count,uint64_t count_va,uint32_t stride)8211 radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer *cmd_buffer, bool indexed, uint32_t draw_count,
8212 uint64_t count_va, uint32_t stride)
8213 {
8214 struct radeon_cmdbuf *cs = cmd_buffer->cs;
8215 const unsigned di_src_sel = indexed ? V_0287F0_DI_SRC_SEL_DMA : V_0287F0_DI_SRC_SEL_AUTO_INDEX;
8216 bool draw_id_enable = cmd_buffer->state.uses_drawid;
8217 uint32_t base_reg = cmd_buffer->state.vtx_base_sgpr;
8218 uint32_t vertex_offset_reg, start_instance_reg = 0, draw_id_reg = 0;
8219 bool predicating = cmd_buffer->state.predicating;
8220 bool mesh = cmd_buffer->state.mesh_shading;
8221 assert(base_reg);
8222
8223 /* just reset draw state for vertex data */
8224 cmd_buffer->state.last_first_instance = -1;
8225 cmd_buffer->state.last_num_instances = -1;
8226 cmd_buffer->state.last_drawid = -1;
8227 cmd_buffer->state.last_vertex_offset_valid = false;
8228
8229 vertex_offset_reg = (base_reg - SI_SH_REG_OFFSET) >> 2;
8230 if (cmd_buffer->state.uses_baseinstance)
8231 start_instance_reg = ((base_reg + (draw_id_enable ? 8 : 4)) - SI_SH_REG_OFFSET) >> 2;
8232 if (draw_id_enable)
8233 draw_id_reg = ((base_reg + mesh * 12 + 4) - SI_SH_REG_OFFSET) >> 2;
8234
8235 if (draw_count == 1 && !count_va && !draw_id_enable) {
8236 radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT : PKT3_DRAW_INDIRECT, 3, predicating));
8237 radeon_emit(cs, 0);
8238 radeon_emit(cs, vertex_offset_reg);
8239 radeon_emit(cs, start_instance_reg);
8240 radeon_emit(cs, di_src_sel);
8241 } else {
8242 radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI : PKT3_DRAW_INDIRECT_MULTI, 8, predicating));
8243 radeon_emit(cs, 0);
8244 radeon_emit(cs, vertex_offset_reg);
8245 radeon_emit(cs, start_instance_reg);
8246 radeon_emit(cs, draw_id_reg | S_2C3_DRAW_INDEX_ENABLE(draw_id_enable) | S_2C3_COUNT_INDIRECT_ENABLE(!!count_va));
8247 radeon_emit(cs, draw_count); /* count */
8248 radeon_emit(cs, count_va); /* count_addr */
8249 radeon_emit(cs, count_va >> 32);
8250 radeon_emit(cs, stride); /* stride */
8251 radeon_emit(cs, di_src_sel);
8252 }
8253
8254 cmd_buffer->state.uses_draw_indirect = true;
8255 }
8256
8257 ALWAYS_INLINE static void
radv_cs_emit_indirect_mesh_draw_packet(struct radv_cmd_buffer * cmd_buffer,uint32_t draw_count,uint64_t count_va,uint32_t stride)8258 radv_cs_emit_indirect_mesh_draw_packet(struct radv_cmd_buffer *cmd_buffer, uint32_t draw_count, uint64_t count_va,
8259 uint32_t stride)
8260 {
8261 const struct radv_shader *mesh_shader = cmd_buffer->state.shaders[MESA_SHADER_MESH];
8262 struct radeon_cmdbuf *cs = cmd_buffer->cs;
8263 uint32_t base_reg = cmd_buffer->state.vtx_base_sgpr;
8264 bool predicating = cmd_buffer->state.predicating;
8265 assert(base_reg || (!cmd_buffer->state.uses_drawid && !mesh_shader->info.cs.uses_grid_size));
8266
8267 /* Reset draw state. */
8268 cmd_buffer->state.last_first_instance = -1;
8269 cmd_buffer->state.last_num_instances = -1;
8270 cmd_buffer->state.last_drawid = -1;
8271 cmd_buffer->state.last_vertex_offset_valid = false;
8272
8273 uint32_t xyz_dim_enable = mesh_shader->info.cs.uses_grid_size;
8274 uint32_t xyz_dim_reg = !xyz_dim_enable ? 0 : (base_reg - SI_SH_REG_OFFSET) >> 2;
8275 uint32_t draw_id_enable = !!cmd_buffer->state.uses_drawid;
8276 uint32_t draw_id_reg = !draw_id_enable ? 0 : (base_reg + (xyz_dim_enable ? 12 : 0) - SI_SH_REG_OFFSET) >> 2;
8277
8278 uint32_t mode1_enable = !cmd_buffer->device->physical_device->mesh_fast_launch_2;
8279
8280 radeon_emit(cs, PKT3(PKT3_DISPATCH_MESH_INDIRECT_MULTI, 7, predicating) | PKT3_RESET_FILTER_CAM_S(1));
8281 radeon_emit(cs, 0); /* data_offset */
8282 radeon_emit(cs, S_4C1_XYZ_DIM_REG(xyz_dim_reg) | S_4C1_DRAW_INDEX_REG(draw_id_reg));
8283 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11)
8284 radeon_emit(cs, S_4C2_DRAW_INDEX_ENABLE(draw_id_enable) | S_4C2_COUNT_INDIRECT_ENABLE(!!count_va) |
8285 S_4C2_XYZ_DIM_ENABLE(xyz_dim_enable) | S_4C2_MODE1_ENABLE(mode1_enable));
8286 else
8287 radeon_emit(cs, S_4C2_DRAW_INDEX_ENABLE(draw_id_enable) | S_4C2_COUNT_INDIRECT_ENABLE(!!count_va));
8288 radeon_emit(cs, draw_count);
8289 radeon_emit(cs, count_va & 0xFFFFFFFF);
8290 radeon_emit(cs, count_va >> 32);
8291 radeon_emit(cs, stride);
8292 radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX);
8293 }
8294
8295 ALWAYS_INLINE static void
radv_cs_emit_dispatch_taskmesh_direct_ace_packet(struct radv_cmd_buffer * cmd_buffer,const uint32_t x,const uint32_t y,const uint32_t z)8296 radv_cs_emit_dispatch_taskmesh_direct_ace_packet(struct radv_cmd_buffer *cmd_buffer, const uint32_t x, const uint32_t y,
8297 const uint32_t z)
8298 {
8299 struct radv_shader *task_shader = cmd_buffer->state.shaders[MESA_SHADER_TASK];
8300 struct radeon_cmdbuf *cs = cmd_buffer->gang.cs;
8301 const bool predicating = cmd_buffer->state.predicating;
8302 const uint32_t dispatch_initiator =
8303 cmd_buffer->device->dispatch_initiator_task | S_00B800_CS_W32_EN(task_shader->info.wave_size == 32);
8304
8305 const struct radv_userdata_info *ring_entry_loc = radv_get_user_sgpr(task_shader, AC_UD_TASK_RING_ENTRY);
8306 assert(ring_entry_loc && ring_entry_loc->sgpr_idx != -1 && ring_entry_loc->num_sgprs == 1);
8307
8308 uint32_t ring_entry_reg = (R_00B900_COMPUTE_USER_DATA_0 + ring_entry_loc->sgpr_idx * 4 - SI_SH_REG_OFFSET) >> 2;
8309
8310 radeon_emit(cs, PKT3(PKT3_DISPATCH_TASKMESH_DIRECT_ACE, 4, predicating) | PKT3_SHADER_TYPE_S(1));
8311 radeon_emit(cs, x);
8312 radeon_emit(cs, y);
8313 radeon_emit(cs, z);
8314 radeon_emit(cs, dispatch_initiator);
8315 radeon_emit(cs, ring_entry_reg & 0xFFFF);
8316 }
8317
8318 ALWAYS_INLINE static void
radv_cs_emit_dispatch_taskmesh_indirect_multi_ace_packet(struct radv_cmd_buffer * cmd_buffer,uint64_t data_va,uint32_t draw_count,uint64_t count_va,uint32_t stride)8319 radv_cs_emit_dispatch_taskmesh_indirect_multi_ace_packet(struct radv_cmd_buffer *cmd_buffer, uint64_t data_va,
8320 uint32_t draw_count, uint64_t count_va, uint32_t stride)
8321 {
8322 assert((data_va & 0x03) == 0);
8323 assert((count_va & 0x03) == 0);
8324
8325 struct radv_shader *task_shader = cmd_buffer->state.shaders[MESA_SHADER_TASK];
8326 struct radeon_cmdbuf *cs = cmd_buffer->gang.cs;
8327
8328 const uint32_t xyz_dim_enable = task_shader->info.cs.uses_grid_size;
8329 const uint32_t draw_id_enable = task_shader->info.vs.needs_draw_id;
8330 const uint32_t dispatch_initiator =
8331 cmd_buffer->device->dispatch_initiator_task | S_00B800_CS_W32_EN(task_shader->info.wave_size == 32);
8332
8333 const struct radv_userdata_info *ring_entry_loc = radv_get_user_sgpr(task_shader, AC_UD_TASK_RING_ENTRY);
8334 const struct radv_userdata_info *xyz_dim_loc = radv_get_user_sgpr(task_shader, AC_UD_CS_GRID_SIZE);
8335 const struct radv_userdata_info *draw_id_loc = radv_get_user_sgpr(task_shader, AC_UD_CS_TASK_DRAW_ID);
8336
8337 assert(ring_entry_loc->sgpr_idx != -1 && ring_entry_loc->num_sgprs == 1);
8338 assert(!xyz_dim_enable || (xyz_dim_loc->sgpr_idx != -1 && xyz_dim_loc->num_sgprs == 3));
8339 assert(!draw_id_enable || (draw_id_loc->sgpr_idx != -1 && draw_id_loc->num_sgprs == 1));
8340
8341 const uint32_t ring_entry_reg =
8342 (R_00B900_COMPUTE_USER_DATA_0 + ring_entry_loc->sgpr_idx * 4 - SI_SH_REG_OFFSET) >> 2;
8343 const uint32_t xyz_dim_reg =
8344 !xyz_dim_enable ? 0 : (R_00B900_COMPUTE_USER_DATA_0 + xyz_dim_loc->sgpr_idx * 4 - SI_SH_REG_OFFSET) >> 2;
8345 const uint32_t draw_id_reg =
8346 !draw_id_enable ? 0 : (R_00B900_COMPUTE_USER_DATA_0 + draw_id_loc->sgpr_idx * 4 - SI_SH_REG_OFFSET) >> 2;
8347
8348 radeon_emit(cs, PKT3(PKT3_DISPATCH_TASKMESH_INDIRECT_MULTI_ACE, 9, 0) | PKT3_SHADER_TYPE_S(1));
8349 radeon_emit(cs, data_va);
8350 radeon_emit(cs, data_va >> 32);
8351 radeon_emit(cs, S_AD2_RING_ENTRY_REG(ring_entry_reg));
8352 radeon_emit(cs, S_AD3_COUNT_INDIRECT_ENABLE(!!count_va) | S_AD3_DRAW_INDEX_ENABLE(draw_id_enable) |
8353 S_AD3_XYZ_DIM_ENABLE(xyz_dim_enable) | S_AD3_DRAW_INDEX_REG(draw_id_reg));
8354 radeon_emit(cs, S_AD4_XYZ_DIM_REG(xyz_dim_reg));
8355 radeon_emit(cs, draw_count);
8356 radeon_emit(cs, count_va);
8357 radeon_emit(cs, count_va >> 32);
8358 radeon_emit(cs, stride);
8359 radeon_emit(cs, dispatch_initiator);
8360 }
8361
8362 ALWAYS_INLINE static void
radv_cs_emit_dispatch_taskmesh_gfx_packet(struct radv_cmd_buffer * cmd_buffer)8363 radv_cs_emit_dispatch_taskmesh_gfx_packet(struct radv_cmd_buffer *cmd_buffer)
8364 {
8365 const struct radv_shader *mesh_shader = cmd_buffer->state.shaders[MESA_SHADER_MESH];
8366 struct radeon_cmdbuf *cs = cmd_buffer->cs;
8367 bool predicating = cmd_buffer->state.predicating;
8368
8369 const struct radv_userdata_info *ring_entry_loc =
8370 radv_get_user_sgpr(cmd_buffer->state.last_vgt_shader, AC_UD_TASK_RING_ENTRY);
8371
8372 assert(ring_entry_loc->sgpr_idx != -1);
8373
8374 uint32_t xyz_dim_en = mesh_shader->info.cs.uses_grid_size;
8375 uint32_t xyz_dim_reg = !xyz_dim_en ? 0 : (cmd_buffer->state.vtx_base_sgpr - SI_SH_REG_OFFSET) >> 2;
8376 uint32_t ring_entry_reg = ((mesh_shader->info.user_data_0 - SI_SH_REG_OFFSET) >> 2) + ring_entry_loc->sgpr_idx;
8377 uint32_t mode1_en = !cmd_buffer->device->physical_device->mesh_fast_launch_2;
8378 uint32_t linear_dispatch_en = cmd_buffer->state.shaders[MESA_SHADER_TASK]->info.cs.linear_taskmesh_dispatch;
8379 const bool sqtt_en = !!cmd_buffer->device->sqtt.bo;
8380
8381 radeon_emit(cs, PKT3(PKT3_DISPATCH_TASKMESH_GFX, 2, predicating) | PKT3_RESET_FILTER_CAM_S(1));
8382 radeon_emit(cs, S_4D0_RING_ENTRY_REG(ring_entry_reg) | S_4D0_XYZ_DIM_REG(xyz_dim_reg));
8383 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11)
8384 radeon_emit(cs, S_4D1_XYZ_DIM_ENABLE(xyz_dim_en) | S_4D1_MODE1_ENABLE(mode1_en) |
8385 S_4D1_LINEAR_DISPATCH_ENABLE(linear_dispatch_en) | S_4D1_THREAD_TRACE_MARKER_ENABLE(sqtt_en));
8386 else
8387 radeon_emit(cs, S_4D1_THREAD_TRACE_MARKER_ENABLE(sqtt_en));
8388 radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX);
8389 }
8390
8391 ALWAYS_INLINE static void
radv_emit_userdata_vertex_internal(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,const uint32_t vertex_offset)8392 radv_emit_userdata_vertex_internal(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
8393 const uint32_t vertex_offset)
8394 {
8395 struct radv_cmd_state *state = &cmd_buffer->state;
8396 struct radeon_cmdbuf *cs = cmd_buffer->cs;
8397 const bool uses_baseinstance = state->uses_baseinstance;
8398 const bool uses_drawid = state->uses_drawid;
8399
8400 radeon_set_sh_reg_seq(cs, state->vtx_base_sgpr, state->vtx_emit_num);
8401
8402 radeon_emit(cs, vertex_offset);
8403 state->last_vertex_offset_valid = true;
8404 state->last_vertex_offset = vertex_offset;
8405 if (uses_drawid) {
8406 radeon_emit(cs, 0);
8407 state->last_drawid = 0;
8408 }
8409 if (uses_baseinstance) {
8410 radeon_emit(cs, info->first_instance);
8411 state->last_first_instance = info->first_instance;
8412 }
8413 }
8414
8415 ALWAYS_INLINE static void
radv_emit_userdata_vertex(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,const uint32_t vertex_offset)8416 radv_emit_userdata_vertex(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
8417 const uint32_t vertex_offset)
8418 {
8419 const struct radv_cmd_state *state = &cmd_buffer->state;
8420 const bool uses_baseinstance = state->uses_baseinstance;
8421 const bool uses_drawid = state->uses_drawid;
8422
8423 if (!state->last_vertex_offset_valid || vertex_offset != state->last_vertex_offset ||
8424 (uses_drawid && 0 != state->last_drawid) ||
8425 (uses_baseinstance && info->first_instance != state->last_first_instance))
8426 radv_emit_userdata_vertex_internal(cmd_buffer, info, vertex_offset);
8427 }
8428
8429 ALWAYS_INLINE static void
radv_emit_userdata_vertex_drawid(struct radv_cmd_buffer * cmd_buffer,uint32_t vertex_offset,uint32_t drawid)8430 radv_emit_userdata_vertex_drawid(struct radv_cmd_buffer *cmd_buffer, uint32_t vertex_offset, uint32_t drawid)
8431 {
8432 struct radv_cmd_state *state = &cmd_buffer->state;
8433 struct radeon_cmdbuf *cs = cmd_buffer->cs;
8434 radeon_set_sh_reg_seq(cs, state->vtx_base_sgpr, 1 + !!drawid);
8435 radeon_emit(cs, vertex_offset);
8436 state->last_vertex_offset_valid = true;
8437 state->last_vertex_offset = vertex_offset;
8438 if (drawid)
8439 radeon_emit(cs, drawid);
8440 }
8441
8442 ALWAYS_INLINE static void
radv_emit_userdata_mesh(struct radv_cmd_buffer * cmd_buffer,const uint32_t x,const uint32_t y,const uint32_t z)8443 radv_emit_userdata_mesh(struct radv_cmd_buffer *cmd_buffer, const uint32_t x, const uint32_t y, const uint32_t z)
8444 {
8445 struct radv_cmd_state *state = &cmd_buffer->state;
8446 const struct radv_shader *mesh_shader = state->shaders[MESA_SHADER_MESH];
8447 struct radeon_cmdbuf *cs = cmd_buffer->cs;
8448 const bool uses_drawid = state->uses_drawid;
8449 const bool uses_grid_size = mesh_shader->info.cs.uses_grid_size;
8450
8451 if (!uses_drawid && !uses_grid_size)
8452 return;
8453
8454 radeon_set_sh_reg_seq(cs, state->vtx_base_sgpr, state->vtx_emit_num);
8455 if (uses_grid_size) {
8456 radeon_emit(cs, x);
8457 radeon_emit(cs, y);
8458 radeon_emit(cs, z);
8459 }
8460 if (uses_drawid) {
8461 radeon_emit(cs, 0);
8462 state->last_drawid = 0;
8463 }
8464 }
8465
8466 ALWAYS_INLINE static void
radv_emit_userdata_task(struct radv_cmd_buffer * cmd_buffer,uint32_t x,uint32_t y,uint32_t z,uint32_t draw_id)8467 radv_emit_userdata_task(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t y, uint32_t z, uint32_t draw_id)
8468 {
8469 struct radv_shader *task_shader = cmd_buffer->state.shaders[MESA_SHADER_TASK];
8470 struct radeon_cmdbuf *cs = cmd_buffer->gang.cs;
8471
8472 const struct radv_userdata_info *xyz_loc = radv_get_user_sgpr(task_shader, AC_UD_CS_GRID_SIZE);
8473 const struct radv_userdata_info *draw_id_loc = radv_get_user_sgpr(task_shader, AC_UD_CS_TASK_DRAW_ID);
8474
8475 if (xyz_loc->sgpr_idx != -1) {
8476 assert(xyz_loc->num_sgprs == 3);
8477 unsigned xyz_reg = R_00B900_COMPUTE_USER_DATA_0 + xyz_loc->sgpr_idx * 4;
8478
8479 radeon_set_sh_reg_seq(cs, xyz_reg, 3);
8480 radeon_emit(cs, x);
8481 radeon_emit(cs, y);
8482 radeon_emit(cs, z);
8483 }
8484
8485 if (draw_id_loc->sgpr_idx != -1) {
8486 assert(draw_id_loc->num_sgprs == 1);
8487 unsigned draw_id_reg = R_00B900_COMPUTE_USER_DATA_0 + draw_id_loc->sgpr_idx * 4;
8488
8489 radeon_set_sh_reg_seq(cs, draw_id_reg, 1);
8490 radeon_emit(cs, draw_id);
8491 }
8492 }
8493
8494 ALWAYS_INLINE static void
radv_emit_draw_packets_indexed(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,uint32_t drawCount,const VkMultiDrawIndexedInfoEXT * minfo,uint32_t stride,const int32_t * vertexOffset)8495 radv_emit_draw_packets_indexed(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
8496 uint32_t drawCount, const VkMultiDrawIndexedInfoEXT *minfo, uint32_t stride,
8497 const int32_t *vertexOffset)
8498
8499 {
8500 struct radv_cmd_state *state = &cmd_buffer->state;
8501 struct radeon_cmdbuf *cs = cmd_buffer->cs;
8502 const int index_size = radv_get_vgt_index_size(state->index_type);
8503 unsigned i = 0;
8504 const bool uses_drawid = state->uses_drawid;
8505 const bool can_eop = !uses_drawid && cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10;
8506
8507 if (uses_drawid) {
8508 if (vertexOffset) {
8509 radv_emit_userdata_vertex(cmd_buffer, info, *vertexOffset);
8510 vk_foreach_multi_draw_indexed (draw, i, minfo, drawCount, stride) {
8511 uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
8512 uint64_t index_va = state->index_va + draw->firstIndex * index_size;
8513
8514 /* Handle draw calls with 0-sized index buffers if the GPU can't support them. */
8515 if (!remaining_indexes && cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
8516 radv_handle_zero_index_buffer_bug(cmd_buffer, &index_va, &remaining_indexes);
8517
8518 if (i > 0)
8519 radeon_set_sh_reg(cs, state->vtx_base_sgpr + sizeof(uint32_t), i);
8520
8521 if (!state->render.view_mask) {
8522 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
8523 } else {
8524 u_foreach_bit (view, state->render.view_mask) {
8525 radv_emit_view_index(cmd_buffer, view);
8526
8527 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
8528 }
8529 }
8530 }
8531 } else {
8532 vk_foreach_multi_draw_indexed (draw, i, minfo, drawCount, stride) {
8533 uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
8534 uint64_t index_va = state->index_va + draw->firstIndex * index_size;
8535
8536 /* Handle draw calls with 0-sized index buffers if the GPU can't support them. */
8537 if (!remaining_indexes && cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
8538 radv_handle_zero_index_buffer_bug(cmd_buffer, &index_va, &remaining_indexes);
8539
8540 if (i > 0) {
8541 assert(state->last_vertex_offset_valid);
8542 if (state->last_vertex_offset != draw->vertexOffset)
8543 radv_emit_userdata_vertex_drawid(cmd_buffer, draw->vertexOffset, i);
8544 else
8545 radeon_set_sh_reg(cs, state->vtx_base_sgpr + sizeof(uint32_t), i);
8546 } else
8547 radv_emit_userdata_vertex(cmd_buffer, info, draw->vertexOffset);
8548
8549 if (!state->render.view_mask) {
8550 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
8551 } else {
8552 u_foreach_bit (view, state->render.view_mask) {
8553 radv_emit_view_index(cmd_buffer, view);
8554
8555 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
8556 }
8557 }
8558 }
8559 }
8560 if (drawCount > 1) {
8561 state->last_drawid = drawCount - 1;
8562 }
8563 } else {
8564 if (vertexOffset) {
8565 if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX10) {
8566 /* GFX10 has a bug that consecutive draw packets with NOT_EOP must not have
8567 * count == 0 for the last draw that doesn't have NOT_EOP.
8568 */
8569 while (drawCount > 1) {
8570 const VkMultiDrawIndexedInfoEXT *last =
8571 (const VkMultiDrawIndexedInfoEXT *)(((const uint8_t *)minfo) + (drawCount - 1) * stride);
8572 if (last->indexCount)
8573 break;
8574 drawCount--;
8575 }
8576 }
8577
8578 radv_emit_userdata_vertex(cmd_buffer, info, *vertexOffset);
8579 vk_foreach_multi_draw_indexed (draw, i, minfo, drawCount, stride) {
8580 uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
8581 uint64_t index_va = state->index_va + draw->firstIndex * index_size;
8582
8583 /* Handle draw calls with 0-sized index buffers if the GPU can't support them. */
8584 if (!remaining_indexes && cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
8585 radv_handle_zero_index_buffer_bug(cmd_buffer, &index_va, &remaining_indexes);
8586
8587 if (!state->render.view_mask) {
8588 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount,
8589 can_eop && i < drawCount - 1);
8590 } else {
8591 u_foreach_bit (view, state->render.view_mask) {
8592 radv_emit_view_index(cmd_buffer, view);
8593
8594 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
8595 }
8596 }
8597 }
8598 } else {
8599 vk_foreach_multi_draw_indexed (draw, i, minfo, drawCount, stride) {
8600 uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
8601 uint64_t index_va = state->index_va + draw->firstIndex * index_size;
8602
8603 /* Handle draw calls with 0-sized index buffers if the GPU can't support them. */
8604 if (!remaining_indexes && cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
8605 radv_handle_zero_index_buffer_bug(cmd_buffer, &index_va, &remaining_indexes);
8606
8607 const VkMultiDrawIndexedInfoEXT *next =
8608 (const VkMultiDrawIndexedInfoEXT *)(i < drawCount - 1 ? ((uint8_t *)draw + stride) : NULL);
8609 const bool offset_changes = next && next->vertexOffset != draw->vertexOffset;
8610 radv_emit_userdata_vertex(cmd_buffer, info, draw->vertexOffset);
8611
8612 if (!state->render.view_mask) {
8613 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount,
8614 can_eop && !offset_changes && i < drawCount - 1);
8615 } else {
8616 u_foreach_bit (view, state->render.view_mask) {
8617 radv_emit_view_index(cmd_buffer, view);
8618
8619 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
8620 }
8621 }
8622 }
8623 }
8624 if (drawCount > 1) {
8625 state->last_drawid = drawCount - 1;
8626 }
8627 }
8628 }
8629
8630 ALWAYS_INLINE static void
radv_emit_direct_draw_packets(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,uint32_t drawCount,const VkMultiDrawInfoEXT * minfo,uint32_t use_opaque,uint32_t stride)8631 radv_emit_direct_draw_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info, uint32_t drawCount,
8632 const VkMultiDrawInfoEXT *minfo, uint32_t use_opaque, uint32_t stride)
8633 {
8634 unsigned i = 0;
8635 const uint32_t view_mask = cmd_buffer->state.render.view_mask;
8636 const bool uses_drawid = cmd_buffer->state.uses_drawid;
8637 uint32_t last_start = 0;
8638
8639 vk_foreach_multi_draw (draw, i, minfo, drawCount, stride) {
8640 if (!i)
8641 radv_emit_userdata_vertex(cmd_buffer, info, draw->firstVertex);
8642 else
8643 radv_emit_userdata_vertex_drawid(cmd_buffer, draw->firstVertex, uses_drawid ? i : 0);
8644
8645 if (!view_mask) {
8646 radv_cs_emit_draw_packet(cmd_buffer, draw->vertexCount, use_opaque);
8647 } else {
8648 u_foreach_bit (view, view_mask) {
8649 radv_emit_view_index(cmd_buffer, view);
8650 radv_cs_emit_draw_packet(cmd_buffer, draw->vertexCount, use_opaque);
8651 }
8652 }
8653 last_start = draw->firstVertex;
8654 }
8655 if (drawCount > 1) {
8656 struct radv_cmd_state *state = &cmd_buffer->state;
8657 assert(state->last_vertex_offset_valid);
8658 state->last_vertex_offset = last_start;
8659 if (uses_drawid)
8660 state->last_drawid = drawCount - 1;
8661 }
8662 }
8663
8664 static void
radv_cs_emit_mesh_dispatch_packet(struct radv_cmd_buffer * cmd_buffer,uint32_t x,uint32_t y,uint32_t z)8665 radv_cs_emit_mesh_dispatch_packet(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t y, uint32_t z)
8666 {
8667 radeon_emit(cmd_buffer->cs, PKT3(PKT3_DISPATCH_MESH_DIRECT, 3, cmd_buffer->state.predicating));
8668 radeon_emit(cmd_buffer->cs, x);
8669 radeon_emit(cmd_buffer->cs, y);
8670 radeon_emit(cmd_buffer->cs, z);
8671 radeon_emit(cmd_buffer->cs, S_0287F0_SOURCE_SELECT(V_0287F0_DI_SRC_SEL_AUTO_INDEX));
8672 }
8673
8674 ALWAYS_INLINE static void
radv_emit_direct_mesh_draw_packet(struct radv_cmd_buffer * cmd_buffer,uint32_t x,uint32_t y,uint32_t z)8675 radv_emit_direct_mesh_draw_packet(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t y, uint32_t z)
8676 {
8677 const uint32_t view_mask = cmd_buffer->state.render.view_mask;
8678
8679 radv_emit_userdata_mesh(cmd_buffer, x, y, z);
8680
8681 if (cmd_buffer->device->physical_device->mesh_fast_launch_2) {
8682 if (!view_mask) {
8683 radv_cs_emit_mesh_dispatch_packet(cmd_buffer, x, y, z);
8684 } else {
8685 u_foreach_bit (view, view_mask) {
8686 radv_emit_view_index(cmd_buffer, view);
8687 radv_cs_emit_mesh_dispatch_packet(cmd_buffer, x, y, z);
8688 }
8689 }
8690 } else {
8691 const uint32_t count = x * y * z;
8692 if (!view_mask) {
8693 radv_cs_emit_draw_packet(cmd_buffer, count, 0);
8694 } else {
8695 u_foreach_bit (view, view_mask) {
8696 radv_emit_view_index(cmd_buffer, view);
8697 radv_cs_emit_draw_packet(cmd_buffer, count, 0);
8698 }
8699 }
8700 }
8701 }
8702
8703 ALWAYS_INLINE static void
radv_emit_indirect_mesh_draw_packets(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info)8704 radv_emit_indirect_mesh_draw_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info)
8705 {
8706 const struct radv_cmd_state *state = &cmd_buffer->state;
8707 struct radeon_winsys *ws = cmd_buffer->device->ws;
8708 struct radeon_cmdbuf *cs = cmd_buffer->cs;
8709 const uint64_t va = radv_buffer_get_va(info->indirect->bo) + info->indirect->offset + info->indirect_offset;
8710 const uint64_t count_va = !info->count_buffer ? 0
8711 : radv_buffer_get_va(info->count_buffer->bo) +
8712 info->count_buffer->offset + info->count_buffer_offset;
8713
8714 radv_cs_add_buffer(ws, cs, info->indirect->bo);
8715
8716 if (info->count_buffer) {
8717 radv_cs_add_buffer(ws, cs, info->count_buffer->bo);
8718 }
8719
8720 radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0));
8721 radeon_emit(cs, 1);
8722 radeon_emit(cs, va);
8723 radeon_emit(cs, va >> 32);
8724
8725 if (state->uses_drawid) {
8726 const struct radv_shader *mesh_shader = state->shaders[MESA_SHADER_MESH];
8727 unsigned reg = state->vtx_base_sgpr + (mesh_shader->info.cs.uses_grid_size ? 12 : 0);
8728 radeon_set_sh_reg_seq(cs, reg, 1);
8729 radeon_emit(cs, 0);
8730 }
8731
8732 if (!state->render.view_mask) {
8733 radv_cs_emit_indirect_mesh_draw_packet(cmd_buffer, info->count, count_va, info->stride);
8734 } else {
8735 u_foreach_bit (i, state->render.view_mask) {
8736 radv_emit_view_index(cmd_buffer, i);
8737 radv_cs_emit_indirect_mesh_draw_packet(cmd_buffer, info->count, count_va, info->stride);
8738 }
8739 }
8740 }
8741
8742 ALWAYS_INLINE static void
radv_emit_direct_taskmesh_draw_packets(struct radv_cmd_buffer * cmd_buffer,uint32_t x,uint32_t y,uint32_t z)8743 radv_emit_direct_taskmesh_draw_packets(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t y, uint32_t z)
8744 {
8745 const uint32_t view_mask = cmd_buffer->state.render.view_mask;
8746 const unsigned num_views = MAX2(1, util_bitcount(view_mask));
8747 unsigned ace_predication_size = num_views * 6; /* DISPATCH_TASKMESH_DIRECT_ACE size */
8748
8749 radv_emit_userdata_task(cmd_buffer, x, y, z, 0);
8750 radv_cs_emit_compute_predication(&cmd_buffer->state, cmd_buffer->gang.cs, cmd_buffer->mec_inv_pred_va,
8751 &cmd_buffer->mec_inv_pred_emitted, ace_predication_size);
8752
8753 if (!view_mask) {
8754 radv_cs_emit_dispatch_taskmesh_direct_ace_packet(cmd_buffer, x, y, z);
8755 radv_cs_emit_dispatch_taskmesh_gfx_packet(cmd_buffer);
8756 } else {
8757 u_foreach_bit (view, view_mask) {
8758 radv_emit_view_index(cmd_buffer, view);
8759 radv_cs_emit_dispatch_taskmesh_direct_ace_packet(cmd_buffer, x, y, z);
8760 radv_cs_emit_dispatch_taskmesh_gfx_packet(cmd_buffer);
8761 }
8762 }
8763 }
8764
8765 static void
radv_emit_indirect_taskmesh_draw_packets(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info)8766 radv_emit_indirect_taskmesh_draw_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info)
8767 {
8768 const uint32_t view_mask = cmd_buffer->state.render.view_mask;
8769 struct radeon_winsys *ws = cmd_buffer->device->ws;
8770 const unsigned num_views = MAX2(1, util_bitcount(view_mask));
8771 unsigned ace_predication_size = num_views * 11; /* DISPATCH_TASKMESH_INDIRECT_MULTI_ACE size */
8772 struct radeon_cmdbuf *ace_cs = cmd_buffer->gang.cs;
8773
8774 const uint64_t va = radv_buffer_get_va(info->indirect->bo) + info->indirect->offset + info->indirect_offset;
8775 const uint64_t count_va = !info->count_buffer ? 0
8776 : radv_buffer_get_va(info->count_buffer->bo) +
8777 info->count_buffer->offset + info->count_buffer_offset;
8778 uint64_t workaround_cond_va = 0;
8779
8780 if (num_views > 1)
8781 ace_predication_size += num_views * 3; /* SET_SH_REG size (view index SGPR) */
8782
8783 if (count_va)
8784 radv_cs_add_buffer(ws, cmd_buffer->gang.cs, info->count_buffer->bo);
8785
8786 if (cmd_buffer->device->physical_device->rad_info.has_taskmesh_indirect0_bug && count_va) {
8787 /* MEC firmware bug workaround.
8788 * When the count buffer contains zero, DISPATCH_TASKMESH_INDIRECT_MULTI_ACE hangs.
8789 * - We must ensure that DISPATCH_TASKMESH_INDIRECT_MULTI_ACE
8790 * is only executed when the count buffer contains non-zero.
8791 * - Furthermore, we must also ensure that each DISPATCH_TASKMESH_GFX packet
8792 * has a matching ACE packet.
8793 *
8794 * As a workaround:
8795 * - Reserve a dword in the upload buffer and initialize it to 1 for the workaround
8796 * - When count != 0, write 0 to the workaround BO and execute the indirect dispatch
8797 * - When workaround BO != 0 (count was 0), execute an empty direct dispatch
8798 */
8799
8800 uint32_t workaround_cond_init = 0;
8801 uint32_t workaround_cond_off;
8802 if (!radv_cmd_buffer_upload_data(cmd_buffer, 4, &workaround_cond_init, &workaround_cond_off))
8803 vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
8804
8805 workaround_cond_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + workaround_cond_off;
8806
8807 radeon_emit(ace_cs, PKT3(PKT3_COPY_DATA, 4, 0));
8808 radeon_emit(ace_cs,
8809 COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM);
8810 radeon_emit(ace_cs, 1);
8811 radeon_emit(ace_cs, 0);
8812 radeon_emit(ace_cs, workaround_cond_va);
8813 radeon_emit(ace_cs, workaround_cond_va >> 32);
8814
8815 /* 2x COND_EXEC + 1x COPY_DATA + Nx DISPATCH_TASKMESH_DIRECT_ACE */
8816 ace_predication_size += 2 * 5 + 6 + 6 * num_views;
8817 }
8818
8819 radv_cs_add_buffer(ws, cmd_buffer->gang.cs, info->indirect->bo);
8820 radv_cs_emit_compute_predication(&cmd_buffer->state, cmd_buffer->gang.cs, cmd_buffer->mec_inv_pred_va,
8821 &cmd_buffer->mec_inv_pred_emitted, ace_predication_size);
8822
8823 if (workaround_cond_va) {
8824 radeon_emit(ace_cs, PKT3(PKT3_COND_EXEC, 3, 0));
8825 radeon_emit(ace_cs, count_va);
8826 radeon_emit(ace_cs, count_va >> 32);
8827 radeon_emit(ace_cs, 0);
8828 radeon_emit(ace_cs, 6 + 11 * num_views); /* 1x COPY_DATA + Nx DISPATCH_TASKMESH_INDIRECT_MULTI_ACE */
8829
8830 radeon_emit(ace_cs, PKT3(PKT3_COPY_DATA, 4, 0));
8831 radeon_emit(ace_cs,
8832 COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM);
8833 radeon_emit(ace_cs, 0);
8834 radeon_emit(ace_cs, 0);
8835 radeon_emit(ace_cs, workaround_cond_va);
8836 radeon_emit(ace_cs, workaround_cond_va >> 32);
8837 }
8838
8839 if (!view_mask) {
8840 radv_cs_emit_dispatch_taskmesh_indirect_multi_ace_packet(cmd_buffer, va, info->count, count_va, info->stride);
8841 radv_cs_emit_dispatch_taskmesh_gfx_packet(cmd_buffer);
8842 } else {
8843 u_foreach_bit (view, view_mask) {
8844 radv_emit_view_index(cmd_buffer, view);
8845 radv_cs_emit_dispatch_taskmesh_indirect_multi_ace_packet(cmd_buffer, va, info->count, count_va, info->stride);
8846 radv_cs_emit_dispatch_taskmesh_gfx_packet(cmd_buffer);
8847 }
8848 }
8849
8850 if (workaround_cond_va) {
8851 radeon_emit(ace_cs, PKT3(PKT3_COND_EXEC, 3, 0));
8852 radeon_emit(ace_cs, workaround_cond_va);
8853 radeon_emit(ace_cs, workaround_cond_va >> 32);
8854 radeon_emit(ace_cs, 0);
8855 radeon_emit(ace_cs, 6 * num_views); /* Nx DISPATCH_TASKMESH_DIRECT_ACE */
8856
8857 for (unsigned v = 0; v < num_views; ++v) {
8858 radv_cs_emit_dispatch_taskmesh_direct_ace_packet(cmd_buffer, 0, 0, 0);
8859 }
8860 }
8861 }
8862
8863 static void
radv_emit_indirect_draw_packets(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info)8864 radv_emit_indirect_draw_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info)
8865 {
8866 const struct radv_cmd_state *state = &cmd_buffer->state;
8867 struct radeon_winsys *ws = cmd_buffer->device->ws;
8868 struct radeon_cmdbuf *cs = cmd_buffer->cs;
8869 const uint64_t va = radv_buffer_get_va(info->indirect->bo) + info->indirect->offset + info->indirect_offset;
8870 const uint64_t count_va = info->count_buffer ? radv_buffer_get_va(info->count_buffer->bo) +
8871 info->count_buffer->offset + info->count_buffer_offset
8872 : 0;
8873
8874 radv_cs_add_buffer(ws, cs, info->indirect->bo);
8875
8876 radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0));
8877 radeon_emit(cs, 1);
8878 radeon_emit(cs, va);
8879 radeon_emit(cs, va >> 32);
8880
8881 if (info->count_buffer) {
8882 radv_cs_add_buffer(ws, cs, info->count_buffer->bo);
8883 }
8884
8885 if (!state->render.view_mask) {
8886 radv_cs_emit_indirect_draw_packet(cmd_buffer, info->indexed, info->count, count_va, info->stride);
8887 } else {
8888 u_foreach_bit (i, state->render.view_mask) {
8889 radv_emit_view_index(cmd_buffer, i);
8890
8891 radv_cs_emit_indirect_draw_packet(cmd_buffer, info->indexed, info->count, count_va, info->stride);
8892 }
8893 }
8894 }
8895
8896 static uint64_t
radv_get_needed_dynamic_states(struct radv_cmd_buffer * cmd_buffer)8897 radv_get_needed_dynamic_states(struct radv_cmd_buffer *cmd_buffer)
8898 {
8899 uint64_t dynamic_states = RADV_DYNAMIC_ALL;
8900
8901 if (cmd_buffer->state.graphics_pipeline)
8902 return cmd_buffer->state.graphics_pipeline->needed_dynamic_state;
8903
8904 /* Clear unnecessary dynamic states for shader objects. */
8905 if (!cmd_buffer->state.shaders[MESA_SHADER_TESS_CTRL])
8906 dynamic_states &= ~(RADV_DYNAMIC_PATCH_CONTROL_POINTS | RADV_DYNAMIC_TESS_DOMAIN_ORIGIN);
8907
8908 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10_3) {
8909 if (cmd_buffer->state.shaders[MESA_SHADER_MESH])
8910 dynamic_states &= ~(RADV_DYNAMIC_VERTEX_INPUT | RADV_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE |
8911 RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE | RADV_DYNAMIC_PRIMITIVE_TOPOLOGY);
8912 } else {
8913 dynamic_states &= ~RADV_DYNAMIC_FRAGMENT_SHADING_RATE;
8914 }
8915
8916 return dynamic_states;
8917 }
8918
8919 /*
8920 * Vega and raven have a bug which triggers if there are multiple context
8921 * register contexts active at the same time with different scissor values.
8922 *
8923 * There are two possible workarounds:
8924 * 1) Wait for PS_PARTIAL_FLUSH every time the scissor is changed. That way
8925 * there is only ever 1 active set of scissor values at the same time.
8926 *
8927 * 2) Whenever the hardware switches contexts we have to set the scissor
8928 * registers again even if it is a noop. That way the new context gets
8929 * the correct scissor values.
8930 *
8931 * This implements option 2. radv_need_late_scissor_emission needs to
8932 * return true on affected HW if radv_emit_all_graphics_states sets
8933 * any context registers.
8934 */
8935 static bool
radv_need_late_scissor_emission(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info)8936 radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info)
8937 {
8938 if (cmd_buffer->state.context_roll_without_scissor_emitted || info->strmout_buffer)
8939 return true;
8940
8941 uint64_t used_states = radv_get_needed_dynamic_states(cmd_buffer) | ~RADV_CMD_DIRTY_DYNAMIC_ALL;
8942
8943 /* Index, vertex and streamout buffers don't change context regs.
8944 * We assume that any other dirty flag causes context rolls.
8945 */
8946 used_states &= ~(RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_VERTEX_BUFFER | RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT |
8947 RADV_CMD_DIRTY_STREAMOUT_BUFFER);
8948
8949 return cmd_buffer->state.dirty & used_states;
8950 }
8951
8952 ALWAYS_INLINE static uint32_t
radv_get_ngg_culling_settings(struct radv_cmd_buffer * cmd_buffer,bool vp_y_inverted)8953 radv_get_ngg_culling_settings(struct radv_cmd_buffer *cmd_buffer, bool vp_y_inverted)
8954 {
8955 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
8956
8957 /* Disable shader culling entirely when conservative overestimate is used.
8958 * The face culling algorithm can delete very tiny triangles (even if unintended).
8959 */
8960 if (d->vk.rs.conservative_mode == VK_CONSERVATIVE_RASTERIZATION_MODE_OVERESTIMATE_EXT)
8961 return radv_nggc_none;
8962
8963 /* With graphics pipeline library, NGG culling is unconditionally compiled into shaders
8964 * because we don't know the primitive topology at compile time, so we should
8965 * disable it dynamically for points or lines.
8966 */
8967 const unsigned num_vertices_per_prim = radv_conv_prim_to_gs_out(d->vk.ia.primitive_topology, true) + 1;
8968 if (num_vertices_per_prim != 3)
8969 return radv_nggc_none;
8970
8971 /* Cull every triangle when rasterizer discard is enabled. */
8972 if (d->vk.rs.rasterizer_discard_enable)
8973 return radv_nggc_front_face | radv_nggc_back_face;
8974
8975 uint32_t nggc_settings = radv_nggc_none;
8976
8977 /* The culling code needs to know whether face is CW or CCW. */
8978 bool ccw = d->vk.rs.front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE;
8979
8980 /* Take inverted viewport into account. */
8981 ccw ^= vp_y_inverted;
8982
8983 if (ccw)
8984 nggc_settings |= radv_nggc_face_is_ccw;
8985
8986 /* Face culling settings. */
8987 if (d->vk.rs.cull_mode & VK_CULL_MODE_FRONT_BIT)
8988 nggc_settings |= radv_nggc_front_face;
8989 if (d->vk.rs.cull_mode & VK_CULL_MODE_BACK_BIT)
8990 nggc_settings |= radv_nggc_back_face;
8991
8992 /* Small primitive culling assumes a sample position at (0.5, 0.5)
8993 * so don't enable it with user sample locations.
8994 */
8995 if (!d->vk.ms.sample_locations_enable) {
8996 nggc_settings |= radv_nggc_small_primitives;
8997
8998 /* small_prim_precision = num_samples / 2^subpixel_bits
8999 * num_samples is also always a power of two, so the small prim precision can only be
9000 * a power of two between 2^-2 and 2^-6, therefore it's enough to remember the exponent.
9001 */
9002 unsigned rasterization_samples = radv_get_rasterization_samples(cmd_buffer);
9003 unsigned subpixel_bits = 256;
9004 int32_t small_prim_precision_log2 = util_logbase2(rasterization_samples) - util_logbase2(subpixel_bits);
9005 nggc_settings |= ((uint32_t)small_prim_precision_log2 << 24u);
9006 }
9007
9008 return nggc_settings;
9009 }
9010
9011 static void
radv_emit_ngg_culling_state(struct radv_cmd_buffer * cmd_buffer)9012 radv_emit_ngg_culling_state(struct radv_cmd_buffer *cmd_buffer)
9013 {
9014 const struct radv_shader *last_vgt_shader = cmd_buffer->state.last_vgt_shader;
9015 const uint32_t base_reg = last_vgt_shader->info.user_data_0;
9016
9017 /* Get viewport transform. */
9018 float vp_scale[2], vp_translate[2];
9019 memcpy(vp_scale, cmd_buffer->state.dynamic.hw_vp.xform[0].scale, 2 * sizeof(float));
9020 memcpy(vp_translate, cmd_buffer->state.dynamic.hw_vp.xform[0].translate, 2 * sizeof(float));
9021 bool vp_y_inverted = (-vp_scale[1] + vp_translate[1]) > (vp_scale[1] + vp_translate[1]);
9022
9023 /* Get current culling settings. */
9024 uint32_t nggc_settings = radv_get_ngg_culling_settings(cmd_buffer, vp_y_inverted);
9025
9026 if (cmd_buffer->state.dirty &
9027 (RADV_CMD_DIRTY_PIPELINE | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT | RADV_CMD_DIRTY_DYNAMIC_RASTERIZATION_SAMPLES)) {
9028 /* Correction for inverted Y */
9029 if (vp_y_inverted) {
9030 vp_scale[1] = -vp_scale[1];
9031 vp_translate[1] = -vp_translate[1];
9032 }
9033
9034 /* Correction for number of samples per pixel. */
9035 for (unsigned i = 0; i < 2; ++i) {
9036 vp_scale[i] *= (float)cmd_buffer->state.dynamic.vk.ms.rasterization_samples;
9037 vp_translate[i] *= (float)cmd_buffer->state.dynamic.vk.ms.rasterization_samples;
9038 }
9039
9040 uint32_t vp_reg_values[4] = {fui(vp_scale[0]), fui(vp_scale[1]), fui(vp_translate[0]), fui(vp_translate[1])};
9041 const int8_t vp_sgpr_idx = radv_get_user_sgpr(last_vgt_shader, AC_UD_NGG_VIEWPORT)->sgpr_idx;
9042 assert(vp_sgpr_idx != -1);
9043 radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + vp_sgpr_idx * 4, 4);
9044 radeon_emit_array(cmd_buffer->cs, vp_reg_values, 4);
9045 }
9046
9047 const int8_t nggc_sgpr_idx = radv_get_user_sgpr(last_vgt_shader, AC_UD_NGG_CULLING_SETTINGS)->sgpr_idx;
9048 assert(nggc_sgpr_idx != -1);
9049
9050 radeon_set_sh_reg(cmd_buffer->cs, base_reg + nggc_sgpr_idx * 4, nggc_settings);
9051 }
9052
9053 static void
radv_emit_fs_state(struct radv_cmd_buffer * cmd_buffer)9054 radv_emit_fs_state(struct radv_cmd_buffer *cmd_buffer)
9055 {
9056 const struct radv_shader *ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
9057 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
9058 const struct radv_userdata_info *loc;
9059
9060 if (!ps)
9061 return;
9062
9063 loc = radv_get_user_sgpr(ps, AC_UD_PS_STATE);
9064 if (loc->sgpr_idx == -1)
9065 return;
9066 assert(loc->num_sgprs == 1);
9067
9068 const unsigned rasterization_samples = radv_get_rasterization_samples(cmd_buffer);
9069 const unsigned ps_iter_samples = radv_get_ps_iter_samples(cmd_buffer);
9070 const uint16_t ps_iter_mask = ac_get_ps_iter_mask(ps_iter_samples);
9071 const unsigned rast_prim = radv_get_rasterization_prim(cmd_buffer);
9072 const uint32_t base_reg = ps->info.user_data_0;
9073 const unsigned ps_state = SET_SGPR_FIELD(PS_STATE_NUM_SAMPLES, rasterization_samples) |
9074 SET_SGPR_FIELD(PS_STATE_PS_ITER_MASK, ps_iter_mask) |
9075 SET_SGPR_FIELD(PS_STATE_LINE_RAST_MODE, d->vk.rs.line.mode) |
9076 SET_SGPR_FIELD(PS_STATE_RAST_PRIM, rast_prim);
9077
9078 radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, ps_state);
9079 }
9080
9081 static void
radv_emit_db_shader_control(struct radv_cmd_buffer * cmd_buffer)9082 radv_emit_db_shader_control(struct radv_cmd_buffer *cmd_buffer)
9083 {
9084 const struct radeon_info *rad_info = &cmd_buffer->device->physical_device->rad_info;
9085 const struct radv_shader *ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
9086 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
9087 const bool uses_ds_feedback_loop =
9088 !!(d->feedback_loop_aspects & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT));
9089 const unsigned rasterization_samples = radv_get_rasterization_samples(cmd_buffer);
9090
9091 uint32_t db_shader_control;
9092
9093 if (ps) {
9094 db_shader_control = ps->info.ps.db_shader_control;
9095 } else {
9096 db_shader_control = S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_ANY_Z) |
9097 S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z) |
9098 S_02880C_DUAL_QUAD_DISABLE(rad_info->has_rbplus && !rad_info->rbplus_allowed);
9099 }
9100
9101 /* When a depth/stencil attachment is used inside feedback loops, use LATE_Z to make sure shader invocations read the
9102 * correct value.
9103 * Also apply the bug workaround for smoothing (overrasterization) on GFX6.
9104 */
9105 if (uses_ds_feedback_loop ||
9106 (rad_info->gfx_level == GFX6 && d->vk.rs.line.mode == VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_KHR))
9107 db_shader_control = (db_shader_control & C_02880C_Z_ORDER) | S_02880C_Z_ORDER(V_02880C_LATE_Z);
9108
9109 if (ps && ps->info.ps.pops) {
9110 /* POPS_OVERLAP_NUM_SAMPLES (OVERRIDE_INTRINSIC_RATE on GFX11, must always be enabled for POPS) controls the
9111 * interlock granularity.
9112 * PixelInterlock: 1x.
9113 * SampleInterlock: MSAA_EXPOSED_SAMPLES (much faster at common edges of adjacent primitives with MSAA).
9114 */
9115 if (rad_info->gfx_level >= GFX11) {
9116 db_shader_control |= S_02880C_OVERRIDE_INTRINSIC_RATE_ENABLE(1);
9117 if (ps->info.ps.pops_is_per_sample)
9118 db_shader_control |= S_02880C_OVERRIDE_INTRINSIC_RATE(util_logbase2(rasterization_samples));
9119 } else {
9120 if (ps->info.ps.pops_is_per_sample)
9121 db_shader_control |= S_02880C_POPS_OVERLAP_NUM_SAMPLES(util_logbase2(rasterization_samples));
9122
9123 if (rad_info->has_pops_missed_overlap_bug) {
9124 radeon_set_context_reg(cmd_buffer->cs, R_028060_DB_DFSM_CONTROL,
9125 S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF) |
9126 S_028060_POPS_DRAIN_PS_ON_OVERLAP(rasterization_samples >= 8));
9127 }
9128 }
9129 } else if (rad_info->has_export_conflict_bug && rasterization_samples == 1) {
9130 for (uint32_t i = 0; i < MAX_RTS; i++) {
9131 if (d->vk.cb.attachments[i].write_mask && d->vk.cb.attachments[i].blend_enable) {
9132 db_shader_control |= S_02880C_OVERRIDE_INTRINSIC_RATE_ENABLE(1) | S_02880C_OVERRIDE_INTRINSIC_RATE(2);
9133 break;
9134 }
9135 }
9136 }
9137
9138 if (db_shader_control != cmd_buffer->state.last_db_shader_control) {
9139 radeon_set_context_reg(cmd_buffer->cs, R_02880C_DB_SHADER_CONTROL, db_shader_control);
9140
9141 cmd_buffer->state.last_db_shader_control = db_shader_control;
9142 }
9143
9144 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_DB_SHADER_CONTROL;
9145 }
9146
9147 static void
radv_emit_streamout_enable_state(struct radv_cmd_buffer * cmd_buffer)9148 radv_emit_streamout_enable_state(struct radv_cmd_buffer *cmd_buffer)
9149 {
9150 const struct radv_streamout_state *so = &cmd_buffer->state.streamout;
9151 const bool streamout_enabled = radv_is_streamout_enabled(cmd_buffer);
9152 uint32_t enabled_stream_buffers_mask = 0;
9153
9154 if (streamout_enabled && cmd_buffer->state.last_vgt_shader) {
9155 const struct radv_shader_info *info = &cmd_buffer->state.last_vgt_shader->info;
9156
9157 enabled_stream_buffers_mask = info->so.enabled_stream_buffers_mask;
9158
9159 if (!cmd_buffer->device->physical_device->use_ngg_streamout) {
9160 u_foreach_bit (i, so->enabled_mask) {
9161 radeon_set_context_reg(cmd_buffer->cs, R_028AD4_VGT_STRMOUT_VTX_STRIDE_0 + 16 * i, info->so.strides[i]);
9162 }
9163 }
9164 }
9165
9166 radeon_set_context_reg_seq(cmd_buffer->cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
9167 radeon_emit(cmd_buffer->cs, S_028B94_STREAMOUT_0_EN(streamout_enabled) | S_028B94_RAST_STREAM(0) |
9168 S_028B94_STREAMOUT_1_EN(streamout_enabled) |
9169 S_028B94_STREAMOUT_2_EN(streamout_enabled) |
9170 S_028B94_STREAMOUT_3_EN(streamout_enabled));
9171 radeon_emit(cmd_buffer->cs, so->hw_enabled_mask & enabled_stream_buffers_mask);
9172
9173 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_STREAMOUT_ENABLE;
9174 }
9175
9176 static gl_shader_stage
radv_cmdbuf_get_last_vgt_api_stage(const struct radv_cmd_buffer * cmd_buffer)9177 radv_cmdbuf_get_last_vgt_api_stage(const struct radv_cmd_buffer *cmd_buffer)
9178 {
9179 if (cmd_buffer->state.active_stages & VK_SHADER_STAGE_MESH_BIT_EXT)
9180 return MESA_SHADER_MESH;
9181
9182 return util_last_bit(cmd_buffer->state.active_stages & BITFIELD_MASK(MESA_SHADER_FRAGMENT)) - 1;
9183 }
9184
9185 static void
radv_emit_graphics_shaders(struct radv_cmd_buffer * cmd_buffer)9186 radv_emit_graphics_shaders(struct radv_cmd_buffer *cmd_buffer)
9187 {
9188 const gl_shader_stage last_vgt_api_stage = radv_cmdbuf_get_last_vgt_api_stage(cmd_buffer);
9189 const struct radv_shader *last_vgt_shader = cmd_buffer->state.shaders[last_vgt_api_stage];
9190 struct radv_device *device = cmd_buffer->device;
9191 struct radeon_cmdbuf *cs = cmd_buffer->cs;
9192
9193 radv_foreach_stage(s, cmd_buffer->state.active_stages & RADV_GRAPHICS_STAGE_BITS)
9194 {
9195 struct radv_shader_object *shader_obj = cmd_buffer->state.shader_objs[s];
9196
9197 switch (s) {
9198 case MESA_SHADER_VERTEX: {
9199 const struct radv_shader *vs = cmd_buffer->state.shaders[MESA_SHADER_VERTEX];
9200 struct radv_shader *next_stage = NULL;
9201
9202 if (vs->info.merged_shader_compiled_separately) {
9203 assert(vs->info.next_stage == MESA_SHADER_TESS_CTRL || vs->info.next_stage == MESA_SHADER_GEOMETRY);
9204 next_stage = cmd_buffer->state.shaders[vs->info.next_stage];
9205 }
9206
9207 radv_emit_vertex_shader(device, cs, cs, vs, next_stage);
9208 break;
9209 }
9210 case MESA_SHADER_TESS_CTRL:
9211 radv_emit_tess_ctrl_shader(device, cs, cmd_buffer->state.shaders[MESA_SHADER_TESS_CTRL]);
9212 break;
9213 case MESA_SHADER_TESS_EVAL: {
9214 const struct radv_shader *tes = cmd_buffer->state.shaders[MESA_SHADER_TESS_EVAL];
9215 struct radv_shader *gs = NULL;
9216
9217 if (tes->info.merged_shader_compiled_separately) {
9218 assert(tes->info.next_stage == MESA_SHADER_GEOMETRY);
9219 gs = cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY];
9220 }
9221
9222 radv_emit_tess_eval_shader(device, cs, cs, tes, gs);
9223 break;
9224 }
9225 case MESA_SHADER_GEOMETRY: {
9226 struct radv_shader *es = cmd_buffer->state.shaders[MESA_SHADER_TESS_EVAL]
9227 ? cmd_buffer->state.shaders[MESA_SHADER_TESS_EVAL]
9228 : cmd_buffer->state.shaders[MESA_SHADER_VERTEX];
9229
9230 radv_emit_geometry_shader(device, cs, cs, cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY], es,
9231 shader_obj->gs.copy_shader);
9232 break;
9233 }
9234 case MESA_SHADER_FRAGMENT:
9235 radv_emit_fragment_shader(device, cs, cs, cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT]);
9236 radv_emit_ps_inputs(device, cs, last_vgt_shader, cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT]);
9237 break;
9238 case MESA_SHADER_MESH:
9239 radv_emit_mesh_shader(device, cs, cs, cmd_buffer->state.shaders[MESA_SHADER_MESH]);
9240 break;
9241 case MESA_SHADER_TASK:
9242 radv_emit_compute_shader(device->physical_device, cmd_buffer->gang.cs,
9243 cmd_buffer->state.shaders[MESA_SHADER_TASK]);
9244 break;
9245 default:
9246 unreachable("invalid bind stage");
9247 }
9248 }
9249
9250 /* Emit graphics states related to shaders. */
9251 struct radv_vgt_shader_key vgt_shader_cfg_key = {
9252 .tess = !!cmd_buffer->state.shaders[MESA_SHADER_TESS_CTRL],
9253 .gs = !!cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY],
9254 .ngg = last_vgt_shader->info.is_ngg,
9255 .ngg_passthrough = last_vgt_shader->info.is_ngg_passthrough,
9256 .ngg_streamout = last_vgt_shader->info.is_ngg && last_vgt_shader->info.so.num_outputs > 0,
9257 };
9258
9259 if (cmd_buffer->state.shaders[MESA_SHADER_MESH]) {
9260 vgt_shader_cfg_key.mesh = 1;
9261 vgt_shader_cfg_key.mesh_scratch_ring = cmd_buffer->state.shaders[MESA_SHADER_MESH]->info.ms.needs_ms_scratch_ring;
9262 }
9263
9264 radv_emit_vgt_gs_mode(device, cs, last_vgt_shader);
9265 radv_emit_vgt_vertex_reuse(device, cs, radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_TESS_EVAL));
9266 radv_emit_vgt_shader_config(device, cs, &vgt_shader_cfg_key);
9267 radv_emit_vgt_gs_out(device, cs, radv_get_rasterization_prim(cmd_buffer));
9268
9269 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10_3) {
9270 gfx103_emit_vgt_draw_payload_cntl(cs, cmd_buffer->state.shaders[MESA_SHADER_MESH], false);
9271 gfx103_emit_vrs_state(device, cs, NULL, false, false, false);
9272 }
9273
9274 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_GRAPHICS_SHADERS;
9275 }
9276
9277 static void
radv_emit_all_graphics_states(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info)9278 radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info)
9279 {
9280 const struct radv_device *device = cmd_buffer->device;
9281 struct radv_shader_part *tcs_epilog = NULL, *ps_epilog = NULL;
9282
9283 if (cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT] &&
9284 cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT]->info.has_epilog) {
9285 if ((cmd_buffer->state.emitted_graphics_pipeline != cmd_buffer->state.graphics_pipeline ||
9286 (cmd_buffer->state.dirty &
9287 (RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_MASK | RADV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_ENABLE |
9288 RADV_CMD_DIRTY_DYNAMIC_ALPHA_TO_COVERAGE_ENABLE | RADV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_EQUATION |
9289 RADV_CMD_DIRTY_GRAPHICS_SHADERS)))) {
9290 ps_epilog = lookup_ps_epilog(cmd_buffer);
9291 if (!ps_epilog) {
9292 vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
9293 return;
9294 }
9295
9296 cmd_buffer->state.col_format_non_compacted = ps_epilog->spi_shader_col_format;
9297
9298 bool need_null_export_workaround = radv_needs_null_export_workaround(
9299 device, cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT], cmd_buffer->state.custom_blend_mode);
9300
9301 if (need_null_export_workaround && !cmd_buffer->state.col_format_non_compacted)
9302 cmd_buffer->state.col_format_non_compacted = V_028714_SPI_SHADER_32_R;
9303 if (device->physical_device->rad_info.rbplus_allowed)
9304 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_RBPLUS;
9305 }
9306 }
9307
9308 if (cmd_buffer->state.shaders[MESA_SHADER_TESS_CTRL] &&
9309 cmd_buffer->state.shaders[MESA_SHADER_TESS_CTRL]->info.has_epilog) {
9310 tcs_epilog = lookup_tcs_epilog(cmd_buffer);
9311 if (!tcs_epilog) {
9312 vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
9313 return;
9314 }
9315 }
9316
9317 /* Determine whether GFX9 late scissor workaround should be applied based on:
9318 * 1. radv_need_late_scissor_emission
9319 * 2. any dirty dynamic flags that may cause context rolls
9320 */
9321 const bool late_scissor_emission = cmd_buffer->device->physical_device->rad_info.has_gfx9_scissor_bug
9322 ? radv_need_late_scissor_emission(cmd_buffer, info)
9323 : false;
9324
9325 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_RBPLUS)
9326 radv_emit_rbplus_state(cmd_buffer);
9327
9328 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_SHADER_QUERY)
9329 radv_flush_shader_query_state(cmd_buffer);
9330
9331 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_OCCLUSION_QUERY)
9332 radv_flush_occlusion_query_state(cmd_buffer);
9333
9334 if ((cmd_buffer->state.dirty &
9335 (RADV_CMD_DIRTY_PIPELINE | RADV_CMD_DIRTY_DYNAMIC_CULL_MODE | RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE |
9336 RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT |
9337 RADV_CMD_DIRTY_DYNAMIC_CONSERVATIVE_RAST_MODE | RADV_CMD_DIRTY_DYNAMIC_RASTERIZATION_SAMPLES |
9338 RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY | RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS_ENABLE)) &&
9339 cmd_buffer->state.has_nggc)
9340 radv_emit_ngg_culling_state(cmd_buffer);
9341
9342 if (cmd_buffer->state.dirty &
9343 (RADV_CMD_DIRTY_FRAMEBUFFER | RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_MASK |
9344 RADV_CMD_DIRTY_DYNAMIC_RASTERIZATION_SAMPLES | RADV_CMD_DIRTY_DYNAMIC_LINE_RASTERIZATION_MODE))
9345 radv_emit_binning_state(cmd_buffer);
9346
9347 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) {
9348 radv_emit_graphics_pipeline(cmd_buffer);
9349 } else if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_GRAPHICS_SHADERS) {
9350 radv_emit_graphics_shaders(cmd_buffer);
9351 }
9352
9353 if (ps_epilog)
9354 radv_emit_ps_epilog_state(cmd_buffer, ps_epilog);
9355
9356 if (tcs_epilog)
9357 radv_emit_tcs_epilog_state(cmd_buffer, tcs_epilog);
9358
9359 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)
9360 radv_emit_framebuffer_state(cmd_buffer);
9361
9362 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_GUARDBAND)
9363 radv_emit_guardband_state(cmd_buffer);
9364
9365 if (cmd_buffer->state.dirty &
9366 (RADV_CMD_DIRTY_DB_SHADER_CONTROL | RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_MASK |
9367 RADV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_ENABLE | RADV_CMD_DIRTY_DYNAMIC_RASTERIZATION_SAMPLES |
9368 RADV_CMD_DIRTY_DYNAMIC_LINE_RASTERIZATION_MODE | RADV_CMD_DIRTY_DYNAMIC_ATTACHMENT_FEEDBACK_LOOP_ENABLE))
9369 radv_emit_db_shader_control(cmd_buffer);
9370
9371 if (info->indexed && info->indirect && cmd_buffer->state.dirty & RADV_CMD_DIRTY_INDEX_BUFFER)
9372 radv_emit_index_buffer(cmd_buffer);
9373
9374 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_STREAMOUT_ENABLE)
9375 radv_emit_streamout_enable_state(cmd_buffer);
9376
9377 const uint64_t dynamic_states = cmd_buffer->state.dirty & radv_get_needed_dynamic_states(cmd_buffer);
9378
9379 if (dynamic_states) {
9380 radv_cmd_buffer_flush_dynamic_state(cmd_buffer, dynamic_states);
9381
9382 if (dynamic_states &
9383 (RADV_CMD_DIRTY_DYNAMIC_RASTERIZATION_SAMPLES | RADV_CMD_DIRTY_DYNAMIC_LINE_RASTERIZATION_MODE))
9384 radv_emit_fs_state(cmd_buffer);
9385 }
9386
9387 radv_emit_draw_registers(cmd_buffer, info);
9388
9389 if (late_scissor_emission) {
9390 radv_emit_scissor(cmd_buffer);
9391 cmd_buffer->state.context_roll_without_scissor_emitted = false;
9392 }
9393 }
9394
9395 static void
radv_bind_graphics_shaders(struct radv_cmd_buffer * cmd_buffer)9396 radv_bind_graphics_shaders(struct radv_cmd_buffer *cmd_buffer)
9397 {
9398 const struct radv_device *device = cmd_buffer->device;
9399 uint32_t push_constant_size = 0, dynamic_offset_count = 0;
9400 bool need_indirect_descriptor_sets = false;
9401
9402 for (unsigned s = 0; s <= MESA_SHADER_MESH; s++) {
9403 const struct radv_shader_object *shader_obj = cmd_buffer->state.shader_objs[s];
9404 struct radv_shader *shader = NULL;
9405
9406 if (s == MESA_SHADER_COMPUTE)
9407 continue;
9408
9409 if (!shader_obj) {
9410 radv_bind_shader(cmd_buffer, NULL, s);
9411 continue;
9412 }
9413
9414 /* Select shader variants. */
9415 if (s == MESA_SHADER_VERTEX && (cmd_buffer->state.shader_objs[MESA_SHADER_TESS_CTRL] ||
9416 cmd_buffer->state.shader_objs[MESA_SHADER_GEOMETRY])) {
9417 if (cmd_buffer->state.shader_objs[MESA_SHADER_TESS_CTRL]) {
9418 shader = shader_obj->as_ls.shader;
9419 } else {
9420 shader = shader_obj->as_es.shader;
9421 }
9422 } else if (s == MESA_SHADER_TESS_EVAL && cmd_buffer->state.shader_objs[MESA_SHADER_GEOMETRY]) {
9423 shader = shader_obj->as_es.shader;
9424 } else {
9425 shader = shader_obj->shader;
9426 }
9427
9428 radv_bind_shader(cmd_buffer, shader, s);
9429 if (!shader)
9430 continue;
9431
9432 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, shader->bo);
9433
9434 /* Compute push constants/indirect descriptors state. */
9435 need_indirect_descriptor_sets |= radv_get_user_sgpr(shader, AC_UD_INDIRECT_DESCRIPTOR_SETS)->sgpr_idx != -1;
9436 push_constant_size += shader_obj->push_constant_size;
9437 dynamic_offset_count += shader_obj->dynamic_offset_count;
9438 }
9439
9440 /* Determine the last VGT shader. */
9441 const gl_shader_stage last_vgt_api_stage = radv_cmdbuf_get_last_vgt_api_stage(cmd_buffer);
9442
9443 assume(last_vgt_api_stage != MESA_SHADER_NONE);
9444 cmd_buffer->state.last_vgt_shader = cmd_buffer->state.shaders[last_vgt_api_stage];
9445
9446 cmd_buffer->state.gs_copy_shader = cmd_buffer->state.shader_objs[MESA_SHADER_GEOMETRY]
9447 ? cmd_buffer->state.shader_objs[MESA_SHADER_GEOMETRY]->gs.copy_shader
9448 : NULL;
9449 if (cmd_buffer->state.gs_copy_shader) {
9450 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->state.gs_copy_shader->bo);
9451 }
9452
9453 /* Determine the rasterized primitive. */
9454 if (cmd_buffer->state.active_stages &
9455 (VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT | VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT |
9456 VK_SHADER_STAGE_GEOMETRY_BIT | VK_SHADER_STAGE_MESH_BIT_EXT)) {
9457 cmd_buffer->state.rast_prim = radv_get_vgt_gs_out(cmd_buffer->state.shaders, 0);
9458 }
9459
9460 const struct radv_shader *vs = radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_VERTEX);
9461 if (vs) {
9462 /* Re-emit the VS prolog when a new vertex shader is bound. */
9463 if (vs->info.vs.has_prolog) {
9464 cmd_buffer->state.emitted_vs_prolog = NULL;
9465 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT;
9466 }
9467
9468 /* Re-emit the vertex buffer descriptors because they are really tied to the pipeline. */
9469 if (vs->info.vs.vb_desc_usage_mask) {
9470 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER;
9471 }
9472 }
9473
9474 /* Update push constants/indirect descriptors state. */
9475 struct radv_descriptor_state *descriptors_state =
9476 radv_get_descriptors_state(cmd_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS);
9477 struct radv_push_constant_state *pc_state = &cmd_buffer->push_constant_state[VK_PIPELINE_BIND_POINT_GRAPHICS];
9478
9479 descriptors_state->need_indirect_descriptor_sets = need_indirect_descriptor_sets;
9480 pc_state->size = push_constant_size;
9481 pc_state->dynamic_offset_count = dynamic_offset_count;
9482
9483 if (device->physical_device->rad_info.gfx_level <= GFX9) {
9484 cmd_buffer->state.ia_multi_vgt_param = radv_compute_ia_multi_vgt_param(device, cmd_buffer->state.shaders);
9485 }
9486
9487 if (cmd_buffer->state.active_stages &
9488 (VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT | VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT)) {
9489 cmd_buffer->state.uses_dynamic_patch_control_points = true;
9490 }
9491
9492 cmd_buffer->state.uses_dynamic_vertex_binding_stride = true;
9493 }
9494
9495 /* MUST inline this function to avoid massive perf loss in drawoverhead */
9496 ALWAYS_INLINE static bool
radv_before_draw(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,uint32_t drawCount,bool dgc)9497 radv_before_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info, uint32_t drawCount, bool dgc)
9498 {
9499 const bool has_prefetch = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7;
9500
9501 ASSERTED const unsigned cdw_max =
9502 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4096 + 128 * (drawCount - 1));
9503
9504 if (likely(!info->indirect)) {
9505 /* GFX6-GFX7 treat instance_count==0 as instance_count==1. There is
9506 * no workaround for indirect draws, but we can at least skip
9507 * direct draws.
9508 */
9509 if (unlikely(!info->instance_count))
9510 return false;
9511
9512 /* Handle count == 0. */
9513 if (unlikely(!info->count && !info->strmout_buffer))
9514 return false;
9515 }
9516
9517 if (!info->indexed && cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) {
9518 /* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE,
9519 * so the state must be re-emitted before the next indexed
9520 * draw.
9521 */
9522 cmd_buffer->state.last_index_type = -1;
9523 }
9524
9525 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_GRAPHICS_SHADERS) {
9526 radv_bind_graphics_shaders(cmd_buffer);
9527 }
9528
9529 /* Use optimal packet order based on whether we need to sync the
9530 * pipeline.
9531 */
9532 if (cmd_buffer->state.flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB |
9533 RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
9534 /* If we have to wait for idle, set all states first, so that
9535 * all SET packets are processed in parallel with previous draw
9536 * calls. Then upload descriptors, set shader pointers, and
9537 * draw, and prefetch at the end. This ensures that the time
9538 * the CUs are idle is very short. (there are only SET_SH
9539 * packets between the wait and the draw)
9540 */
9541 radv_emit_all_graphics_states(cmd_buffer, info);
9542 radv_emit_cache_flush(cmd_buffer);
9543 /* <-- CUs are idle here --> */
9544
9545 radv_upload_graphics_shader_descriptors(cmd_buffer);
9546 } else {
9547 const bool need_prefetch = has_prefetch && cmd_buffer->state.prefetch_L2_mask;
9548
9549 /* If we don't wait for idle, start prefetches first, then set
9550 * states, and draw at the end.
9551 */
9552 radv_emit_cache_flush(cmd_buffer);
9553
9554 if (need_prefetch) {
9555 /* Only prefetch the vertex shader and VBO descriptors
9556 * in order to start the draw as soon as possible.
9557 */
9558 radv_emit_prefetch_L2(cmd_buffer, true);
9559 }
9560
9561 radv_upload_graphics_shader_descriptors(cmd_buffer);
9562
9563 radv_emit_all_graphics_states(cmd_buffer, info);
9564 }
9565
9566 if (!dgc)
9567 radv_describe_draw(cmd_buffer);
9568 if (likely(!info->indirect)) {
9569 struct radv_cmd_state *state = &cmd_buffer->state;
9570 struct radeon_cmdbuf *cs = cmd_buffer->cs;
9571 assert(state->vtx_base_sgpr);
9572 if (state->last_num_instances != info->instance_count) {
9573 radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, false));
9574 radeon_emit(cs, info->instance_count);
9575 state->last_num_instances = info->instance_count;
9576 }
9577 }
9578 assert(cmd_buffer->cs->cdw <= cdw_max);
9579
9580 return true;
9581 }
9582
9583 ALWAYS_INLINE static bool
radv_before_taskmesh_draw(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,uint32_t drawCount,bool dgc)9584 radv_before_taskmesh_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info, uint32_t drawCount,
9585 bool dgc)
9586 {
9587 /* For direct draws, this makes sure we don't draw anything.
9588 * For indirect draws, this is necessary to prevent a GPU hang (on MEC version < 100).
9589 */
9590 if (unlikely(!info->count))
9591 return false;
9592
9593 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_GRAPHICS_SHADERS) {
9594 radv_bind_graphics_shaders(cmd_buffer);
9595 }
9596
9597 struct radeon_cmdbuf *ace_cs = cmd_buffer->gang.cs;
9598 struct radv_shader *task_shader = cmd_buffer->state.shaders[MESA_SHADER_TASK];
9599
9600 assert(!task_shader || ace_cs);
9601
9602 const VkShaderStageFlags stages =
9603 VK_SHADER_STAGE_MESH_BIT_EXT | VK_SHADER_STAGE_FRAGMENT_BIT | (task_shader ? VK_SHADER_STAGE_TASK_BIT_EXT : 0);
9604 const bool need_task_semaphore = task_shader && radv_flush_gang_leader_semaphore(cmd_buffer);
9605
9606 ASSERTED const unsigned cdw_max =
9607 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4096 + 128 * (drawCount - 1));
9608 ASSERTED const unsigned ace_cdw_max =
9609 !ace_cs ? 0 : radeon_check_space(cmd_buffer->device->ws, ace_cs, 4096 + 128 * (drawCount - 1));
9610
9611 radv_emit_all_graphics_states(cmd_buffer, info);
9612
9613 radv_emit_cache_flush(cmd_buffer);
9614
9615 if (task_shader) {
9616 radv_gang_cache_flush(cmd_buffer);
9617
9618 if (need_task_semaphore) {
9619 radv_wait_gang_leader(cmd_buffer);
9620 }
9621 }
9622
9623 radv_flush_descriptors(cmd_buffer, stages, VK_PIPELINE_BIND_POINT_GRAPHICS);
9624
9625 const VkShaderStageFlags pc_stages = radv_must_flush_constants(cmd_buffer, stages, VK_PIPELINE_BIND_POINT_GRAPHICS);
9626 if (pc_stages)
9627 radv_flush_constants(cmd_buffer, pc_stages, VK_PIPELINE_BIND_POINT_GRAPHICS);
9628
9629 if (!dgc)
9630 radv_describe_draw(cmd_buffer);
9631 if (likely(!info->indirect)) {
9632 struct radv_cmd_state *state = &cmd_buffer->state;
9633 if (unlikely(state->last_num_instances != 1)) {
9634 struct radeon_cmdbuf *cs = cmd_buffer->cs;
9635 radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, false));
9636 radeon_emit(cs, 1);
9637 state->last_num_instances = 1;
9638 }
9639 }
9640
9641 assert(cmd_buffer->cs->cdw <= cdw_max);
9642 assert(!ace_cs || ace_cs->cdw <= ace_cdw_max);
9643
9644 cmd_buffer->state.last_index_type = -1;
9645
9646 return true;
9647 }
9648
9649 ALWAYS_INLINE static void
radv_after_draw(struct radv_cmd_buffer * cmd_buffer,bool dgc)9650 radv_after_draw(struct radv_cmd_buffer *cmd_buffer, bool dgc)
9651 {
9652 const struct radeon_info *rad_info = &cmd_buffer->device->physical_device->rad_info;
9653 bool has_prefetch = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7;
9654 /* Start prefetches after the draw has been started. Both will
9655 * run in parallel, but starting the draw first is more
9656 * important.
9657 */
9658 if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) {
9659 radv_emit_prefetch_L2(cmd_buffer, false);
9660 }
9661
9662 /* Workaround for a VGT hang when streamout is enabled.
9663 * It must be done after drawing.
9664 */
9665 if (radv_is_streamout_enabled(cmd_buffer) &&
9666 (rad_info->family == CHIP_HAWAII || rad_info->family == CHIP_TONGA || rad_info->family == CHIP_FIJI)) {
9667 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_STREAMOUT_SYNC;
9668 }
9669
9670 radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_PS_PARTIAL_FLUSH, dgc);
9671 }
9672
9673 VKAPI_ATTR void VKAPI_CALL
radv_CmdDraw(VkCommandBuffer commandBuffer,uint32_t vertexCount,uint32_t instanceCount,uint32_t firstVertex,uint32_t firstInstance)9674 radv_CmdDraw(VkCommandBuffer commandBuffer, uint32_t vertexCount, uint32_t instanceCount, uint32_t firstVertex,
9675 uint32_t firstInstance)
9676 {
9677 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9678 struct radv_draw_info info;
9679
9680 info.count = vertexCount;
9681 info.instance_count = instanceCount;
9682 info.first_instance = firstInstance;
9683 info.strmout_buffer = NULL;
9684 info.indirect = NULL;
9685 info.indexed = false;
9686
9687 if (!radv_before_draw(cmd_buffer, &info, 1, false))
9688 return;
9689 const VkMultiDrawInfoEXT minfo = {firstVertex, vertexCount};
9690 radv_emit_direct_draw_packets(cmd_buffer, &info, 1, &minfo, 0, 0);
9691 radv_after_draw(cmd_buffer, false);
9692 }
9693
9694 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawMultiEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawInfoEXT * pVertexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride)9695 radv_CmdDrawMultiEXT(VkCommandBuffer commandBuffer, uint32_t drawCount, const VkMultiDrawInfoEXT *pVertexInfo,
9696 uint32_t instanceCount, uint32_t firstInstance, uint32_t stride)
9697 {
9698 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9699 struct radv_draw_info info;
9700
9701 if (!drawCount)
9702 return;
9703
9704 info.count = pVertexInfo->vertexCount;
9705 info.instance_count = instanceCount;
9706 info.first_instance = firstInstance;
9707 info.strmout_buffer = NULL;
9708 info.indirect = NULL;
9709 info.indexed = false;
9710
9711 if (!radv_before_draw(cmd_buffer, &info, drawCount, false))
9712 return;
9713 radv_emit_direct_draw_packets(cmd_buffer, &info, drawCount, pVertexInfo, 0, stride);
9714 radv_after_draw(cmd_buffer, false);
9715 }
9716
9717 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawIndexed(VkCommandBuffer commandBuffer,uint32_t indexCount,uint32_t instanceCount,uint32_t firstIndex,int32_t vertexOffset,uint32_t firstInstance)9718 radv_CmdDrawIndexed(VkCommandBuffer commandBuffer, uint32_t indexCount, uint32_t instanceCount, uint32_t firstIndex,
9719 int32_t vertexOffset, uint32_t firstInstance)
9720 {
9721 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9722 struct radv_draw_info info;
9723
9724 info.indexed = true;
9725 info.count = indexCount;
9726 info.instance_count = instanceCount;
9727 info.first_instance = firstInstance;
9728 info.strmout_buffer = NULL;
9729 info.indirect = NULL;
9730
9731 if (!radv_before_draw(cmd_buffer, &info, 1, false))
9732 return;
9733 const VkMultiDrawIndexedInfoEXT minfo = {firstIndex, indexCount, vertexOffset};
9734 radv_emit_draw_packets_indexed(cmd_buffer, &info, 1, &minfo, 0, NULL);
9735 radv_after_draw(cmd_buffer, false);
9736 }
9737
9738 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawIndexedInfoEXT * pIndexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride,const int32_t * pVertexOffset)9739 radv_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer, uint32_t drawCount,
9740 const VkMultiDrawIndexedInfoEXT *pIndexInfo, uint32_t instanceCount, uint32_t firstInstance,
9741 uint32_t stride, const int32_t *pVertexOffset)
9742 {
9743 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9744 struct radv_draw_info info;
9745
9746 if (!drawCount)
9747 return;
9748
9749 const VkMultiDrawIndexedInfoEXT *minfo = pIndexInfo;
9750 info.indexed = true;
9751 info.count = minfo->indexCount;
9752 info.instance_count = instanceCount;
9753 info.first_instance = firstInstance;
9754 info.strmout_buffer = NULL;
9755 info.indirect = NULL;
9756
9757 if (!radv_before_draw(cmd_buffer, &info, drawCount, false))
9758 return;
9759 radv_emit_draw_packets_indexed(cmd_buffer, &info, drawCount, pIndexInfo, stride, pVertexOffset);
9760 radv_after_draw(cmd_buffer, false);
9761 }
9762
9763 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)9764 radv_CmdDrawIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, uint32_t drawCount,
9765 uint32_t stride)
9766 {
9767 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9768 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
9769 struct radv_draw_info info;
9770
9771 info.count = drawCount;
9772 info.indirect = buffer;
9773 info.indirect_offset = offset;
9774 info.stride = stride;
9775 info.strmout_buffer = NULL;
9776 info.count_buffer = NULL;
9777 info.indexed = false;
9778 info.instance_count = 0;
9779
9780 if (!radv_before_draw(cmd_buffer, &info, 1, false))
9781 return;
9782 radv_emit_indirect_draw_packets(cmd_buffer, &info);
9783 radv_after_draw(cmd_buffer, false);
9784 }
9785
9786 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)9787 radv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, uint32_t drawCount,
9788 uint32_t stride)
9789 {
9790 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9791 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
9792 struct radv_draw_info info;
9793
9794 info.indexed = true;
9795 info.count = drawCount;
9796 info.indirect = buffer;
9797 info.indirect_offset = offset;
9798 info.stride = stride;
9799 info.count_buffer = NULL;
9800 info.strmout_buffer = NULL;
9801 info.instance_count = 0;
9802
9803 if (!radv_before_draw(cmd_buffer, &info, 1, false))
9804 return;
9805 radv_emit_indirect_draw_packets(cmd_buffer, &info);
9806 radv_after_draw(cmd_buffer, false);
9807 }
9808
9809 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer _countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)9810 radv_CmdDrawIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, VkBuffer _countBuffer,
9811 VkDeviceSize countBufferOffset, uint32_t maxDrawCount, uint32_t stride)
9812 {
9813 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9814 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
9815 RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
9816 struct radv_draw_info info;
9817
9818 info.count = maxDrawCount;
9819 info.indirect = buffer;
9820 info.indirect_offset = offset;
9821 info.count_buffer = count_buffer;
9822 info.count_buffer_offset = countBufferOffset;
9823 info.stride = stride;
9824 info.strmout_buffer = NULL;
9825 info.indexed = false;
9826 info.instance_count = 0;
9827
9828 if (!radv_before_draw(cmd_buffer, &info, 1, false))
9829 return;
9830 radv_emit_indirect_draw_packets(cmd_buffer, &info);
9831 radv_after_draw(cmd_buffer, false);
9832 }
9833
9834 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer _countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)9835 radv_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset,
9836 VkBuffer _countBuffer, VkDeviceSize countBufferOffset, uint32_t maxDrawCount,
9837 uint32_t stride)
9838 {
9839 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9840 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
9841 RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
9842 struct radv_draw_info info;
9843
9844 info.indexed = true;
9845 info.count = maxDrawCount;
9846 info.indirect = buffer;
9847 info.indirect_offset = offset;
9848 info.count_buffer = count_buffer;
9849 info.count_buffer_offset = countBufferOffset;
9850 info.stride = stride;
9851 info.strmout_buffer = NULL;
9852 info.instance_count = 0;
9853
9854 if (!radv_before_draw(cmd_buffer, &info, 1, false))
9855 return;
9856 radv_emit_indirect_draw_packets(cmd_buffer, &info);
9857 radv_after_draw(cmd_buffer, false);
9858 }
9859
9860 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawMeshTasksEXT(VkCommandBuffer commandBuffer,uint32_t x,uint32_t y,uint32_t z)9861 radv_CmdDrawMeshTasksEXT(VkCommandBuffer commandBuffer, uint32_t x, uint32_t y, uint32_t z)
9862 {
9863 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9864 struct radv_draw_info info;
9865
9866 info.count = x * y * z;
9867 info.instance_count = 1;
9868 info.first_instance = 0;
9869 info.stride = 0;
9870 info.indexed = false;
9871 info.strmout_buffer = NULL;
9872 info.count_buffer = NULL;
9873 info.indirect = NULL;
9874
9875 if (!radv_before_taskmesh_draw(cmd_buffer, &info, 1, false))
9876 return;
9877
9878 if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_TASK)) {
9879 radv_emit_direct_taskmesh_draw_packets(cmd_buffer, x, y, z);
9880 } else {
9881 radv_emit_direct_mesh_draw_packet(cmd_buffer, x, y, z);
9882 }
9883
9884 radv_after_draw(cmd_buffer, false);
9885 }
9886
9887 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawMeshTasksIndirectEXT(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)9888 radv_CmdDrawMeshTasksIndirectEXT(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset,
9889 uint32_t drawCount, uint32_t stride)
9890 {
9891 if (!drawCount)
9892 return;
9893
9894 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9895 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
9896
9897 struct radv_draw_info info;
9898
9899 info.indirect = buffer;
9900 info.indirect_offset = offset;
9901 info.stride = stride;
9902 info.count = drawCount;
9903 info.strmout_buffer = NULL;
9904 info.count_buffer = NULL;
9905 info.indexed = false;
9906 info.instance_count = 0;
9907
9908 if (!radv_before_taskmesh_draw(cmd_buffer, &info, drawCount, false))
9909 return;
9910
9911 if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_TASK)) {
9912 radv_emit_indirect_taskmesh_draw_packets(cmd_buffer, &info);
9913 } else {
9914 radv_emit_indirect_mesh_draw_packets(cmd_buffer, &info);
9915 }
9916
9917 radv_after_draw(cmd_buffer, false);
9918 }
9919
9920 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawMeshTasksIndirectCountEXT(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer _countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)9921 radv_CmdDrawMeshTasksIndirectCountEXT(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset,
9922 VkBuffer _countBuffer, VkDeviceSize countBufferOffset, uint32_t maxDrawCount,
9923 uint32_t stride)
9924 {
9925
9926 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9927 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
9928 RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
9929
9930 struct radv_draw_info info;
9931
9932 info.indirect = buffer;
9933 info.indirect_offset = offset;
9934 info.stride = stride;
9935 info.count = maxDrawCount;
9936 info.strmout_buffer = NULL;
9937 info.count_buffer = count_buffer;
9938 info.count_buffer_offset = countBufferOffset;
9939 info.indexed = false;
9940 info.instance_count = 0;
9941
9942 if (!radv_before_taskmesh_draw(cmd_buffer, &info, maxDrawCount, false))
9943 return;
9944
9945 if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_TASK)) {
9946 radv_emit_indirect_taskmesh_draw_packets(cmd_buffer, &info);
9947 } else {
9948 radv_emit_indirect_mesh_draw_packets(cmd_buffer, &info);
9949 }
9950
9951 radv_after_draw(cmd_buffer, false);
9952 }
9953
9954 /* TODO: Use these functions with the normal dispatch path. */
9955 static void radv_dgc_before_dispatch(struct radv_cmd_buffer *cmd_buffer);
9956 static void radv_dgc_after_dispatch(struct radv_cmd_buffer *cmd_buffer);
9957
9958 VKAPI_ATTR void VKAPI_CALL
radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer,VkBool32 isPreprocessed,const VkGeneratedCommandsInfoNV * pGeneratedCommandsInfo)9959 radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer, VkBool32 isPreprocessed,
9960 const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo)
9961 {
9962 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9963 VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout);
9964 VK_FROM_HANDLE(radv_pipeline, pipeline, pGeneratedCommandsInfo->pipeline);
9965 VK_FROM_HANDLE(radv_buffer, prep_buffer, pGeneratedCommandsInfo->preprocessBuffer);
9966 const bool compute = layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_COMPUTE;
9967 const bool use_predication = radv_use_dgc_predication(cmd_buffer, pGeneratedCommandsInfo);
9968 const struct radv_device *device = cmd_buffer->device;
9969
9970 /* Secondary command buffers are needed for the full extension but can't use
9971 * PKT3_INDIRECT_BUFFER.
9972 */
9973 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
9974
9975 if (use_predication) {
9976 VK_FROM_HANDLE(radv_buffer, seq_count_buffer, pGeneratedCommandsInfo->sequencesCountBuffer);
9977 const uint64_t va = radv_buffer_get_va(seq_count_buffer->bo) + seq_count_buffer->offset +
9978 pGeneratedCommandsInfo->sequencesCountOffset;
9979
9980 radv_begin_conditional_rendering(cmd_buffer, va, true);
9981 }
9982
9983 if (!radv_dgc_can_preprocess(layout, pipeline)) {
9984 const bool old_predicating = cmd_buffer->state.predicating;
9985
9986 if (cmd_buffer->qf == RADV_QUEUE_COMPUTE && cmd_buffer->state.predicating) {
9987 /* Suspend conditional rendering when the DGC execute is called on the compute queue to
9988 * generate a cmdbuf which will skips dispatches when necessary. This is because the
9989 * compute queue is missing IB2 which means it's not possible to skip the cmdbuf entirely.
9990 */
9991 cmd_buffer->state.predicating = false;
9992 }
9993
9994 radv_prepare_dgc(cmd_buffer, pGeneratedCommandsInfo, old_predicating);
9995
9996 if (cmd_buffer->qf == RADV_QUEUE_COMPUTE) {
9997 cmd_buffer->state.predicating = old_predicating;
9998 }
9999
10000 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_L2;
10001 }
10002
10003 if (compute) {
10004 radv_dgc_before_dispatch(cmd_buffer);
10005 } else {
10006 struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline);
10007 struct radv_draw_info info;
10008
10009 info.count = pGeneratedCommandsInfo->sequencesCount;
10010 info.indirect = prep_buffer; /* We're not really going use it this way, but a good signal
10011 that this is not direct. */
10012 info.indirect_offset = 0;
10013 info.stride = 0;
10014 info.strmout_buffer = NULL;
10015 info.count_buffer = NULL;
10016 info.indexed = layout->indexed;
10017 info.instance_count = 0;
10018
10019 if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_MESH)) {
10020 if (!radv_before_taskmesh_draw(cmd_buffer, &info, 1, true))
10021 return;
10022 } else {
10023 if (!radv_before_draw(cmd_buffer, &info, 1, true))
10024 return;
10025 }
10026 }
10027
10028 uint32_t cmdbuf_size = radv_get_indirect_cmdbuf_size(pGeneratedCommandsInfo);
10029 struct radeon_winsys_bo *ib_bo = prep_buffer->bo;
10030 const uint64_t ib_offset = prep_buffer->offset + pGeneratedCommandsInfo->preprocessOffset;
10031 const uint32_t view_mask = cmd_buffer->state.render.view_mask;
10032
10033 if (!radv_cmd_buffer_uses_mec(cmd_buffer)) {
10034 radeon_emit(cmd_buffer->cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
10035 radeon_emit(cmd_buffer->cs, 0);
10036 }
10037
10038 if (compute || !view_mask) {
10039 device->ws->cs_execute_ib(cmd_buffer->cs, ib_bo, ib_offset, cmdbuf_size >> 2, cmd_buffer->state.predicating);
10040 } else {
10041 u_foreach_bit (view, view_mask) {
10042 radv_emit_view_index(cmd_buffer, view);
10043
10044 device->ws->cs_execute_ib(cmd_buffer->cs, ib_bo, ib_offset, cmdbuf_size >> 2, cmd_buffer->state.predicating);
10045 }
10046 }
10047
10048 if (compute) {
10049 cmd_buffer->push_constant_stages |= VK_SHADER_STAGE_COMPUTE_BIT;
10050
10051 radv_dgc_after_dispatch(cmd_buffer);
10052 } else {
10053 struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline);
10054
10055 if (layout->binds_index_buffer) {
10056 cmd_buffer->state.last_index_type = -1;
10057 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
10058 }
10059
10060 if (layout->bind_vbo_mask)
10061 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER;
10062
10063 cmd_buffer->push_constant_stages |= graphics_pipeline->active_stages;
10064
10065 if (!layout->indexed && cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) {
10066 /* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE, so the state must be
10067 * re-emitted before the next indexed draw.
10068 */
10069 cmd_buffer->state.last_index_type = -1;
10070 }
10071
10072 cmd_buffer->state.last_num_instances = -1;
10073 cmd_buffer->state.last_vertex_offset_valid = false;
10074 cmd_buffer->state.last_first_instance = -1;
10075 cmd_buffer->state.last_drawid = -1;
10076
10077 radv_after_draw(cmd_buffer, true);
10078 }
10079
10080 if (use_predication) {
10081 radv_end_conditional_rendering(cmd_buffer);
10082 }
10083 }
10084
10085 static void
radv_emit_dispatch_packets(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * compute_shader,const struct radv_dispatch_info * info)10086 radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *compute_shader,
10087 const struct radv_dispatch_info *info)
10088 {
10089 unsigned dispatch_initiator = cmd_buffer->device->dispatch_initiator;
10090 struct radeon_winsys *ws = cmd_buffer->device->ws;
10091 bool predicating = cmd_buffer->state.predicating;
10092 struct radeon_cmdbuf *cs = cmd_buffer->cs;
10093 const struct radv_userdata_info *loc = radv_get_user_sgpr(compute_shader, AC_UD_CS_GRID_SIZE);
10094
10095 radv_describe_dispatch(cmd_buffer, info);
10096
10097 ASSERTED unsigned cdw_max = radeon_check_space(ws, cs, 30);
10098
10099 if (compute_shader->info.wave_size == 32) {
10100 assert(cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10);
10101 dispatch_initiator |= S_00B800_CS_W32_EN(1);
10102 }
10103
10104 if (info->ordered)
10105 dispatch_initiator &= ~S_00B800_ORDER_MODE(1);
10106
10107 if (info->va) {
10108 if (info->indirect)
10109 radv_cs_add_buffer(ws, cs, info->indirect);
10110
10111 if (info->unaligned) {
10112 radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
10113 radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[0]));
10114 radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[1]));
10115 radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[2]));
10116
10117 dispatch_initiator |= S_00B800_USE_THREAD_DIMENSIONS(1);
10118 }
10119
10120 if (loc->sgpr_idx != -1) {
10121 unsigned reg = R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4;
10122
10123 if (cmd_buffer->device->load_grid_size_from_user_sgpr) {
10124 assert(cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10_3);
10125 radeon_emit(cs, PKT3(PKT3_LOAD_SH_REG_INDEX, 3, 0));
10126 radeon_emit(cs, info->va);
10127 radeon_emit(cs, info->va >> 32);
10128 radeon_emit(cs, (reg - SI_SH_REG_OFFSET) >> 2);
10129 radeon_emit(cs, 3);
10130 } else {
10131 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, reg, info->va, true);
10132 }
10133 }
10134
10135 if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
10136 uint64_t indirect_va = info->va;
10137 const bool needs_align32_workaround =
10138 cmd_buffer->device->physical_device->rad_info.has_async_compute_align32_bug &&
10139 cmd_buffer->qf == RADV_QUEUE_COMPUTE && !radv_is_aligned(indirect_va, 32);
10140 const unsigned ace_predication_size =
10141 4 /* DISPATCH_INDIRECT */ + (needs_align32_workaround ? 6 * 3 /* 3x COPY_DATA */ : 0);
10142
10143 radv_cs_emit_compute_predication(&cmd_buffer->state, cs, cmd_buffer->mec_inv_pred_va,
10144 &cmd_buffer->mec_inv_pred_emitted, ace_predication_size);
10145
10146 if (needs_align32_workaround) {
10147 const uint64_t unaligned_va = indirect_va;
10148 UNUSED void *ptr;
10149 uint32_t offset;
10150
10151 if (!radv_cmd_buffer_upload_alloc_aligned(cmd_buffer, sizeof(VkDispatchIndirectCommand), 32, &offset, &ptr))
10152 return;
10153
10154 indirect_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset;
10155
10156 for (uint32_t i = 0; i < 3; i++) {
10157 const uint64_t src_va = unaligned_va + i * 4;
10158 const uint64_t dst_va = indirect_va + i * 4;
10159
10160 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
10161 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
10162 COPY_DATA_WR_CONFIRM);
10163 radeon_emit(cs, src_va);
10164 radeon_emit(cs, src_va >> 32);
10165 radeon_emit(cs, dst_va);
10166 radeon_emit(cs, dst_va >> 32);
10167 }
10168 }
10169
10170 radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 2, 0) | PKT3_SHADER_TYPE_S(1));
10171 radeon_emit(cs, indirect_va);
10172 radeon_emit(cs, indirect_va >> 32);
10173 radeon_emit(cs, dispatch_initiator);
10174 } else {
10175 radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) | PKT3_SHADER_TYPE_S(1));
10176 radeon_emit(cs, 1);
10177 radeon_emit(cs, info->va);
10178 radeon_emit(cs, info->va >> 32);
10179
10180 radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, predicating) | PKT3_SHADER_TYPE_S(1));
10181 radeon_emit(cs, 0);
10182 radeon_emit(cs, dispatch_initiator);
10183 }
10184 } else {
10185 const unsigned *cs_block_size = compute_shader->info.cs.block_size;
10186 unsigned blocks[3] = {info->blocks[0], info->blocks[1], info->blocks[2]};
10187 unsigned offsets[3] = {info->offsets[0], info->offsets[1], info->offsets[2]};
10188
10189 if (info->unaligned) {
10190 unsigned remainder[3];
10191
10192 /* If aligned, these should be an entire block size,
10193 * not 0.
10194 */
10195 remainder[0] = blocks[0] + cs_block_size[0] - align_u32_npot(blocks[0], cs_block_size[0]);
10196 remainder[1] = blocks[1] + cs_block_size[1] - align_u32_npot(blocks[1], cs_block_size[1]);
10197 remainder[2] = blocks[2] + cs_block_size[2] - align_u32_npot(blocks[2], cs_block_size[2]);
10198
10199 blocks[0] = DIV_ROUND_UP(blocks[0], cs_block_size[0]);
10200 blocks[1] = DIV_ROUND_UP(blocks[1], cs_block_size[1]);
10201 blocks[2] = DIV_ROUND_UP(blocks[2], cs_block_size[2]);
10202
10203 for (unsigned i = 0; i < 3; ++i) {
10204 assert(offsets[i] % cs_block_size[i] == 0);
10205 offsets[i] /= cs_block_size[i];
10206 }
10207
10208 radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
10209 radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(cs_block_size[0]) | S_00B81C_NUM_THREAD_PARTIAL(remainder[0]));
10210 radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(cs_block_size[1]) | S_00B81C_NUM_THREAD_PARTIAL(remainder[1]));
10211 radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(cs_block_size[2]) | S_00B81C_NUM_THREAD_PARTIAL(remainder[2]));
10212
10213 dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1);
10214 }
10215
10216 if (loc->sgpr_idx != -1) {
10217 if (cmd_buffer->device->load_grid_size_from_user_sgpr) {
10218 assert(loc->num_sgprs == 3);
10219
10220 radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, 3);
10221 radeon_emit(cs, blocks[0]);
10222 radeon_emit(cs, blocks[1]);
10223 radeon_emit(cs, blocks[2]);
10224 } else {
10225 uint32_t offset;
10226 if (!radv_cmd_buffer_upload_data(cmd_buffer, 12, blocks, &offset))
10227 return;
10228
10229 uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset;
10230 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs,
10231 R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, va, true);
10232 }
10233 }
10234
10235 if (offsets[0] || offsets[1] || offsets[2]) {
10236 radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3);
10237 radeon_emit(cs, offsets[0]);
10238 radeon_emit(cs, offsets[1]);
10239 radeon_emit(cs, offsets[2]);
10240
10241 /* The blocks in the packet are not counts but end values. */
10242 for (unsigned i = 0; i < 3; ++i)
10243 blocks[i] += offsets[i];
10244 } else {
10245 dispatch_initiator |= S_00B800_FORCE_START_AT_000(1);
10246 }
10247
10248 if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
10249 radv_cs_emit_compute_predication(&cmd_buffer->state, cs, cmd_buffer->mec_inv_pred_va,
10250 &cmd_buffer->mec_inv_pred_emitted, 5 /* DISPATCH_DIRECT size */);
10251 predicating = false;
10252 }
10253
10254 if (cmd_buffer->device->physical_device->rad_info.has_async_compute_threadgroup_bug &&
10255 cmd_buffer->qf == RADV_QUEUE_COMPUTE) {
10256 for (unsigned i = 0; i < 3; i++) {
10257 if (info->unaligned) {
10258 /* info->blocks is already in thread dimensions for unaligned dispatches. */
10259 blocks[i] = info->blocks[i];
10260 } else {
10261 /* Force the async compute dispatch to be in "thread" dim mode to workaround a hw bug. */
10262 blocks[i] *= cs_block_size[i];
10263 }
10264
10265 dispatch_initiator |= S_00B800_USE_THREAD_DIMENSIONS(1);
10266 }
10267 }
10268
10269 radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, predicating) | PKT3_SHADER_TYPE_S(1));
10270 radeon_emit(cs, blocks[0]);
10271 radeon_emit(cs, blocks[1]);
10272 radeon_emit(cs, blocks[2]);
10273 radeon_emit(cs, dispatch_initiator);
10274 }
10275
10276 assert(cmd_buffer->cs->cdw <= cdw_max);
10277 }
10278
10279 static void
radv_upload_compute_shader_descriptors(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)10280 radv_upload_compute_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point)
10281 {
10282 radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_COMPUTE_BIT, bind_point);
10283 const VkShaderStageFlags stages =
10284 bind_point == VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR ? RADV_RT_STAGE_BITS : VK_SHADER_STAGE_COMPUTE_BIT;
10285 const VkShaderStageFlags pc_stages = radv_must_flush_constants(cmd_buffer, stages, bind_point);
10286 if (pc_stages)
10287 radv_flush_constants(cmd_buffer, pc_stages, bind_point);
10288 }
10289
10290 static void
radv_dispatch(struct radv_cmd_buffer * cmd_buffer,const struct radv_dispatch_info * info,struct radv_compute_pipeline * pipeline,struct radv_shader * compute_shader,VkPipelineBindPoint bind_point)10291 radv_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info,
10292 struct radv_compute_pipeline *pipeline, struct radv_shader *compute_shader,
10293 VkPipelineBindPoint bind_point)
10294 {
10295 bool has_prefetch = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7;
10296 bool pipeline_is_dirty = pipeline != cmd_buffer->state.emitted_compute_pipeline;
10297
10298 if (compute_shader->info.cs.regalloc_hang_bug)
10299 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
10300
10301 if (cmd_buffer->state.flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB |
10302 RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
10303 /* If we have to wait for idle, set all states first, so that
10304 * all SET packets are processed in parallel with previous draw
10305 * calls. Then upload descriptors, set shader pointers, and
10306 * dispatch, and prefetch at the end. This ensures that the
10307 * time the CUs are idle is very short. (there are only SET_SH
10308 * packets between the wait and the draw)
10309 */
10310 radv_emit_compute_pipeline(cmd_buffer, pipeline);
10311 radv_emit_cache_flush(cmd_buffer);
10312 /* <-- CUs are idle here --> */
10313
10314 radv_upload_compute_shader_descriptors(cmd_buffer, bind_point);
10315
10316 radv_emit_dispatch_packets(cmd_buffer, compute_shader, info);
10317 /* <-- CUs are busy here --> */
10318
10319 /* Start prefetches after the dispatch has been started. Both
10320 * will run in parallel, but starting the dispatch first is
10321 * more important.
10322 */
10323 if (has_prefetch && pipeline_is_dirty) {
10324 radv_emit_shader_prefetch(cmd_buffer, compute_shader);
10325 }
10326 } else {
10327 /* If we don't wait for idle, start prefetches first, then set
10328 * states, and dispatch at the end.
10329 */
10330 radv_emit_cache_flush(cmd_buffer);
10331
10332 if (has_prefetch && pipeline_is_dirty) {
10333 radv_emit_shader_prefetch(cmd_buffer, compute_shader);
10334 }
10335
10336 radv_upload_compute_shader_descriptors(cmd_buffer, bind_point);
10337
10338 radv_emit_compute_pipeline(cmd_buffer, pipeline);
10339 radv_emit_dispatch_packets(cmd_buffer, compute_shader, info);
10340 }
10341
10342 if (pipeline_is_dirty) {
10343 /* Raytracing uses compute shaders but has separate bind points and pipelines.
10344 * So if we set compute userdata & shader registers we should dirty the raytracing
10345 * ones and the other way around.
10346 *
10347 * We only need to do this when the pipeline is dirty because when we switch between
10348 * the two we always need to switch pipelines.
10349 */
10350 radv_mark_descriptor_sets_dirty(cmd_buffer, bind_point == VK_PIPELINE_BIND_POINT_COMPUTE
10351 ? VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR
10352 : VK_PIPELINE_BIND_POINT_COMPUTE);
10353 }
10354
10355 if (compute_shader->info.cs.regalloc_hang_bug)
10356 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
10357
10358 radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_CS_PARTIAL_FLUSH, false);
10359 }
10360
10361 static void
radv_dgc_before_dispatch(struct radv_cmd_buffer * cmd_buffer)10362 radv_dgc_before_dispatch(struct radv_cmd_buffer *cmd_buffer)
10363 {
10364 struct radv_compute_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
10365 struct radv_shader *compute_shader = cmd_buffer->state.shaders[MESA_SHADER_COMPUTE];
10366 bool pipeline_is_dirty = pipeline != cmd_buffer->state.emitted_compute_pipeline;
10367
10368 /* We will have run the DGC patch shaders before, so we can assume that there is something to
10369 * flush. Otherwise, we just split radv_dispatch in two. One pre-dispatch and another one
10370 * post-dispatch. */
10371
10372 if (compute_shader->info.cs.regalloc_hang_bug)
10373 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
10374
10375 radv_emit_compute_pipeline(cmd_buffer, pipeline);
10376 radv_emit_cache_flush(cmd_buffer);
10377
10378 radv_upload_compute_shader_descriptors(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE);
10379
10380 if (pipeline_is_dirty) {
10381 const bool has_prefetch = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7;
10382
10383 if (has_prefetch)
10384 radv_emit_shader_prefetch(cmd_buffer, compute_shader);
10385
10386 /* Raytracing uses compute shaders but has separate bind points and pipelines.
10387 * So if we set compute userdata & shader registers we should dirty the raytracing
10388 * ones and the other way around.
10389 *
10390 * We only need to do this when the pipeline is dirty because when we switch between
10391 * the two we always need to switch pipelines.
10392 */
10393 radv_mark_descriptor_sets_dirty(cmd_buffer, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
10394 }
10395 }
10396
10397 static void
radv_dgc_after_dispatch(struct radv_cmd_buffer * cmd_buffer)10398 radv_dgc_after_dispatch(struct radv_cmd_buffer *cmd_buffer)
10399 {
10400 struct radv_shader *compute_shader = cmd_buffer->state.shaders[MESA_SHADER_COMPUTE];
10401
10402 if (compute_shader->info.cs.regalloc_hang_bug)
10403 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
10404
10405 radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_CS_PARTIAL_FLUSH, true);
10406 }
10407
10408 void
radv_compute_dispatch(struct radv_cmd_buffer * cmd_buffer,const struct radv_dispatch_info * info)10409 radv_compute_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info)
10410 {
10411 radv_dispatch(cmd_buffer, info, cmd_buffer->state.compute_pipeline, cmd_buffer->state.shaders[MESA_SHADER_COMPUTE],
10412 VK_PIPELINE_BIND_POINT_COMPUTE);
10413 }
10414
10415 VKAPI_ATTR void VKAPI_CALL
radv_CmdDispatchBase(VkCommandBuffer commandBuffer,uint32_t base_x,uint32_t base_y,uint32_t base_z,uint32_t x,uint32_t y,uint32_t z)10416 radv_CmdDispatchBase(VkCommandBuffer commandBuffer, uint32_t base_x, uint32_t base_y, uint32_t base_z, uint32_t x,
10417 uint32_t y, uint32_t z)
10418 {
10419 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
10420 struct radv_dispatch_info info = {0};
10421
10422 info.blocks[0] = x;
10423 info.blocks[1] = y;
10424 info.blocks[2] = z;
10425
10426 info.offsets[0] = base_x;
10427 info.offsets[1] = base_y;
10428 info.offsets[2] = base_z;
10429 radv_compute_dispatch(cmd_buffer, &info);
10430 }
10431
10432 VKAPI_ATTR void VKAPI_CALL
radv_CmdDispatchIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset)10433 radv_CmdDispatchIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset)
10434 {
10435 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
10436 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
10437 struct radv_dispatch_info info = {0};
10438
10439 info.indirect = buffer->bo;
10440 info.va = radv_buffer_get_va(buffer->bo) + buffer->offset + offset;
10441
10442 radv_compute_dispatch(cmd_buffer, &info);
10443 }
10444
10445 void
radv_unaligned_dispatch(struct radv_cmd_buffer * cmd_buffer,uint32_t x,uint32_t y,uint32_t z)10446 radv_unaligned_dispatch(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t y, uint32_t z)
10447 {
10448 struct radv_dispatch_info info = {0};
10449
10450 info.blocks[0] = x;
10451 info.blocks[1] = y;
10452 info.blocks[2] = z;
10453 info.unaligned = 1;
10454
10455 radv_compute_dispatch(cmd_buffer, &info);
10456 }
10457
10458 void
radv_indirect_dispatch(struct radv_cmd_buffer * cmd_buffer,struct radeon_winsys_bo * bo,uint64_t va)10459 radv_indirect_dispatch(struct radv_cmd_buffer *cmd_buffer, struct radeon_winsys_bo *bo, uint64_t va)
10460 {
10461 struct radv_dispatch_info info = {0};
10462
10463 info.indirect = bo;
10464 info.va = va;
10465
10466 radv_compute_dispatch(cmd_buffer, &info);
10467 }
10468
10469 static void
radv_trace_trace_rays(struct radv_cmd_buffer * cmd_buffer,const VkTraceRaysIndirectCommand2KHR * cmd,uint64_t indirect_va)10470 radv_trace_trace_rays(struct radv_cmd_buffer *cmd_buffer, const VkTraceRaysIndirectCommand2KHR *cmd,
10471 uint64_t indirect_va)
10472 {
10473 if (!cmd || indirect_va)
10474 return;
10475
10476 struct radv_rra_ray_history_data *data = malloc(sizeof(struct radv_rra_ray_history_data));
10477 if (!data)
10478 return;
10479
10480 uint32_t width = DIV_ROUND_UP(cmd->width, cmd_buffer->device->rra_trace.ray_history_resolution_scale);
10481 uint32_t height = DIV_ROUND_UP(cmd->height, cmd_buffer->device->rra_trace.ray_history_resolution_scale);
10482 uint32_t depth = DIV_ROUND_UP(cmd->depth, cmd_buffer->device->rra_trace.ray_history_resolution_scale);
10483
10484 struct radv_rra_ray_history_counter counter = {
10485 .dispatch_size = {width, height, depth},
10486 .hit_shader_count = cmd->hitShaderBindingTableSize / cmd->hitShaderBindingTableStride,
10487 .miss_shader_count = cmd->missShaderBindingTableSize / cmd->missShaderBindingTableStride,
10488 .shader_count = cmd_buffer->state.rt_pipeline->stage_count,
10489 .pipeline_api_hash = cmd_buffer->state.rt_pipeline->base.base.pipeline_hash,
10490 .mode = 1,
10491 .stride = sizeof(uint32_t),
10492 .data_size = 0,
10493 .ray_id_begin = 0,
10494 .ray_id_end = 0xFFFFFFFF,
10495 .pipeline_type = RADV_RRA_PIPELINE_RAY_TRACING,
10496 };
10497
10498 struct radv_rra_ray_history_dispatch_size dispatch_size = {
10499 .size = {width, height, depth},
10500 };
10501
10502 struct radv_rra_ray_history_traversal_flags traversal_flags = {0};
10503
10504 data->metadata = (struct radv_rra_ray_history_metadata){
10505 .counter_info.type = RADV_RRA_COUNTER_INFO,
10506 .counter_info.size = sizeof(struct radv_rra_ray_history_counter),
10507 .counter = counter,
10508
10509 .dispatch_size_info.type = RADV_RRA_DISPATCH_SIZE,
10510 .dispatch_size_info.size = sizeof(struct radv_rra_ray_history_dispatch_size),
10511 .dispatch_size = dispatch_size,
10512
10513 .traversal_flags_info.type = RADV_RRA_TRAVERSAL_FLAGS,
10514 .traversal_flags_info.size = sizeof(struct radv_rra_ray_history_traversal_flags),
10515 .traversal_flags = traversal_flags,
10516 };
10517
10518 uint32_t dispatch_index = util_dynarray_num_elements(&cmd_buffer->ray_history, struct radv_rra_ray_history_data *)
10519 << 16;
10520
10521 util_dynarray_append(&cmd_buffer->ray_history, struct radv_rra_ray_history_data *, data);
10522
10523 cmd_buffer->state.flush_bits |=
10524 RADV_CMD_FLAG_INV_SCACHE | RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
10525 radv_src_access_flush(cmd_buffer, VK_ACCESS_2_SHADER_READ_BIT | VK_ACCESS_2_SHADER_WRITE_BIT, NULL) |
10526 radv_dst_access_flush(cmd_buffer, VK_ACCESS_2_SHADER_READ_BIT | VK_ACCESS_2_SHADER_WRITE_BIT, NULL);
10527
10528 radv_update_buffer_cp(
10529 cmd_buffer,
10530 cmd_buffer->device->rra_trace.ray_history_addr + offsetof(struct radv_ray_history_header, dispatch_index),
10531 &dispatch_index, sizeof(dispatch_index));
10532 }
10533
10534 enum radv_rt_mode {
10535 radv_rt_mode_direct,
10536 radv_rt_mode_indirect,
10537 radv_rt_mode_indirect2,
10538 };
10539
10540 static void
radv_upload_trace_rays_params(struct radv_cmd_buffer * cmd_buffer,VkTraceRaysIndirectCommand2KHR * tables,enum radv_rt_mode mode,uint64_t * launch_size_va,uint64_t * sbt_va)10541 radv_upload_trace_rays_params(struct radv_cmd_buffer *cmd_buffer, VkTraceRaysIndirectCommand2KHR *tables,
10542 enum radv_rt_mode mode, uint64_t *launch_size_va, uint64_t *sbt_va)
10543 {
10544 uint32_t upload_size = mode == radv_rt_mode_direct ? sizeof(VkTraceRaysIndirectCommand2KHR)
10545 : offsetof(VkTraceRaysIndirectCommand2KHR, width);
10546
10547 uint32_t offset;
10548 if (!radv_cmd_buffer_upload_data(cmd_buffer, upload_size, tables, &offset))
10549 return;
10550
10551 uint64_t upload_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset;
10552
10553 if (mode == radv_rt_mode_direct)
10554 *launch_size_va = upload_va + offsetof(VkTraceRaysIndirectCommand2KHR, width);
10555 if (sbt_va)
10556 *sbt_va = upload_va;
10557 }
10558
10559 static void
radv_trace_rays(struct radv_cmd_buffer * cmd_buffer,VkTraceRaysIndirectCommand2KHR * tables,uint64_t indirect_va,enum radv_rt_mode mode)10560 radv_trace_rays(struct radv_cmd_buffer *cmd_buffer, VkTraceRaysIndirectCommand2KHR *tables, uint64_t indirect_va,
10561 enum radv_rt_mode mode)
10562 {
10563 if (cmd_buffer->device->instance->debug_flags & RADV_DEBUG_NO_RT)
10564 return;
10565
10566 if (unlikely(cmd_buffer->device->rra_trace.ray_history_buffer))
10567 radv_trace_trace_rays(cmd_buffer, tables, indirect_va);
10568
10569 struct radv_compute_pipeline *pipeline = &cmd_buffer->state.rt_pipeline->base;
10570 struct radv_shader *rt_prolog = cmd_buffer->state.rt_prolog;
10571 uint32_t base_reg = rt_prolog->info.user_data_0;
10572
10573 /* Reserve scratch for stacks manually since it is not handled by the compute path. */
10574 uint32_t scratch_bytes_per_wave = rt_prolog->config.scratch_bytes_per_wave;
10575 uint32_t wave_size = rt_prolog->info.wave_size;
10576
10577 /* The hardware register is specified as a multiple of 64 or 256 DWORDS. */
10578 unsigned scratch_alloc_granule = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11 ? 256 : 1024;
10579 scratch_bytes_per_wave += align(cmd_buffer->state.rt_stack_size * wave_size, scratch_alloc_granule);
10580
10581 cmd_buffer->compute_scratch_size_per_wave_needed =
10582 MAX2(cmd_buffer->compute_scratch_size_per_wave_needed, scratch_bytes_per_wave);
10583
10584 /* Since the workgroup size is 8x4 (or 8x8), 1D dispatches can only fill 8 threads per wave at most. To increase
10585 * occupancy, it's beneficial to convert to a 2D dispatch in these cases. */
10586 if (tables && tables->height == 1 && tables->width >= cmd_buffer->state.rt_prolog->info.cs.block_size[0])
10587 tables->height = ACO_RT_CONVERTED_2D_LAUNCH_SIZE;
10588
10589 struct radv_dispatch_info info = {0};
10590 info.unaligned = true;
10591
10592 uint64_t launch_size_va = 0;
10593 uint64_t sbt_va = 0;
10594
10595 if (mode != radv_rt_mode_indirect2) {
10596 launch_size_va = indirect_va;
10597 radv_upload_trace_rays_params(cmd_buffer, tables, mode, &launch_size_va, &sbt_va);
10598 } else {
10599 launch_size_va = indirect_va + offsetof(VkTraceRaysIndirectCommand2KHR, width);
10600 sbt_va = indirect_va;
10601 }
10602
10603 uint32_t remaining_ray_count = 0;
10604
10605 if (mode == radv_rt_mode_direct) {
10606 info.blocks[0] = tables->width;
10607 info.blocks[1] = tables->height;
10608 info.blocks[2] = tables->depth;
10609
10610 if (tables->height == ACO_RT_CONVERTED_2D_LAUNCH_SIZE) {
10611 /* We need the ray count for the 2D dispatch to be a multiple of the y block size for the division to work, and
10612 * a multiple of the x block size because the invocation offset must be a multiple of the block size when
10613 * dispatching the remaining rays. Fortunately, the x block size is itself a multiple of the y block size, so
10614 * we only need to ensure that the ray count is a multiple of the x block size. */
10615 remaining_ray_count = tables->width % rt_prolog->info.cs.block_size[0];
10616
10617 uint32_t ray_count = tables->width - remaining_ray_count;
10618 info.blocks[0] = ray_count / rt_prolog->info.cs.block_size[1];
10619 info.blocks[1] = rt_prolog->info.cs.block_size[1];
10620 }
10621 } else
10622 info.va = launch_size_va;
10623
10624 ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 15);
10625
10626 const struct radv_userdata_info *desc_loc = radv_get_user_sgpr(rt_prolog, AC_UD_CS_SBT_DESCRIPTORS);
10627 if (desc_loc->sgpr_idx != -1) {
10628 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + desc_loc->sgpr_idx * 4, sbt_va, true);
10629 }
10630
10631 const struct radv_userdata_info *size_loc = radv_get_user_sgpr(rt_prolog, AC_UD_CS_RAY_LAUNCH_SIZE_ADDR);
10632 if (size_loc->sgpr_idx != -1) {
10633 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + size_loc->sgpr_idx * 4, launch_size_va,
10634 true);
10635 }
10636
10637 const struct radv_userdata_info *base_loc = radv_get_user_sgpr(rt_prolog, AC_UD_CS_RAY_DYNAMIC_CALLABLE_STACK_BASE);
10638 if (base_loc->sgpr_idx != -1) {
10639 const struct radv_shader_info *cs_info = &rt_prolog->info;
10640 radeon_set_sh_reg(cmd_buffer->cs, R_00B900_COMPUTE_USER_DATA_0 + base_loc->sgpr_idx * 4,
10641 rt_prolog->config.scratch_bytes_per_wave / cs_info->wave_size);
10642 }
10643
10644 const struct radv_userdata_info *shader_loc = radv_get_user_sgpr(rt_prolog, AC_UD_CS_TRAVERSAL_SHADER_ADDR);
10645 struct radv_shader *traversal_shader = cmd_buffer->state.shaders[MESA_SHADER_INTERSECTION];
10646 if (shader_loc->sgpr_idx != -1 && traversal_shader) {
10647 uint64_t traversal_va = traversal_shader->va | radv_rt_priority_traversal;
10648 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + shader_loc->sgpr_idx * 4, traversal_va,
10649 true);
10650 }
10651
10652 assert(cmd_buffer->cs->cdw <= cdw_max);
10653
10654 radv_dispatch(cmd_buffer, &info, pipeline, rt_prolog, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
10655
10656 if (remaining_ray_count) {
10657 info.blocks[0] = remaining_ray_count;
10658 info.blocks[1] = 1;
10659 info.offsets[0] = tables->width - remaining_ray_count;
10660
10661 /* Reset the ray launch size so the prolog doesn't think this is a converted dispatch */
10662 tables->height = 1;
10663 radv_upload_trace_rays_params(cmd_buffer, tables, mode, &launch_size_va, NULL);
10664 if (size_loc->sgpr_idx != -1) {
10665 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + size_loc->sgpr_idx * 4, launch_size_va,
10666 true);
10667 }
10668
10669 radv_dispatch(cmd_buffer, &info, pipeline, rt_prolog, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
10670 }
10671 }
10672
10673 VKAPI_ATTR void VKAPI_CALL
radv_CmdTraceRaysKHR(VkCommandBuffer commandBuffer,const VkStridedDeviceAddressRegionKHR * pRaygenShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pMissShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pHitShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pCallableShaderBindingTable,uint32_t width,uint32_t height,uint32_t depth)10674 radv_CmdTraceRaysKHR(VkCommandBuffer commandBuffer, const VkStridedDeviceAddressRegionKHR *pRaygenShaderBindingTable,
10675 const VkStridedDeviceAddressRegionKHR *pMissShaderBindingTable,
10676 const VkStridedDeviceAddressRegionKHR *pHitShaderBindingTable,
10677 const VkStridedDeviceAddressRegionKHR *pCallableShaderBindingTable, uint32_t width,
10678 uint32_t height, uint32_t depth)
10679 {
10680 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
10681
10682 VkTraceRaysIndirectCommand2KHR tables = {
10683 .raygenShaderRecordAddress = pRaygenShaderBindingTable->deviceAddress,
10684 .raygenShaderRecordSize = pRaygenShaderBindingTable->size,
10685 .missShaderBindingTableAddress = pMissShaderBindingTable->deviceAddress,
10686 .missShaderBindingTableSize = pMissShaderBindingTable->size,
10687 .missShaderBindingTableStride = pMissShaderBindingTable->stride,
10688 .hitShaderBindingTableAddress = pHitShaderBindingTable->deviceAddress,
10689 .hitShaderBindingTableSize = pHitShaderBindingTable->size,
10690 .hitShaderBindingTableStride = pHitShaderBindingTable->stride,
10691 .callableShaderBindingTableAddress = pCallableShaderBindingTable->deviceAddress,
10692 .callableShaderBindingTableSize = pCallableShaderBindingTable->size,
10693 .callableShaderBindingTableStride = pCallableShaderBindingTable->stride,
10694 .width = width,
10695 .height = height,
10696 .depth = depth,
10697 };
10698
10699 radv_trace_rays(cmd_buffer, &tables, 0, radv_rt_mode_direct);
10700 }
10701
10702 VKAPI_ATTR void VKAPI_CALL
radv_CmdTraceRaysIndirectKHR(VkCommandBuffer commandBuffer,const VkStridedDeviceAddressRegionKHR * pRaygenShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pMissShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pHitShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pCallableShaderBindingTable,VkDeviceAddress indirectDeviceAddress)10703 radv_CmdTraceRaysIndirectKHR(VkCommandBuffer commandBuffer,
10704 const VkStridedDeviceAddressRegionKHR *pRaygenShaderBindingTable,
10705 const VkStridedDeviceAddressRegionKHR *pMissShaderBindingTable,
10706 const VkStridedDeviceAddressRegionKHR *pHitShaderBindingTable,
10707 const VkStridedDeviceAddressRegionKHR *pCallableShaderBindingTable,
10708 VkDeviceAddress indirectDeviceAddress)
10709 {
10710 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
10711
10712 assert(cmd_buffer->device->use_global_bo_list);
10713
10714 VkTraceRaysIndirectCommand2KHR tables = {
10715 .raygenShaderRecordAddress = pRaygenShaderBindingTable->deviceAddress,
10716 .raygenShaderRecordSize = pRaygenShaderBindingTable->size,
10717 .missShaderBindingTableAddress = pMissShaderBindingTable->deviceAddress,
10718 .missShaderBindingTableSize = pMissShaderBindingTable->size,
10719 .missShaderBindingTableStride = pMissShaderBindingTable->stride,
10720 .hitShaderBindingTableAddress = pHitShaderBindingTable->deviceAddress,
10721 .hitShaderBindingTableSize = pHitShaderBindingTable->size,
10722 .hitShaderBindingTableStride = pHitShaderBindingTable->stride,
10723 .callableShaderBindingTableAddress = pCallableShaderBindingTable->deviceAddress,
10724 .callableShaderBindingTableSize = pCallableShaderBindingTable->size,
10725 .callableShaderBindingTableStride = pCallableShaderBindingTable->stride,
10726 };
10727
10728 radv_trace_rays(cmd_buffer, &tables, indirectDeviceAddress, radv_rt_mode_indirect);
10729 }
10730
10731 VKAPI_ATTR void VKAPI_CALL
radv_CmdTraceRaysIndirect2KHR(VkCommandBuffer commandBuffer,VkDeviceAddress indirectDeviceAddress)10732 radv_CmdTraceRaysIndirect2KHR(VkCommandBuffer commandBuffer, VkDeviceAddress indirectDeviceAddress)
10733 {
10734 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
10735
10736 assert(cmd_buffer->device->use_global_bo_list);
10737
10738 radv_trace_rays(cmd_buffer, NULL, indirectDeviceAddress, radv_rt_mode_indirect2);
10739 }
10740
10741 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetRayTracingPipelineStackSizeKHR(VkCommandBuffer commandBuffer,uint32_t size)10742 radv_CmdSetRayTracingPipelineStackSizeKHR(VkCommandBuffer commandBuffer, uint32_t size)
10743 {
10744 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
10745 cmd_buffer->state.rt_stack_size = size;
10746 }
10747
10748 /*
10749 * For HTILE we have the following interesting clear words:
10750 * 0xfffff30f: Uncompressed, full depth range, for depth+stencil HTILE
10751 * 0xfffc000f: Uncompressed, full depth range, for depth only HTILE.
10752 * 0xfffffff0: Clear depth to 1.0
10753 * 0x00000000: Clear depth to 0.0
10754 */
10755 static void
radv_initialize_htile(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range)10756 radv_initialize_htile(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
10757 const VkImageSubresourceRange *range)
10758 {
10759 struct radv_cmd_state *state = &cmd_buffer->state;
10760 uint32_t htile_value = radv_get_htile_initial_value(cmd_buffer->device, image);
10761 VkClearDepthStencilValue value = {0};
10762 struct radv_barrier_data barrier = {0};
10763
10764 barrier.layout_transitions.init_mask_ram = 1;
10765 radv_describe_layout_transition(cmd_buffer, &barrier);
10766
10767 /* Transitioning from LAYOUT_UNDEFINED layout not everyone is consistent
10768 * in considering previous rendering work for WAW hazards. */
10769 state->flush_bits |= radv_src_access_flush(cmd_buffer, VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, image);
10770
10771 if (image->planes[0].surface.has_stencil &&
10772 !(range->aspectMask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT))) {
10773 /* Flush caches before performing a separate aspect initialization because it's a
10774 * read-modify-write operation.
10775 */
10776 state->flush_bits |= radv_dst_access_flush(cmd_buffer, VK_ACCESS_2_SHADER_READ_BIT, image);
10777 }
10778
10779 state->flush_bits |= radv_clear_htile(cmd_buffer, image, range, htile_value);
10780
10781 radv_set_ds_clear_metadata(cmd_buffer, image, range, value, range->aspectMask);
10782
10783 if (radv_image_is_tc_compat_htile(image) && (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)) {
10784 /* Initialize the TC-compat metada value to 0 because by
10785 * default DB_Z_INFO.RANGE_PRECISION is set to 1, and we only
10786 * need have to conditionally update its value when performing
10787 * a fast depth clear.
10788 */
10789 radv_set_tc_compat_zrange_metadata(cmd_buffer, image, range, 0);
10790 }
10791 }
10792
10793 static void
radv_handle_depth_image_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,VkImageLayout dst_layout,unsigned src_queue_mask,unsigned dst_queue_mask,const VkImageSubresourceRange * range,struct radv_sample_locations_state * sample_locs)10794 radv_handle_depth_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
10795 VkImageLayout src_layout, VkImageLayout dst_layout, unsigned src_queue_mask,
10796 unsigned dst_queue_mask, const VkImageSubresourceRange *range,
10797 struct radv_sample_locations_state *sample_locs)
10798 {
10799 struct radv_device *device = cmd_buffer->device;
10800
10801 if (!radv_htile_enabled(image, range->baseMipLevel))
10802 return;
10803
10804 if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
10805 radv_initialize_htile(cmd_buffer, image, range);
10806 } else if (!radv_layout_is_htile_compressed(device, image, src_layout, src_queue_mask) &&
10807 radv_layout_is_htile_compressed(device, image, dst_layout, dst_queue_mask)) {
10808 radv_initialize_htile(cmd_buffer, image, range);
10809 } else if (radv_layout_is_htile_compressed(device, image, src_layout, src_queue_mask) &&
10810 !radv_layout_is_htile_compressed(device, image, dst_layout, dst_queue_mask)) {
10811 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
10812
10813 radv_expand_depth_stencil(cmd_buffer, image, range, sample_locs);
10814
10815 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
10816 }
10817 }
10818
10819 static uint32_t
radv_init_cmask(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,uint32_t value)10820 radv_init_cmask(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, const VkImageSubresourceRange *range,
10821 uint32_t value)
10822 {
10823 struct radv_barrier_data barrier = {0};
10824
10825 barrier.layout_transitions.init_mask_ram = 1;
10826 radv_describe_layout_transition(cmd_buffer, &barrier);
10827
10828 return radv_clear_cmask(cmd_buffer, image, range, value);
10829 }
10830
10831 uint32_t
radv_init_fmask(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range)10832 radv_init_fmask(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, const VkImageSubresourceRange *range)
10833 {
10834 static const uint32_t fmask_clear_values[4] = {0x00000000, 0x02020202, 0xE4E4E4E4, 0x76543210};
10835 uint32_t log2_samples = util_logbase2(image->vk.samples);
10836 uint32_t value = fmask_clear_values[log2_samples];
10837 struct radv_barrier_data barrier = {0};
10838
10839 barrier.layout_transitions.init_mask_ram = 1;
10840 radv_describe_layout_transition(cmd_buffer, &barrier);
10841
10842 return radv_clear_fmask(cmd_buffer, image, range, value);
10843 }
10844
10845 uint32_t
radv_init_dcc(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,uint32_t value)10846 radv_init_dcc(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, const VkImageSubresourceRange *range,
10847 uint32_t value)
10848 {
10849 struct radv_barrier_data barrier = {0};
10850 uint32_t flush_bits = 0;
10851 unsigned size = 0;
10852
10853 barrier.layout_transitions.init_mask_ram = 1;
10854 radv_describe_layout_transition(cmd_buffer, &barrier);
10855
10856 flush_bits |= radv_clear_dcc(cmd_buffer, image, range, value);
10857
10858 if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX8) {
10859 /* When DCC is enabled with mipmaps, some levels might not
10860 * support fast clears and we have to initialize them as "fully
10861 * expanded".
10862 */
10863 /* Compute the size of all fast clearable DCC levels. */
10864 for (unsigned i = 0; i < image->planes[0].surface.num_meta_levels; i++) {
10865 struct legacy_surf_dcc_level *dcc_level = &image->planes[0].surface.u.legacy.color.dcc_level[i];
10866 unsigned dcc_fast_clear_size = dcc_level->dcc_slice_fast_clear_size * image->vk.array_layers;
10867
10868 if (!dcc_fast_clear_size)
10869 break;
10870
10871 size = dcc_level->dcc_offset + dcc_fast_clear_size;
10872 }
10873
10874 /* Initialize the mipmap levels without DCC. */
10875 if (size != image->planes[0].surface.meta_size) {
10876 flush_bits |= radv_fill_buffer(cmd_buffer, image, image->bindings[0].bo,
10877 radv_buffer_get_va(image->bindings[0].bo) + image->bindings[0].offset +
10878 image->planes[0].surface.meta_offset + size,
10879 image->planes[0].surface.meta_size - size, 0xffffffff);
10880 }
10881 }
10882
10883 return flush_bits;
10884 }
10885
10886 /**
10887 * Initialize DCC/FMASK/CMASK metadata for a color image.
10888 */
10889 static void
radv_init_color_image_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,VkImageLayout dst_layout,unsigned src_queue_mask,unsigned dst_queue_mask,const VkImageSubresourceRange * range)10890 radv_init_color_image_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, VkImageLayout src_layout,
10891 VkImageLayout dst_layout, unsigned src_queue_mask, unsigned dst_queue_mask,
10892 const VkImageSubresourceRange *range)
10893 {
10894 uint32_t flush_bits = 0;
10895
10896 /* Transitioning from LAYOUT_UNDEFINED layout not everyone is
10897 * consistent in considering previous rendering work for WAW hazards.
10898 */
10899 cmd_buffer->state.flush_bits |= radv_src_access_flush(cmd_buffer, VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT, image);
10900
10901 if (radv_image_has_cmask(image)) {
10902 static const uint32_t cmask_clear_values[4] = {0xffffffff, 0xdddddddd, 0xeeeeeeee, 0xffffffff};
10903 uint32_t log2_samples = util_logbase2(image->vk.samples);
10904
10905 flush_bits |= radv_init_cmask(cmd_buffer, image, range, cmask_clear_values[log2_samples]);
10906 }
10907
10908 if (radv_image_has_fmask(image)) {
10909 flush_bits |= radv_init_fmask(cmd_buffer, image, range);
10910 }
10911
10912 if (radv_dcc_enabled(image, range->baseMipLevel)) {
10913 uint32_t value = 0xffffffffu; /* Fully expanded mode. */
10914
10915 if (radv_layout_dcc_compressed(cmd_buffer->device, image, range->baseMipLevel, dst_layout, dst_queue_mask)) {
10916 value = 0u;
10917 }
10918
10919 flush_bits |= radv_init_dcc(cmd_buffer, image, range, value);
10920 }
10921
10922 if (radv_image_has_cmask(image) || radv_dcc_enabled(image, range->baseMipLevel)) {
10923 radv_update_fce_metadata(cmd_buffer, image, range, false);
10924
10925 uint32_t color_values[2] = {0};
10926 radv_set_color_clear_metadata(cmd_buffer, image, range, color_values);
10927 }
10928
10929 cmd_buffer->state.flush_bits |= flush_bits;
10930 }
10931
10932 static void
radv_retile_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,VkImageLayout dst_layout,unsigned dst_queue_mask)10933 radv_retile_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, VkImageLayout src_layout,
10934 VkImageLayout dst_layout, unsigned dst_queue_mask)
10935 {
10936 /* If the image is read-only, we don't have to retile DCC because it can't change. */
10937 if (!(image->vk.usage & RADV_IMAGE_USAGE_WRITE_BITS))
10938 return;
10939
10940 if (src_layout != VK_IMAGE_LAYOUT_PRESENT_SRC_KHR &&
10941 (dst_layout == VK_IMAGE_LAYOUT_PRESENT_SRC_KHR || (dst_queue_mask & (1u << RADV_QUEUE_FOREIGN))))
10942 radv_retile_dcc(cmd_buffer, image);
10943 }
10944
10945 static bool
radv_image_need_retile(const struct radv_cmd_buffer * cmd_buffer,const struct radv_image * image)10946 radv_image_need_retile(const struct radv_cmd_buffer *cmd_buffer, const struct radv_image *image)
10947 {
10948 return cmd_buffer->qf != RADV_QUEUE_TRANSFER && image->planes[0].surface.display_dcc_offset &&
10949 image->planes[0].surface.display_dcc_offset != image->planes[0].surface.meta_offset;
10950 }
10951
10952 /**
10953 * Handle color image transitions for DCC/FMASK/CMASK.
10954 */
10955 static void
radv_handle_color_image_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,VkImageLayout dst_layout,unsigned src_queue_mask,unsigned dst_queue_mask,const VkImageSubresourceRange * range)10956 radv_handle_color_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
10957 VkImageLayout src_layout, VkImageLayout dst_layout, unsigned src_queue_mask,
10958 unsigned dst_queue_mask, const VkImageSubresourceRange *range)
10959 {
10960 bool dcc_decompressed = false, fast_clear_flushed = false;
10961
10962 if (!radv_image_has_cmask(image) && !radv_image_has_fmask(image) && !radv_dcc_enabled(image, range->baseMipLevel))
10963 return;
10964
10965 if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
10966 radv_init_color_image_metadata(cmd_buffer, image, src_layout, dst_layout, src_queue_mask, dst_queue_mask, range);
10967
10968 if (radv_image_need_retile(cmd_buffer, image))
10969 radv_retile_transition(cmd_buffer, image, src_layout, dst_layout, dst_queue_mask);
10970 return;
10971 }
10972
10973 if (radv_dcc_enabled(image, range->baseMipLevel)) {
10974 if (src_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) {
10975 cmd_buffer->state.flush_bits |= radv_init_dcc(cmd_buffer, image, range, 0xffffffffu);
10976 } else if (radv_layout_dcc_compressed(cmd_buffer->device, image, range->baseMipLevel, src_layout,
10977 src_queue_mask) &&
10978 !radv_layout_dcc_compressed(cmd_buffer->device, image, range->baseMipLevel, dst_layout,
10979 dst_queue_mask)) {
10980 radv_decompress_dcc(cmd_buffer, image, range);
10981 dcc_decompressed = true;
10982 } else if (radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel, src_layout,
10983 src_queue_mask) &&
10984 !radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel, dst_layout,
10985 dst_queue_mask)) {
10986 radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
10987 fast_clear_flushed = true;
10988 }
10989
10990 if (radv_image_need_retile(cmd_buffer, image))
10991 radv_retile_transition(cmd_buffer, image, src_layout, dst_layout, dst_queue_mask);
10992 } else if (radv_image_has_cmask(image) || radv_image_has_fmask(image)) {
10993 if (radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel, src_layout, src_queue_mask) &&
10994 !radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel, dst_layout, dst_queue_mask)) {
10995 radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
10996 fast_clear_flushed = true;
10997 }
10998 }
10999
11000 /* MSAA color decompress. */
11001 const enum radv_fmask_compression src_fmask_comp =
11002 radv_layout_fmask_compression(cmd_buffer->device, image, src_layout, src_queue_mask);
11003 const enum radv_fmask_compression dst_fmask_comp =
11004 radv_layout_fmask_compression(cmd_buffer->device, image, dst_layout, dst_queue_mask);
11005 if (src_fmask_comp <= dst_fmask_comp)
11006 return;
11007
11008 if (src_fmask_comp == RADV_FMASK_COMPRESSION_FULL) {
11009 if (radv_dcc_enabled(image, range->baseMipLevel) && !radv_image_use_dcc_image_stores(cmd_buffer->device, image) &&
11010 !dcc_decompressed) {
11011 /* A DCC decompress is required before expanding FMASK
11012 * when DCC stores aren't supported to avoid being in
11013 * a state where DCC is compressed and the main
11014 * surface is uncompressed.
11015 */
11016 radv_decompress_dcc(cmd_buffer, image, range);
11017 } else if (!fast_clear_flushed) {
11018 /* A FMASK decompress is required before expanding
11019 * FMASK.
11020 */
11021 radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
11022 }
11023 }
11024
11025 if (dst_fmask_comp == RADV_FMASK_COMPRESSION_NONE) {
11026 struct radv_barrier_data barrier = {0};
11027 barrier.layout_transitions.fmask_color_expand = 1;
11028 radv_describe_layout_transition(cmd_buffer, &barrier);
11029
11030 radv_expand_fmask_image_inplace(cmd_buffer, image, range);
11031 }
11032 }
11033
11034 static void
radv_handle_image_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,VkImageLayout dst_layout,uint32_t src_family_index,uint32_t dst_family_index,const VkImageSubresourceRange * range,struct radv_sample_locations_state * sample_locs)11035 radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, VkImageLayout src_layout,
11036 VkImageLayout dst_layout, uint32_t src_family_index, uint32_t dst_family_index,
11037 const VkImageSubresourceRange *range, struct radv_sample_locations_state *sample_locs)
11038 {
11039 enum radv_queue_family src_qf = vk_queue_to_radv(cmd_buffer->device->physical_device, src_family_index);
11040 enum radv_queue_family dst_qf = vk_queue_to_radv(cmd_buffer->device->physical_device, dst_family_index);
11041 if (image->exclusive && src_family_index != dst_family_index) {
11042 /* This is an acquire or a release operation and there will be
11043 * a corresponding release/acquire. Do the transition in the
11044 * most flexible queue. */
11045
11046 assert(src_qf == cmd_buffer->qf || dst_qf == cmd_buffer->qf);
11047
11048 if (src_family_index == VK_QUEUE_FAMILY_EXTERNAL || src_family_index == VK_QUEUE_FAMILY_FOREIGN_EXT)
11049 return;
11050
11051 if (cmd_buffer->qf == RADV_QUEUE_TRANSFER)
11052 return;
11053
11054 if (cmd_buffer->qf == RADV_QUEUE_COMPUTE && (src_qf == RADV_QUEUE_GENERAL || dst_qf == RADV_QUEUE_GENERAL))
11055 return;
11056 }
11057
11058 unsigned src_queue_mask = radv_image_queue_family_mask(image, src_qf, cmd_buffer->qf);
11059 unsigned dst_queue_mask = radv_image_queue_family_mask(image, dst_qf, cmd_buffer->qf);
11060
11061 if (src_layout == dst_layout && src_queue_mask == dst_queue_mask)
11062 return;
11063
11064 if (image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {
11065 radv_handle_depth_image_transition(cmd_buffer, image, src_layout, dst_layout, src_queue_mask, dst_queue_mask,
11066 range, sample_locs);
11067 } else {
11068 radv_handle_color_image_transition(cmd_buffer, image, src_layout, dst_layout, src_queue_mask, dst_queue_mask,
11069 range);
11070 }
11071 }
11072
11073 static void
radv_cp_dma_wait_for_stages(struct radv_cmd_buffer * cmd_buffer,VkPipelineStageFlags2 stage_mask)11074 radv_cp_dma_wait_for_stages(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 stage_mask)
11075 {
11076 /* Make sure CP DMA is idle because the driver might have performed a DMA operation for copying a
11077 * buffer (or a MSAA image using FMASK). Note that updating a buffer is considered a clear
11078 * operation but it might also use a CP DMA copy in some rare situations. Other operations using
11079 * a CP DMA clear are implicitly synchronized (see CP_DMA_SYNC).
11080 */
11081 if (stage_mask &
11082 (VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_CLEAR_BIT | VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT |
11083 VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
11084 radv_cp_dma_wait_for_idle(cmd_buffer);
11085 }
11086
11087 static void
radv_barrier(struct radv_cmd_buffer * cmd_buffer,const VkDependencyInfo * dep_info,enum rgp_barrier_reason reason)11088 radv_barrier(struct radv_cmd_buffer *cmd_buffer, const VkDependencyInfo *dep_info, enum rgp_barrier_reason reason)
11089 {
11090 enum radv_cmd_flush_bits src_flush_bits = 0;
11091 enum radv_cmd_flush_bits dst_flush_bits = 0;
11092 VkPipelineStageFlags2 src_stage_mask = 0;
11093 VkPipelineStageFlags2 dst_stage_mask = 0;
11094
11095 if (cmd_buffer->state.render.active)
11096 radv_mark_noncoherent_rb(cmd_buffer);
11097
11098 radv_describe_barrier_start(cmd_buffer, reason);
11099
11100 for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
11101 src_stage_mask |= dep_info->pMemoryBarriers[i].srcStageMask;
11102 src_flush_bits |= radv_src_access_flush(cmd_buffer, dep_info->pMemoryBarriers[i].srcAccessMask, NULL);
11103 dst_stage_mask |= dep_info->pMemoryBarriers[i].dstStageMask;
11104 dst_flush_bits |= radv_dst_access_flush(cmd_buffer, dep_info->pMemoryBarriers[i].dstAccessMask, NULL);
11105 }
11106
11107 for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
11108 src_stage_mask |= dep_info->pBufferMemoryBarriers[i].srcStageMask;
11109 src_flush_bits |= radv_src_access_flush(cmd_buffer, dep_info->pBufferMemoryBarriers[i].srcAccessMask, NULL);
11110 dst_stage_mask |= dep_info->pBufferMemoryBarriers[i].dstStageMask;
11111 dst_flush_bits |= radv_dst_access_flush(cmd_buffer, dep_info->pBufferMemoryBarriers[i].dstAccessMask, NULL);
11112 }
11113
11114 for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
11115 RADV_FROM_HANDLE(radv_image, image, dep_info->pImageMemoryBarriers[i].image);
11116
11117 src_stage_mask |= dep_info->pImageMemoryBarriers[i].srcStageMask;
11118 src_flush_bits |= radv_src_access_flush(cmd_buffer, dep_info->pImageMemoryBarriers[i].srcAccessMask, image);
11119 dst_stage_mask |= dep_info->pImageMemoryBarriers[i].dstStageMask;
11120 dst_flush_bits |= radv_dst_access_flush(cmd_buffer, dep_info->pImageMemoryBarriers[i].dstAccessMask, image);
11121 }
11122
11123 /* The Vulkan spec 1.1.98 says:
11124 *
11125 * "An execution dependency with only
11126 * VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT in the destination stage mask
11127 * will only prevent that stage from executing in subsequently
11128 * submitted commands. As this stage does not perform any actual
11129 * execution, this is not observable - in effect, it does not delay
11130 * processing of subsequent commands. Similarly an execution dependency
11131 * with only VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT in the source stage mask
11132 * will effectively not wait for any prior commands to complete."
11133 */
11134 if (dst_stage_mask != VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT)
11135 radv_stage_flush(cmd_buffer, src_stage_mask);
11136 cmd_buffer->state.flush_bits |= src_flush_bits;
11137
11138 radv_gang_barrier(cmd_buffer, src_stage_mask, 0);
11139
11140 for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
11141 RADV_FROM_HANDLE(radv_image, image, dep_info->pImageMemoryBarriers[i].image);
11142
11143 const struct VkSampleLocationsInfoEXT *sample_locs_info =
11144 vk_find_struct_const(dep_info->pImageMemoryBarriers[i].pNext, SAMPLE_LOCATIONS_INFO_EXT);
11145 struct radv_sample_locations_state sample_locations;
11146
11147 if (sample_locs_info) {
11148 assert(image->vk.create_flags & VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT);
11149 sample_locations.per_pixel = sample_locs_info->sampleLocationsPerPixel;
11150 sample_locations.grid_size = sample_locs_info->sampleLocationGridSize;
11151 sample_locations.count = sample_locs_info->sampleLocationsCount;
11152 typed_memcpy(&sample_locations.locations[0], sample_locs_info->pSampleLocations,
11153 sample_locs_info->sampleLocationsCount);
11154 }
11155
11156 radv_handle_image_transition(
11157 cmd_buffer, image, dep_info->pImageMemoryBarriers[i].oldLayout, dep_info->pImageMemoryBarriers[i].newLayout,
11158 dep_info->pImageMemoryBarriers[i].srcQueueFamilyIndex, dep_info->pImageMemoryBarriers[i].dstQueueFamilyIndex,
11159 &dep_info->pImageMemoryBarriers[i].subresourceRange, sample_locs_info ? &sample_locations : NULL);
11160 }
11161
11162 radv_gang_barrier(cmd_buffer, 0, dst_stage_mask);
11163
11164 if (cmd_buffer->qf == RADV_QUEUE_TRANSFER) {
11165 /* SDMA NOP packet waits for all pending SDMA operations to complete.
11166 * Note that GFX9+ is supposed to have RAW dependency tracking, but it's buggy
11167 * so we can't rely on it fow now.
11168 */
11169 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 1);
11170 radeon_emit(cmd_buffer->cs, SDMA_PACKET(SDMA_OPCODE_NOP, 0, 0));
11171 } else {
11172 const bool is_gfx_or_ace = cmd_buffer->qf == RADV_QUEUE_GENERAL || cmd_buffer->qf == RADV_QUEUE_COMPUTE;
11173 if (is_gfx_or_ace)
11174 radv_cp_dma_wait_for_stages(cmd_buffer, src_stage_mask);
11175 }
11176
11177 cmd_buffer->state.flush_bits |= dst_flush_bits;
11178
11179 radv_describe_barrier_end(cmd_buffer);
11180 }
11181
11182 VKAPI_ATTR void VKAPI_CALL
radv_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,const VkDependencyInfo * pDependencyInfo)11183 radv_CmdPipelineBarrier2(VkCommandBuffer commandBuffer, const VkDependencyInfo *pDependencyInfo)
11184 {
11185 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11186 enum rgp_barrier_reason barrier_reason;
11187
11188 if (cmd_buffer->vk.runtime_rp_barrier) {
11189 barrier_reason = RGP_BARRIER_EXTERNAL_RENDER_PASS_SYNC;
11190 } else {
11191 barrier_reason = RGP_BARRIER_EXTERNAL_CMD_PIPELINE_BARRIER;
11192 }
11193
11194 radv_barrier(cmd_buffer, pDependencyInfo, barrier_reason);
11195 }
11196
11197 static void
write_event(struct radv_cmd_buffer * cmd_buffer,struct radv_event * event,VkPipelineStageFlags2 stageMask,unsigned value)11198 write_event(struct radv_cmd_buffer *cmd_buffer, struct radv_event *event, VkPipelineStageFlags2 stageMask,
11199 unsigned value)
11200 {
11201 struct radeon_cmdbuf *cs = cmd_buffer->cs;
11202 uint64_t va = radv_buffer_get_va(event->bo);
11203
11204 if (cmd_buffer->qf == RADV_QUEUE_VIDEO_DEC)
11205 return;
11206
11207 radv_emit_cache_flush(cmd_buffer);
11208
11209 radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo);
11210
11211 ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 28);
11212
11213 if (stageMask & (VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_RESOLVE_BIT | VK_PIPELINE_STAGE_2_BLIT_BIT |
11214 VK_PIPELINE_STAGE_2_CLEAR_BIT)) {
11215 /* Be conservative for now. */
11216 stageMask |= VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT;
11217 }
11218
11219 /* Flags that only require a top-of-pipe event. */
11220 VkPipelineStageFlags2 top_of_pipe_flags = VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT;
11221
11222 /* Flags that only require a post-index-fetch event. */
11223 VkPipelineStageFlags2 post_index_fetch_flags =
11224 top_of_pipe_flags | VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT;
11225
11226 /* Flags that only require signaling post PS. */
11227 VkPipelineStageFlags2 post_ps_flags =
11228 post_index_fetch_flags | VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT |
11229 VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT | VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT |
11230 VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT | VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_EXT |
11231 VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT | VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT |
11232 VK_PIPELINE_STAGE_2_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR | VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT |
11233 VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT;
11234
11235 /* Flags that only require signaling post CS. */
11236 VkPipelineStageFlags2 post_cs_flags = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT;
11237
11238 radv_cp_dma_wait_for_stages(cmd_buffer, stageMask);
11239
11240 if (!(stageMask & ~top_of_pipe_flags)) {
11241 /* Just need to sync the PFP engine. */
11242 radv_write_data(cmd_buffer, V_370_PFP, va, 1, &value, false);
11243 } else if (!(stageMask & ~post_index_fetch_flags)) {
11244 /* Sync ME because PFP reads index and indirect buffers. */
11245 radv_write_data(cmd_buffer, V_370_ME, va, 1, &value, false);
11246 } else {
11247 unsigned event_type;
11248
11249 if (!(stageMask & ~post_ps_flags)) {
11250 /* Sync previous fragment shaders. */
11251 event_type = V_028A90_PS_DONE;
11252 } else if (!(stageMask & ~post_cs_flags)) {
11253 /* Sync previous compute shaders. */
11254 event_type = V_028A90_CS_DONE;
11255 } else {
11256 /* Otherwise, sync all prior GPU work. */
11257 event_type = V_028A90_BOTTOM_OF_PIPE_TS;
11258 }
11259
11260 radv_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level, cmd_buffer->qf,
11261 event_type, 0, EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, va, value,
11262 cmd_buffer->gfx9_eop_bug_va);
11263 }
11264
11265 assert(cmd_buffer->cs->cdw <= cdw_max);
11266 }
11267
11268 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetEvent2(VkCommandBuffer commandBuffer,VkEvent _event,const VkDependencyInfo * pDependencyInfo)11269 radv_CmdSetEvent2(VkCommandBuffer commandBuffer, VkEvent _event, const VkDependencyInfo *pDependencyInfo)
11270 {
11271 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11272 RADV_FROM_HANDLE(radv_event, event, _event);
11273 VkPipelineStageFlags2 src_stage_mask = 0;
11274
11275 for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++)
11276 src_stage_mask |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
11277 for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++)
11278 src_stage_mask |= pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
11279 for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++)
11280 src_stage_mask |= pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
11281
11282 write_event(cmd_buffer, event, src_stage_mask, 1);
11283 }
11284
11285 VKAPI_ATTR void VKAPI_CALL
radv_CmdResetEvent2(VkCommandBuffer commandBuffer,VkEvent _event,VkPipelineStageFlags2 stageMask)11286 radv_CmdResetEvent2(VkCommandBuffer commandBuffer, VkEvent _event, VkPipelineStageFlags2 stageMask)
11287 {
11288 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11289 RADV_FROM_HANDLE(radv_event, event, _event);
11290
11291 write_event(cmd_buffer, event, stageMask, 0);
11292 }
11293
11294 VKAPI_ATTR void VKAPI_CALL
radv_CmdWaitEvents2(VkCommandBuffer commandBuffer,uint32_t eventCount,const VkEvent * pEvents,const VkDependencyInfo * pDependencyInfos)11295 radv_CmdWaitEvents2(VkCommandBuffer commandBuffer, uint32_t eventCount, const VkEvent *pEvents,
11296 const VkDependencyInfo *pDependencyInfos)
11297 {
11298 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11299 struct radeon_cmdbuf *cs = cmd_buffer->cs;
11300
11301 if (cmd_buffer->qf == RADV_QUEUE_VIDEO_DEC)
11302 return;
11303
11304 for (unsigned i = 0; i < eventCount; ++i) {
11305 RADV_FROM_HANDLE(radv_event, event, pEvents[i]);
11306 uint64_t va = radv_buffer_get_va(event->bo);
11307
11308 radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo);
11309
11310 ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 7);
11311
11312 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_EQUAL, va, 1, 0xffffffff);
11313 assert(cmd_buffer->cs->cdw <= cdw_max);
11314 }
11315
11316 radv_barrier(cmd_buffer, pDependencyInfos, RGP_BARRIER_EXTERNAL_CMD_WAIT_EVENTS);
11317 }
11318
11319 void
radv_begin_conditional_rendering(struct radv_cmd_buffer * cmd_buffer,uint64_t va,bool draw_visible)11320 radv_begin_conditional_rendering(struct radv_cmd_buffer *cmd_buffer, uint64_t va, bool draw_visible)
11321 {
11322 struct radeon_cmdbuf *cs = cmd_buffer->cs;
11323 unsigned pred_op = PREDICATION_OP_BOOL32;
11324
11325 radv_emit_cache_flush(cmd_buffer);
11326
11327 if (cmd_buffer->qf == RADV_QUEUE_GENERAL && !cmd_buffer->device->physical_device->rad_info.has_32bit_predication) {
11328 uint64_t pred_value = 0, pred_va;
11329 unsigned pred_offset;
11330
11331 /* From the Vulkan spec 1.1.107:
11332 *
11333 * "If the 32-bit value at offset in buffer memory is zero,
11334 * then the rendering commands are discarded, otherwise they
11335 * are executed as normal. If the value of the predicate in
11336 * buffer memory changes while conditional rendering is
11337 * active, the rendering commands may be discarded in an
11338 * implementation-dependent way. Some implementations may
11339 * latch the value of the predicate upon beginning conditional
11340 * rendering while others may read it before every rendering
11341 * command."
11342 *
11343 * But, the AMD hardware treats the predicate as a 64-bit
11344 * value which means we need a workaround in the driver.
11345 * Luckily, it's not required to support if the value changes
11346 * when predication is active.
11347 *
11348 * The workaround is as follows:
11349 * 1) allocate a 64-value in the upload BO and initialize it
11350 * to 0
11351 * 2) copy the 32-bit predicate value to the upload BO
11352 * 3) use the new allocated VA address for predication
11353 *
11354 * Based on the conditionalrender demo, it's faster to do the
11355 * COPY_DATA in ME (+ sync PFP) instead of PFP.
11356 */
11357 radv_cmd_buffer_upload_data(cmd_buffer, 8, &pred_value, &pred_offset);
11358
11359 pred_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + pred_offset;
11360
11361 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 8);
11362
11363 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
11364 radeon_emit(cs,
11365 COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM);
11366 radeon_emit(cs, va);
11367 radeon_emit(cs, va >> 32);
11368 radeon_emit(cs, pred_va);
11369 radeon_emit(cs, pred_va >> 32);
11370
11371 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
11372 radeon_emit(cs, 0);
11373
11374 va = pred_va;
11375 pred_op = PREDICATION_OP_BOOL64;
11376 }
11377
11378 /* MEC doesn't support predication, we emulate it elsewhere. */
11379 if (!radv_cmd_buffer_uses_mec(cmd_buffer)) {
11380 radv_emit_set_predication_state(cmd_buffer, draw_visible, pred_op, va);
11381 }
11382
11383 /* Store conditional rendering user info. */
11384 cmd_buffer->state.predicating = true;
11385 cmd_buffer->state.predication_type = draw_visible;
11386 cmd_buffer->state.predication_op = pred_op;
11387 cmd_buffer->state.predication_va = va;
11388 cmd_buffer->mec_inv_pred_emitted = false;
11389 }
11390
11391 void
radv_end_conditional_rendering(struct radv_cmd_buffer * cmd_buffer)11392 radv_end_conditional_rendering(struct radv_cmd_buffer *cmd_buffer)
11393 {
11394 /* MEC doesn't support predication, no need to emit anything here. */
11395 if (!radv_cmd_buffer_uses_mec(cmd_buffer)) {
11396 radv_emit_set_predication_state(cmd_buffer, false, 0, 0);
11397 }
11398
11399 /* Reset conditional rendering user info. */
11400 cmd_buffer->state.predicating = false;
11401 cmd_buffer->state.predication_type = -1;
11402 cmd_buffer->state.predication_op = 0;
11403 cmd_buffer->state.predication_va = 0;
11404 cmd_buffer->mec_inv_pred_emitted = false;
11405 }
11406
11407 /* VK_EXT_conditional_rendering */
11408 VKAPI_ATTR void VKAPI_CALL
radv_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,const VkConditionalRenderingBeginInfoEXT * pConditionalRenderingBegin)11409 radv_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,
11410 const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin)
11411 {
11412 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11413 RADV_FROM_HANDLE(radv_buffer, buffer, pConditionalRenderingBegin->buffer);
11414 bool draw_visible = true;
11415 uint64_t va;
11416
11417 va = radv_buffer_get_va(buffer->bo) + buffer->offset + pConditionalRenderingBegin->offset;
11418
11419 /* By default, if the 32-bit value at offset in buffer memory is zero,
11420 * then the rendering commands are discarded, otherwise they are
11421 * executed as normal. If the inverted flag is set, all commands are
11422 * discarded if the value is non zero.
11423 */
11424 if (pConditionalRenderingBegin->flags & VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT) {
11425 draw_visible = false;
11426 }
11427
11428 radv_begin_conditional_rendering(cmd_buffer, va, draw_visible);
11429 }
11430
11431 VKAPI_ATTR void VKAPI_CALL
radv_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)11432 radv_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)
11433 {
11434 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11435
11436 radv_end_conditional_rendering(cmd_buffer);
11437 }
11438
11439 /* VK_EXT_transform_feedback */
11440 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes)11441 radv_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer, uint32_t firstBinding, uint32_t bindingCount,
11442 const VkBuffer *pBuffers, const VkDeviceSize *pOffsets,
11443 const VkDeviceSize *pSizes)
11444 {
11445 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11446 struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
11447 uint8_t enabled_mask = 0;
11448
11449 assert(firstBinding + bindingCount <= MAX_SO_BUFFERS);
11450 for (uint32_t i = 0; i < bindingCount; i++) {
11451 uint32_t idx = firstBinding + i;
11452
11453 sb[idx].buffer = radv_buffer_from_handle(pBuffers[i]);
11454 sb[idx].offset = pOffsets[i];
11455
11456 if (!pSizes || pSizes[i] == VK_WHOLE_SIZE) {
11457 sb[idx].size = sb[idx].buffer->vk.size - sb[idx].offset;
11458 } else {
11459 sb[idx].size = pSizes[i];
11460 }
11461
11462 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, sb[idx].buffer->bo);
11463
11464 enabled_mask |= 1 << idx;
11465 }
11466
11467 cmd_buffer->state.streamout.enabled_mask |= enabled_mask;
11468
11469 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_BUFFER;
11470 }
11471
11472 static void
radv_set_streamout_enable(struct radv_cmd_buffer * cmd_buffer,bool enable)11473 radv_set_streamout_enable(struct radv_cmd_buffer *cmd_buffer, bool enable)
11474 {
11475 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
11476 bool old_streamout_enabled = radv_is_streamout_enabled(cmd_buffer);
11477 uint32_t old_hw_enabled_mask = so->hw_enabled_mask;
11478
11479 so->streamout_enabled = enable;
11480
11481 so->hw_enabled_mask =
11482 so->enabled_mask | (so->enabled_mask << 4) | (so->enabled_mask << 8) | (so->enabled_mask << 12);
11483
11484 if (!cmd_buffer->device->physical_device->use_ngg_streamout &&
11485 ((old_streamout_enabled != radv_is_streamout_enabled(cmd_buffer)) ||
11486 (old_hw_enabled_mask != so->hw_enabled_mask)))
11487 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_ENABLE;
11488
11489 if (cmd_buffer->device->physical_device->use_ngg_streamout) {
11490 /* Re-emit streamout desciptors because with NGG streamout, a buffer size of 0 acts like a
11491 * disable bit and this is needed when streamout needs to be ignored in shaders.
11492 */
11493 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY | RADV_CMD_DIRTY_STREAMOUT_BUFFER;
11494 }
11495 }
11496
11497 static void
radv_flush_vgt_streamout(struct radv_cmd_buffer * cmd_buffer)11498 radv_flush_vgt_streamout(struct radv_cmd_buffer *cmd_buffer)
11499 {
11500 struct radeon_cmdbuf *cs = cmd_buffer->cs;
11501 unsigned reg_strmout_cntl;
11502
11503 ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 14);
11504
11505 /* The register is at different places on different ASICs. */
11506 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9) {
11507 reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
11508 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
11509 radeon_emit(cs, S_370_DST_SEL(V_370_MEM_MAPPED_REGISTER) | S_370_ENGINE_SEL(V_370_ME));
11510 radeon_emit(cs, R_0300FC_CP_STRMOUT_CNTL >> 2);
11511 radeon_emit(cs, 0);
11512 radeon_emit(cs, 0);
11513 } else if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) {
11514 reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
11515 radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0);
11516 } else {
11517 reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL;
11518 radeon_set_config_reg(cs, reg_strmout_cntl, 0);
11519 }
11520
11521 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
11522 radeon_emit(cs, EVENT_TYPE(V_028A90_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0));
11523
11524 radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
11525 radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
11526 radeon_emit(cs, reg_strmout_cntl >> 2); /* register */
11527 radeon_emit(cs, 0);
11528 radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
11529 radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
11530 radeon_emit(cs, 4); /* poll interval */
11531
11532 assert(cs->cdw <= cdw_max);
11533 }
11534
11535 VKAPI_ATTR void VKAPI_CALL
radv_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)11536 radv_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer,
11537 uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
11538 const VkDeviceSize *pCounterBufferOffsets)
11539 {
11540 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11541 struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
11542 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
11543 struct radeon_cmdbuf *cs = cmd_buffer->cs;
11544
11545 assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
11546 if (!cmd_buffer->device->physical_device->use_ngg_streamout)
11547 radv_flush_vgt_streamout(cmd_buffer);
11548
11549 ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, MAX_SO_BUFFERS * 10);
11550
11551 u_foreach_bit (i, so->enabled_mask) {
11552 int32_t counter_buffer_idx = i - firstCounterBuffer;
11553 if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
11554 counter_buffer_idx = -1;
11555
11556 bool append = counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx];
11557 uint64_t va = 0;
11558
11559 if (append) {
11560 RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
11561 uint64_t counter_buffer_offset = 0;
11562
11563 if (pCounterBufferOffsets)
11564 counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
11565
11566 va += radv_buffer_get_va(buffer->bo);
11567 va += buffer->offset + counter_buffer_offset;
11568
11569 radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
11570 }
11571
11572 if (cmd_buffer->device->physical_device->use_ngg_streamout) {
11573 if (append) {
11574 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
11575 radeon_emit(cs,
11576 COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) | COPY_DATA_WR_CONFIRM);
11577 radeon_emit(cs, va);
11578 radeon_emit(cs, va >> 32);
11579 radeon_emit(cs, (R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 >> 2) + i);
11580 radeon_emit(cs, 0);
11581 } else {
11582 /* The PKT3 CAM bit workaround seems needed for initializing this GDS register to zero. */
11583 radeon_set_perfctr_reg(cmd_buffer->device->physical_device->rad_info.gfx_level, cmd_buffer->qf, cs,
11584 R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 + i * 4, 0);
11585 }
11586 } else {
11587 /* AMD GCN binds streamout buffers as shader resources.
11588 * VGT only counts primitives and tells the shader through
11589 * SGPRs what to do.
11590 */
11591 radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, sb[i].size >> 2);
11592
11593 cmd_buffer->state.context_roll_without_scissor_emitted = true;
11594
11595 if (append) {
11596 radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
11597 radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) | /* offset in bytes */
11598 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
11599 radeon_emit(cs, 0); /* unused */
11600 radeon_emit(cs, 0); /* unused */
11601 radeon_emit(cs, va); /* src address lo */
11602 radeon_emit(cs, va >> 32); /* src address hi */
11603 } else {
11604 /* Start from the beginning. */
11605 radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
11606 radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) | /* offset in bytes */
11607 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
11608 radeon_emit(cs, 0); /* unused */
11609 radeon_emit(cs, 0); /* unused */
11610 radeon_emit(cs, 0); /* unused */
11611 radeon_emit(cs, 0); /* unused */
11612 }
11613 }
11614 }
11615
11616 assert(cs->cdw <= cdw_max);
11617
11618 radv_set_streamout_enable(cmd_buffer, true);
11619
11620 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_ENABLE;
11621 }
11622
11623 VKAPI_ATTR void VKAPI_CALL
radv_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)11624 radv_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer, uint32_t counterBufferCount,
11625 const VkBuffer *pCounterBuffers, const VkDeviceSize *pCounterBufferOffsets)
11626 {
11627 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11628 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
11629 struct radeon_cmdbuf *cs = cmd_buffer->cs;
11630
11631 assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
11632
11633 if (cmd_buffer->device->physical_device->use_ngg_streamout) {
11634 /* Wait for streamout to finish before reading GDS_STRMOUT registers. */
11635 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH;
11636 radv_emit_cache_flush(cmd_buffer);
11637 } else {
11638 radv_flush_vgt_streamout(cmd_buffer);
11639 }
11640
11641 ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, MAX_SO_BUFFERS * 12);
11642
11643 u_foreach_bit (i, so->enabled_mask) {
11644 int32_t counter_buffer_idx = i - firstCounterBuffer;
11645 if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
11646 counter_buffer_idx = -1;
11647
11648 bool append = counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx];
11649 uint64_t va = 0;
11650
11651 if (append) {
11652 RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
11653 uint64_t counter_buffer_offset = 0;
11654
11655 if (pCounterBufferOffsets)
11656 counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
11657
11658 va += radv_buffer_get_va(buffer->bo);
11659 va += buffer->offset + counter_buffer_offset;
11660
11661 radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
11662 }
11663
11664 if (cmd_buffer->device->physical_device->use_ngg_streamout) {
11665 if (append) {
11666 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
11667 radeon_emit(cs,
11668 COPY_DATA_SRC_SEL(COPY_DATA_REG) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM);
11669 radeon_emit(cs, (R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 >> 2) + i);
11670 radeon_emit(cs, 0);
11671 radeon_emit(cs, va);
11672 radeon_emit(cs, va >> 32);
11673 }
11674 } else {
11675 if (append) {
11676 radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
11677 radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) | /* offset in bytes */
11678 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
11679 STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
11680 radeon_emit(cs, va); /* dst address lo */
11681 radeon_emit(cs, va >> 32); /* dst address hi */
11682 radeon_emit(cs, 0); /* unused */
11683 radeon_emit(cs, 0); /* unused */
11684 }
11685
11686 /* Deactivate transform feedback by zeroing the buffer size.
11687 * The counters (primitives generated, primitives emitted) may
11688 * be enabled even if there is not buffer bound. This ensures
11689 * that the primitives-emitted query won't increment.
11690 */
11691 radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 0);
11692
11693 cmd_buffer->state.context_roll_without_scissor_emitted = true;
11694 }
11695 }
11696
11697 assert(cmd_buffer->cs->cdw <= cdw_max);
11698
11699 radv_set_streamout_enable(cmd_buffer, false);
11700 }
11701
11702 static void
radv_emit_strmout_buffer(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * draw_info)11703 radv_emit_strmout_buffer(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *draw_info)
11704 {
11705 const enum amd_gfx_level gfx_level = cmd_buffer->device->physical_device->rad_info.gfx_level;
11706 uint64_t va = radv_buffer_get_va(draw_info->strmout_buffer->bo);
11707 struct radeon_cmdbuf *cs = cmd_buffer->cs;
11708
11709 va += draw_info->strmout_buffer->offset + draw_info->strmout_buffer_offset;
11710
11711 radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE, draw_info->stride);
11712
11713 if (gfx_level >= GFX10) {
11714 /* Emitting a COPY_DATA packet should be enough because RADV doesn't support preemption
11715 * (shadow memory) but for unknown reasons, it can lead to GPU hangs on GFX10+.
11716 */
11717 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
11718 radeon_emit(cs, 0);
11719
11720 radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, 0));
11721 radeon_emit(cs, va);
11722 radeon_emit(cs, va >> 32);
11723 radeon_emit(cs, (R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE - SI_CONTEXT_REG_OFFSET) >> 2);
11724 radeon_emit(cs, 1); /* 1 DWORD */
11725 } else {
11726 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
11727 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) | COPY_DATA_WR_CONFIRM);
11728 radeon_emit(cs, va);
11729 radeon_emit(cs, va >> 32);
11730 radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2);
11731 radeon_emit(cs, 0); /* unused */
11732 }
11733
11734 radv_cs_add_buffer(cmd_buffer->device->ws, cs, draw_info->strmout_buffer->bo);
11735 }
11736
11737 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,uint32_t instanceCount,uint32_t firstInstance,VkBuffer _counterBuffer,VkDeviceSize counterBufferOffset,uint32_t counterOffset,uint32_t vertexStride)11738 radv_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer, uint32_t instanceCount, uint32_t firstInstance,
11739 VkBuffer _counterBuffer, VkDeviceSize counterBufferOffset, uint32_t counterOffset,
11740 uint32_t vertexStride)
11741 {
11742 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11743 RADV_FROM_HANDLE(radv_buffer, counterBuffer, _counterBuffer);
11744 struct radv_draw_info info;
11745
11746 info.count = 0;
11747 info.instance_count = instanceCount;
11748 info.first_instance = firstInstance;
11749 info.strmout_buffer = counterBuffer;
11750 info.strmout_buffer_offset = counterBufferOffset;
11751 info.stride = vertexStride;
11752 info.indexed = false;
11753 info.indirect = NULL;
11754
11755 if (!radv_before_draw(cmd_buffer, &info, 1, false))
11756 return;
11757 struct VkMultiDrawInfoEXT minfo = {0, 0};
11758 radv_emit_strmout_buffer(cmd_buffer, &info);
11759 radv_emit_direct_draw_packets(cmd_buffer, &info, 1, &minfo, S_0287F0_USE_OPAQUE(1), 0);
11760 radv_after_draw(cmd_buffer, false);
11761 }
11762
11763 /* VK_AMD_buffer_marker */
11764 VKAPI_ATTR void VKAPI_CALL
radv_CmdWriteBufferMarker2AMD(VkCommandBuffer commandBuffer,VkPipelineStageFlags2 stage,VkBuffer dstBuffer,VkDeviceSize dstOffset,uint32_t marker)11765 radv_CmdWriteBufferMarker2AMD(VkCommandBuffer commandBuffer, VkPipelineStageFlags2 stage, VkBuffer dstBuffer,
11766 VkDeviceSize dstOffset, uint32_t marker)
11767 {
11768 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11769 RADV_FROM_HANDLE(radv_buffer, buffer, dstBuffer);
11770 struct radeon_cmdbuf *cs = cmd_buffer->cs;
11771 const uint64_t va = radv_buffer_get_va(buffer->bo) + buffer->offset + dstOffset;
11772
11773 if (cmd_buffer->qf == RADV_QUEUE_TRANSFER) {
11774 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4);
11775 radeon_emit(cmd_buffer->cs, SDMA_PACKET(SDMA_OPCODE_FENCE, 0, SDMA_FENCE_MTYPE_UC));
11776 radeon_emit(cs, va);
11777 radeon_emit(cs, va >> 32);
11778 radeon_emit(cs, marker);
11779 return;
11780 }
11781
11782 radv_emit_cache_flush(cmd_buffer);
11783
11784 ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 12);
11785
11786 if (!(stage & ~VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT)) {
11787 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
11788 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM);
11789 radeon_emit(cs, marker);
11790 radeon_emit(cs, 0);
11791 radeon_emit(cs, va);
11792 radeon_emit(cs, va >> 32);
11793 } else {
11794 radv_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level, cmd_buffer->qf,
11795 V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, va, marker,
11796 cmd_buffer->gfx9_eop_bug_va);
11797 }
11798
11799 assert(cmd_buffer->cs->cdw <= cdw_max);
11800 }
11801
11802 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindPipelineShaderGroupNV(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipeline pipeline,uint32_t groupIndex)11803 radv_CmdBindPipelineShaderGroupNV(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint,
11804 VkPipeline pipeline, uint32_t groupIndex)
11805 {
11806 fprintf(stderr, "radv: unimplemented vkCmdBindPipelineShaderGroupNV\n");
11807 abort();
11808 }
11809
11810 /* VK_NV_device_generated_commands_compute */
11811 VKAPI_ATTR void VKAPI_CALL
radv_CmdUpdatePipelineIndirectBufferNV(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipeline pipeline)11812 radv_CmdUpdatePipelineIndirectBufferNV(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint,
11813 VkPipeline pipeline)
11814 {
11815 unreachable("radv: unimplemented vkCmdUpdatePipelineIndirectBufferNV");
11816 }
11817
11818 /* VK_EXT_descriptor_buffer */
11819 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindDescriptorBuffersEXT(VkCommandBuffer commandBuffer,uint32_t bufferCount,const VkDescriptorBufferBindingInfoEXT * pBindingInfos)11820 radv_CmdBindDescriptorBuffersEXT(VkCommandBuffer commandBuffer, uint32_t bufferCount,
11821 const VkDescriptorBufferBindingInfoEXT *pBindingInfos)
11822 {
11823 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11824
11825 for (uint32_t i = 0; i < bufferCount; i++) {
11826 cmd_buffer->descriptor_buffers[i] = pBindingInfos[i].address;
11827 }
11828 }
11829
11830 static void
radv_set_descriptor_buffer_offsets(struct radv_cmd_buffer * cmd_buffer,const VkSetDescriptorBufferOffsetsInfoEXT * pSetDescriptorBufferOffsetsInfo,VkPipelineBindPoint bind_point)11831 radv_set_descriptor_buffer_offsets(struct radv_cmd_buffer *cmd_buffer,
11832 const VkSetDescriptorBufferOffsetsInfoEXT *pSetDescriptorBufferOffsetsInfo,
11833 VkPipelineBindPoint bind_point)
11834 {
11835 struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);
11836
11837 for (unsigned i = 0; i < pSetDescriptorBufferOffsetsInfo->setCount; i++) {
11838 const uint32_t buffer_idx = pSetDescriptorBufferOffsetsInfo->pBufferIndices[i];
11839 const uint64_t offset = pSetDescriptorBufferOffsetsInfo->pOffsets[i];
11840 unsigned idx = i + pSetDescriptorBufferOffsetsInfo->firstSet;
11841
11842 descriptors_state->descriptor_buffers[idx] = cmd_buffer->descriptor_buffers[buffer_idx] + offset;
11843
11844 radv_set_descriptor_set(cmd_buffer, bind_point, NULL, idx);
11845 }
11846 }
11847
11848 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDescriptorBufferOffsets2EXT(VkCommandBuffer commandBuffer,const VkSetDescriptorBufferOffsetsInfoEXT * pSetDescriptorBufferOffsetsInfo)11849 radv_CmdSetDescriptorBufferOffsets2EXT(VkCommandBuffer commandBuffer,
11850 const VkSetDescriptorBufferOffsetsInfoEXT *pSetDescriptorBufferOffsetsInfo)
11851 {
11852 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11853
11854 if (pSetDescriptorBufferOffsetsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) {
11855 radv_set_descriptor_buffer_offsets(cmd_buffer, pSetDescriptorBufferOffsetsInfo, VK_PIPELINE_BIND_POINT_COMPUTE);
11856 }
11857
11858 if (pSetDescriptorBufferOffsetsInfo->stageFlags & RADV_GRAPHICS_STAGE_BITS) {
11859 radv_set_descriptor_buffer_offsets(cmd_buffer, pSetDescriptorBufferOffsetsInfo, VK_PIPELINE_BIND_POINT_GRAPHICS);
11860 }
11861
11862 if (pSetDescriptorBufferOffsetsInfo->stageFlags & RADV_RT_STAGE_BITS) {
11863 radv_set_descriptor_buffer_offsets(cmd_buffer, pSetDescriptorBufferOffsetsInfo,
11864 VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
11865 }
11866 }
11867
11868 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindDescriptorBufferEmbeddedSamplers2EXT(VkCommandBuffer commandBuffer,const VkBindDescriptorBufferEmbeddedSamplersInfoEXT * pBindDescriptorBufferEmbeddedSamplersInfo)11869 radv_CmdBindDescriptorBufferEmbeddedSamplers2EXT(
11870 VkCommandBuffer commandBuffer,
11871 const VkBindDescriptorBufferEmbeddedSamplersInfoEXT *pBindDescriptorBufferEmbeddedSamplersInfo)
11872 {
11873 /* This is a no-op because embedded samplers are inlined at compile time. */
11874 }
11875
11876 /* VK_EXT_shader_object */
11877 static void
radv_reset_pipeline_state(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint pipelineBindPoint)11878 radv_reset_pipeline_state(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint pipelineBindPoint)
11879 {
11880 switch (pipelineBindPoint) {
11881 case VK_PIPELINE_BIND_POINT_COMPUTE:
11882 if (cmd_buffer->state.compute_pipeline) {
11883 radv_bind_shader(cmd_buffer, NULL, MESA_SHADER_COMPUTE);
11884 cmd_buffer->state.compute_pipeline = NULL;
11885 }
11886 if (cmd_buffer->state.emitted_compute_pipeline) {
11887 cmd_buffer->state.emitted_compute_pipeline = NULL;
11888 }
11889 break;
11890 case VK_PIPELINE_BIND_POINT_GRAPHICS:
11891 if (cmd_buffer->state.graphics_pipeline) {
11892 radv_foreach_stage(s, cmd_buffer->state.graphics_pipeline->active_stages)
11893 {
11894 radv_bind_shader(cmd_buffer, NULL, s);
11895 }
11896 cmd_buffer->state.graphics_pipeline = NULL;
11897
11898 cmd_buffer->state.gs_copy_shader = NULL;
11899 cmd_buffer->state.last_vgt_shader = NULL;
11900 cmd_buffer->state.has_nggc = false;
11901 cmd_buffer->state.emitted_vs_prolog = NULL;
11902 cmd_buffer->state.col_format_non_compacted = 0;
11903 cmd_buffer->state.ms.sample_shading_enable = false;
11904 cmd_buffer->state.ms.min_sample_shading = 1.0f;
11905 cmd_buffer->state.rast_prim = 0;
11906 cmd_buffer->state.uses_out_of_order_rast = false;
11907 cmd_buffer->state.uses_vrs_attachment = false;
11908 cmd_buffer->state.uses_dynamic_vertex_binding_stride = false;
11909 }
11910 if (cmd_buffer->state.emitted_graphics_pipeline) {
11911 radv_bind_custom_blend_mode(cmd_buffer, 0);
11912
11913 if (cmd_buffer->state.db_render_control) {
11914 cmd_buffer->state.db_render_control = 0;
11915 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FRAMEBUFFER;
11916 }
11917
11918 cmd_buffer->state.emitted_graphics_pipeline = NULL;
11919 }
11920 break;
11921 default:
11922 break;
11923 }
11924
11925 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_PIPELINE;
11926 }
11927
11928 static void
radv_bind_compute_shader(struct radv_cmd_buffer * cmd_buffer,struct radv_shader_object * shader_obj)11929 radv_bind_compute_shader(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_object *shader_obj)
11930 {
11931 struct radv_shader *shader = shader_obj ? shader_obj->shader : NULL;
11932 const struct radv_device *device = cmd_buffer->device;
11933 struct radeon_cmdbuf *cs = cmd_buffer->cs;
11934
11935 radv_bind_shader(cmd_buffer, shader, MESA_SHADER_COMPUTE);
11936
11937 if (!shader_obj)
11938 return;
11939
11940 ASSERTED const unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 128);
11941
11942 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, shader->bo);
11943
11944 radv_emit_compute_shader(device->physical_device, cs, shader);
11945
11946 /* Update push constants/indirect descriptors state. */
11947 struct radv_descriptor_state *descriptors_state =
11948 radv_get_descriptors_state(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE);
11949 struct radv_push_constant_state *pc_state = &cmd_buffer->push_constant_state[VK_PIPELINE_BIND_POINT_COMPUTE];
11950
11951 descriptors_state->need_indirect_descriptor_sets =
11952 radv_get_user_sgpr(shader, AC_UD_INDIRECT_DESCRIPTOR_SETS)->sgpr_idx != -1;
11953 pc_state->size = shader_obj->push_constant_size;
11954 pc_state->dynamic_offset_count = shader_obj->dynamic_offset_count;
11955
11956 assert(cmd_buffer->cs->cdw <= cdw_max);
11957 }
11958
11959 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindShadersEXT(VkCommandBuffer commandBuffer,uint32_t stageCount,const VkShaderStageFlagBits * pStages,const VkShaderEXT * pShaders)11960 radv_CmdBindShadersEXT(VkCommandBuffer commandBuffer, uint32_t stageCount, const VkShaderStageFlagBits *pStages,
11961 const VkShaderEXT *pShaders)
11962 {
11963 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11964 VkShaderStageFlagBits bound_stages = 0;
11965
11966 for (uint32_t i = 0; i < stageCount; i++) {
11967 const gl_shader_stage stage = vk_to_mesa_shader_stage(pStages[i]);
11968
11969 if (!pShaders) {
11970 cmd_buffer->state.shader_objs[stage] = NULL;
11971 continue;
11972 }
11973
11974 RADV_FROM_HANDLE(radv_shader_object, shader_obj, pShaders[i]);
11975
11976 cmd_buffer->state.shader_objs[stage] = shader_obj;
11977
11978 bound_stages |= pStages[i];
11979 }
11980
11981 if (bound_stages & VK_SHADER_STAGE_COMPUTE_BIT) {
11982 radv_reset_pipeline_state(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE);
11983 radv_mark_descriptor_sets_dirty(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE);
11984
11985 radv_bind_compute_shader(cmd_buffer, cmd_buffer->state.shader_objs[MESA_SHADER_COMPUTE]);
11986 }
11987
11988 if (bound_stages & RADV_GRAPHICS_STAGE_BITS) {
11989 radv_reset_pipeline_state(cmd_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS);
11990 radv_mark_descriptor_sets_dirty(cmd_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS);
11991
11992 /* Graphics shaders are handled at draw time because of shader variants. */
11993 }
11994
11995 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_GRAPHICS_SHADERS;
11996 }
11997
11998 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetCoverageModulationModeNV(VkCommandBuffer commandBuffer,VkCoverageModulationModeNV coverageModulationMode)11999 radv_CmdSetCoverageModulationModeNV(VkCommandBuffer commandBuffer, VkCoverageModulationModeNV coverageModulationMode)
12000 {
12001 unreachable("Not supported by RADV.");
12002 }
12003
12004 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetCoverageModulationTableEnableNV(VkCommandBuffer commandBuffer,VkBool32 coverageModulationTableEnable)12005 radv_CmdSetCoverageModulationTableEnableNV(VkCommandBuffer commandBuffer, VkBool32 coverageModulationTableEnable)
12006 {
12007 unreachable("Not supported by RADV.");
12008 }
12009
12010 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetCoverageModulationTableNV(VkCommandBuffer commandBuffer,uint32_t coverageModulationTableCount,const float * pCoverageModulationTable)12011 radv_CmdSetCoverageModulationTableNV(VkCommandBuffer commandBuffer, uint32_t coverageModulationTableCount,
12012 const float *pCoverageModulationTable)
12013 {
12014 unreachable("Not supported by RADV.");
12015 }
12016
12017 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetCoverageReductionModeNV(VkCommandBuffer commandBuffer,VkCoverageReductionModeNV coverageReductionMode)12018 radv_CmdSetCoverageReductionModeNV(VkCommandBuffer commandBuffer, VkCoverageReductionModeNV coverageReductionMode)
12019 {
12020 unreachable("Not supported by RADV.");
12021 }
12022
12023 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetCoverageToColorEnableNV(VkCommandBuffer commandBuffer,VkBool32 coverageToColorEnable)12024 radv_CmdSetCoverageToColorEnableNV(VkCommandBuffer commandBuffer, VkBool32 coverageToColorEnable)
12025 {
12026 unreachable("Not supported by RADV.");
12027 }
12028
12029 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetCoverageToColorLocationNV(VkCommandBuffer commandBuffer,uint32_t coverageToColorLocation)12030 radv_CmdSetCoverageToColorLocationNV(VkCommandBuffer commandBuffer, uint32_t coverageToColorLocation)
12031 {
12032 unreachable("Not supported by RADV.");
12033 }
12034
12035 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetRepresentativeFragmentTestEnableNV(VkCommandBuffer commandBuffer,VkBool32 representativeFragmentTestEnable)12036 radv_CmdSetRepresentativeFragmentTestEnableNV(VkCommandBuffer commandBuffer, VkBool32 representativeFragmentTestEnable)
12037 {
12038 unreachable("Not supported by RADV.");
12039 }
12040
12041 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetShadingRateImageEnableNV(VkCommandBuffer commandBuffer,VkBool32 shadingRateImageEnable)12042 radv_CmdSetShadingRateImageEnableNV(VkCommandBuffer commandBuffer, VkBool32 shadingRateImageEnable)
12043 {
12044 unreachable("Not supported by RADV.");
12045 }
12046
12047 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetViewportSwizzleNV(VkCommandBuffer commandBuffer,uint32_t firstViewport,uint32_t viewportCount,const VkViewportSwizzleNV * pViewportSwizzles)12048 radv_CmdSetViewportSwizzleNV(VkCommandBuffer commandBuffer, uint32_t firstViewport, uint32_t viewportCount,
12049 const VkViewportSwizzleNV *pViewportSwizzles)
12050 {
12051 unreachable("Not supported by RADV.");
12052 }
12053
12054 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetViewportWScalingEnableNV(VkCommandBuffer commandBuffer,VkBool32 viewportWScalingEnable)12055 radv_CmdSetViewportWScalingEnableNV(VkCommandBuffer commandBuffer, VkBool32 viewportWScalingEnable)
12056 {
12057 unreachable("Not supported by RADV.");
12058 }
12059