1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 *
5 * based in part on anv driver which is:
6 * Copyright © 2015 Intel Corporation
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the "Software"),
10 * to deal in the Software without restriction, including without limitation
11 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 * and/or sell copies of the Software, and to permit persons to whom the
13 * Software is furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the next
16 * paragraph) shall be included in all copies or substantial portions of the
17 * Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * IN THE SOFTWARE.
26 */
27
28 #include "radv_cs.h"
29 #include "radv_debug.h"
30 #include "radv_meta.h"
31 #include "radv_private.h"
32 #include "radv_radeon_winsys.h"
33 #include "radv_shader.h"
34 #include "sid.h"
35 #include "vk_format.h"
36 #include "vk_util.h"
37 #include "vk_enum_defines.h"
38 #include "vk_common_entrypoints.h"
39
40 #include "ac_debug.h"
41 #include "ac_shader_args.h"
42
43 #include "util/fast_idiv_by_const.h"
44
45 enum {
46 RADV_PREFETCH_VBO_DESCRIPTORS = (1 << 0),
47 RADV_PREFETCH_VS = (1 << 1),
48 RADV_PREFETCH_TCS = (1 << 2),
49 RADV_PREFETCH_TES = (1 << 3),
50 RADV_PREFETCH_GS = (1 << 4),
51 RADV_PREFETCH_PS = (1 << 5),
52 RADV_PREFETCH_MS = (1 << 6),
53 RADV_PREFETCH_SHADERS = (RADV_PREFETCH_VS | RADV_PREFETCH_TCS | RADV_PREFETCH_TES |
54 RADV_PREFETCH_GS | RADV_PREFETCH_PS | RADV_PREFETCH_MS)
55 };
56
57 static void radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer,
58 struct radv_image *image, VkImageLayout src_layout,
59 bool src_render_loop, VkImageLayout dst_layout,
60 bool dst_render_loop, uint32_t src_family_index,
61 uint32_t dst_family_index, const VkImageSubresourceRange *range,
62 struct radv_sample_locations_state *sample_locs);
63
64 static void radv_set_rt_stack_size(struct radv_cmd_buffer *cmd_buffer, uint32_t size);
65
66 const struct radv_dynamic_state default_dynamic_state = {
67 .viewport =
68 {
69 .count = 0,
70 },
71 .scissor =
72 {
73 .count = 0,
74 },
75 .line_width = 1.0f,
76 .depth_bias =
77 {
78 .bias = 0.0f,
79 .clamp = 0.0f,
80 .slope = 0.0f,
81 },
82 .blend_constants = {0.0f, 0.0f, 0.0f, 0.0f},
83 .depth_bounds =
84 {
85 .min = 0.0f,
86 .max = 1.0f,
87 },
88 .stencil_compare_mask =
89 {
90 .front = ~0u,
91 .back = ~0u,
92 },
93 .stencil_write_mask =
94 {
95 .front = ~0u,
96 .back = ~0u,
97 },
98 .stencil_reference =
99 {
100 .front = 0u,
101 .back = 0u,
102 },
103 .line_stipple =
104 {
105 .factor = 0u,
106 .pattern = 0u,
107 },
108 .cull_mode = 0u,
109 .front_face = 0u,
110 .primitive_topology = 0u,
111 .fragment_shading_rate =
112 {
113 .size = {1u, 1u},
114 .combiner_ops = {VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR,
115 VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR},
116 },
117 .depth_bias_enable = 0u,
118 .primitive_restart_enable = 0u,
119 .rasterizer_discard_enable = 0u,
120 .logic_op = 0u,
121 .color_write_enable = 0xffffffffu,
122 };
123
124 static void
radv_bind_dynamic_state(struct radv_cmd_buffer * cmd_buffer,const struct radv_dynamic_state * src)125 radv_bind_dynamic_state(struct radv_cmd_buffer *cmd_buffer, const struct radv_dynamic_state *src)
126 {
127 struct radv_dynamic_state *dest = &cmd_buffer->state.dynamic;
128 uint64_t copy_mask = src->mask;
129 uint64_t dest_mask = 0;
130
131 dest->discard_rectangle.count = src->discard_rectangle.count;
132 dest->sample_location.count = src->sample_location.count;
133
134 if (copy_mask & RADV_DYNAMIC_VIEWPORT) {
135 if (dest->viewport.count != src->viewport.count) {
136 dest->viewport.count = src->viewport.count;
137 dest_mask |= RADV_DYNAMIC_VIEWPORT;
138 }
139
140 if (memcmp(&dest->viewport.viewports, &src->viewport.viewports,
141 src->viewport.count * sizeof(VkViewport))) {
142 typed_memcpy(dest->viewport.viewports, src->viewport.viewports, src->viewport.count);
143 typed_memcpy(dest->viewport.xform, src->viewport.xform, src->viewport.count);
144 dest_mask |= RADV_DYNAMIC_VIEWPORT;
145 }
146 }
147
148 if (copy_mask & RADV_DYNAMIC_SCISSOR) {
149 if (dest->scissor.count != src->scissor.count) {
150 dest->scissor.count = src->scissor.count;
151 dest_mask |= RADV_DYNAMIC_SCISSOR;
152 }
153
154 if (memcmp(&dest->scissor.scissors, &src->scissor.scissors,
155 src->scissor.count * sizeof(VkRect2D))) {
156 typed_memcpy(dest->scissor.scissors, src->scissor.scissors, src->scissor.count);
157 dest_mask |= RADV_DYNAMIC_SCISSOR;
158 }
159 }
160
161 if (copy_mask & RADV_DYNAMIC_LINE_WIDTH) {
162 if (dest->line_width != src->line_width) {
163 dest->line_width = src->line_width;
164 dest_mask |= RADV_DYNAMIC_LINE_WIDTH;
165 }
166 }
167
168 if (copy_mask & RADV_DYNAMIC_DEPTH_BIAS) {
169 if (memcmp(&dest->depth_bias, &src->depth_bias, sizeof(src->depth_bias))) {
170 dest->depth_bias = src->depth_bias;
171 dest_mask |= RADV_DYNAMIC_DEPTH_BIAS;
172 }
173 }
174
175 if (copy_mask & RADV_DYNAMIC_BLEND_CONSTANTS) {
176 if (memcmp(&dest->blend_constants, &src->blend_constants, sizeof(src->blend_constants))) {
177 typed_memcpy(dest->blend_constants, src->blend_constants, 4);
178 dest_mask |= RADV_DYNAMIC_BLEND_CONSTANTS;
179 }
180 }
181
182 if (copy_mask & RADV_DYNAMIC_DEPTH_BOUNDS) {
183 if (memcmp(&dest->depth_bounds, &src->depth_bounds, sizeof(src->depth_bounds))) {
184 dest->depth_bounds = src->depth_bounds;
185 dest_mask |= RADV_DYNAMIC_DEPTH_BOUNDS;
186 }
187 }
188
189 if (copy_mask & RADV_DYNAMIC_STENCIL_COMPARE_MASK) {
190 if (memcmp(&dest->stencil_compare_mask, &src->stencil_compare_mask,
191 sizeof(src->stencil_compare_mask))) {
192 dest->stencil_compare_mask = src->stencil_compare_mask;
193 dest_mask |= RADV_DYNAMIC_STENCIL_COMPARE_MASK;
194 }
195 }
196
197 if (copy_mask & RADV_DYNAMIC_STENCIL_WRITE_MASK) {
198 if (memcmp(&dest->stencil_write_mask, &src->stencil_write_mask,
199 sizeof(src->stencil_write_mask))) {
200 dest->stencil_write_mask = src->stencil_write_mask;
201 dest_mask |= RADV_DYNAMIC_STENCIL_WRITE_MASK;
202 }
203 }
204
205 if (copy_mask & RADV_DYNAMIC_STENCIL_REFERENCE) {
206 if (memcmp(&dest->stencil_reference, &src->stencil_reference,
207 sizeof(src->stencil_reference))) {
208 dest->stencil_reference = src->stencil_reference;
209 dest_mask |= RADV_DYNAMIC_STENCIL_REFERENCE;
210 }
211 }
212
213 if (copy_mask & RADV_DYNAMIC_DISCARD_RECTANGLE) {
214 if (memcmp(&dest->discard_rectangle.rectangles, &src->discard_rectangle.rectangles,
215 src->discard_rectangle.count * sizeof(VkRect2D))) {
216 typed_memcpy(dest->discard_rectangle.rectangles, src->discard_rectangle.rectangles,
217 src->discard_rectangle.count);
218 dest_mask |= RADV_DYNAMIC_DISCARD_RECTANGLE;
219 }
220 }
221
222 if (copy_mask & RADV_DYNAMIC_SAMPLE_LOCATIONS) {
223 if (dest->sample_location.per_pixel != src->sample_location.per_pixel ||
224 dest->sample_location.grid_size.width != src->sample_location.grid_size.width ||
225 dest->sample_location.grid_size.height != src->sample_location.grid_size.height ||
226 memcmp(&dest->sample_location.locations, &src->sample_location.locations,
227 src->sample_location.count * sizeof(VkSampleLocationEXT))) {
228 dest->sample_location.per_pixel = src->sample_location.per_pixel;
229 dest->sample_location.grid_size = src->sample_location.grid_size;
230 typed_memcpy(dest->sample_location.locations, src->sample_location.locations,
231 src->sample_location.count);
232 dest_mask |= RADV_DYNAMIC_SAMPLE_LOCATIONS;
233 }
234 }
235
236 if (copy_mask & RADV_DYNAMIC_LINE_STIPPLE) {
237 if (memcmp(&dest->line_stipple, &src->line_stipple, sizeof(src->line_stipple))) {
238 dest->line_stipple = src->line_stipple;
239 dest_mask |= RADV_DYNAMIC_LINE_STIPPLE;
240 }
241 }
242
243 if (copy_mask & RADV_DYNAMIC_CULL_MODE) {
244 if (dest->cull_mode != src->cull_mode) {
245 dest->cull_mode = src->cull_mode;
246 dest_mask |= RADV_DYNAMIC_CULL_MODE;
247 }
248 }
249
250 if (copy_mask & RADV_DYNAMIC_FRONT_FACE) {
251 if (dest->front_face != src->front_face) {
252 dest->front_face = src->front_face;
253 dest_mask |= RADV_DYNAMIC_FRONT_FACE;
254 }
255 }
256
257 if (copy_mask & RADV_DYNAMIC_PRIMITIVE_TOPOLOGY) {
258 if (dest->primitive_topology != src->primitive_topology) {
259 dest->primitive_topology = src->primitive_topology;
260 dest_mask |= RADV_DYNAMIC_PRIMITIVE_TOPOLOGY;
261 }
262 }
263
264 if (copy_mask & RADV_DYNAMIC_DEPTH_TEST_ENABLE) {
265 if (dest->depth_test_enable != src->depth_test_enable) {
266 dest->depth_test_enable = src->depth_test_enable;
267 dest_mask |= RADV_DYNAMIC_DEPTH_TEST_ENABLE;
268 }
269 }
270
271 if (copy_mask & RADV_DYNAMIC_DEPTH_WRITE_ENABLE) {
272 if (dest->depth_write_enable != src->depth_write_enable) {
273 dest->depth_write_enable = src->depth_write_enable;
274 dest_mask |= RADV_DYNAMIC_DEPTH_WRITE_ENABLE;
275 }
276 }
277
278 if (copy_mask & RADV_DYNAMIC_DEPTH_COMPARE_OP) {
279 if (dest->depth_compare_op != src->depth_compare_op) {
280 dest->depth_compare_op = src->depth_compare_op;
281 dest_mask |= RADV_DYNAMIC_DEPTH_COMPARE_OP;
282 }
283 }
284
285 if (copy_mask & RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE) {
286 if (dest->depth_bounds_test_enable != src->depth_bounds_test_enable) {
287 dest->depth_bounds_test_enable = src->depth_bounds_test_enable;
288 dest_mask |= RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE;
289 }
290 }
291
292 if (copy_mask & RADV_DYNAMIC_STENCIL_TEST_ENABLE) {
293 if (dest->stencil_test_enable != src->stencil_test_enable) {
294 dest->stencil_test_enable = src->stencil_test_enable;
295 dest_mask |= RADV_DYNAMIC_STENCIL_TEST_ENABLE;
296 }
297 }
298
299 if (copy_mask & RADV_DYNAMIC_STENCIL_OP) {
300 if (memcmp(&dest->stencil_op, &src->stencil_op, sizeof(src->stencil_op))) {
301 dest->stencil_op = src->stencil_op;
302 dest_mask |= RADV_DYNAMIC_STENCIL_OP;
303 }
304 }
305
306 if (copy_mask & RADV_DYNAMIC_FRAGMENT_SHADING_RATE) {
307 if (memcmp(&dest->fragment_shading_rate, &src->fragment_shading_rate,
308 sizeof(src->fragment_shading_rate))) {
309 dest->fragment_shading_rate = src->fragment_shading_rate;
310 dest_mask |= RADV_DYNAMIC_FRAGMENT_SHADING_RATE;
311 }
312 }
313
314 if (copy_mask & RADV_DYNAMIC_DEPTH_BIAS_ENABLE) {
315 if (dest->depth_bias_enable != src->depth_bias_enable) {
316 dest->depth_bias_enable = src->depth_bias_enable;
317 dest_mask |= RADV_DYNAMIC_DEPTH_BIAS_ENABLE;
318 }
319 }
320
321 if (copy_mask & RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE) {
322 if (dest->primitive_restart_enable != src->primitive_restart_enable) {
323 dest->primitive_restart_enable = src->primitive_restart_enable;
324 dest_mask |= RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE;
325 }
326 }
327
328 if (copy_mask & RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE) {
329 if (dest->rasterizer_discard_enable != src->rasterizer_discard_enable) {
330 dest->rasterizer_discard_enable = src->rasterizer_discard_enable;
331 dest_mask |= RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
332 }
333 }
334
335 if (copy_mask & RADV_DYNAMIC_LOGIC_OP) {
336 if (dest->logic_op != src->logic_op) {
337 dest->logic_op = src->logic_op;
338 dest_mask |= RADV_DYNAMIC_LOGIC_OP;
339 }
340 }
341
342 if (copy_mask & RADV_DYNAMIC_COLOR_WRITE_ENABLE) {
343 if (dest->color_write_enable != src->color_write_enable) {
344 dest->color_write_enable = src->color_write_enable;
345 dest_mask |= RADV_DYNAMIC_COLOR_WRITE_ENABLE;
346 }
347 }
348
349 cmd_buffer->state.dirty |= dest_mask;
350 }
351
352 bool
radv_cmd_buffer_uses_mec(struct radv_cmd_buffer * cmd_buffer)353 radv_cmd_buffer_uses_mec(struct radv_cmd_buffer *cmd_buffer)
354 {
355 return cmd_buffer->qf == RADV_QUEUE_COMPUTE &&
356 cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7;
357 }
358
359 enum amd_ip_type
radv_queue_family_to_ring(struct radv_physical_device * physical_device,enum radv_queue_family f)360 radv_queue_family_to_ring(struct radv_physical_device *physical_device,
361 enum radv_queue_family f)
362 {
363 switch (f) {
364 case RADV_QUEUE_GENERAL:
365 return AMD_IP_GFX;
366 case RADV_QUEUE_COMPUTE:
367 return AMD_IP_COMPUTE;
368 case RADV_QUEUE_TRANSFER:
369 return AMD_IP_SDMA;
370 default:
371 unreachable("Unknown queue family");
372 }
373 }
374
375 static void
radv_emit_write_data_packet(struct radv_cmd_buffer * cmd_buffer,unsigned engine_sel,uint64_t va,unsigned count,const uint32_t * data)376 radv_emit_write_data_packet(struct radv_cmd_buffer *cmd_buffer, unsigned engine_sel, uint64_t va,
377 unsigned count, const uint32_t *data)
378 {
379 struct radeon_cmdbuf *cs = cmd_buffer->cs;
380
381 radeon_check_space(cmd_buffer->device->ws, cs, 4 + count);
382
383 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0));
384 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(engine_sel));
385 radeon_emit(cs, va);
386 radeon_emit(cs, va >> 32);
387 radeon_emit_array(cs, data, count);
388 }
389
390 static void
radv_emit_clear_data(struct radv_cmd_buffer * cmd_buffer,unsigned engine_sel,uint64_t va,unsigned size)391 radv_emit_clear_data(struct radv_cmd_buffer *cmd_buffer, unsigned engine_sel, uint64_t va,
392 unsigned size)
393 {
394 uint32_t *zeroes = alloca(size);
395 memset(zeroes, 0, size);
396 radv_emit_write_data_packet(cmd_buffer, engine_sel, va, size / 4, zeroes);
397 }
398
399 static void
radv_destroy_cmd_buffer(struct radv_cmd_buffer * cmd_buffer)400 radv_destroy_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
401 {
402 list_del(&cmd_buffer->pool_link);
403
404 util_dynarray_fini(&cmd_buffer->cached_vertex_formats);
405
406 list_for_each_entry_safe(struct radv_cmd_buffer_upload, up, &cmd_buffer->upload.list, list)
407 {
408 cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, up->upload_bo);
409 list_del(&up->list);
410 free(up);
411 }
412
413 if (cmd_buffer->upload.upload_bo)
414 cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, cmd_buffer->upload.upload_bo);
415
416 if (cmd_buffer->state.own_render_pass) {
417 radv_DestroyRenderPass(radv_device_to_handle(cmd_buffer->device),
418 radv_render_pass_to_handle(cmd_buffer->state.pass), NULL);
419 cmd_buffer->state.own_render_pass = false;
420 }
421
422 if (cmd_buffer->cs)
423 cmd_buffer->device->ws->cs_destroy(cmd_buffer->cs);
424 if (cmd_buffer->ace_internal.cs)
425 cmd_buffer->device->ws->cs_destroy(cmd_buffer->ace_internal.cs);
426
427 for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
428 struct radv_descriptor_set_header *set = &cmd_buffer->descriptors[i].push_set.set;
429 free(set->mapped_ptr);
430 if (set->layout)
431 vk_descriptor_set_layout_unref(&cmd_buffer->device->vk, &set->layout->vk);
432 vk_object_base_finish(&set->base);
433 }
434
435 vk_object_base_finish(&cmd_buffer->meta_push_descriptors.base);
436
437 vk_command_buffer_finish(&cmd_buffer->vk);
438 vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer);
439 }
440
441 static VkResult
radv_create_cmd_buffer(struct radv_device * device,struct radv_cmd_pool * pool,VkCommandBufferLevel level,VkCommandBuffer * pCommandBuffer)442 radv_create_cmd_buffer(struct radv_device *device, struct radv_cmd_pool *pool,
443 VkCommandBufferLevel level, VkCommandBuffer *pCommandBuffer)
444 {
445 struct radv_cmd_buffer *cmd_buffer;
446 unsigned ring;
447 cmd_buffer = vk_zalloc(&pool->vk.alloc, sizeof(*cmd_buffer), 8,
448 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
449 if (cmd_buffer == NULL)
450 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
451
452 VkResult result =
453 vk_command_buffer_init(&cmd_buffer->vk, &pool->vk, level);
454 if (result != VK_SUCCESS) {
455 vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer);
456 return result;
457 }
458
459 cmd_buffer->device = device;
460 cmd_buffer->pool = pool;
461
462 list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
463 cmd_buffer->qf = vk_queue_to_radv(device->physical_device, pool->vk.queue_family_index);
464
465 ring = radv_queue_family_to_ring(device->physical_device, cmd_buffer->qf);
466
467 cmd_buffer->cs = device->ws->cs_create(device->ws, ring);
468 if (!cmd_buffer->cs) {
469 radv_destroy_cmd_buffer(cmd_buffer);
470 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
471 }
472
473 vk_object_base_init(&device->vk, &cmd_buffer->meta_push_descriptors.base,
474 VK_OBJECT_TYPE_DESCRIPTOR_SET);
475
476 util_dynarray_init(&cmd_buffer->cached_vertex_formats, NULL);
477
478 for (unsigned i = 0; i < MAX_BIND_POINTS; i++)
479 vk_object_base_init(&device->vk, &cmd_buffer->descriptors[i].push_set.set.base,
480 VK_OBJECT_TYPE_DESCRIPTOR_SET);
481
482 *pCommandBuffer = radv_cmd_buffer_to_handle(cmd_buffer);
483
484 list_inithead(&cmd_buffer->upload.list);
485
486 return VK_SUCCESS;
487 }
488
489 static VkResult
radv_reset_cmd_buffer(struct radv_cmd_buffer * cmd_buffer)490 radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
491 {
492 vk_command_buffer_reset(&cmd_buffer->vk);
493
494 cmd_buffer->device->ws->cs_reset(cmd_buffer->cs);
495 if (cmd_buffer->ace_internal.cs)
496 cmd_buffer->device->ws->cs_reset(cmd_buffer->ace_internal.cs);
497
498 list_for_each_entry_safe(struct radv_cmd_buffer_upload, up, &cmd_buffer->upload.list, list)
499 {
500 cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, up->upload_bo);
501 list_del(&up->list);
502 free(up);
503 }
504
505 if (cmd_buffer->state.own_render_pass) {
506 radv_DestroyRenderPass(radv_device_to_handle(cmd_buffer->device),
507 radv_render_pass_to_handle(cmd_buffer->state.pass), NULL);
508 cmd_buffer->state.own_render_pass = false;
509 }
510
511 cmd_buffer->push_constant_stages = 0;
512 cmd_buffer->scratch_size_per_wave_needed = 0;
513 cmd_buffer->scratch_waves_wanted = 0;
514 cmd_buffer->compute_scratch_size_per_wave_needed = 0;
515 cmd_buffer->compute_scratch_waves_wanted = 0;
516 cmd_buffer->esgs_ring_size_needed = 0;
517 cmd_buffer->gsvs_ring_size_needed = 0;
518 cmd_buffer->tess_rings_needed = false;
519 cmd_buffer->task_rings_needed = false;
520 cmd_buffer->mesh_scratch_ring_needed = false;
521 cmd_buffer->gds_needed = false;
522 cmd_buffer->gds_oa_needed = false;
523 cmd_buffer->sample_positions_needed = false;
524 cmd_buffer->ace_internal.sem.gfx2ace_value = 0;
525 cmd_buffer->ace_internal.sem.emitted_gfx2ace_value = 0;
526 cmd_buffer->ace_internal.sem.va = 0;
527
528 if (cmd_buffer->upload.upload_bo)
529 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->upload.upload_bo);
530 cmd_buffer->upload.offset = 0;
531
532 cmd_buffer->record_result = VK_SUCCESS;
533
534 memset(cmd_buffer->vertex_binding_buffers, 0, sizeof(struct radv_buffer *) * cmd_buffer->used_vertex_bindings);
535 cmd_buffer->used_vertex_bindings = 0;
536
537 for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
538 cmd_buffer->descriptors[i].dirty = 0;
539 cmd_buffer->descriptors[i].valid = 0;
540 cmd_buffer->descriptors[i].push_dirty = false;
541 }
542
543 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) {
544 uint32_t pred_value = 0;
545 uint32_t pred_offset;
546 if (!radv_cmd_buffer_upload_data(cmd_buffer, 4, &pred_value, &pred_offset))
547 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
548
549 cmd_buffer->mec_inv_pred_emitted = false;
550 cmd_buffer->mec_inv_pred_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + pred_offset;
551 }
552
553 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9 &&
554 cmd_buffer->qf == RADV_QUEUE_GENERAL) {
555 unsigned num_db = cmd_buffer->device->physical_device->rad_info.max_render_backends;
556 unsigned fence_offset, eop_bug_offset;
557 void *fence_ptr;
558
559 radv_cmd_buffer_upload_alloc(cmd_buffer, 8, &fence_offset, &fence_ptr);
560 memset(fence_ptr, 0, 8);
561
562 cmd_buffer->gfx9_fence_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
563 cmd_buffer->gfx9_fence_va += fence_offset;
564
565 radv_emit_clear_data(cmd_buffer, V_370_PFP, cmd_buffer->gfx9_fence_va, 8);
566
567 if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX9) {
568 /* Allocate a buffer for the EOP bug on GFX9. */
569 radv_cmd_buffer_upload_alloc(cmd_buffer, 16 * num_db, &eop_bug_offset, &fence_ptr);
570 memset(fence_ptr, 0, 16 * num_db);
571 cmd_buffer->gfx9_eop_bug_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
572 cmd_buffer->gfx9_eop_bug_va += eop_bug_offset;
573
574 radv_emit_clear_data(cmd_buffer, V_370_PFP, cmd_buffer->gfx9_eop_bug_va, 16 * num_db);
575 }
576 }
577
578 cmd_buffer->status = RADV_CMD_BUFFER_STATUS_INITIAL;
579
580 return cmd_buffer->record_result;
581 }
582
583 static bool
radv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer * cmd_buffer,uint64_t min_needed)584 radv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer *cmd_buffer, uint64_t min_needed)
585 {
586 uint64_t new_size;
587 struct radeon_winsys_bo *bo = NULL;
588 struct radv_cmd_buffer_upload *upload;
589 struct radv_device *device = cmd_buffer->device;
590
591 new_size = MAX2(min_needed, 16 * 1024);
592 new_size = MAX2(new_size, 2 * cmd_buffer->upload.size);
593
594 VkResult result =
595 device->ws->buffer_create(device->ws, new_size, 4096, device->ws->cs_domain(device->ws),
596 RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING |
597 RADEON_FLAG_32BIT | RADEON_FLAG_GTT_WC,
598 RADV_BO_PRIORITY_UPLOAD_BUFFER, 0, &bo);
599
600 if (result != VK_SUCCESS) {
601 cmd_buffer->record_result = result;
602 return false;
603 }
604
605 radv_cs_add_buffer(device->ws, cmd_buffer->cs, bo);
606 if (cmd_buffer->upload.upload_bo) {
607 upload = malloc(sizeof(*upload));
608
609 if (!upload) {
610 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
611 device->ws->buffer_destroy(device->ws, bo);
612 return false;
613 }
614
615 memcpy(upload, &cmd_buffer->upload, sizeof(*upload));
616 list_add(&upload->list, &cmd_buffer->upload.list);
617 }
618
619 cmd_buffer->upload.upload_bo = bo;
620 cmd_buffer->upload.size = new_size;
621 cmd_buffer->upload.offset = 0;
622 cmd_buffer->upload.map = device->ws->buffer_map(cmd_buffer->upload.upload_bo);
623
624 if (!cmd_buffer->upload.map) {
625 cmd_buffer->record_result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
626 return false;
627 }
628
629 return true;
630 }
631
632 bool
radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer * cmd_buffer,unsigned size,unsigned * out_offset,void ** ptr)633 radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer *cmd_buffer, unsigned size,
634 unsigned *out_offset, void **ptr)
635 {
636 assert(size % 4 == 0);
637
638 struct radeon_info *rad_info = &cmd_buffer->device->physical_device->rad_info;
639
640 /* Align to the scalar cache line size if it results in this allocation
641 * being placed in less of them.
642 */
643 unsigned offset = cmd_buffer->upload.offset;
644 unsigned line_size = rad_info->gfx_level >= GFX10 ? 64 : 32;
645 unsigned gap = align(offset, line_size) - offset;
646 if ((size & (line_size - 1)) > gap)
647 offset = align(offset, line_size);
648
649 if (offset + size > cmd_buffer->upload.size) {
650 if (!radv_cmd_buffer_resize_upload_buf(cmd_buffer, size))
651 return false;
652 offset = 0;
653 }
654
655 *out_offset = offset;
656 *ptr = cmd_buffer->upload.map + offset;
657
658 cmd_buffer->upload.offset = offset + size;
659 return true;
660 }
661
662 bool
radv_cmd_buffer_upload_data(struct radv_cmd_buffer * cmd_buffer,unsigned size,const void * data,unsigned * out_offset)663 radv_cmd_buffer_upload_data(struct radv_cmd_buffer *cmd_buffer, unsigned size, const void *data,
664 unsigned *out_offset)
665 {
666 uint8_t *ptr;
667
668 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, out_offset, (void **)&ptr))
669 return false;
670 assert(ptr);
671
672 memcpy(ptr, data, size);
673 return true;
674 }
675
676 void
radv_cmd_buffer_trace_emit(struct radv_cmd_buffer * cmd_buffer)677 radv_cmd_buffer_trace_emit(struct radv_cmd_buffer *cmd_buffer)
678 {
679 struct radv_device *device = cmd_buffer->device;
680 struct radeon_cmdbuf *cs = cmd_buffer->cs;
681 uint64_t va;
682
683 va = radv_buffer_get_va(device->trace_bo);
684 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
685 va += 4;
686
687 ++cmd_buffer->state.trace_id;
688 radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, 1, &cmd_buffer->state.trace_id);
689
690 radeon_check_space(cmd_buffer->device->ws, cs, 2);
691
692 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
693 radeon_emit(cs, AC_ENCODE_TRACE_POINT(cmd_buffer->state.trace_id));
694 }
695
696 static void
radv_ace_internal_barrier(struct radv_cmd_buffer * cmd_buffer,VkPipelineStageFlags2 src_stage_mask,VkPipelineStageFlags2 dst_stage_mask)697 radv_ace_internal_barrier(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 src_stage_mask,
698 VkPipelineStageFlags2 dst_stage_mask)
699 {
700 /* Update flush bits from the main cmdbuf, except the stage flush. */
701 cmd_buffer->ace_internal.flush_bits |=
702 cmd_buffer->state.flush_bits & RADV_CMD_FLUSH_ALL_COMPUTE & ~RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
703
704 /* Add stage flush only when necessary. */
705 if (src_stage_mask &
706 (VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_NV | VK_PIPELINE_STAGE_2_TRANSFER_BIT |
707 VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
708 cmd_buffer->ace_internal.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
709
710 /* Block task shaders when we have to wait for CP DMA on the GFX cmdbuf. */
711 if (src_stage_mask &
712 (VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_CLEAR_BIT |
713 VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT | VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT |
714 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
715 dst_stage_mask |= cmd_buffer->state.dma_is_busy ? VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_NV : 0;
716
717 /* Increment the GFX/ACE semaphore when task shaders are blocked. */
718 if (dst_stage_mask &
719 (VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT_KHR | VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT |
720 VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_NV))
721 cmd_buffer->ace_internal.sem.gfx2ace_value++;
722 }
723
724 static void
radv_ace_internal_cache_flush(struct radv_cmd_buffer * cmd_buffer)725 radv_ace_internal_cache_flush(struct radv_cmd_buffer *cmd_buffer)
726 {
727 struct radeon_cmdbuf *ace_cs = cmd_buffer->ace_internal.cs;
728 const uint32_t flush_bits = cmd_buffer->ace_internal.flush_bits;
729 enum rgp_flush_bits sqtt_flush_bits = 0;
730
731 si_cs_emit_cache_flush(ace_cs, cmd_buffer->device->physical_device->rad_info.gfx_level, NULL, 0,
732 true, flush_bits, &sqtt_flush_bits, 0);
733
734 cmd_buffer->ace_internal.flush_bits = 0;
735 }
736
737 static uint64_t
radv_ace_internal_sem_create(struct radv_cmd_buffer * cmd_buffer)738 radv_ace_internal_sem_create(struct radv_cmd_buffer *cmd_buffer)
739 {
740 /* DWORD 0: GFX->ACE semaphore (GFX blocks ACE, ie. ACE waits for GFX)
741 * DWORD 1: ACE->GFX semaphore
742 */
743 uint64_t sem_init = 0;
744 uint32_t va_off = 0;
745 if (!radv_cmd_buffer_upload_data(cmd_buffer, sizeof(uint64_t), &sem_init, &va_off)) {
746 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
747 return 0;
748 }
749
750 return radv_buffer_get_va(cmd_buffer->upload.upload_bo) + va_off;
751 }
752
753 static bool
radv_ace_internal_sem_dirty(const struct radv_cmd_buffer * cmd_buffer)754 radv_ace_internal_sem_dirty(const struct radv_cmd_buffer *cmd_buffer)
755 {
756 return cmd_buffer->ace_internal.sem.gfx2ace_value !=
757 cmd_buffer->ace_internal.sem.emitted_gfx2ace_value;
758 }
759
760 ALWAYS_INLINE static bool
radv_flush_gfx2ace_semaphore(struct radv_cmd_buffer * cmd_buffer)761 radv_flush_gfx2ace_semaphore(struct radv_cmd_buffer *cmd_buffer)
762 {
763 if (!radv_ace_internal_sem_dirty(cmd_buffer))
764 return false;
765
766 if (!cmd_buffer->ace_internal.sem.va) {
767 cmd_buffer->ace_internal.sem.va = radv_ace_internal_sem_create(cmd_buffer);
768 if (!cmd_buffer->ace_internal.sem.va)
769 return false;
770 }
771
772 /* GFX writes a value to the semaphore which ACE can wait for.*/
773 si_cs_emit_write_event_eop(
774 cmd_buffer->cs, cmd_buffer->device->physical_device->rad_info.gfx_level,
775 radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM,
776 EOP_DATA_SEL_VALUE_32BIT, cmd_buffer->ace_internal.sem.va,
777 cmd_buffer->ace_internal.sem.gfx2ace_value, cmd_buffer->gfx9_eop_bug_va);
778
779 cmd_buffer->ace_internal.sem.emitted_gfx2ace_value = cmd_buffer->ace_internal.sem.gfx2ace_value;
780 return true;
781 }
782
783 ALWAYS_INLINE static void
radv_wait_gfx2ace_semaphore(struct radv_cmd_buffer * cmd_buffer)784 radv_wait_gfx2ace_semaphore(struct radv_cmd_buffer *cmd_buffer)
785 {
786 assert(cmd_buffer->ace_internal.sem.va);
787 struct radeon_cmdbuf *ace_cs = cmd_buffer->ace_internal.cs;
788 radeon_check_space(cmd_buffer->device->ws, ace_cs, 7);
789
790 /* ACE waits for the semaphore which GFX wrote. */
791 radv_cp_wait_mem(ace_cs, WAIT_REG_MEM_GREATER_OR_EQUAL, cmd_buffer->ace_internal.sem.va,
792 cmd_buffer->ace_internal.sem.gfx2ace_value, 0xffffffff);
793 }
794
795 static struct radeon_cmdbuf *
radv_ace_internal_create(struct radv_cmd_buffer * cmd_buffer)796 radv_ace_internal_create(struct radv_cmd_buffer *cmd_buffer)
797 {
798 assert(!cmd_buffer->ace_internal.cs);
799 struct radv_device *device = cmd_buffer->device;
800 struct radeon_cmdbuf *ace_cs = device->ws->cs_create(device->ws, AMD_IP_COMPUTE);
801
802 if (!ace_cs) {
803 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
804 }
805
806 return ace_cs;
807 }
808
809 static VkResult
radv_ace_internal_finalize(struct radv_cmd_buffer * cmd_buffer)810 radv_ace_internal_finalize(struct radv_cmd_buffer *cmd_buffer)
811 {
812 assert(cmd_buffer->ace_internal.cs);
813 struct radv_device *device = cmd_buffer->device;
814 struct radeon_cmdbuf *ace_cs = cmd_buffer->ace_internal.cs;
815
816 /* Emit pending cache flush. */
817 radv_ace_internal_cache_flush(cmd_buffer);
818
819 /* Clear the ACE semaphore if it exists.
820 * This is necessary in case the same cmd buffer is submitted again in the future.
821 */
822 if (cmd_buffer->ace_internal.sem.va) {
823 struct radeon_cmdbuf *main_cs = cmd_buffer->cs;
824 uint64_t gfx2ace_va = cmd_buffer->ace_internal.sem.va;
825 uint64_t ace2gfx_va = cmd_buffer->ace_internal.sem.va + 4;
826
827 /* ACE: write 1 to the ACE->GFX semaphore. */
828 si_cs_emit_write_event_eop(ace_cs, cmd_buffer->device->physical_device->rad_info.gfx_level,
829 true, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM,
830 EOP_DATA_SEL_VALUE_32BIT, ace2gfx_va, 1,
831 cmd_buffer->gfx9_eop_bug_va);
832
833 /* Wait for ACE to finish, otherwise we may risk writing 0 to the semaphore
834 * when ACE is still waiting for it. This may not happen in practice, but
835 * better safe than sorry.
836 */
837 radv_cp_wait_mem(main_cs, WAIT_REG_MEM_GREATER_OR_EQUAL, ace2gfx_va, 1, 0xffffffff);
838
839 /* GFX: clear GFX->ACE and ACE->GFX semaphores. */
840 radv_emit_clear_data(cmd_buffer, V_370_ME, gfx2ace_va, 8);
841 }
842
843 device->ws->cs_add_buffers(ace_cs, cmd_buffer->cs);
844 return device->ws->cs_finalize(ace_cs);
845 }
846
847 static void
radv_cmd_buffer_after_draw(struct radv_cmd_buffer * cmd_buffer,enum radv_cmd_flush_bits flags)848 radv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer, enum radv_cmd_flush_bits flags)
849 {
850 if (unlikely(cmd_buffer->device->thread_trace.bo)) {
851 radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
852 radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) | EVENT_INDEX(0));
853 }
854
855 if (cmd_buffer->device->instance->debug_flags & RADV_DEBUG_SYNC_SHADERS) {
856 enum rgp_flush_bits sqtt_flush_bits = 0;
857 assert(flags & (RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH));
858
859 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4);
860
861 /* Force wait for graphics or compute engines to be idle. */
862 si_cs_emit_cache_flush(cmd_buffer->cs,
863 cmd_buffer->device->physical_device->rad_info.gfx_level,
864 &cmd_buffer->gfx9_fence_idx, cmd_buffer->gfx9_fence_va,
865 radv_cmd_buffer_uses_mec(cmd_buffer), flags, &sqtt_flush_bits,
866 cmd_buffer->gfx9_eop_bug_va);
867
868 if (cmd_buffer->state.graphics_pipeline && (flags & RADV_CMD_FLAG_PS_PARTIAL_FLUSH) &&
869 radv_pipeline_has_stage(cmd_buffer->state.graphics_pipeline, MESA_SHADER_TASK)) {
870 /* Force wait for compute engines to be idle on the internal cmdbuf. */
871 si_cs_emit_cache_flush(cmd_buffer->ace_internal.cs,
872 cmd_buffer->device->physical_device->rad_info.gfx_level, NULL, 0,
873 true, RADV_CMD_FLAG_CS_PARTIAL_FLUSH, &sqtt_flush_bits, 0);
874 }
875 }
876
877 if (unlikely(cmd_buffer->device->trace_bo))
878 radv_cmd_buffer_trace_emit(cmd_buffer);
879 }
880
881 static void
radv_save_pipeline(struct radv_cmd_buffer * cmd_buffer,struct radv_pipeline * pipeline)882 radv_save_pipeline(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline)
883 {
884 struct radv_device *device = cmd_buffer->device;
885 enum amd_ip_type ring;
886 uint32_t data[2];
887 uint64_t va;
888
889 va = radv_buffer_get_va(device->trace_bo);
890
891 ring = radv_queue_family_to_ring(device->physical_device, cmd_buffer->qf);
892
893 switch (ring) {
894 case AMD_IP_GFX:
895 va += 8;
896 break;
897 case AMD_IP_COMPUTE:
898 va += 16;
899 break;
900 default:
901 assert(!"invalid IP type");
902 }
903
904 uint64_t pipeline_address = (uintptr_t)pipeline;
905 data[0] = pipeline_address;
906 data[1] = pipeline_address >> 32;
907
908 radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, 2, data);
909 }
910
911 static void
radv_save_vertex_descriptors(struct radv_cmd_buffer * cmd_buffer,uint64_t vb_ptr)912 radv_save_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, uint64_t vb_ptr)
913 {
914 struct radv_device *device = cmd_buffer->device;
915 uint32_t data[2];
916 uint64_t va;
917
918 va = radv_buffer_get_va(device->trace_bo);
919 va += 24;
920
921 data[0] = vb_ptr;
922 data[1] = vb_ptr >> 32;
923
924 radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, 2, data);
925 }
926
927 static void
radv_save_vs_prolog(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader_part * prolog)928 radv_save_vs_prolog(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader_part *prolog)
929 {
930 struct radv_device *device = cmd_buffer->device;
931 uint32_t data[2];
932 uint64_t va;
933
934 va = radv_buffer_get_va(device->trace_bo);
935 va += 32;
936
937 uint64_t prolog_address = (uintptr_t)prolog;
938 data[0] = prolog_address;
939 data[1] = prolog_address >> 32;
940
941 radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, 2, data);
942 }
943
944 void
radv_set_descriptor_set(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point,struct radv_descriptor_set * set,unsigned idx)945 radv_set_descriptor_set(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point,
946 struct radv_descriptor_set *set, unsigned idx)
947 {
948 struct radv_descriptor_state *descriptors_state =
949 radv_get_descriptors_state(cmd_buffer, bind_point);
950
951 descriptors_state->sets[idx] = set;
952
953 descriptors_state->valid |= (1u << idx); /* active descriptors */
954 descriptors_state->dirty |= (1u << idx);
955 }
956
957 static void
radv_save_descriptors(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)958 radv_save_descriptors(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point)
959 {
960 struct radv_descriptor_state *descriptors_state =
961 radv_get_descriptors_state(cmd_buffer, bind_point);
962 struct radv_device *device = cmd_buffer->device;
963 uint32_t data[MAX_SETS * 2] = {0};
964 uint64_t va;
965 va = radv_buffer_get_va(device->trace_bo) + 40;
966
967 u_foreach_bit(i, descriptors_state->valid)
968 {
969 struct radv_descriptor_set *set = descriptors_state->sets[i];
970 data[i * 2] = (uint64_t)(uintptr_t)set;
971 data[i * 2 + 1] = (uint64_t)(uintptr_t)set >> 32;
972 }
973
974 radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, MAX_SETS * 2, data);
975 }
976
977 struct radv_userdata_info *
radv_lookup_user_sgpr(struct radv_pipeline * pipeline,gl_shader_stage stage,int idx)978 radv_lookup_user_sgpr(struct radv_pipeline *pipeline, gl_shader_stage stage, int idx)
979 {
980 struct radv_shader *shader = radv_get_shader(pipeline, stage);
981 return &shader->info.user_sgprs_locs.shader_data[idx];
982 }
983
984 static void
radv_emit_userdata_address(struct radv_device * device,struct radeon_cmdbuf * cs,struct radv_pipeline * pipeline,gl_shader_stage stage,int idx,uint64_t va)985 radv_emit_userdata_address(struct radv_device *device, struct radeon_cmdbuf *cs,
986 struct radv_pipeline *pipeline, gl_shader_stage stage, int idx,
987 uint64_t va)
988 {
989 struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx);
990 uint32_t base_reg = pipeline->user_data_0[stage];
991 if (loc->sgpr_idx == -1)
992 return;
993
994 assert(loc->num_sgprs == 1);
995
996 radv_emit_shader_pointer(device, cs, base_reg + loc->sgpr_idx * 4, va, false);
997 }
998
999 static void
radv_emit_descriptor_pointers(struct radv_device * device,struct radeon_cmdbuf * cs,struct radv_pipeline * pipeline,struct radv_descriptor_state * descriptors_state,gl_shader_stage stage)1000 radv_emit_descriptor_pointers(struct radv_device *device, struct radeon_cmdbuf *cs,
1001 struct radv_pipeline *pipeline,
1002 struct radv_descriptor_state *descriptors_state,
1003 gl_shader_stage stage)
1004 {
1005 uint32_t sh_base = pipeline->user_data_0[stage];
1006 struct radv_userdata_locations *locs = &pipeline->shaders[stage]->info.user_sgprs_locs;
1007 unsigned mask = locs->descriptor_sets_enabled;
1008
1009 mask &= descriptors_state->dirty & descriptors_state->valid;
1010
1011 while (mask) {
1012 int start, count;
1013
1014 u_bit_scan_consecutive_range(&mask, &start, &count);
1015
1016 struct radv_userdata_info *loc = &locs->descriptor_sets[start];
1017 unsigned sh_offset = sh_base + loc->sgpr_idx * 4;
1018
1019 radv_emit_shader_pointer_head(cs, sh_offset, count, true);
1020 for (int i = 0; i < count; i++) {
1021 struct radv_descriptor_set *set = descriptors_state->sets[start + i];
1022
1023 radv_emit_shader_pointer_body(device, cs, set->header.va, true);
1024 }
1025 }
1026 }
1027
1028 /**
1029 * Convert the user sample locations to hardware sample locations (the values
1030 * that will be emitted by PA_SC_AA_SAMPLE_LOCS_PIXEL_*).
1031 */
1032 static void
radv_convert_user_sample_locs(struct radv_sample_locations_state * state,uint32_t x,uint32_t y,VkOffset2D * sample_locs)1033 radv_convert_user_sample_locs(struct radv_sample_locations_state *state, uint32_t x, uint32_t y,
1034 VkOffset2D *sample_locs)
1035 {
1036 uint32_t x_offset = x % state->grid_size.width;
1037 uint32_t y_offset = y % state->grid_size.height;
1038 uint32_t num_samples = (uint32_t)state->per_pixel;
1039 VkSampleLocationEXT *user_locs;
1040 uint32_t pixel_offset;
1041
1042 pixel_offset = (x_offset + y_offset * state->grid_size.width) * num_samples;
1043
1044 assert(pixel_offset <= MAX_SAMPLE_LOCATIONS);
1045 user_locs = &state->locations[pixel_offset];
1046
1047 for (uint32_t i = 0; i < num_samples; i++) {
1048 float shifted_pos_x = user_locs[i].x - 0.5;
1049 float shifted_pos_y = user_locs[i].y - 0.5;
1050
1051 int32_t scaled_pos_x = floorf(shifted_pos_x * 16);
1052 int32_t scaled_pos_y = floorf(shifted_pos_y * 16);
1053
1054 sample_locs[i].x = CLAMP(scaled_pos_x, -8, 7);
1055 sample_locs[i].y = CLAMP(scaled_pos_y, -8, 7);
1056 }
1057 }
1058
1059 /**
1060 * Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask based on hardware sample
1061 * locations.
1062 */
1063 static void
radv_compute_sample_locs_pixel(uint32_t num_samples,VkOffset2D * sample_locs,uint32_t * sample_locs_pixel)1064 radv_compute_sample_locs_pixel(uint32_t num_samples, VkOffset2D *sample_locs,
1065 uint32_t *sample_locs_pixel)
1066 {
1067 for (uint32_t i = 0; i < num_samples; i++) {
1068 uint32_t sample_reg_idx = i / 4;
1069 uint32_t sample_loc_idx = i % 4;
1070 int32_t pos_x = sample_locs[i].x;
1071 int32_t pos_y = sample_locs[i].y;
1072
1073 uint32_t shift_x = 8 * sample_loc_idx;
1074 uint32_t shift_y = shift_x + 4;
1075
1076 sample_locs_pixel[sample_reg_idx] |= (pos_x & 0xf) << shift_x;
1077 sample_locs_pixel[sample_reg_idx] |= (pos_y & 0xf) << shift_y;
1078 }
1079 }
1080
1081 /**
1082 * Compute the PA_SC_CENTROID_PRIORITY_* mask based on the top left hardware
1083 * sample locations.
1084 */
1085 static uint64_t
radv_compute_centroid_priority(struct radv_cmd_buffer * cmd_buffer,VkOffset2D * sample_locs,uint32_t num_samples)1086 radv_compute_centroid_priority(struct radv_cmd_buffer *cmd_buffer, VkOffset2D *sample_locs,
1087 uint32_t num_samples)
1088 {
1089 uint32_t *centroid_priorities = alloca(num_samples * sizeof(*centroid_priorities));
1090 uint32_t sample_mask = num_samples - 1;
1091 uint32_t *distances = alloca(num_samples * sizeof(*distances));
1092 uint64_t centroid_priority = 0;
1093
1094 /* Compute the distances from center for each sample. */
1095 for (int i = 0; i < num_samples; i++) {
1096 distances[i] = (sample_locs[i].x * sample_locs[i].x) + (sample_locs[i].y * sample_locs[i].y);
1097 }
1098
1099 /* Compute the centroid priorities by looking at the distances array. */
1100 for (int i = 0; i < num_samples; i++) {
1101 uint32_t min_idx = 0;
1102
1103 for (int j = 1; j < num_samples; j++) {
1104 if (distances[j] < distances[min_idx])
1105 min_idx = j;
1106 }
1107
1108 centroid_priorities[i] = min_idx;
1109 distances[min_idx] = 0xffffffff;
1110 }
1111
1112 /* Compute the final centroid priority. */
1113 for (int i = 0; i < 8; i++) {
1114 centroid_priority |= centroid_priorities[i & sample_mask] << (i * 4);
1115 }
1116
1117 return centroid_priority << 32 | centroid_priority;
1118 }
1119
1120 /**
1121 * Emit the sample locations that are specified with VK_EXT_sample_locations.
1122 */
1123 static void
radv_emit_sample_locations(struct radv_cmd_buffer * cmd_buffer)1124 radv_emit_sample_locations(struct radv_cmd_buffer *cmd_buffer)
1125 {
1126 struct radv_sample_locations_state *sample_location = &cmd_buffer->state.dynamic.sample_location;
1127 uint32_t num_samples = (uint32_t)sample_location->per_pixel;
1128 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1129 uint32_t sample_locs_pixel[4][2] = {0};
1130 VkOffset2D sample_locs[4][8]; /* 8 is the max. sample count supported */
1131 uint32_t max_sample_dist = 0;
1132 uint64_t centroid_priority;
1133
1134 if (!cmd_buffer->state.dynamic.sample_location.count)
1135 return;
1136
1137 /* Convert the user sample locations to hardware sample locations. */
1138 radv_convert_user_sample_locs(sample_location, 0, 0, sample_locs[0]);
1139 radv_convert_user_sample_locs(sample_location, 1, 0, sample_locs[1]);
1140 radv_convert_user_sample_locs(sample_location, 0, 1, sample_locs[2]);
1141 radv_convert_user_sample_locs(sample_location, 1, 1, sample_locs[3]);
1142
1143 /* Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask. */
1144 for (uint32_t i = 0; i < 4; i++) {
1145 radv_compute_sample_locs_pixel(num_samples, sample_locs[i], sample_locs_pixel[i]);
1146 }
1147
1148 /* Compute the PA_SC_CENTROID_PRIORITY_* mask. */
1149 centroid_priority = radv_compute_centroid_priority(cmd_buffer, sample_locs[0], num_samples);
1150
1151 /* Compute the maximum sample distance from the specified locations. */
1152 for (unsigned i = 0; i < 4; ++i) {
1153 for (uint32_t j = 0; j < num_samples; j++) {
1154 VkOffset2D offset = sample_locs[i][j];
1155 max_sample_dist = MAX2(max_sample_dist, MAX2(abs(offset.x), abs(offset.y)));
1156 }
1157 }
1158
1159 /* Emit the specified user sample locations. */
1160 switch (num_samples) {
1161 case 2:
1162 case 4:
1163 radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0,
1164 sample_locs_pixel[0][0]);
1165 radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0,
1166 sample_locs_pixel[1][0]);
1167 radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0,
1168 sample_locs_pixel[2][0]);
1169 radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0,
1170 sample_locs_pixel[3][0]);
1171 break;
1172 case 8:
1173 radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0,
1174 sample_locs_pixel[0][0]);
1175 radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0,
1176 sample_locs_pixel[1][0]);
1177 radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0,
1178 sample_locs_pixel[2][0]);
1179 radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0,
1180 sample_locs_pixel[3][0]);
1181 radeon_set_context_reg(cs, R_028BFC_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_1,
1182 sample_locs_pixel[0][1]);
1183 radeon_set_context_reg(cs, R_028C0C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_1,
1184 sample_locs_pixel[1][1]);
1185 radeon_set_context_reg(cs, R_028C1C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_1,
1186 sample_locs_pixel[2][1]);
1187 radeon_set_context_reg(cs, R_028C2C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1,
1188 sample_locs_pixel[3][1]);
1189 break;
1190 default:
1191 unreachable("invalid number of samples");
1192 }
1193
1194 /* Emit the maximum sample distance and the centroid priority. */
1195 radeon_set_context_reg_rmw(cs, R_028BE0_PA_SC_AA_CONFIG,
1196 S_028BE0_MAX_SAMPLE_DIST(max_sample_dist), ~C_028BE0_MAX_SAMPLE_DIST);
1197
1198 radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
1199 radeon_emit(cs, centroid_priority);
1200 radeon_emit(cs, centroid_priority >> 32);
1201
1202 cmd_buffer->state.context_roll_without_scissor_emitted = true;
1203 }
1204
1205 static void
radv_emit_inline_push_consts(struct radv_device * device,struct radeon_cmdbuf * cs,struct radv_pipeline * pipeline,gl_shader_stage stage,int idx,uint32_t * values)1206 radv_emit_inline_push_consts(struct radv_device *device, struct radeon_cmdbuf *cs,
1207 struct radv_pipeline *pipeline, gl_shader_stage stage, int idx,
1208 uint32_t *values)
1209 {
1210 struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx);
1211 uint32_t base_reg = pipeline->user_data_0[stage];
1212 if (loc->sgpr_idx == -1)
1213 return;
1214
1215 radeon_check_space(device->ws, cs, 2 + loc->num_sgprs);
1216
1217 radeon_set_sh_reg_seq(cs, base_reg + loc->sgpr_idx * 4, loc->num_sgprs);
1218 radeon_emit_array(cs, values, loc->num_sgprs);
1219 }
1220
1221 static void
radv_update_multisample_state(struct radv_cmd_buffer * cmd_buffer,struct radv_graphics_pipeline * pipeline)1222 radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer,
1223 struct radv_graphics_pipeline *pipeline)
1224 {
1225 int num_samples = pipeline->ms.num_samples;
1226 struct radv_graphics_pipeline *old_pipeline = cmd_buffer->state.emitted_graphics_pipeline;
1227
1228 if (pipeline->base.shaders[MESA_SHADER_FRAGMENT]->info.ps.needs_sample_positions)
1229 cmd_buffer->sample_positions_needed = true;
1230
1231 if (old_pipeline && num_samples == old_pipeline->ms.num_samples)
1232 return;
1233
1234 radv_emit_default_sample_locations(cmd_buffer->cs, num_samples);
1235
1236 cmd_buffer->state.context_roll_without_scissor_emitted = true;
1237 }
1238
1239 static void
radv_update_binning_state(struct radv_cmd_buffer * cmd_buffer,struct radv_graphics_pipeline * pipeline)1240 radv_update_binning_state(struct radv_cmd_buffer *cmd_buffer,
1241 struct radv_graphics_pipeline *pipeline)
1242 {
1243 const struct radv_graphics_pipeline *old_pipeline = cmd_buffer->state.emitted_graphics_pipeline;
1244
1245 if (pipeline->base.device->physical_device->rad_info.gfx_level < GFX9)
1246 return;
1247
1248 if (old_pipeline &&
1249 old_pipeline->binning.pa_sc_binner_cntl_0 ==
1250 pipeline->binning.pa_sc_binner_cntl_0)
1251 return;
1252
1253 bool binning_flush = false;
1254 if (cmd_buffer->device->physical_device->rad_info.family == CHIP_VEGA12 ||
1255 cmd_buffer->device->physical_device->rad_info.family == CHIP_VEGA20 ||
1256 cmd_buffer->device->physical_device->rad_info.family == CHIP_RAVEN2 ||
1257 cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10) {
1258 binning_flush = !old_pipeline ||
1259 G_028C44_BINNING_MODE(old_pipeline->binning.pa_sc_binner_cntl_0) !=
1260 G_028C44_BINNING_MODE(pipeline->binning.pa_sc_binner_cntl_0);
1261 }
1262
1263 radeon_set_context_reg(cmd_buffer->cs, R_028C44_PA_SC_BINNER_CNTL_0,
1264 pipeline->binning.pa_sc_binner_cntl_0 |
1265 S_028C44_FLUSH_ON_BINNING_TRANSITION(!!binning_flush));
1266
1267 cmd_buffer->state.context_roll_without_scissor_emitted = true;
1268 }
1269
1270 static void
radv_emit_shader_prefetch(struct radv_cmd_buffer * cmd_buffer,struct radv_shader * shader)1271 radv_emit_shader_prefetch(struct radv_cmd_buffer *cmd_buffer, struct radv_shader *shader)
1272 {
1273 uint64_t va;
1274
1275 if (!shader)
1276 return;
1277
1278 va = radv_shader_get_va(shader);
1279
1280 si_cp_dma_prefetch(cmd_buffer, va, shader->code_size);
1281 }
1282
1283 static void
radv_emit_prefetch_L2(struct radv_cmd_buffer * cmd_buffer,struct radv_graphics_pipeline * pipeline,bool first_stage_only)1284 radv_emit_prefetch_L2(struct radv_cmd_buffer *cmd_buffer,
1285 struct radv_graphics_pipeline *pipeline, bool first_stage_only)
1286 {
1287 struct radv_cmd_state *state = &cmd_buffer->state;
1288 uint32_t mask = state->prefetch_L2_mask;
1289
1290 /* Fast prefetch path for starting draws as soon as possible. */
1291 if (first_stage_only)
1292 mask &= RADV_PREFETCH_VS | RADV_PREFETCH_VBO_DESCRIPTORS | RADV_PREFETCH_MS;
1293
1294 if (mask & RADV_PREFETCH_VS)
1295 radv_emit_shader_prefetch(cmd_buffer, pipeline->base.shaders[MESA_SHADER_VERTEX]);
1296
1297 if (mask & RADV_PREFETCH_MS)
1298 radv_emit_shader_prefetch(cmd_buffer, pipeline->base.shaders[MESA_SHADER_MESH]);
1299
1300 if (mask & RADV_PREFETCH_VBO_DESCRIPTORS)
1301 si_cp_dma_prefetch(cmd_buffer, state->vb_va, pipeline->vb_desc_alloc_size);
1302
1303 if (mask & RADV_PREFETCH_TCS)
1304 radv_emit_shader_prefetch(cmd_buffer, pipeline->base.shaders[MESA_SHADER_TESS_CTRL]);
1305
1306 if (mask & RADV_PREFETCH_TES)
1307 radv_emit_shader_prefetch(cmd_buffer, pipeline->base.shaders[MESA_SHADER_TESS_EVAL]);
1308
1309 if (mask & RADV_PREFETCH_GS) {
1310 radv_emit_shader_prefetch(cmd_buffer, pipeline->base.shaders[MESA_SHADER_GEOMETRY]);
1311 if (radv_pipeline_has_gs_copy_shader(&pipeline->base))
1312 radv_emit_shader_prefetch(cmd_buffer, pipeline->base.gs_copy_shader);
1313 }
1314
1315 if (mask & RADV_PREFETCH_PS)
1316 radv_emit_shader_prefetch(cmd_buffer, pipeline->base.shaders[MESA_SHADER_FRAGMENT]);
1317
1318 state->prefetch_L2_mask &= ~mask;
1319 }
1320
1321 static void
radv_emit_rbplus_state(struct radv_cmd_buffer * cmd_buffer)1322 radv_emit_rbplus_state(struct radv_cmd_buffer *cmd_buffer)
1323 {
1324 if (!cmd_buffer->device->physical_device->rad_info.rbplus_allowed)
1325 return;
1326
1327 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
1328 const struct radv_subpass *subpass = cmd_buffer->state.subpass;
1329
1330 unsigned sx_ps_downconvert = 0;
1331 unsigned sx_blend_opt_epsilon = 0;
1332 unsigned sx_blend_opt_control = 0;
1333
1334 for (unsigned i = 0; i < subpass->color_count; ++i) {
1335 unsigned format, swap;
1336 bool has_alpha, has_rgb;
1337 if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) {
1338 /* We don't set the DISABLE bits, because the HW can't have holes,
1339 * so the SPI color format is set to 32-bit 1-component. */
1340 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
1341 continue;
1342 }
1343
1344 int idx = subpass->color_attachments[i].attachment;
1345 if (cmd_buffer->state.attachments) {
1346 struct radv_color_buffer_info *cb = &cmd_buffer->state.attachments[idx].cb;
1347
1348 format = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11
1349 ? G_028C70_FORMAT_GFX11(cb->cb_color_info)
1350 : G_028C70_FORMAT_GFX6(cb->cb_color_info);
1351 swap = G_028C70_COMP_SWAP(cb->cb_color_info);
1352 has_alpha = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11
1353 ? !G_028C74_FORCE_DST_ALPHA_1_GFX11(cb->cb_color_attrib)
1354 : !G_028C74_FORCE_DST_ALPHA_1_GFX6(cb->cb_color_attrib);
1355 } else {
1356 VkFormat fmt = cmd_buffer->state.pass->attachments[idx].format;
1357 format = radv_translate_colorformat(fmt);
1358 swap = radv_translate_colorswap(fmt, false);
1359 has_alpha = vk_format_description(fmt)->swizzle[3] != PIPE_SWIZZLE_1;
1360 }
1361
1362 uint32_t spi_format = (pipeline->col_format >> (i * 4)) & 0xf;
1363 uint32_t colormask = (pipeline->cb_target_mask >> (i * 4)) & 0xf;
1364
1365 if (format == V_028C70_COLOR_8 || format == V_028C70_COLOR_16 || format == V_028C70_COLOR_32)
1366 has_rgb = !has_alpha;
1367 else
1368 has_rgb = true;
1369
1370 /* Check the colormask and export format. */
1371 if (!(colormask & 0x7))
1372 has_rgb = false;
1373 if (!(colormask & 0x8))
1374 has_alpha = false;
1375
1376 if (spi_format == V_028714_SPI_SHADER_ZERO) {
1377 has_rgb = false;
1378 has_alpha = false;
1379 }
1380
1381 /* The HW doesn't quite blend correctly with rgb9e5 if we disable the alpha
1382 * optimization, even though it has no alpha. */
1383 if (has_rgb && format == V_028C70_COLOR_5_9_9_9)
1384 has_alpha = true;
1385
1386 /* Disable value checking for disabled channels. */
1387 if (!has_rgb)
1388 sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4);
1389 if (!has_alpha)
1390 sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4);
1391
1392 /* Enable down-conversion for 32bpp and smaller formats. */
1393 switch (format) {
1394 case V_028C70_COLOR_8:
1395 case V_028C70_COLOR_8_8:
1396 case V_028C70_COLOR_8_8_8_8:
1397 /* For 1 and 2-channel formats, use the superset thereof. */
1398 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR ||
1399 spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
1400 spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
1401 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_8_8_8_8 << (i * 4);
1402 sx_blend_opt_epsilon |= V_028758_8BIT_FORMAT << (i * 4);
1403 }
1404 break;
1405
1406 case V_028C70_COLOR_5_6_5:
1407 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1408 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_5_6_5 << (i * 4);
1409 sx_blend_opt_epsilon |= V_028758_6BIT_FORMAT << (i * 4);
1410 }
1411 break;
1412
1413 case V_028C70_COLOR_1_5_5_5:
1414 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1415 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_1_5_5_5 << (i * 4);
1416 sx_blend_opt_epsilon |= V_028758_5BIT_FORMAT << (i * 4);
1417 }
1418 break;
1419
1420 case V_028C70_COLOR_4_4_4_4:
1421 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1422 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_4_4_4_4 << (i * 4);
1423 sx_blend_opt_epsilon |= V_028758_4BIT_FORMAT << (i * 4);
1424 }
1425 break;
1426
1427 case V_028C70_COLOR_32:
1428 if (swap == V_028C70_SWAP_STD && spi_format == V_028714_SPI_SHADER_32_R)
1429 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
1430 else if (swap == V_028C70_SWAP_ALT_REV && spi_format == V_028714_SPI_SHADER_32_AR)
1431 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_A << (i * 4);
1432 break;
1433
1434 case V_028C70_COLOR_16:
1435 case V_028C70_COLOR_16_16:
1436 /* For 1-channel formats, use the superset thereof. */
1437 if (spi_format == V_028714_SPI_SHADER_UNORM16_ABGR ||
1438 spi_format == V_028714_SPI_SHADER_SNORM16_ABGR ||
1439 spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
1440 spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
1441 if (swap == V_028C70_SWAP_STD || swap == V_028C70_SWAP_STD_REV)
1442 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_GR << (i * 4);
1443 else
1444 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_AR << (i * 4);
1445 }
1446 break;
1447
1448 case V_028C70_COLOR_10_11_11:
1449 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR)
1450 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_10_11_11 << (i * 4);
1451 break;
1452
1453 case V_028C70_COLOR_2_10_10_10:
1454 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1455 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_2_10_10_10 << (i * 4);
1456 sx_blend_opt_epsilon |= V_028758_10BIT_FORMAT << (i * 4);
1457 }
1458 break;
1459 case V_028C70_COLOR_5_9_9_9:
1460 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR)
1461 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_9_9_9_E5 << (i * 4);
1462 break;
1463 }
1464 }
1465
1466 /* Do not set the DISABLE bits for the unused attachments, as that
1467 * breaks dual source blending in SkQP and does not seem to improve
1468 * performance. */
1469
1470 if (sx_ps_downconvert == cmd_buffer->state.last_sx_ps_downconvert &&
1471 sx_blend_opt_epsilon == cmd_buffer->state.last_sx_blend_opt_epsilon &&
1472 sx_blend_opt_control == cmd_buffer->state.last_sx_blend_opt_control)
1473 return;
1474
1475 radeon_set_context_reg_seq(cmd_buffer->cs, R_028754_SX_PS_DOWNCONVERT, 3);
1476 radeon_emit(cmd_buffer->cs, sx_ps_downconvert);
1477 radeon_emit(cmd_buffer->cs, sx_blend_opt_epsilon);
1478 radeon_emit(cmd_buffer->cs, sx_blend_opt_control);
1479
1480 cmd_buffer->state.context_roll_without_scissor_emitted = true;
1481
1482 cmd_buffer->state.last_sx_ps_downconvert = sx_ps_downconvert;
1483 cmd_buffer->state.last_sx_blend_opt_epsilon = sx_blend_opt_epsilon;
1484 cmd_buffer->state.last_sx_blend_opt_control = sx_blend_opt_control;
1485 }
1486
1487 static void
radv_emit_batch_break_on_new_ps(struct radv_cmd_buffer * cmd_buffer)1488 radv_emit_batch_break_on_new_ps(struct radv_cmd_buffer *cmd_buffer)
1489 {
1490 if (!cmd_buffer->device->pbb_allowed)
1491 return;
1492
1493 struct radv_binning_settings settings =
1494 radv_get_binning_settings(cmd_buffer->device->physical_device);
1495 bool break_for_new_ps =
1496 (!cmd_buffer->state.emitted_graphics_pipeline ||
1497 cmd_buffer->state.emitted_graphics_pipeline->base.shaders[MESA_SHADER_FRAGMENT] !=
1498 cmd_buffer->state.graphics_pipeline->base.shaders[MESA_SHADER_FRAGMENT]) &&
1499 (settings.context_states_per_bin > 1 || settings.persistent_states_per_bin > 1);
1500 bool break_for_new_cb_target_mask =
1501 (cmd_buffer->state.dirty & RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE) &&
1502 settings.context_states_per_bin > 1;
1503
1504 if (!break_for_new_ps && !break_for_new_cb_target_mask)
1505 return;
1506
1507 radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
1508 radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
1509 }
1510
1511 static void
radv_emit_graphics_pipeline(struct radv_cmd_buffer * cmd_buffer)1512 radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
1513 {
1514 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
1515
1516 if (cmd_buffer->state.emitted_graphics_pipeline == pipeline)
1517 return;
1518
1519 radv_update_multisample_state(cmd_buffer, pipeline);
1520 radv_update_binning_state(cmd_buffer, pipeline);
1521
1522 cmd_buffer->scratch_size_per_wave_needed =
1523 MAX2(cmd_buffer->scratch_size_per_wave_needed, pipeline->base.scratch_bytes_per_wave);
1524 cmd_buffer->scratch_waves_wanted = MAX2(cmd_buffer->scratch_waves_wanted, pipeline->base.max_waves);
1525
1526 if (!cmd_buffer->state.emitted_graphics_pipeline ||
1527 cmd_buffer->state.emitted_graphics_pipeline->negative_one_to_one != pipeline->negative_one_to_one ||
1528 cmd_buffer->state.emitted_graphics_pipeline->depth_clamp_mode != pipeline->depth_clamp_mode)
1529 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_VIEWPORT;
1530
1531 if (!cmd_buffer->state.emitted_graphics_pipeline ||
1532 radv_rast_prim_is_points_or_lines(cmd_buffer->state.emitted_graphics_pipeline->rast_prim) != radv_rast_prim_is_points_or_lines(pipeline->rast_prim) ||
1533 cmd_buffer->state.emitted_graphics_pipeline->line_width != pipeline->line_width)
1534 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR;
1535
1536 if (!cmd_buffer->state.emitted_graphics_pipeline ||
1537 cmd_buffer->state.emitted_graphics_pipeline->pa_su_sc_mode_cntl != pipeline->pa_su_sc_mode_cntl)
1538 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_CULL_MODE |
1539 RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE |
1540 RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
1541
1542 if (!cmd_buffer->state.emitted_graphics_pipeline ||
1543 cmd_buffer->state.emitted_graphics_pipeline->pa_cl_clip_cntl != pipeline->pa_cl_clip_cntl)
1544 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
1545
1546 if (!cmd_buffer->state.emitted_graphics_pipeline ||
1547 cmd_buffer->state.emitted_graphics_pipeline->cb_color_control != pipeline->cb_color_control)
1548 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP;
1549
1550 if (!cmd_buffer->state.emitted_graphics_pipeline)
1551 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY |
1552 RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS |
1553 RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS |
1554 RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE;
1555
1556 if (!cmd_buffer->state.emitted_graphics_pipeline ||
1557 cmd_buffer->state.emitted_graphics_pipeline->db_depth_control != pipeline->db_depth_control)
1558 cmd_buffer->state.dirty |=
1559 RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE |
1560 RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP | RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE |
1561 RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP;
1562
1563 if (!cmd_buffer->state.emitted_graphics_pipeline)
1564 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP;
1565
1566 if (!cmd_buffer->state.emitted_graphics_pipeline ||
1567 cmd_buffer->state.emitted_graphics_pipeline->cb_target_mask != pipeline->cb_target_mask) {
1568 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE;
1569 }
1570
1571 radeon_emit_array(cmd_buffer->cs, pipeline->base.cs.buf, pipeline->base.cs.cdw);
1572
1573 if (pipeline->has_ngg_culling &&
1574 pipeline->last_vgt_api_stage != MESA_SHADER_GEOMETRY &&
1575 !cmd_buffer->state.last_nggc_settings) {
1576 /* The already emitted RSRC2 contains the LDS required for NGG culling.
1577 * Culling is currently disabled, so re-emit RSRC2 to reduce LDS usage.
1578 * API GS always needs LDS, so this isn't useful there.
1579 */
1580 struct radv_shader *v = pipeline->base.shaders[pipeline->last_vgt_api_stage];
1581 radeon_set_sh_reg(cmd_buffer->cs, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
1582 (v->config.rsrc2 & C_00B22C_LDS_SIZE) |
1583 S_00B22C_LDS_SIZE(v->info.num_lds_blocks_when_not_culling));
1584 }
1585
1586 if (!cmd_buffer->state.emitted_graphics_pipeline ||
1587 cmd_buffer->state.emitted_graphics_pipeline->base.ctx_cs.cdw != pipeline->base.ctx_cs.cdw ||
1588 cmd_buffer->state.emitted_graphics_pipeline->base.ctx_cs_hash != pipeline->base.ctx_cs_hash ||
1589 memcmp(cmd_buffer->state.emitted_graphics_pipeline->base.ctx_cs.buf, pipeline->base.ctx_cs.buf,
1590 pipeline->base.ctx_cs.cdw * 4)) {
1591 radeon_emit_array(cmd_buffer->cs, pipeline->base.ctx_cs.buf, pipeline->base.ctx_cs.cdw);
1592 cmd_buffer->state.context_roll_without_scissor_emitted = true;
1593 }
1594
1595 radv_emit_batch_break_on_new_ps(cmd_buffer);
1596
1597 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->base.slab_bo);
1598
1599 if (unlikely(cmd_buffer->device->trace_bo))
1600 radv_save_pipeline(cmd_buffer, &pipeline->base);
1601
1602 cmd_buffer->state.emitted_graphics_pipeline = pipeline;
1603
1604 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_PIPELINE;
1605 }
1606
1607 static void
radv_emit_viewport(struct radv_cmd_buffer * cmd_buffer)1608 radv_emit_viewport(struct radv_cmd_buffer *cmd_buffer)
1609 {
1610 const struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
1611 const struct radv_viewport_state *viewport = &cmd_buffer->state.dynamic.viewport;
1612 int i;
1613 const unsigned count = viewport->count;
1614
1615 assert(count);
1616 radeon_set_context_reg_seq(cmd_buffer->cs, R_02843C_PA_CL_VPORT_XSCALE, count * 6);
1617
1618 for (i = 0; i < count; i++) {
1619 radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].scale[0]));
1620 radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].translate[0]));
1621 radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].scale[1]));
1622 radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].translate[1]));
1623
1624 double scale_z, translate_z;
1625 if (pipeline->negative_one_to_one) {
1626 scale_z = viewport->xform[i].scale[2] * 0.5f;
1627 translate_z = (viewport->xform[i].translate[2] + viewport->viewports[i].maxDepth) * 0.5f;
1628 } else {
1629 scale_z = viewport->xform[i].scale[2];
1630 translate_z = viewport->xform[i].translate[2];
1631
1632 }
1633 radeon_emit(cmd_buffer->cs, fui(scale_z));
1634 radeon_emit(cmd_buffer->cs, fui(translate_z));
1635 }
1636
1637 radeon_set_context_reg_seq(cmd_buffer->cs, R_0282D0_PA_SC_VPORT_ZMIN_0, count * 2);
1638 for (i = 0; i < count; i++) {
1639 float zmin, zmax;
1640
1641 if (pipeline->depth_clamp_mode == RADV_DEPTH_CLAMP_MODE_ZERO_TO_ONE) {
1642 zmin = 0.0f;
1643 zmax = 1.0f;
1644 } else {
1645 zmin = MIN2(viewport->viewports[i].minDepth, viewport->viewports[i].maxDepth);
1646 zmax = MAX2(viewport->viewports[i].minDepth, viewport->viewports[i].maxDepth);
1647 }
1648
1649 radeon_emit(cmd_buffer->cs, fui(zmin));
1650 radeon_emit(cmd_buffer->cs, fui(zmax));
1651 }
1652 }
1653
1654 void
radv_write_scissors(struct radv_cmd_buffer * cmd_buffer,struct radeon_cmdbuf * cs)1655 radv_write_scissors(struct radv_cmd_buffer *cmd_buffer, struct radeon_cmdbuf *cs)
1656 {
1657 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
1658 uint32_t count = cmd_buffer->state.dynamic.scissor.count;
1659 unsigned rast_prim;
1660
1661 if (!(pipeline->dynamic_states & RADV_DYNAMIC_PRIMITIVE_TOPOLOGY) ||
1662 (pipeline->active_stages & (VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT |
1663 VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT |
1664 VK_SHADER_STAGE_GEOMETRY_BIT |
1665 VK_SHADER_STAGE_MESH_BIT_NV))) {
1666 /* Ignore dynamic primitive topology for TES/GS/MS stages. */
1667 rast_prim = pipeline->rast_prim;
1668 } else {
1669 rast_prim = si_conv_prim_to_gs_out(cmd_buffer->state.dynamic.primitive_topology);
1670 }
1671
1672 si_write_scissors(cs, 0, count, cmd_buffer->state.dynamic.scissor.scissors,
1673 cmd_buffer->state.dynamic.viewport.viewports, rast_prim,
1674 cmd_buffer->state.dynamic.line_width);
1675 }
1676
1677 static void
radv_emit_scissor(struct radv_cmd_buffer * cmd_buffer)1678 radv_emit_scissor(struct radv_cmd_buffer *cmd_buffer)
1679 {
1680 radv_write_scissors(cmd_buffer, cmd_buffer->cs);
1681
1682 cmd_buffer->state.context_roll_without_scissor_emitted = false;
1683 }
1684
1685 static void
radv_emit_discard_rectangle(struct radv_cmd_buffer * cmd_buffer)1686 radv_emit_discard_rectangle(struct radv_cmd_buffer *cmd_buffer)
1687 {
1688 if (!cmd_buffer->state.dynamic.discard_rectangle.count)
1689 return;
1690
1691 radeon_set_context_reg_seq(cmd_buffer->cs, R_028210_PA_SC_CLIPRECT_0_TL,
1692 cmd_buffer->state.dynamic.discard_rectangle.count * 2);
1693 for (unsigned i = 0; i < cmd_buffer->state.dynamic.discard_rectangle.count; ++i) {
1694 VkRect2D rect = cmd_buffer->state.dynamic.discard_rectangle.rectangles[i];
1695 radeon_emit(cmd_buffer->cs, S_028210_TL_X(rect.offset.x) | S_028210_TL_Y(rect.offset.y));
1696 radeon_emit(cmd_buffer->cs, S_028214_BR_X(rect.offset.x + rect.extent.width) |
1697 S_028214_BR_Y(rect.offset.y + rect.extent.height));
1698 }
1699 }
1700
1701 static void
radv_emit_line_width(struct radv_cmd_buffer * cmd_buffer)1702 radv_emit_line_width(struct radv_cmd_buffer *cmd_buffer)
1703 {
1704 unsigned width = cmd_buffer->state.dynamic.line_width * 8;
1705
1706 radeon_set_context_reg(cmd_buffer->cs, R_028A08_PA_SU_LINE_CNTL,
1707 S_028A08_WIDTH(CLAMP(width, 0, 0xFFFF)));
1708 }
1709
1710 static void
radv_emit_blend_constants(struct radv_cmd_buffer * cmd_buffer)1711 radv_emit_blend_constants(struct radv_cmd_buffer *cmd_buffer)
1712 {
1713 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1714
1715 radeon_set_context_reg_seq(cmd_buffer->cs, R_028414_CB_BLEND_RED, 4);
1716 radeon_emit_array(cmd_buffer->cs, (uint32_t *)d->blend_constants, 4);
1717 }
1718
1719 static void
radv_emit_stencil(struct radv_cmd_buffer * cmd_buffer)1720 radv_emit_stencil(struct radv_cmd_buffer *cmd_buffer)
1721 {
1722 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1723
1724 radeon_set_context_reg_seq(cmd_buffer->cs, R_028430_DB_STENCILREFMASK, 2);
1725 radeon_emit(cmd_buffer->cs, S_028430_STENCILTESTVAL(d->stencil_reference.front) |
1726 S_028430_STENCILMASK(d->stencil_compare_mask.front) |
1727 S_028430_STENCILWRITEMASK(d->stencil_write_mask.front) |
1728 S_028430_STENCILOPVAL(1));
1729 radeon_emit(cmd_buffer->cs, S_028434_STENCILTESTVAL_BF(d->stencil_reference.back) |
1730 S_028434_STENCILMASK_BF(d->stencil_compare_mask.back) |
1731 S_028434_STENCILWRITEMASK_BF(d->stencil_write_mask.back) |
1732 S_028434_STENCILOPVAL_BF(1));
1733 }
1734
1735 static void
radv_emit_depth_bounds(struct radv_cmd_buffer * cmd_buffer)1736 radv_emit_depth_bounds(struct radv_cmd_buffer *cmd_buffer)
1737 {
1738 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1739
1740 radeon_set_context_reg_seq(cmd_buffer->cs, R_028020_DB_DEPTH_BOUNDS_MIN, 2);
1741 radeon_emit(cmd_buffer->cs, fui(d->depth_bounds.min));
1742 radeon_emit(cmd_buffer->cs, fui(d->depth_bounds.max));
1743 }
1744
1745 static void
radv_emit_depth_bias(struct radv_cmd_buffer * cmd_buffer)1746 radv_emit_depth_bias(struct radv_cmd_buffer *cmd_buffer)
1747 {
1748 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1749 unsigned slope = fui(d->depth_bias.slope * 16.0f);
1750
1751 radeon_set_context_reg_seq(cmd_buffer->cs, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, 5);
1752 radeon_emit(cmd_buffer->cs, fui(d->depth_bias.clamp)); /* CLAMP */
1753 radeon_emit(cmd_buffer->cs, slope); /* FRONT SCALE */
1754 radeon_emit(cmd_buffer->cs, fui(d->depth_bias.bias)); /* FRONT OFFSET */
1755 radeon_emit(cmd_buffer->cs, slope); /* BACK SCALE */
1756 radeon_emit(cmd_buffer->cs, fui(d->depth_bias.bias)); /* BACK OFFSET */
1757 }
1758
1759 static void
radv_emit_line_stipple(struct radv_cmd_buffer * cmd_buffer)1760 radv_emit_line_stipple(struct radv_cmd_buffer *cmd_buffer)
1761 {
1762 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1763 uint32_t auto_reset_cntl = 1;
1764
1765 if (d->primitive_topology == V_008958_DI_PT_LINESTRIP)
1766 auto_reset_cntl = 2;
1767
1768 radeon_set_context_reg(cmd_buffer->cs, R_028A0C_PA_SC_LINE_STIPPLE,
1769 S_028A0C_LINE_PATTERN(d->line_stipple.pattern) |
1770 S_028A0C_REPEAT_COUNT(d->line_stipple.factor - 1) |
1771 S_028A0C_AUTO_RESET_CNTL(auto_reset_cntl));
1772 }
1773
1774 uint32_t
radv_get_pa_su_sc_mode_cntl(const struct radv_cmd_buffer * cmd_buffer)1775 radv_get_pa_su_sc_mode_cntl(const struct radv_cmd_buffer *cmd_buffer)
1776 {
1777 unsigned pa_su_sc_mode_cntl = cmd_buffer->state.graphics_pipeline->pa_su_sc_mode_cntl;
1778 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1779
1780 pa_su_sc_mode_cntl &= C_028814_CULL_FRONT &
1781 C_028814_CULL_BACK &
1782 C_028814_FACE &
1783 C_028814_POLY_OFFSET_FRONT_ENABLE &
1784 C_028814_POLY_OFFSET_BACK_ENABLE &
1785 C_028814_POLY_OFFSET_PARA_ENABLE;
1786
1787 pa_su_sc_mode_cntl |= S_028814_CULL_FRONT(!!(d->cull_mode & VK_CULL_MODE_FRONT_BIT)) |
1788 S_028814_CULL_BACK(!!(d->cull_mode & VK_CULL_MODE_BACK_BIT)) |
1789 S_028814_FACE(d->front_face) |
1790 S_028814_POLY_OFFSET_FRONT_ENABLE(d->depth_bias_enable) |
1791 S_028814_POLY_OFFSET_BACK_ENABLE(d->depth_bias_enable) |
1792 S_028814_POLY_OFFSET_PARA_ENABLE(d->depth_bias_enable);
1793 return pa_su_sc_mode_cntl;
1794 }
1795
1796 static void
radv_emit_culling(struct radv_cmd_buffer * cmd_buffer,uint64_t states)1797 radv_emit_culling(struct radv_cmd_buffer *cmd_buffer, uint64_t states)
1798 {
1799 unsigned pa_su_sc_mode_cntl = radv_get_pa_su_sc_mode_cntl(cmd_buffer);
1800
1801 radeon_set_context_reg(cmd_buffer->cs, R_028814_PA_SU_SC_MODE_CNTL, pa_su_sc_mode_cntl);
1802 }
1803
1804 static void
radv_emit_primitive_topology(struct radv_cmd_buffer * cmd_buffer)1805 radv_emit_primitive_topology(struct radv_cmd_buffer *cmd_buffer)
1806 {
1807 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1808
1809 assert(!cmd_buffer->state.mesh_shading);
1810
1811 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) {
1812 radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device, cmd_buffer->cs,
1813 R_030908_VGT_PRIMITIVE_TYPE, 1, d->primitive_topology);
1814 } else {
1815 radeon_set_config_reg(cmd_buffer->cs, R_008958_VGT_PRIMITIVE_TYPE, d->primitive_topology);
1816 }
1817 }
1818
1819 static void
radv_emit_depth_control(struct radv_cmd_buffer * cmd_buffer,uint64_t states)1820 radv_emit_depth_control(struct radv_cmd_buffer *cmd_buffer, uint64_t states)
1821 {
1822 unsigned db_depth_control = cmd_buffer->state.graphics_pipeline->db_depth_control;
1823 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1824
1825 db_depth_control &= C_028800_Z_ENABLE &
1826 C_028800_Z_WRITE_ENABLE &
1827 C_028800_ZFUNC &
1828 C_028800_DEPTH_BOUNDS_ENABLE &
1829 C_028800_STENCIL_ENABLE &
1830 C_028800_BACKFACE_ENABLE &
1831 C_028800_STENCILFUNC &
1832 C_028800_STENCILFUNC_BF;
1833
1834 db_depth_control |= S_028800_Z_ENABLE(d->depth_test_enable ? 1 : 0) |
1835 S_028800_Z_WRITE_ENABLE(d->depth_write_enable ? 1 : 0) |
1836 S_028800_ZFUNC(d->depth_compare_op) |
1837 S_028800_DEPTH_BOUNDS_ENABLE(d->depth_bounds_test_enable ? 1 : 0) |
1838 S_028800_STENCIL_ENABLE(d->stencil_test_enable ? 1 : 0) |
1839 S_028800_BACKFACE_ENABLE(d->stencil_test_enable ? 1 : 0) |
1840 S_028800_STENCILFUNC(d->stencil_op.front.compare_op) |
1841 S_028800_STENCILFUNC_BF(d->stencil_op.back.compare_op);
1842
1843 radeon_set_context_reg(cmd_buffer->cs, R_028800_DB_DEPTH_CONTROL, db_depth_control);
1844 }
1845
1846 static void
radv_emit_stencil_control(struct radv_cmd_buffer * cmd_buffer)1847 radv_emit_stencil_control(struct radv_cmd_buffer *cmd_buffer)
1848 {
1849 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1850
1851 radeon_set_context_reg(
1852 cmd_buffer->cs, R_02842C_DB_STENCIL_CONTROL,
1853 S_02842C_STENCILFAIL(si_translate_stencil_op(d->stencil_op.front.fail_op)) |
1854 S_02842C_STENCILZPASS(si_translate_stencil_op(d->stencil_op.front.pass_op)) |
1855 S_02842C_STENCILZFAIL(si_translate_stencil_op(d->stencil_op.front.depth_fail_op)) |
1856 S_02842C_STENCILFAIL_BF(si_translate_stencil_op(d->stencil_op.back.fail_op)) |
1857 S_02842C_STENCILZPASS_BF(si_translate_stencil_op(d->stencil_op.back.pass_op)) |
1858 S_02842C_STENCILZFAIL_BF(si_translate_stencil_op(d->stencil_op.back.depth_fail_op)));
1859 }
1860
1861 static void
radv_emit_fragment_shading_rate(struct radv_cmd_buffer * cmd_buffer)1862 radv_emit_fragment_shading_rate(struct radv_cmd_buffer *cmd_buffer)
1863 {
1864 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
1865 const struct radv_subpass *subpass = cmd_buffer->state.subpass;
1866 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1867 uint32_t rate_x = MIN2(2, d->fragment_shading_rate.size.width) - 1;
1868 uint32_t rate_y = MIN2(2, d->fragment_shading_rate.size.height) - 1;
1869 uint32_t pa_cl_vrs_cntl = pipeline->vrs.pa_cl_vrs_cntl;
1870 uint32_t pipeline_comb_mode = d->fragment_shading_rate.combiner_ops[0];
1871 uint32_t htile_comb_mode = d->fragment_shading_rate.combiner_ops[1];
1872
1873 assert(cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10_3);
1874
1875 if (subpass && !subpass->vrs_attachment) {
1876 /* When the current subpass has no VRS attachment, the VRS rates are expected to be 1x1, so we
1877 * can cheat by tweaking the different combiner modes.
1878 */
1879 switch (htile_comb_mode) {
1880 case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MIN_KHR:
1881 /* The result of min(A, 1x1) is always 1x1. */
1882 FALLTHROUGH;
1883 case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR:
1884 /* Force the per-draw VRS rate to 1x1. */
1885 rate_x = rate_y = 0;
1886
1887 /* As the result of min(A, 1x1) or replace(A, 1x1) are always 1x1, set the vertex rate
1888 * combiner mode as passthrough.
1889 */
1890 pipeline_comb_mode = V_028848_VRS_COMB_MODE_PASSTHRU;
1891 break;
1892 case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MAX_KHR:
1893 /* The result of max(A, 1x1) is always A. */
1894 FALLTHROUGH;
1895 case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR:
1896 /* Nothing to do here because the SAMPLE_ITER combiner mode should already be passthrough. */
1897 break;
1898 default:
1899 break;
1900 }
1901 }
1902
1903 /* Emit per-draw VRS rate which is the first combiner. */
1904 radeon_set_uconfig_reg(cmd_buffer->cs, R_03098C_GE_VRS_RATE,
1905 S_03098C_RATE_X(rate_x) | S_03098C_RATE_Y(rate_y));
1906
1907 /* VERTEX_RATE_COMBINER_MODE controls the combiner mode between the
1908 * draw rate and the vertex rate.
1909 */
1910 if (cmd_buffer->state.mesh_shading) {
1911 pa_cl_vrs_cntl |= S_028848_VERTEX_RATE_COMBINER_MODE(V_028848_VRS_COMB_MODE_PASSTHRU) |
1912 S_028848_PRIMITIVE_RATE_COMBINER_MODE(pipeline_comb_mode);
1913 } else {
1914 pa_cl_vrs_cntl |= S_028848_VERTEX_RATE_COMBINER_MODE(pipeline_comb_mode) |
1915 S_028848_PRIMITIVE_RATE_COMBINER_MODE(V_028848_VRS_COMB_MODE_PASSTHRU);
1916 }
1917
1918 /* HTILE_RATE_COMBINER_MODE controls the combiner mode between the primitive rate and the HTILE
1919 * rate.
1920 */
1921 pa_cl_vrs_cntl |= S_028848_HTILE_RATE_COMBINER_MODE(htile_comb_mode);
1922
1923 radeon_set_context_reg(cmd_buffer->cs, R_028848_PA_CL_VRS_CNTL, pa_cl_vrs_cntl);
1924 }
1925
1926 static void
radv_emit_primitive_restart_enable(struct radv_cmd_buffer * cmd_buffer)1927 radv_emit_primitive_restart_enable(struct radv_cmd_buffer *cmd_buffer)
1928 {
1929 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1930
1931 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
1932 radeon_set_uconfig_reg(cmd_buffer->cs, R_03092C_GE_MULTI_PRIM_IB_RESET_EN,
1933 d->primitive_restart_enable);
1934 } else if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9) {
1935 radeon_set_uconfig_reg(cmd_buffer->cs, R_03092C_VGT_MULTI_PRIM_IB_RESET_EN,
1936 d->primitive_restart_enable);
1937 } else {
1938 radeon_set_context_reg(cmd_buffer->cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN,
1939 d->primitive_restart_enable);
1940 }
1941 }
1942
1943 static void
radv_emit_rasterizer_discard_enable(struct radv_cmd_buffer * cmd_buffer)1944 radv_emit_rasterizer_discard_enable(struct radv_cmd_buffer *cmd_buffer)
1945 {
1946 unsigned pa_cl_clip_cntl = cmd_buffer->state.graphics_pipeline->pa_cl_clip_cntl;
1947 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1948
1949 pa_cl_clip_cntl &= C_028810_DX_RASTERIZATION_KILL;
1950 pa_cl_clip_cntl |= S_028810_DX_RASTERIZATION_KILL(d->rasterizer_discard_enable);
1951
1952 radeon_set_context_reg(cmd_buffer->cs, R_028810_PA_CL_CLIP_CNTL, pa_cl_clip_cntl);
1953 }
1954
1955 static void
radv_emit_logic_op(struct radv_cmd_buffer * cmd_buffer)1956 radv_emit_logic_op(struct radv_cmd_buffer *cmd_buffer)
1957 {
1958 unsigned cb_color_control = cmd_buffer->state.graphics_pipeline->cb_color_control;
1959 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1960
1961 cb_color_control &= C_028808_ROP3;
1962 cb_color_control |= S_028808_ROP3(d->logic_op);
1963
1964 radeon_set_context_reg(cmd_buffer->cs, R_028808_CB_COLOR_CONTROL, cb_color_control);
1965 }
1966
1967 static void
radv_emit_color_write_enable(struct radv_cmd_buffer * cmd_buffer)1968 radv_emit_color_write_enable(struct radv_cmd_buffer *cmd_buffer)
1969 {
1970 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
1971 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1972
1973 radeon_set_context_reg(cmd_buffer->cs, R_028238_CB_TARGET_MASK,
1974 pipeline->cb_target_mask & d->color_write_enable);
1975 }
1976
1977 static void
radv_emit_fb_color_state(struct radv_cmd_buffer * cmd_buffer,int index,struct radv_color_buffer_info * cb,struct radv_image_view * iview,VkImageLayout layout,bool in_render_loop)1978 radv_emit_fb_color_state(struct radv_cmd_buffer *cmd_buffer, int index,
1979 struct radv_color_buffer_info *cb, struct radv_image_view *iview,
1980 VkImageLayout layout, bool in_render_loop)
1981 {
1982 bool is_vi = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX8;
1983 uint32_t cb_fdcc_control = cb->cb_dcc_control;
1984 uint32_t cb_color_info = cb->cb_color_info;
1985 struct radv_image *image = iview->image;
1986
1987 if (!radv_layout_dcc_compressed(
1988 cmd_buffer->device, image, iview->vk.base_mip_level, layout, in_render_loop,
1989 radv_image_queue_family_mask(image, cmd_buffer->qf,
1990 cmd_buffer->qf))) {
1991 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
1992 cb_fdcc_control &= C_028C78_FDCC_ENABLE;
1993 } else {
1994 cb_color_info &= C_028C70_DCC_ENABLE;
1995 }
1996 }
1997
1998 if (!radv_layout_fmask_compressed(
1999 cmd_buffer->device, image, layout,
2000 radv_image_queue_family_mask(image, cmd_buffer->qf,
2001 cmd_buffer->qf))) {
2002 cb_color_info &= C_028C70_COMPRESSION;
2003 }
2004
2005 if (radv_image_is_tc_compat_cmask(image) && (radv_is_fmask_decompress_pipeline(cmd_buffer) ||
2006 radv_is_dcc_decompress_pipeline(cmd_buffer))) {
2007 /* If this bit is set, the FMASK decompression operation
2008 * doesn't occur (DCC_COMPRESS also implies FMASK_DECOMPRESS).
2009 */
2010 cb_color_info &= C_028C70_FMASK_COMPRESS_1FRAG_ONLY;
2011 }
2012
2013 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
2014 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C6C_CB_COLOR0_VIEW + index * 0x3c, 4);
2015 radeon_emit(cmd_buffer->cs, cb->cb_color_view); /* CB_COLOR0_VIEW */
2016 radeon_emit(cmd_buffer->cs, cb->cb_color_info); /* CB_COLOR0_INFO */
2017 radeon_emit(cmd_buffer->cs, cb->cb_color_attrib); /* CB_COLOR0_ATTRIB */
2018 radeon_emit(cmd_buffer->cs, cb_fdcc_control); /* CB_COLOR0_FDCC_CONTROL */
2019
2020 radeon_set_context_reg(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, cb->cb_color_base);
2021 radeon_set_context_reg(cmd_buffer->cs, R_028E40_CB_COLOR0_BASE_EXT + index * 4, cb->cb_color_base >> 32);
2022 radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->cb_dcc_base);
2023 radeon_set_context_reg(cmd_buffer->cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + index * 4, cb->cb_dcc_base >> 32);
2024 radeon_set_context_reg(cmd_buffer->cs, R_028EC0_CB_COLOR0_ATTRIB2 + index * 4, cb->cb_color_attrib2);
2025 radeon_set_context_reg(cmd_buffer->cs, R_028EE0_CB_COLOR0_ATTRIB3 + index * 4, cb->cb_color_attrib3);
2026 } else if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10) {
2027 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
2028 radeon_emit(cmd_buffer->cs, cb->cb_color_base);
2029 radeon_emit(cmd_buffer->cs, 0);
2030 radeon_emit(cmd_buffer->cs, 0);
2031 radeon_emit(cmd_buffer->cs, cb->cb_color_view);
2032 radeon_emit(cmd_buffer->cs, cb_color_info);
2033 radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
2034 radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
2035 radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
2036 radeon_emit(cmd_buffer->cs, 0);
2037 radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
2038 radeon_emit(cmd_buffer->cs, 0);
2039
2040 radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->cb_dcc_base);
2041
2042 radeon_set_context_reg(cmd_buffer->cs, R_028E40_CB_COLOR0_BASE_EXT + index * 4,
2043 cb->cb_color_base >> 32);
2044 radeon_set_context_reg(cmd_buffer->cs, R_028E60_CB_COLOR0_CMASK_BASE_EXT + index * 4,
2045 cb->cb_color_cmask >> 32);
2046 radeon_set_context_reg(cmd_buffer->cs, R_028E80_CB_COLOR0_FMASK_BASE_EXT + index * 4,
2047 cb->cb_color_fmask >> 32);
2048 radeon_set_context_reg(cmd_buffer->cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + index * 4,
2049 cb->cb_dcc_base >> 32);
2050 radeon_set_context_reg(cmd_buffer->cs, R_028EC0_CB_COLOR0_ATTRIB2 + index * 4,
2051 cb->cb_color_attrib2);
2052 radeon_set_context_reg(cmd_buffer->cs, R_028EE0_CB_COLOR0_ATTRIB3 + index * 4,
2053 cb->cb_color_attrib3);
2054 } else if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX9) {
2055 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
2056 radeon_emit(cmd_buffer->cs, cb->cb_color_base);
2057 radeon_emit(cmd_buffer->cs, S_028C64_BASE_256B(cb->cb_color_base >> 32));
2058 radeon_emit(cmd_buffer->cs, cb->cb_color_attrib2);
2059 radeon_emit(cmd_buffer->cs, cb->cb_color_view);
2060 radeon_emit(cmd_buffer->cs, cb_color_info);
2061 radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
2062 radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
2063 radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
2064 radeon_emit(cmd_buffer->cs, S_028C80_BASE_256B(cb->cb_color_cmask >> 32));
2065 radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
2066 radeon_emit(cmd_buffer->cs, S_028C88_BASE_256B(cb->cb_color_fmask >> 32));
2067
2068 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, 2);
2069 radeon_emit(cmd_buffer->cs, cb->cb_dcc_base);
2070 radeon_emit(cmd_buffer->cs, S_028C98_BASE_256B(cb->cb_dcc_base >> 32));
2071
2072 radeon_set_context_reg(cmd_buffer->cs, R_0287A0_CB_MRT0_EPITCH + index * 4,
2073 cb->cb_mrt_epitch);
2074 } else {
2075 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
2076 radeon_emit(cmd_buffer->cs, cb->cb_color_base);
2077 radeon_emit(cmd_buffer->cs, cb->cb_color_pitch);
2078 radeon_emit(cmd_buffer->cs, cb->cb_color_slice);
2079 radeon_emit(cmd_buffer->cs, cb->cb_color_view);
2080 radeon_emit(cmd_buffer->cs, cb_color_info);
2081 radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
2082 radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
2083 radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
2084 radeon_emit(cmd_buffer->cs, cb->cb_color_cmask_slice);
2085 radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
2086 radeon_emit(cmd_buffer->cs, cb->cb_color_fmask_slice);
2087
2088 if (is_vi) { /* DCC BASE */
2089 radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c,
2090 cb->cb_dcc_base);
2091 }
2092 }
2093
2094 if (G_028C70_DCC_ENABLE(cb_color_info)) {
2095 /* Drawing with DCC enabled also compresses colorbuffers. */
2096 VkImageSubresourceRange range = {
2097 .aspectMask = iview->vk.aspects,
2098 .baseMipLevel = iview->vk.base_mip_level,
2099 .levelCount = iview->vk.level_count,
2100 .baseArrayLayer = iview->vk.base_array_layer,
2101 .layerCount = iview->vk.layer_count,
2102 };
2103
2104 radv_update_dcc_metadata(cmd_buffer, image, &range, true);
2105 }
2106 }
2107
2108 static void
radv_update_zrange_precision(struct radv_cmd_buffer * cmd_buffer,struct radv_ds_buffer_info * ds,const struct radv_image_view * iview,VkImageLayout layout,bool in_render_loop,bool requires_cond_exec)2109 radv_update_zrange_precision(struct radv_cmd_buffer *cmd_buffer, struct radv_ds_buffer_info *ds,
2110 const struct radv_image_view *iview, VkImageLayout layout,
2111 bool in_render_loop, bool requires_cond_exec)
2112 {
2113 const struct radv_image *image = iview->image;
2114 uint32_t db_z_info = ds->db_z_info;
2115 uint32_t db_z_info_reg;
2116
2117 if (!cmd_buffer->device->physical_device->rad_info.has_tc_compat_zrange_bug ||
2118 !radv_image_is_tc_compat_htile(image))
2119 return;
2120
2121 if (!radv_layout_is_htile_compressed(
2122 cmd_buffer->device, image, layout, in_render_loop,
2123 radv_image_queue_family_mask(image, cmd_buffer->qf,
2124 cmd_buffer->qf))) {
2125 db_z_info &= C_028040_TILE_SURFACE_ENABLE;
2126 }
2127
2128 db_z_info &= C_028040_ZRANGE_PRECISION;
2129
2130 if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX9) {
2131 db_z_info_reg = R_028038_DB_Z_INFO;
2132 } else {
2133 db_z_info_reg = R_028040_DB_Z_INFO;
2134 }
2135
2136 /* When we don't know the last fast clear value we need to emit a
2137 * conditional packet that will eventually skip the following
2138 * SET_CONTEXT_REG packet.
2139 */
2140 if (requires_cond_exec) {
2141 uint64_t va = radv_get_tc_compat_zrange_va(image, iview->vk.base_mip_level);
2142
2143 radeon_emit(cmd_buffer->cs, PKT3(PKT3_COND_EXEC, 3, 0));
2144 radeon_emit(cmd_buffer->cs, va);
2145 radeon_emit(cmd_buffer->cs, va >> 32);
2146 radeon_emit(cmd_buffer->cs, 0);
2147 radeon_emit(cmd_buffer->cs, 3); /* SET_CONTEXT_REG size */
2148 }
2149
2150 radeon_set_context_reg(cmd_buffer->cs, db_z_info_reg, db_z_info);
2151 }
2152
2153 static void
radv_emit_fb_ds_state(struct radv_cmd_buffer * cmd_buffer,struct radv_ds_buffer_info * ds,struct radv_image_view * iview,VkImageLayout layout,bool in_render_loop)2154 radv_emit_fb_ds_state(struct radv_cmd_buffer *cmd_buffer, struct radv_ds_buffer_info *ds,
2155 struct radv_image_view *iview, VkImageLayout layout, bool in_render_loop)
2156 {
2157 const struct radv_image *image = iview->image;
2158 uint32_t db_z_info = ds->db_z_info;
2159 uint32_t db_stencil_info = ds->db_stencil_info;
2160 uint32_t db_htile_surface = ds->db_htile_surface;
2161
2162 if (!radv_layout_is_htile_compressed(
2163 cmd_buffer->device, image, layout, in_render_loop,
2164 radv_image_queue_family_mask(image, cmd_buffer->qf,
2165 cmd_buffer->qf))) {
2166 db_z_info &= C_028040_TILE_SURFACE_ENABLE;
2167 db_stencil_info |= S_028044_TILE_STENCIL_DISABLE(1);
2168 }
2169
2170 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10_3 &&
2171 !cmd_buffer->state.subpass->vrs_attachment) {
2172 db_htile_surface &= C_028ABC_VRS_HTILE_ENCODING;
2173 }
2174
2175 radeon_set_context_reg(cmd_buffer->cs, R_028008_DB_DEPTH_VIEW, ds->db_depth_view);
2176 radeon_set_context_reg(cmd_buffer->cs, R_028ABC_DB_HTILE_SURFACE, db_htile_surface);
2177
2178 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10) {
2179 radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, ds->db_htile_data_base);
2180 radeon_set_context_reg(cmd_buffer->cs, R_02801C_DB_DEPTH_SIZE_XY, ds->db_depth_size);
2181
2182 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
2183 radeon_set_context_reg_seq(cmd_buffer->cs, R_028040_DB_Z_INFO, 6);
2184 } else {
2185 radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 7);
2186 radeon_emit(cmd_buffer->cs, S_02803C_RESOURCE_LEVEL(1));
2187 }
2188 radeon_emit(cmd_buffer->cs, db_z_info);
2189 radeon_emit(cmd_buffer->cs, db_stencil_info);
2190 radeon_emit(cmd_buffer->cs, ds->db_z_read_base);
2191 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);
2192 radeon_emit(cmd_buffer->cs, ds->db_z_read_base);
2193 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);
2194
2195 radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_READ_BASE_HI, 5);
2196 radeon_emit(cmd_buffer->cs, ds->db_z_read_base >> 32);
2197 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base >> 32);
2198 radeon_emit(cmd_buffer->cs, ds->db_z_read_base >> 32);
2199 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base >> 32);
2200 radeon_emit(cmd_buffer->cs, ds->db_htile_data_base >> 32);
2201 } else if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX9) {
2202 radeon_set_context_reg_seq(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, 3);
2203 radeon_emit(cmd_buffer->cs, ds->db_htile_data_base);
2204 radeon_emit(cmd_buffer->cs, S_028018_BASE_HI(ds->db_htile_data_base >> 32));
2205 radeon_emit(cmd_buffer->cs, ds->db_depth_size);
2206
2207 radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 10);
2208 radeon_emit(cmd_buffer->cs, db_z_info); /* DB_Z_INFO */
2209 radeon_emit(cmd_buffer->cs, db_stencil_info); /* DB_STENCIL_INFO */
2210 radeon_emit(cmd_buffer->cs, ds->db_z_read_base); /* DB_Z_READ_BASE */
2211 radeon_emit(cmd_buffer->cs,
2212 S_028044_BASE_HI(ds->db_z_read_base >> 32)); /* DB_Z_READ_BASE_HI */
2213 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base); /* DB_STENCIL_READ_BASE */
2214 radeon_emit(cmd_buffer->cs,
2215 S_02804C_BASE_HI(ds->db_stencil_read_base >> 32)); /* DB_STENCIL_READ_BASE_HI */
2216 radeon_emit(cmd_buffer->cs, ds->db_z_write_base); /* DB_Z_WRITE_BASE */
2217 radeon_emit(cmd_buffer->cs,
2218 S_028054_BASE_HI(ds->db_z_write_base >> 32)); /* DB_Z_WRITE_BASE_HI */
2219 radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base); /* DB_STENCIL_WRITE_BASE */
2220 radeon_emit(cmd_buffer->cs,
2221 S_02805C_BASE_HI(ds->db_stencil_write_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */
2222
2223 radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_INFO2, 2);
2224 radeon_emit(cmd_buffer->cs, ds->db_z_info2);
2225 radeon_emit(cmd_buffer->cs, ds->db_stencil_info2);
2226 } else {
2227 radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, ds->db_htile_data_base);
2228
2229 radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 9);
2230 radeon_emit(cmd_buffer->cs, ds->db_depth_info); /* R_02803C_DB_DEPTH_INFO */
2231 radeon_emit(cmd_buffer->cs, db_z_info); /* R_028040_DB_Z_INFO */
2232 radeon_emit(cmd_buffer->cs, db_stencil_info); /* R_028044_DB_STENCIL_INFO */
2233 radeon_emit(cmd_buffer->cs, ds->db_z_read_base); /* R_028048_DB_Z_READ_BASE */
2234 radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base); /* R_02804C_DB_STENCIL_READ_BASE */
2235 radeon_emit(cmd_buffer->cs, ds->db_z_write_base); /* R_028050_DB_Z_WRITE_BASE */
2236 radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base); /* R_028054_DB_STENCIL_WRITE_BASE */
2237 radeon_emit(cmd_buffer->cs, ds->db_depth_size); /* R_028058_DB_DEPTH_SIZE */
2238 radeon_emit(cmd_buffer->cs, ds->db_depth_slice); /* R_02805C_DB_DEPTH_SLICE */
2239 }
2240
2241 /* Update the ZRANGE_PRECISION value for the TC-compat bug. */
2242 radv_update_zrange_precision(cmd_buffer, ds, iview, layout, in_render_loop, true);
2243
2244 radeon_set_context_reg(cmd_buffer->cs, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL,
2245 ds->pa_su_poly_offset_db_fmt_cntl);
2246 }
2247
2248 /**
2249 * Update the fast clear depth/stencil values if the image is bound as a
2250 * depth/stencil buffer.
2251 */
2252 static void
radv_update_bound_fast_clear_ds(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview,VkClearDepthStencilValue ds_clear_value,VkImageAspectFlags aspects)2253 radv_update_bound_fast_clear_ds(struct radv_cmd_buffer *cmd_buffer,
2254 const struct radv_image_view *iview,
2255 VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects)
2256 {
2257 const struct radv_subpass *subpass = cmd_buffer->state.subpass;
2258 const struct radv_image *image = iview->image;
2259 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2260 uint32_t att_idx;
2261
2262 if (!cmd_buffer->state.attachments || !subpass)
2263 return;
2264
2265 if (!subpass->depth_stencil_attachment)
2266 return;
2267
2268 att_idx = subpass->depth_stencil_attachment->attachment;
2269 if (cmd_buffer->state.attachments[att_idx].iview->image != image)
2270 return;
2271
2272 if (aspects == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
2273 radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2);
2274 radeon_emit(cs, ds_clear_value.stencil);
2275 radeon_emit(cs, fui(ds_clear_value.depth));
2276 } else if (aspects == VK_IMAGE_ASPECT_DEPTH_BIT) {
2277 radeon_set_context_reg(cs, R_02802C_DB_DEPTH_CLEAR, fui(ds_clear_value.depth));
2278 } else {
2279 assert(aspects == VK_IMAGE_ASPECT_STENCIL_BIT);
2280 radeon_set_context_reg(cs, R_028028_DB_STENCIL_CLEAR, ds_clear_value.stencil);
2281 }
2282
2283 /* Update the ZRANGE_PRECISION value for the TC-compat bug. This is
2284 * only needed when clearing Z to 0.0.
2285 */
2286 if ((aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && ds_clear_value.depth == 0.0) {
2287 VkImageLayout layout = subpass->depth_stencil_attachment->layout;
2288 bool in_render_loop = subpass->depth_stencil_attachment->in_render_loop;
2289
2290 radv_update_zrange_precision(cmd_buffer, &cmd_buffer->state.attachments[att_idx].ds, iview,
2291 layout, in_render_loop, false);
2292 }
2293
2294 cmd_buffer->state.context_roll_without_scissor_emitted = true;
2295 }
2296
2297 /**
2298 * Set the clear depth/stencil values to the image's metadata.
2299 */
2300 static void
radv_set_ds_clear_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,VkClearDepthStencilValue ds_clear_value,VkImageAspectFlags aspects)2301 radv_set_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2302 const VkImageSubresourceRange *range,
2303 VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects)
2304 {
2305 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2306 uint32_t level_count = radv_get_levelCount(image, range);
2307
2308 if (aspects == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
2309 uint64_t va = radv_get_ds_clear_value_va(image, range->baseMipLevel);
2310
2311 /* Use the fastest way when both aspects are used. */
2312 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + 2 * level_count, cmd_buffer->state.predicating));
2313 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2314 radeon_emit(cs, va);
2315 radeon_emit(cs, va >> 32);
2316
2317 for (uint32_t l = 0; l < level_count; l++) {
2318 radeon_emit(cs, ds_clear_value.stencil);
2319 radeon_emit(cs, fui(ds_clear_value.depth));
2320 }
2321 } else {
2322 /* Otherwise we need one WRITE_DATA packet per level. */
2323 for (uint32_t l = 0; l < level_count; l++) {
2324 uint64_t va = radv_get_ds_clear_value_va(image, range->baseMipLevel + l);
2325 unsigned value;
2326
2327 if (aspects == VK_IMAGE_ASPECT_DEPTH_BIT) {
2328 value = fui(ds_clear_value.depth);
2329 va += 4;
2330 } else {
2331 assert(aspects == VK_IMAGE_ASPECT_STENCIL_BIT);
2332 value = ds_clear_value.stencil;
2333 }
2334
2335 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, cmd_buffer->state.predicating));
2336 radeon_emit(cs,
2337 S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2338 radeon_emit(cs, va);
2339 radeon_emit(cs, va >> 32);
2340 radeon_emit(cs, value);
2341 }
2342 }
2343 }
2344
2345 /**
2346 * Update the TC-compat metadata value for this image.
2347 */
2348 static void
radv_set_tc_compat_zrange_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,uint32_t value)2349 radv_set_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2350 const VkImageSubresourceRange *range, uint32_t value)
2351 {
2352 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2353
2354 if (!cmd_buffer->device->physical_device->rad_info.has_tc_compat_zrange_bug)
2355 return;
2356
2357 uint64_t va = radv_get_tc_compat_zrange_va(image, range->baseMipLevel);
2358 uint32_t level_count = radv_get_levelCount(image, range);
2359
2360 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + level_count, cmd_buffer->state.predicating));
2361 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2362 radeon_emit(cs, va);
2363 radeon_emit(cs, va >> 32);
2364
2365 for (uint32_t l = 0; l < level_count; l++)
2366 radeon_emit(cs, value);
2367 }
2368
2369 static void
radv_update_tc_compat_zrange_metadata(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview,VkClearDepthStencilValue ds_clear_value)2370 radv_update_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer,
2371 const struct radv_image_view *iview,
2372 VkClearDepthStencilValue ds_clear_value)
2373 {
2374 VkImageSubresourceRange range = {
2375 .aspectMask = iview->vk.aspects,
2376 .baseMipLevel = iview->vk.base_mip_level,
2377 .levelCount = iview->vk.level_count,
2378 .baseArrayLayer = iview->vk.base_array_layer,
2379 .layerCount = iview->vk.layer_count,
2380 };
2381 uint32_t cond_val;
2382
2383 /* Conditionally set DB_Z_INFO.ZRANGE_PRECISION to 0 when the last
2384 * depth clear value is 0.0f.
2385 */
2386 cond_val = ds_clear_value.depth == 0.0f ? UINT_MAX : 0;
2387
2388 radv_set_tc_compat_zrange_metadata(cmd_buffer, iview->image, &range, cond_val);
2389 }
2390
2391 /**
2392 * Update the clear depth/stencil values for this image.
2393 */
2394 void
radv_update_ds_clear_metadata(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview,VkClearDepthStencilValue ds_clear_value,VkImageAspectFlags aspects)2395 radv_update_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
2396 const struct radv_image_view *iview,
2397 VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects)
2398 {
2399 VkImageSubresourceRange range = {
2400 .aspectMask = iview->vk.aspects,
2401 .baseMipLevel = iview->vk.base_mip_level,
2402 .levelCount = iview->vk.level_count,
2403 .baseArrayLayer = iview->vk.base_array_layer,
2404 .layerCount = iview->vk.layer_count,
2405 };
2406 struct radv_image *image = iview->image;
2407
2408 assert(radv_htile_enabled(image, range.baseMipLevel));
2409
2410 radv_set_ds_clear_metadata(cmd_buffer, iview->image, &range, ds_clear_value, aspects);
2411
2412 if (radv_image_is_tc_compat_htile(image) && (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) {
2413 radv_update_tc_compat_zrange_metadata(cmd_buffer, iview, ds_clear_value);
2414 }
2415
2416 radv_update_bound_fast_clear_ds(cmd_buffer, iview, ds_clear_value, aspects);
2417 }
2418
2419 /**
2420 * Load the clear depth/stencil values from the image's metadata.
2421 */
2422 static void
radv_load_ds_clear_metadata(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview)2423 radv_load_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, const struct radv_image_view *iview)
2424 {
2425 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2426 const struct radv_image *image = iview->image;
2427 VkImageAspectFlags aspects = vk_format_aspects(image->vk.format);
2428 uint64_t va = radv_get_ds_clear_value_va(image, iview->vk.base_mip_level);
2429 unsigned reg_offset = 0, reg_count = 0;
2430
2431 assert(radv_image_has_htile(image));
2432
2433 if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
2434 ++reg_count;
2435 } else {
2436 ++reg_offset;
2437 va += 4;
2438 }
2439 if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
2440 ++reg_count;
2441
2442 uint32_t reg = R_028028_DB_STENCIL_CLEAR + 4 * reg_offset;
2443
2444 if (cmd_buffer->device->physical_device->rad_info.has_load_ctx_reg_pkt) {
2445 radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, 0));
2446 radeon_emit(cs, va);
2447 radeon_emit(cs, va >> 32);
2448 radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
2449 radeon_emit(cs, reg_count);
2450 } else {
2451 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
2452 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) |
2453 (reg_count == 2 ? COPY_DATA_COUNT_SEL : 0));
2454 radeon_emit(cs, va);
2455 radeon_emit(cs, va >> 32);
2456 radeon_emit(cs, reg >> 2);
2457 radeon_emit(cs, 0);
2458
2459 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
2460 radeon_emit(cs, 0);
2461 }
2462 }
2463
2464 /*
2465 * With DCC some colors don't require CMASK elimination before being
2466 * used as a texture. This sets a predicate value to determine if the
2467 * cmask eliminate is required.
2468 */
2469 void
radv_update_fce_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,bool value)2470 radv_update_fce_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2471 const VkImageSubresourceRange *range, bool value)
2472 {
2473 if (!image->fce_pred_offset)
2474 return;
2475
2476 uint64_t pred_val = value;
2477 uint64_t va = radv_image_get_fce_pred_va(image, range->baseMipLevel);
2478 uint32_t level_count = radv_get_levelCount(image, range);
2479 uint32_t count = 2 * level_count;
2480
2481 radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0));
2482 radeon_emit(cmd_buffer->cs,
2483 S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2484 radeon_emit(cmd_buffer->cs, va);
2485 radeon_emit(cmd_buffer->cs, va >> 32);
2486
2487 for (uint32_t l = 0; l < level_count; l++) {
2488 radeon_emit(cmd_buffer->cs, pred_val);
2489 radeon_emit(cmd_buffer->cs, pred_val >> 32);
2490 }
2491 }
2492
2493 /**
2494 * Update the DCC predicate to reflect the compression state.
2495 */
2496 void
radv_update_dcc_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,bool value)2497 radv_update_dcc_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2498 const VkImageSubresourceRange *range, bool value)
2499 {
2500 if (image->dcc_pred_offset == 0)
2501 return;
2502
2503 uint64_t pred_val = value;
2504 uint64_t va = radv_image_get_dcc_pred_va(image, range->baseMipLevel);
2505 uint32_t level_count = radv_get_levelCount(image, range);
2506 uint32_t count = 2 * level_count;
2507
2508 assert(radv_dcc_enabled(image, range->baseMipLevel));
2509
2510 radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0));
2511 radeon_emit(cmd_buffer->cs,
2512 S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2513 radeon_emit(cmd_buffer->cs, va);
2514 radeon_emit(cmd_buffer->cs, va >> 32);
2515
2516 for (uint32_t l = 0; l < level_count; l++) {
2517 radeon_emit(cmd_buffer->cs, pred_val);
2518 radeon_emit(cmd_buffer->cs, pred_val >> 32);
2519 }
2520 }
2521
2522 /**
2523 * Update the fast clear color values if the image is bound as a color buffer.
2524 */
2525 static void
radv_update_bound_fast_clear_color(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,int cb_idx,uint32_t color_values[2])2526 radv_update_bound_fast_clear_color(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2527 int cb_idx, uint32_t color_values[2])
2528 {
2529 const struct radv_subpass *subpass = cmd_buffer->state.subpass;
2530 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2531 uint32_t att_idx;
2532
2533 if (!cmd_buffer->state.attachments || !subpass)
2534 return;
2535
2536 att_idx = subpass->color_attachments[cb_idx].attachment;
2537 if (att_idx == VK_ATTACHMENT_UNUSED)
2538 return;
2539
2540 if (cmd_buffer->state.attachments[att_idx].iview->image != image)
2541 return;
2542
2543 radeon_set_context_reg_seq(cs, R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c, 2);
2544 radeon_emit(cs, color_values[0]);
2545 radeon_emit(cs, color_values[1]);
2546
2547 cmd_buffer->state.context_roll_without_scissor_emitted = true;
2548 }
2549
2550 /**
2551 * Set the clear color values to the image's metadata.
2552 */
2553 static void
radv_set_color_clear_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,uint32_t color_values[2])2554 radv_set_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2555 const VkImageSubresourceRange *range, uint32_t color_values[2])
2556 {
2557 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2558 uint32_t level_count = radv_get_levelCount(image, range);
2559 uint32_t count = 2 * level_count;
2560
2561 assert(radv_image_has_cmask(image) || radv_dcc_enabled(image, range->baseMipLevel));
2562
2563 if (radv_image_has_clear_value(image)) {
2564 uint64_t va = radv_image_get_fast_clear_va(image, range->baseMipLevel);
2565
2566 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + count, cmd_buffer->state.predicating));
2567 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2568 radeon_emit(cs, va);
2569 radeon_emit(cs, va >> 32);
2570
2571 for (uint32_t l = 0; l < level_count; l++) {
2572 radeon_emit(cs, color_values[0]);
2573 radeon_emit(cs, color_values[1]);
2574 }
2575 } else {
2576 /* Some default value we can set in the update. */
2577 assert(color_values[0] == 0 && color_values[1] == 0);
2578 }
2579 }
2580
2581 /**
2582 * Update the clear color values for this image.
2583 */
2584 void
radv_update_color_clear_metadata(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview,int cb_idx,uint32_t color_values[2])2585 radv_update_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
2586 const struct radv_image_view *iview, int cb_idx,
2587 uint32_t color_values[2])
2588 {
2589 struct radv_image *image = iview->image;
2590 VkImageSubresourceRange range = {
2591 .aspectMask = iview->vk.aspects,
2592 .baseMipLevel = iview->vk.base_mip_level,
2593 .levelCount = iview->vk.level_count,
2594 .baseArrayLayer = iview->vk.base_array_layer,
2595 .layerCount = iview->vk.layer_count,
2596 };
2597
2598 assert(radv_image_has_cmask(image) || radv_dcc_enabled(image, iview->vk.base_mip_level));
2599
2600 /* Do not need to update the clear value for images that are fast cleared with the comp-to-single
2601 * mode because the hardware gets the value from the image directly.
2602 */
2603 if (iview->image->support_comp_to_single)
2604 return;
2605
2606 radv_set_color_clear_metadata(cmd_buffer, image, &range, color_values);
2607
2608 radv_update_bound_fast_clear_color(cmd_buffer, image, cb_idx, color_values);
2609 }
2610
2611 /**
2612 * Load the clear color values from the image's metadata.
2613 */
2614 static void
radv_load_color_clear_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image_view * iview,int cb_idx)2615 radv_load_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image_view *iview,
2616 int cb_idx)
2617 {
2618 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2619 struct radv_image *image = iview->image;
2620
2621 if (!radv_image_has_cmask(image) && !radv_dcc_enabled(image, iview->vk.base_mip_level))
2622 return;
2623
2624 if (iview->image->support_comp_to_single)
2625 return;
2626
2627 if (!radv_image_has_clear_value(image)) {
2628 uint32_t color_values[2] = {0, 0};
2629 radv_update_bound_fast_clear_color(cmd_buffer, image, cb_idx, color_values);
2630 return;
2631 }
2632
2633 uint64_t va = radv_image_get_fast_clear_va(image, iview->vk.base_mip_level);
2634 uint32_t reg = R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c;
2635
2636 if (cmd_buffer->device->physical_device->rad_info.has_load_ctx_reg_pkt) {
2637 radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, cmd_buffer->state.predicating));
2638 radeon_emit(cs, va);
2639 radeon_emit(cs, va >> 32);
2640 radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
2641 radeon_emit(cs, 2);
2642 } else {
2643 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating));
2644 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) |
2645 COPY_DATA_COUNT_SEL);
2646 radeon_emit(cs, va);
2647 radeon_emit(cs, va >> 32);
2648 radeon_emit(cs, reg >> 2);
2649 radeon_emit(cs, 0);
2650
2651 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
2652 radeon_emit(cs, 0);
2653 }
2654 }
2655
2656 /* GFX9+ metadata cache flushing workaround. metadata cache coherency is
2657 * broken if the CB caches data of multiple mips of the same image at the
2658 * same time.
2659 *
2660 * Insert some flushes to avoid this.
2661 */
2662 static void
radv_emit_fb_mip_change_flush(struct radv_cmd_buffer * cmd_buffer)2663 radv_emit_fb_mip_change_flush(struct radv_cmd_buffer *cmd_buffer)
2664 {
2665 struct vk_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
2666 const struct radv_subpass *subpass = cmd_buffer->state.subpass;
2667 bool color_mip_changed = false;
2668
2669 /* Entire workaround is not applicable before GFX9 */
2670 if (cmd_buffer->device->physical_device->rad_info.gfx_level < GFX9)
2671 return;
2672
2673 if (!framebuffer)
2674 return;
2675
2676 for (int i = 0; i < subpass->color_count; ++i) {
2677 int idx = subpass->color_attachments[i].attachment;
2678 if (idx == VK_ATTACHMENT_UNUSED)
2679 continue;
2680
2681 struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview;
2682
2683 if ((radv_image_has_CB_metadata(iview->image) ||
2684 radv_dcc_enabled(iview->image, iview->vk.base_mip_level) ||
2685 radv_dcc_enabled(iview->image, cmd_buffer->state.cb_mip[i])) &&
2686 cmd_buffer->state.cb_mip[i] != iview->vk.base_mip_level)
2687 color_mip_changed = true;
2688
2689 cmd_buffer->state.cb_mip[i] = iview->vk.base_mip_level;
2690 }
2691
2692 if (color_mip_changed) {
2693 cmd_buffer->state.flush_bits |=
2694 RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
2695 }
2696 }
2697
2698 /* This function does the flushes for mip changes if the levels are not zero for
2699 * all render targets. This way we can assume at the start of the next cmd_buffer
2700 * that rendering to mip 0 doesn't need any flushes. As that is the most common
2701 * case that saves some flushes. */
2702 static void
radv_emit_mip_change_flush_default(struct radv_cmd_buffer * cmd_buffer)2703 radv_emit_mip_change_flush_default(struct radv_cmd_buffer *cmd_buffer)
2704 {
2705 /* Entire workaround is not applicable before GFX9 */
2706 if (cmd_buffer->device->physical_device->rad_info.gfx_level < GFX9)
2707 return;
2708
2709 bool need_color_mip_flush = false;
2710 for (unsigned i = 0; i < 8; ++i) {
2711 if (cmd_buffer->state.cb_mip[i]) {
2712 need_color_mip_flush = true;
2713 break;
2714 }
2715 }
2716
2717 if (need_color_mip_flush) {
2718 cmd_buffer->state.flush_bits |=
2719 RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
2720 }
2721
2722 memset(cmd_buffer->state.cb_mip, 0, sizeof(cmd_buffer->state.cb_mip));
2723 }
2724
2725 static struct radv_image *
radv_cmd_buffer_get_vrs_image(struct radv_cmd_buffer * cmd_buffer)2726 radv_cmd_buffer_get_vrs_image(struct radv_cmd_buffer *cmd_buffer)
2727 {
2728 struct radv_device *device = cmd_buffer->device;
2729
2730 if (!device->vrs.image) {
2731 VkResult result;
2732
2733 /* The global VRS state is initialized on-demand to avoid wasting VRAM. */
2734 result = radv_device_init_vrs_state(device);
2735 if (result != VK_SUCCESS) {
2736 cmd_buffer->record_result = result;
2737 return NULL;
2738 }
2739 }
2740
2741 return device->vrs.image;
2742 }
2743
2744 static void
radv_emit_framebuffer_state(struct radv_cmd_buffer * cmd_buffer)2745 radv_emit_framebuffer_state(struct radv_cmd_buffer *cmd_buffer)
2746 {
2747 int i;
2748 struct vk_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
2749 const struct radv_subpass *subpass = cmd_buffer->state.subpass;
2750 bool disable_constant_encode_ac01 = false;
2751 unsigned color_invalid = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11
2752 ? G_028C70_FORMAT_GFX11(V_028C70_COLOR_INVALID)
2753 : G_028C70_FORMAT_GFX6(V_028C70_COLOR_INVALID);
2754
2755 for (i = 0; i < subpass->color_count; ++i) {
2756 if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) {
2757 radeon_set_context_reg(cmd_buffer->cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, color_invalid);
2758 continue;
2759 }
2760
2761 int idx = subpass->color_attachments[i].attachment;
2762 struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview;
2763 VkImageLayout layout = subpass->color_attachments[i].layout;
2764 bool in_render_loop = subpass->color_attachments[i].in_render_loop;
2765
2766 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, iview->image->bindings[0].bo);
2767
2768 assert(iview->vk.aspects & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_PLANE_0_BIT |
2769 VK_IMAGE_ASPECT_PLANE_1_BIT | VK_IMAGE_ASPECT_PLANE_2_BIT));
2770
2771 if (iview->image->disjoint && iview->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
2772 for (uint32_t plane_id = 0; plane_id < iview->image->plane_count; plane_id++) {
2773 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
2774 iview->image->bindings[plane_id].bo);
2775 }
2776 } else {
2777 uint32_t plane_id = iview->image->disjoint ? iview->plane_id : 0;
2778 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
2779 iview->image->bindings[plane_id].bo);
2780 }
2781
2782 radv_emit_fb_color_state(cmd_buffer, i, &cmd_buffer->state.attachments[idx].cb, iview, layout,
2783 in_render_loop);
2784
2785 radv_load_color_clear_metadata(cmd_buffer, iview, i);
2786
2787 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9 &&
2788 iview->image->dcc_sign_reinterpret) {
2789 /* Disable constant encoding with the clear value of "1" with different DCC signedness
2790 * because the hardware will fill "1" instead of the clear value.
2791 */
2792 disable_constant_encode_ac01 = true;
2793 }
2794 }
2795 for (; i < cmd_buffer->state.last_subpass_color_count; i++) {
2796 radeon_set_context_reg(cmd_buffer->cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, color_invalid);
2797 }
2798 cmd_buffer->state.last_subpass_color_count = subpass->color_count;
2799
2800 if (subpass->depth_stencil_attachment) {
2801 int idx = subpass->depth_stencil_attachment->attachment;
2802 VkImageLayout layout = subpass->depth_stencil_attachment->layout;
2803 bool in_render_loop = subpass->depth_stencil_attachment->in_render_loop;
2804 struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview;
2805 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
2806 cmd_buffer->state.attachments[idx].iview->image->bindings[0].bo);
2807
2808 radv_emit_fb_ds_state(cmd_buffer, &cmd_buffer->state.attachments[idx].ds, iview, layout,
2809 in_render_loop);
2810
2811 if (radv_layout_is_htile_compressed(
2812 cmd_buffer->device, iview->image, layout, in_render_loop,
2813 radv_image_queue_family_mask(iview->image, cmd_buffer->qf,
2814 cmd_buffer->qf))) {
2815 /* Only load the depth/stencil fast clear values when
2816 * compressed rendering is enabled.
2817 */
2818 radv_load_ds_clear_metadata(cmd_buffer, iview);
2819 }
2820 } else if (subpass->vrs_attachment && radv_cmd_buffer_get_vrs_image(cmd_buffer)) {
2821 /* When a subpass uses a VRS attachment without binding a depth/stencil attachment, we have to
2822 * bind our internal depth buffer that contains the VRS data as part of HTILE.
2823 */
2824 VkImageLayout layout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
2825 struct radv_buffer *htile_buffer = cmd_buffer->device->vrs.buffer;
2826 struct radv_image *image = cmd_buffer->device->vrs.image;
2827 struct radv_ds_buffer_info ds;
2828 struct radv_image_view iview;
2829
2830 radv_image_view_init(&iview, cmd_buffer->device,
2831 &(VkImageViewCreateInfo){
2832 .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
2833 .image = radv_image_to_handle(image),
2834 .viewType = radv_meta_get_view_type(image),
2835 .format = image->vk.format,
2836 .subresourceRange =
2837 {
2838 .aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT,
2839 .baseMipLevel = 0,
2840 .levelCount = 1,
2841 .baseArrayLayer = 0,
2842 .layerCount = 1,
2843 },
2844 },
2845 0, NULL);
2846
2847 radv_initialise_vrs_surface(image, htile_buffer, &ds);
2848
2849 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, htile_buffer->bo);
2850
2851 radv_emit_fb_ds_state(cmd_buffer, &ds, &iview, layout, false);
2852
2853 radv_image_view_finish(&iview);
2854 } else {
2855 unsigned num_samples = 0;
2856
2857 /* On GFX11, DB_Z_INFO.NUM_SAMPLES should always match the framebuffer samples. It affects
2858 * VRS and occlusion queries if depth and stencil are not bound.
2859 */
2860 if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX11)
2861 num_samples = util_logbase2(subpass->max_sample_count);
2862
2863 if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX9)
2864 radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 2);
2865 else
2866 radeon_set_context_reg_seq(cmd_buffer->cs, R_028040_DB_Z_INFO, 2);
2867
2868 radeon_emit(cmd_buffer->cs, S_028040_FORMAT(V_028040_Z_INVALID) | /* DB_Z_INFO */
2869 S_028040_NUM_SAMPLES(num_samples));
2870 radeon_emit(cmd_buffer->cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */
2871 }
2872 radeon_set_context_reg(cmd_buffer->cs, R_028208_PA_SC_WINDOW_SCISSOR_BR,
2873 S_028208_BR_X(framebuffer->width) | S_028208_BR_Y(framebuffer->height));
2874
2875 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX8) {
2876 bool disable_constant_encode =
2877 cmd_buffer->device->physical_device->rad_info.has_dcc_constant_encode;
2878 enum amd_gfx_level gfx_level = cmd_buffer->device->physical_device->rad_info.gfx_level;
2879 uint8_t watermark = gfx_level >= GFX10 ? 6 : 4;
2880
2881 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
2882 radeon_set_context_reg(cmd_buffer->cs, R_028424_CB_FDCC_CONTROL,
2883 S_028424_SAMPLE_MASK_TRACKER_WATERMARK(watermark));
2884 } else {
2885 radeon_set_context_reg(cmd_buffer->cs, R_028424_CB_DCC_CONTROL,
2886 S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(gfx_level <= GFX9) |
2887 S_028424_OVERWRITE_COMBINER_WATERMARK(watermark) |
2888 S_028424_DISABLE_CONSTANT_ENCODE_AC01(disable_constant_encode_ac01) |
2889 S_028424_DISABLE_CONSTANT_ENCODE_REG(disable_constant_encode));
2890 }
2891 }
2892
2893 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_FRAMEBUFFER;
2894 }
2895
2896 static void
radv_emit_index_buffer(struct radv_cmd_buffer * cmd_buffer,bool indirect)2897 radv_emit_index_buffer(struct radv_cmd_buffer *cmd_buffer, bool indirect)
2898 {
2899 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2900 struct radv_cmd_state *state = &cmd_buffer->state;
2901
2902 /* With indirect generated commands the index buffer bind may be part of the
2903 * indirect command buffer, in which case the app may not have bound any yet. */
2904 if (state->index_type < 0)
2905 return;
2906
2907 /* For the direct indexed draws we use DRAW_INDEX_2, which includes
2908 * the index_va and max_index_count already. */
2909 if (!indirect)
2910 return;
2911
2912 if (state->max_index_count ||
2913 !cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug) {
2914 radeon_emit(cs, PKT3(PKT3_INDEX_BASE, 1, 0));
2915 radeon_emit(cs, state->index_va);
2916 radeon_emit(cs, state->index_va >> 32);
2917
2918 radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
2919 radeon_emit(cs, state->max_index_count);
2920 }
2921
2922 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_INDEX_BUFFER;
2923 }
2924
2925 void
radv_set_db_count_control(struct radv_cmd_buffer * cmd_buffer,bool enable_occlusion_queries)2926 radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer, bool enable_occlusion_queries)
2927 {
2928 bool has_perfect_queries = cmd_buffer->state.perfect_occlusion_queries_enabled;
2929 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
2930 uint32_t pa_sc_mode_cntl_1 = pipeline ? pipeline->ms.pa_sc_mode_cntl_1 : 0;
2931 uint32_t db_count_control;
2932
2933 if (!enable_occlusion_queries) {
2934 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) {
2935 if (G_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(pa_sc_mode_cntl_1) &&
2936 pipeline->disable_out_of_order_rast_for_occlusion && has_perfect_queries) {
2937 /* Re-enable out-of-order rasterization if the
2938 * bound pipeline supports it and if it's has
2939 * been disabled before starting any perfect
2940 * occlusion queries.
2941 */
2942 radeon_set_context_reg(cmd_buffer->cs, R_028A4C_PA_SC_MODE_CNTL_1, pa_sc_mode_cntl_1);
2943 }
2944 }
2945 db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(1);
2946 } else {
2947 const struct radv_subpass *subpass = cmd_buffer->state.subpass;
2948 uint32_t sample_rate = subpass ? util_logbase2(subpass->max_sample_count) : 0;
2949 bool gfx10_perfect =
2950 cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10 && has_perfect_queries;
2951
2952 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) {
2953 /* Always enable PERFECT_ZPASS_COUNTS due to issues with partially
2954 * covered tiles, discards, and early depth testing. For more details,
2955 * see https://gitlab.freedesktop.org/mesa/mesa/-/issues/3218 */
2956 db_count_control = S_028004_PERFECT_ZPASS_COUNTS(1) |
2957 S_028004_DISABLE_CONSERVATIVE_ZPASS_COUNTS(gfx10_perfect) |
2958 S_028004_SAMPLE_RATE(sample_rate) | S_028004_ZPASS_ENABLE(1) |
2959 S_028004_SLICE_EVEN_ENABLE(1) | S_028004_SLICE_ODD_ENABLE(1);
2960
2961 if (G_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(pa_sc_mode_cntl_1) &&
2962 pipeline->disable_out_of_order_rast_for_occlusion && has_perfect_queries) {
2963 /* If the bound pipeline has enabled
2964 * out-of-order rasterization, we should
2965 * disable it before starting any perfect
2966 * occlusion queries.
2967 */
2968 pa_sc_mode_cntl_1 &= C_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE;
2969
2970 radeon_set_context_reg(cmd_buffer->cs, R_028A4C_PA_SC_MODE_CNTL_1, pa_sc_mode_cntl_1);
2971 }
2972 } else {
2973 db_count_control = S_028004_PERFECT_ZPASS_COUNTS(1) | S_028004_SAMPLE_RATE(sample_rate);
2974 }
2975 }
2976
2977 radeon_set_context_reg(cmd_buffer->cs, R_028004_DB_COUNT_CONTROL, db_count_control);
2978
2979 cmd_buffer->state.context_roll_without_scissor_emitted = true;
2980 }
2981
2982 unsigned
radv_instance_rate_prolog_index(unsigned num_attributes,uint32_t instance_rate_inputs)2983 radv_instance_rate_prolog_index(unsigned num_attributes, uint32_t instance_rate_inputs)
2984 {
2985 /* instance_rate_vs_prologs is a flattened array of array of arrays of different sizes, or a
2986 * single array sorted in ascending order using:
2987 * - total number of attributes
2988 * - number of instanced attributes
2989 * - index of first instanced attribute
2990 */
2991
2992 /* From total number of attributes to offset. */
2993 static const uint16_t total_to_offset[16] = {0, 1, 4, 10, 20, 35, 56, 84,
2994 120, 165, 220, 286, 364, 455, 560, 680};
2995 unsigned start_index = total_to_offset[num_attributes - 1];
2996
2997 /* From number of instanced attributes to offset. This would require a different LUT depending on
2998 * the total number of attributes, but we can exploit a pattern to use just the LUT for 16 total
2999 * attributes.
3000 */
3001 static const uint8_t count_to_offset_total16[16] = {0, 16, 31, 45, 58, 70, 81, 91,
3002 100, 108, 115, 121, 126, 130, 133, 135};
3003 unsigned count = util_bitcount(instance_rate_inputs);
3004 unsigned offset_from_start_index =
3005 count_to_offset_total16[count - 1] - ((16 - num_attributes) * (count - 1));
3006
3007 unsigned first = ffs(instance_rate_inputs) - 1;
3008 return start_index + offset_from_start_index + first;
3009 }
3010
3011 union vs_prolog_key_header {
3012 struct {
3013 uint32_t key_size : 8;
3014 uint32_t num_attributes : 6;
3015 uint32_t as_ls : 1;
3016 uint32_t is_ngg : 1;
3017 uint32_t wave32 : 1;
3018 uint32_t next_stage : 3;
3019 uint32_t instance_rate_inputs : 1;
3020 uint32_t alpha_adjust_lo : 1;
3021 uint32_t alpha_adjust_hi : 1;
3022 uint32_t misaligned_mask : 1;
3023 uint32_t post_shuffle : 1;
3024 uint32_t nontrivial_divisors : 1;
3025 uint32_t zero_divisors : 1;
3026 /* We need this to ensure the padding is zero. It's useful even if it's unused. */
3027 uint32_t padding0 : 5;
3028 };
3029 uint32_t v;
3030 };
3031
3032 uint32_t
radv_hash_vs_prolog(const void * key_)3033 radv_hash_vs_prolog(const void *key_)
3034 {
3035 const uint32_t *key = key_;
3036 union vs_prolog_key_header header;
3037 header.v = key[0];
3038 return _mesa_hash_data(key, header.key_size);
3039 }
3040
3041 bool
radv_cmp_vs_prolog(const void * a_,const void * b_)3042 radv_cmp_vs_prolog(const void *a_, const void *b_)
3043 {
3044 const uint32_t *a = a_;
3045 const uint32_t *b = b_;
3046 if (a[0] != b[0])
3047 return false;
3048
3049 union vs_prolog_key_header header;
3050 header.v = a[0];
3051 return memcmp(a, b, header.key_size) == 0;
3052 }
3053
3054 static struct radv_shader_part *
lookup_vs_prolog(struct radv_cmd_buffer * cmd_buffer,struct radv_shader * vs_shader,uint32_t * nontrivial_divisors)3055 lookup_vs_prolog(struct radv_cmd_buffer *cmd_buffer, struct radv_shader *vs_shader,
3056 uint32_t *nontrivial_divisors)
3057 {
3058 STATIC_ASSERT(sizeof(union vs_prolog_key_header) == 4);
3059 assert(vs_shader->info.vs.dynamic_inputs);
3060
3061 const struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
3062 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
3063 struct radv_device *device = cmd_buffer->device;
3064
3065 unsigned num_attributes = pipeline->last_vertex_attrib_bit;
3066 uint32_t attribute_mask = BITFIELD_MASK(num_attributes);
3067
3068 uint32_t instance_rate_inputs = state->instance_rate_inputs & attribute_mask;
3069 uint32_t zero_divisors = state->zero_divisors & attribute_mask;
3070 *nontrivial_divisors = state->nontrivial_divisors & attribute_mask;
3071 uint32_t misaligned_mask = cmd_buffer->state.vbo_misaligned_mask;
3072 if (cmd_buffer->state.vbo_misaligned_mask_invalid) {
3073 assert(device->physical_device->rad_info.gfx_level == GFX6 ||
3074 device->physical_device->rad_info.gfx_level >= GFX10);
3075
3076 u_foreach_bit (index, cmd_buffer->state.vbo_misaligned_mask_invalid & attribute_mask) {
3077 uint8_t binding = state->bindings[index];
3078 if (!(cmd_buffer->state.vbo_bound_mask & BITFIELD_BIT(binding)))
3079 continue;
3080 uint8_t req = state->format_align_req_minus_1[index];
3081 struct radv_vertex_binding *vb = &cmd_buffer->vertex_bindings[binding];
3082 VkDeviceSize offset = vb->offset + state->offsets[index];
3083 if ((offset & req) || (vb->stride & req))
3084 misaligned_mask |= BITFIELD_BIT(index);
3085 }
3086 cmd_buffer->state.vbo_misaligned_mask = misaligned_mask;
3087 cmd_buffer->state.vbo_misaligned_mask_invalid &= ~attribute_mask;
3088 }
3089
3090 /* try to use a pre-compiled prolog first */
3091 struct radv_shader_part *prolog = NULL;
3092 if (pipeline->can_use_simple_input &&
3093 (!vs_shader->info.vs.as_ls || !instance_rate_inputs) &&
3094 !misaligned_mask && !state->alpha_adjust_lo && !state->alpha_adjust_hi) {
3095 if (!instance_rate_inputs) {
3096 prolog = device->simple_vs_prologs[num_attributes - 1];
3097 } else if (num_attributes <= 16 && !*nontrivial_divisors && !zero_divisors &&
3098 util_bitcount(instance_rate_inputs) ==
3099 (util_last_bit(instance_rate_inputs) - ffs(instance_rate_inputs) + 1)) {
3100 unsigned index = radv_instance_rate_prolog_index(num_attributes, instance_rate_inputs);
3101 prolog = device->instance_rate_vs_prologs[index];
3102 }
3103 }
3104 if (prolog)
3105 return prolog;
3106
3107 /* if we couldn't use a pre-compiled prolog, find one in the cache or create one */
3108 uint32_t key_words[17];
3109 unsigned key_size = 1;
3110
3111 struct radv_vs_prolog_key key;
3112 key.state = state;
3113 key.num_attributes = num_attributes;
3114 key.misaligned_mask = misaligned_mask;
3115 /* The instance ID input VGPR is placed differently when as_ls=true. */
3116 key.as_ls = vs_shader->info.vs.as_ls && instance_rate_inputs;
3117 key.is_ngg = vs_shader->info.is_ngg;
3118 key.wave32 = vs_shader->info.wave_size == 32;
3119 key.next_stage = pipeline->next_vertex_stage;
3120
3121 union vs_prolog_key_header header;
3122 header.v = 0;
3123 header.num_attributes = num_attributes;
3124 header.as_ls = key.as_ls;
3125 header.is_ngg = key.is_ngg;
3126 header.wave32 = key.wave32;
3127 header.next_stage = key.next_stage;
3128
3129 if (instance_rate_inputs & ~*nontrivial_divisors) {
3130 header.instance_rate_inputs = true;
3131 key_words[key_size++] = instance_rate_inputs;
3132 }
3133 if (*nontrivial_divisors) {
3134 header.nontrivial_divisors = true;
3135 key_words[key_size++] = *nontrivial_divisors;
3136 }
3137 if (zero_divisors) {
3138 header.zero_divisors = true;
3139 key_words[key_size++] = zero_divisors;
3140 }
3141 if (misaligned_mask) {
3142 header.misaligned_mask = true;
3143 key_words[key_size++] = misaligned_mask;
3144
3145 uint8_t *formats = (uint8_t *)&key_words[key_size];
3146 unsigned num_formats = 0;
3147 u_foreach_bit(index, misaligned_mask) formats[num_formats++] = state->formats[index];
3148 while (num_formats & 0x3)
3149 formats[num_formats++] = 0;
3150 key_size += num_formats / 4u;
3151
3152 if (state->post_shuffle & attribute_mask) {
3153 header.post_shuffle = true;
3154 key_words[key_size++] = state->post_shuffle & attribute_mask;
3155 }
3156 }
3157 if (state->alpha_adjust_lo & attribute_mask) {
3158 header.alpha_adjust_lo = true;
3159 key_words[key_size++] = state->alpha_adjust_lo & attribute_mask;
3160 }
3161 if (state->alpha_adjust_hi & attribute_mask) {
3162 header.alpha_adjust_hi = true;
3163 key_words[key_size++] = state->alpha_adjust_hi & attribute_mask;
3164 }
3165
3166 header.key_size = key_size * sizeof(key_words[0]);
3167 key_words[0] = header.v;
3168
3169 uint32_t hash = radv_hash_vs_prolog(key_words);
3170
3171 if (cmd_buffer->state.emitted_vs_prolog &&
3172 cmd_buffer->state.emitted_vs_prolog_key_hash == hash &&
3173 radv_cmp_vs_prolog(key_words, cmd_buffer->state.emitted_vs_prolog_key))
3174 return cmd_buffer->state.emitted_vs_prolog;
3175
3176 u_rwlock_rdlock(&device->vs_prologs_lock);
3177 struct hash_entry *prolog_entry =
3178 _mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, key_words);
3179 u_rwlock_rdunlock(&device->vs_prologs_lock);
3180
3181 if (!prolog_entry) {
3182 u_rwlock_wrlock(&device->vs_prologs_lock);
3183 prolog_entry = _mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, key_words);
3184 if (prolog_entry) {
3185 u_rwlock_wrunlock(&device->vs_prologs_lock);
3186 return prolog_entry->data;
3187 }
3188
3189 prolog = radv_create_vs_prolog(device, &key);
3190 uint32_t *key2 = malloc(key_size * 4);
3191 if (!prolog || !key2) {
3192 radv_shader_part_destroy(device, prolog);
3193 free(key2);
3194 u_rwlock_wrunlock(&device->vs_prologs_lock);
3195 return NULL;
3196 }
3197 memcpy(key2, key_words, key_size * 4);
3198 _mesa_hash_table_insert_pre_hashed(device->vs_prologs, hash, key2, prolog);
3199
3200 u_rwlock_wrunlock(&device->vs_prologs_lock);
3201 return prolog;
3202 }
3203
3204 return prolog_entry->data;
3205 }
3206
3207 static void
emit_prolog_regs(struct radv_cmd_buffer * cmd_buffer,struct radv_shader * vs_shader,struct radv_shader_part * prolog,bool pipeline_is_dirty)3208 emit_prolog_regs(struct radv_cmd_buffer *cmd_buffer, struct radv_shader *vs_shader,
3209 struct radv_shader_part *prolog, bool pipeline_is_dirty)
3210 {
3211 /* no need to re-emit anything in this case */
3212 if (cmd_buffer->state.emitted_vs_prolog == prolog && !pipeline_is_dirty)
3213 return;
3214
3215 enum amd_gfx_level chip = cmd_buffer->device->physical_device->rad_info.gfx_level;
3216 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
3217 uint64_t prolog_va = radv_buffer_get_va(prolog->bo) + prolog->alloc->offset;
3218
3219 assert(cmd_buffer->state.emitted_graphics_pipeline == cmd_buffer->state.graphics_pipeline);
3220
3221 uint32_t rsrc1 = vs_shader->config.rsrc1;
3222 if (chip < GFX10 && G_00B228_SGPRS(prolog->rsrc1) > G_00B228_SGPRS(vs_shader->config.rsrc1))
3223 rsrc1 = (rsrc1 & C_00B228_SGPRS) | (prolog->rsrc1 & ~C_00B228_SGPRS);
3224
3225 /* The main shader must not use less VGPRs than the prolog, otherwise shared vgprs might not
3226 * work.
3227 */
3228 assert(G_00B848_VGPRS(vs_shader->config.rsrc1) >= G_00B848_VGPRS(prolog->rsrc1));
3229
3230 unsigned pgm_lo_reg = R_00B120_SPI_SHADER_PGM_LO_VS;
3231 unsigned rsrc1_reg = R_00B128_SPI_SHADER_PGM_RSRC1_VS;
3232 if (vs_shader->info.is_ngg || pipeline->base.shaders[MESA_SHADER_GEOMETRY] == vs_shader) {
3233 pgm_lo_reg = chip >= GFX10 ? R_00B320_SPI_SHADER_PGM_LO_ES : R_00B210_SPI_SHADER_PGM_LO_ES;
3234 rsrc1_reg = R_00B228_SPI_SHADER_PGM_RSRC1_GS;
3235 } else if (pipeline->base.shaders[MESA_SHADER_TESS_CTRL] == vs_shader) {
3236 pgm_lo_reg = chip >= GFX10 ? R_00B520_SPI_SHADER_PGM_LO_LS : R_00B410_SPI_SHADER_PGM_LO_LS;
3237 rsrc1_reg = R_00B428_SPI_SHADER_PGM_RSRC1_HS;
3238 } else if (vs_shader->info.vs.as_ls) {
3239 pgm_lo_reg = R_00B520_SPI_SHADER_PGM_LO_LS;
3240 rsrc1_reg = R_00B528_SPI_SHADER_PGM_RSRC1_LS;
3241 } else if (vs_shader->info.vs.as_es) {
3242 pgm_lo_reg = R_00B320_SPI_SHADER_PGM_LO_ES;
3243 rsrc1_reg = R_00B328_SPI_SHADER_PGM_RSRC1_ES;
3244 }
3245
3246 radeon_set_sh_reg(cmd_buffer->cs, pgm_lo_reg, prolog_va >> 8);
3247
3248 if (chip < GFX10)
3249 radeon_set_sh_reg(cmd_buffer->cs, rsrc1_reg, rsrc1);
3250 else
3251 assert(rsrc1 == vs_shader->config.rsrc1);
3252
3253 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, prolog->bo);
3254 }
3255
3256 static void
emit_prolog_inputs(struct radv_cmd_buffer * cmd_buffer,struct radv_shader * vs_shader,uint32_t nontrivial_divisors,bool pipeline_is_dirty)3257 emit_prolog_inputs(struct radv_cmd_buffer *cmd_buffer, struct radv_shader *vs_shader,
3258 uint32_t nontrivial_divisors, bool pipeline_is_dirty)
3259 {
3260 /* no need to re-emit anything in this case */
3261 if (!nontrivial_divisors && !pipeline_is_dirty && cmd_buffer->state.emitted_vs_prolog &&
3262 !cmd_buffer->state.emitted_vs_prolog->nontrivial_divisors)
3263 return;
3264
3265 const struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
3266 uint64_t input_va = radv_shader_get_va(vs_shader);
3267
3268 if (nontrivial_divisors) {
3269 unsigned inputs_offset;
3270 uint32_t *inputs;
3271 unsigned size = 8 + util_bitcount(nontrivial_divisors) * 8;
3272 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, &inputs_offset, (void **)&inputs))
3273 return;
3274
3275 *(inputs++) = input_va;
3276 *(inputs++) = input_va >> 32;
3277
3278 u_foreach_bit(index, nontrivial_divisors)
3279 {
3280 uint32_t div = state->divisors[index];
3281 if (div == 0) {
3282 *(inputs++) = 0;
3283 *(inputs++) = 1;
3284 } else if (util_is_power_of_two_or_zero(div)) {
3285 *(inputs++) = util_logbase2(div) | (1 << 8);
3286 *(inputs++) = 0xffffffffu;
3287 } else {
3288 struct util_fast_udiv_info info = util_compute_fast_udiv_info(div, 32, 32);
3289 *(inputs++) = info.pre_shift | (info.increment << 8) | (info.post_shift << 16);
3290 *(inputs++) = info.multiplier;
3291 }
3292 }
3293
3294 input_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + inputs_offset;
3295 }
3296
3297 struct radv_userdata_info *loc =
3298 &vs_shader->info.user_sgprs_locs.shader_data[AC_UD_VS_PROLOG_INPUTS];
3299 uint32_t base_reg = cmd_buffer->state.graphics_pipeline->base.user_data_0[MESA_SHADER_VERTEX];
3300 assert(loc->sgpr_idx != -1);
3301 assert(loc->num_sgprs == 2);
3302 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4,
3303 input_va, true);
3304 }
3305
3306 static void
radv_emit_vertex_input(struct radv_cmd_buffer * cmd_buffer,bool pipeline_is_dirty)3307 radv_emit_vertex_input(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
3308 {
3309 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
3310 struct radv_shader *vs_shader = radv_get_shader(&pipeline->base, MESA_SHADER_VERTEX);
3311
3312 assert(!cmd_buffer->state.mesh_shading);
3313
3314 if (!vs_shader->info.vs.has_prolog)
3315 return;
3316
3317 uint32_t nontrivial_divisors;
3318 struct radv_shader_part *prolog =
3319 lookup_vs_prolog(cmd_buffer, vs_shader, &nontrivial_divisors);
3320 if (!prolog) {
3321 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
3322 return;
3323 }
3324 emit_prolog_regs(cmd_buffer, vs_shader, prolog, pipeline_is_dirty);
3325 emit_prolog_inputs(cmd_buffer, vs_shader, nontrivial_divisors, pipeline_is_dirty);
3326
3327 cmd_buffer->state.emitted_vs_prolog = prolog;
3328
3329 if (unlikely(cmd_buffer->device->trace_bo))
3330 radv_save_vs_prolog(cmd_buffer, prolog);
3331 }
3332
3333 static void
radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer * cmd_buffer,bool pipeline_is_dirty)3334 radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
3335 {
3336 uint64_t states =
3337 cmd_buffer->state.dirty & cmd_buffer->state.emitted_graphics_pipeline->needed_dynamic_state;
3338
3339 if (states & (RADV_CMD_DIRTY_DYNAMIC_VIEWPORT))
3340 radv_emit_viewport(cmd_buffer);
3341
3342 if (states & (RADV_CMD_DIRTY_DYNAMIC_SCISSOR | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT) &&
3343 !cmd_buffer->device->physical_device->rad_info.has_gfx9_scissor_bug)
3344 radv_emit_scissor(cmd_buffer);
3345
3346 if (states & RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH)
3347 radv_emit_line_width(cmd_buffer);
3348
3349 if (states & RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS)
3350 radv_emit_blend_constants(cmd_buffer);
3351
3352 if (states &
3353 (RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE | RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK |
3354 RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK))
3355 radv_emit_stencil(cmd_buffer);
3356
3357 if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS)
3358 radv_emit_depth_bounds(cmd_buffer);
3359
3360 if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS)
3361 radv_emit_depth_bias(cmd_buffer);
3362
3363 if (states & RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE)
3364 radv_emit_discard_rectangle(cmd_buffer);
3365
3366 if (states & RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS)
3367 radv_emit_sample_locations(cmd_buffer);
3368
3369 if (states & (RADV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE))
3370 radv_emit_line_stipple(cmd_buffer);
3371
3372 if (states & (RADV_CMD_DIRTY_DYNAMIC_CULL_MODE | RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE |
3373 RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE))
3374 radv_emit_culling(cmd_buffer, states);
3375
3376 if (states & RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY)
3377 radv_emit_primitive_topology(cmd_buffer);
3378
3379 if (states &
3380 (RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE |
3381 RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP | RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE |
3382 RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP))
3383 radv_emit_depth_control(cmd_buffer, states);
3384
3385 if (states & RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP)
3386 radv_emit_stencil_control(cmd_buffer);
3387
3388 if (states & RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE)
3389 radv_emit_fragment_shading_rate(cmd_buffer);
3390
3391 if (states & RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE)
3392 radv_emit_primitive_restart_enable(cmd_buffer);
3393
3394 if (states & RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE)
3395 radv_emit_rasterizer_discard_enable(cmd_buffer);
3396
3397 if (states & RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP)
3398 radv_emit_logic_op(cmd_buffer);
3399
3400 if (states & RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE)
3401 radv_emit_color_write_enable(cmd_buffer);
3402
3403 if (states & RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT)
3404 radv_emit_vertex_input(cmd_buffer, pipeline_is_dirty);
3405
3406 cmd_buffer->state.dirty &= ~states;
3407 }
3408
3409 static void
radv_flush_push_descriptors(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)3410 radv_flush_push_descriptors(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point)
3411 {
3412 struct radv_descriptor_state *descriptors_state =
3413 radv_get_descriptors_state(cmd_buffer, bind_point);
3414 struct radv_descriptor_set *set = (struct radv_descriptor_set *)&descriptors_state->push_set.set;
3415 unsigned bo_offset;
3416
3417 if (!radv_cmd_buffer_upload_data(cmd_buffer, set->header.size, set->header.mapped_ptr,
3418 &bo_offset))
3419 return;
3420
3421 set->header.va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
3422 set->header.va += bo_offset;
3423 }
3424
3425 static void
radv_flush_indirect_descriptor_sets(struct radv_cmd_buffer * cmd_buffer,struct radv_pipeline * pipeline,VkPipelineBindPoint bind_point)3426 radv_flush_indirect_descriptor_sets(struct radv_cmd_buffer *cmd_buffer,
3427 struct radv_pipeline *pipeline, VkPipelineBindPoint bind_point)
3428 {
3429 struct radv_descriptor_state *descriptors_state =
3430 radv_get_descriptors_state(cmd_buffer, bind_point);
3431 uint32_t size = MAX_SETS * 4;
3432 uint32_t offset;
3433 void *ptr;
3434
3435 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, &offset, &ptr))
3436 return;
3437
3438 for (unsigned i = 0; i < MAX_SETS; i++) {
3439 uint32_t *uptr = ((uint32_t *)ptr) + i;
3440 uint64_t set_va = 0;
3441 struct radv_descriptor_set *set = descriptors_state->sets[i];
3442 if (descriptors_state->valid & (1u << i))
3443 set_va = set->header.va;
3444 uptr[0] = set_va & 0xffffffff;
3445 }
3446
3447 struct radeon_cmdbuf *cs = cmd_buffer->cs;
3448 struct radv_device *device = cmd_buffer->device;
3449 uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
3450 va += offset;
3451
3452 if (bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) {
3453 struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline);
3454
3455 if (pipeline->shaders[MESA_SHADER_VERTEX])
3456 radv_emit_userdata_address(device, cs, pipeline, MESA_SHADER_VERTEX,
3457 AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3458
3459 if (pipeline->shaders[MESA_SHADER_FRAGMENT])
3460 radv_emit_userdata_address(device, cs, pipeline, MESA_SHADER_FRAGMENT,
3461 AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3462
3463 if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_MESH))
3464 radv_emit_userdata_address(device, cs, pipeline, MESA_SHADER_MESH,
3465 AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3466
3467 if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_TASK))
3468 radv_emit_userdata_address(device, cmd_buffer->ace_internal.cs, pipeline, MESA_SHADER_TASK,
3469 AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3470
3471 if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_GEOMETRY))
3472 radv_emit_userdata_address(device, cs, pipeline, MESA_SHADER_GEOMETRY,
3473 AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3474
3475 if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_TESS_CTRL))
3476 radv_emit_userdata_address(device, cs, pipeline, MESA_SHADER_TESS_CTRL,
3477 AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3478
3479 if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_TESS_CTRL))
3480 radv_emit_userdata_address(device, cs, pipeline, MESA_SHADER_TESS_EVAL,
3481 AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3482 } else {
3483 radv_emit_userdata_address(device, cs, pipeline, MESA_SHADER_COMPUTE,
3484 AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3485 }
3486 }
3487
3488 static void
radv_flush_descriptors(struct radv_cmd_buffer * cmd_buffer,VkShaderStageFlags stages,struct radv_pipeline * pipeline,VkPipelineBindPoint bind_point)3489 radv_flush_descriptors(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stages,
3490 struct radv_pipeline *pipeline, VkPipelineBindPoint bind_point)
3491 {
3492 struct radv_descriptor_state *descriptors_state =
3493 radv_get_descriptors_state(cmd_buffer, bind_point);
3494 struct radv_device *device = cmd_buffer->device;
3495 struct radeon_cmdbuf *cs = cmd_buffer->cs;
3496 bool flush_indirect_descriptors;
3497
3498 if (!descriptors_state->dirty)
3499 return;
3500
3501 if (descriptors_state->push_dirty)
3502 radv_flush_push_descriptors(cmd_buffer, bind_point);
3503
3504 flush_indirect_descriptors = pipeline->need_indirect_descriptor_sets;
3505
3506 if (flush_indirect_descriptors)
3507 radv_flush_indirect_descriptor_sets(cmd_buffer, pipeline, bind_point);
3508
3509 ASSERTED unsigned cdw_max =
3510 radeon_check_space(device->ws, cs, MAX_SETS * MESA_VULKAN_SHADER_STAGES * 4);
3511
3512 if (stages & VK_SHADER_STAGE_COMPUTE_BIT) {
3513 radv_emit_descriptor_pointers(device, cs, pipeline, descriptors_state, MESA_SHADER_COMPUTE);
3514 } else {
3515 radv_foreach_stage(stage, stages & ~VK_SHADER_STAGE_TASK_BIT_NV)
3516 {
3517 if (!cmd_buffer->state.graphics_pipeline->base.shaders[stage])
3518 continue;
3519
3520 radv_emit_descriptor_pointers(device, cs, pipeline, descriptors_state, stage);
3521 }
3522
3523 if (stages & VK_SHADER_STAGE_TASK_BIT_NV) {
3524 radv_emit_descriptor_pointers(device, cmd_buffer->ace_internal.cs, pipeline,
3525 descriptors_state, MESA_SHADER_TASK);
3526 }
3527 }
3528
3529 descriptors_state->dirty = 0;
3530 descriptors_state->push_dirty = false;
3531
3532 assert(cmd_buffer->cs->cdw <= cdw_max);
3533
3534 if (unlikely(cmd_buffer->device->trace_bo))
3535 radv_save_descriptors(cmd_buffer, bind_point);
3536 }
3537
3538 static bool
radv_shader_loads_push_constants(struct radv_pipeline * pipeline,gl_shader_stage stage)3539 radv_shader_loads_push_constants(struct radv_pipeline *pipeline, gl_shader_stage stage)
3540 {
3541 struct radv_userdata_info *loc =
3542 radv_lookup_user_sgpr(pipeline, stage, AC_UD_PUSH_CONSTANTS);
3543 return loc->sgpr_idx != -1;
3544 }
3545
3546 static void
radv_emit_all_inline_push_consts(struct radv_device * device,struct radeon_cmdbuf * cs,struct radv_pipeline * pipeline,gl_shader_stage stage,uint32_t * values,bool * need_push_constants)3547 radv_emit_all_inline_push_consts(struct radv_device *device, struct radeon_cmdbuf *cs,
3548 struct radv_pipeline *pipeline, gl_shader_stage stage,
3549 uint32_t *values, bool *need_push_constants)
3550 {
3551 const struct radv_shader *shader = radv_get_shader(pipeline, stage);
3552 if (!shader)
3553 return;
3554
3555 *need_push_constants |= radv_shader_loads_push_constants(pipeline, stage);
3556
3557 const uint64_t mask = shader->info.inline_push_constant_mask;
3558 if (!mask)
3559 return;
3560
3561 const uint8_t base = ffs(mask) - 1;
3562 if (mask == u_bit_consecutive64(base, util_last_bit64(mask) - base)) {
3563 /* consecutive inline push constants */
3564 radv_emit_inline_push_consts(device, cs, pipeline, stage, AC_UD_INLINE_PUSH_CONSTANTS,
3565 values + base);
3566 } else {
3567 /* sparse inline push constants */
3568 uint32_t consts[AC_MAX_INLINE_PUSH_CONSTS];
3569 unsigned num_consts = 0;
3570 u_foreach_bit64 (idx, mask)
3571 consts[num_consts++] = values[idx];
3572 radv_emit_inline_push_consts(device, cs, pipeline, stage, AC_UD_INLINE_PUSH_CONSTANTS,
3573 consts);
3574 }
3575 }
3576
3577 static void
radv_flush_constants(struct radv_cmd_buffer * cmd_buffer,VkShaderStageFlags stages,struct radv_pipeline * pipeline,VkPipelineBindPoint bind_point)3578 radv_flush_constants(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stages,
3579 struct radv_pipeline *pipeline, VkPipelineBindPoint bind_point)
3580 {
3581 struct radv_device *device = cmd_buffer->device;
3582 struct radeon_cmdbuf *cs = cmd_buffer->cs;
3583 struct radv_descriptor_state *descriptors_state =
3584 radv_get_descriptors_state(cmd_buffer, bind_point);
3585 struct radv_shader *shader, *prev_shader;
3586 bool need_push_constants = false;
3587 unsigned offset;
3588 void *ptr;
3589 uint64_t va;
3590 uint32_t internal_stages;
3591 uint32_t dirty_stages = 0;
3592
3593 stages &= cmd_buffer->push_constant_stages;
3594 if (!stages || (!pipeline->push_constant_size && !pipeline->dynamic_offset_count))
3595 return;
3596
3597 internal_stages = stages;
3598 switch (bind_point) {
3599 case VK_PIPELINE_BIND_POINT_GRAPHICS:
3600 break;
3601 case VK_PIPELINE_BIND_POINT_COMPUTE:
3602 dirty_stages = RADV_RT_STAGE_BITS;
3603 break;
3604 case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR:
3605 internal_stages = VK_SHADER_STAGE_COMPUTE_BIT;
3606 dirty_stages = VK_SHADER_STAGE_COMPUTE_BIT;
3607 break;
3608 default:
3609 unreachable("Unhandled bind point");
3610 }
3611
3612 radv_foreach_stage(stage, internal_stages & ~VK_SHADER_STAGE_TASK_BIT_NV)
3613 {
3614 radv_emit_all_inline_push_consts(
3615 device, cs, pipeline, stage, (uint32_t *)cmd_buffer->push_constants, &need_push_constants);
3616 }
3617
3618 if (internal_stages & VK_SHADER_STAGE_TASK_BIT_NV) {
3619 radv_emit_all_inline_push_consts(device, cmd_buffer->ace_internal.cs, pipeline,
3620 MESA_SHADER_TASK, (uint32_t *)cmd_buffer->push_constants,
3621 &need_push_constants);
3622 }
3623
3624 if (need_push_constants) {
3625 if (!radv_cmd_buffer_upload_alloc(
3626 cmd_buffer, pipeline->push_constant_size + 16 * pipeline->dynamic_offset_count, &offset,
3627 &ptr))
3628 return;
3629
3630 memcpy(ptr, cmd_buffer->push_constants, pipeline->push_constant_size);
3631 memcpy((char *)ptr + pipeline->push_constant_size, descriptors_state->dynamic_buffers,
3632 16 * pipeline->dynamic_offset_count);
3633
3634 va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
3635 va += offset;
3636
3637 ASSERTED unsigned cdw_max =
3638 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, MESA_VULKAN_SHADER_STAGES * 4);
3639
3640 prev_shader = NULL;
3641 radv_foreach_stage(stage, internal_stages & ~VK_SHADER_STAGE_TASK_BIT_NV)
3642 {
3643 shader = radv_get_shader(pipeline, stage);
3644
3645 /* Avoid redundantly emitting the address for merged stages. */
3646 if (shader && shader != prev_shader) {
3647 radv_emit_userdata_address(device, cs, pipeline, stage, AC_UD_PUSH_CONSTANTS, va);
3648
3649 prev_shader = shader;
3650 }
3651 }
3652
3653 if (internal_stages & VK_SHADER_STAGE_TASK_BIT_NV) {
3654 radv_emit_userdata_address(device, cmd_buffer->ace_internal.cs, pipeline, MESA_SHADER_TASK,
3655 AC_UD_PUSH_CONSTANTS, va);
3656 }
3657
3658 assert(cmd_buffer->cs->cdw <= cdw_max);
3659 }
3660
3661 cmd_buffer->push_constant_stages &= ~stages;
3662 cmd_buffer->push_constant_stages |= dirty_stages;
3663 }
3664
3665 enum radv_dst_sel {
3666 DST_SEL_0001 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_0) |
3667 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1),
3668 DST_SEL_X001 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_0) |
3669 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1),
3670 DST_SEL_XY01 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3671 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1),
3672 DST_SEL_XYZ1 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3673 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1),
3674 DST_SEL_XYZW = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3675 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W),
3676 DST_SEL_ZYXW = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3677 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W),
3678 };
3679
3680 static const uint32_t data_format_dst_sel[] = {
3681 [V_008F0C_BUF_DATA_FORMAT_INVALID] = DST_SEL_0001,
3682 [V_008F0C_BUF_DATA_FORMAT_8] = DST_SEL_X001,
3683 [V_008F0C_BUF_DATA_FORMAT_16] = DST_SEL_X001,
3684 [V_008F0C_BUF_DATA_FORMAT_8_8] = DST_SEL_XY01,
3685 [V_008F0C_BUF_DATA_FORMAT_32] = DST_SEL_X001,
3686 [V_008F0C_BUF_DATA_FORMAT_16_16] = DST_SEL_XY01,
3687 [V_008F0C_BUF_DATA_FORMAT_10_11_11] = DST_SEL_XYZ1,
3688 [V_008F0C_BUF_DATA_FORMAT_11_11_10] = DST_SEL_XYZ1,
3689 [V_008F0C_BUF_DATA_FORMAT_10_10_10_2] = DST_SEL_XYZW,
3690 [V_008F0C_BUF_DATA_FORMAT_2_10_10_10] = DST_SEL_XYZW,
3691 [V_008F0C_BUF_DATA_FORMAT_8_8_8_8] = DST_SEL_XYZW,
3692 [V_008F0C_BUF_DATA_FORMAT_32_32] = DST_SEL_XY01,
3693 [V_008F0C_BUF_DATA_FORMAT_16_16_16_16] = DST_SEL_XYZW,
3694 [V_008F0C_BUF_DATA_FORMAT_32_32_32] = DST_SEL_XYZ1,
3695 [V_008F0C_BUF_DATA_FORMAT_32_32_32_32] = DST_SEL_XYZW,
3696 };
3697
3698 void
radv_write_vertex_descriptors(const struct radv_cmd_buffer * cmd_buffer,const struct radv_graphics_pipeline * pipeline,bool full_null_descriptors,void * vb_ptr)3699 radv_write_vertex_descriptors(const struct radv_cmd_buffer *cmd_buffer,
3700 const struct radv_graphics_pipeline *pipeline,
3701 bool full_null_descriptors, void *vb_ptr)
3702 {
3703 struct radv_shader *vs_shader = radv_get_shader(&pipeline->base, MESA_SHADER_VERTEX);
3704 enum amd_gfx_level chip = cmd_buffer->device->physical_device->rad_info.gfx_level;
3705 unsigned desc_index = 0;
3706 uint32_t mask = pipeline->vb_desc_usage_mask;
3707 uint64_t va;
3708 const struct radv_vs_input_state *vs_state =
3709 vs_shader->info.vs.dynamic_inputs ? &cmd_buffer->state.dynamic_vs_input : NULL;
3710 assert(!vs_state || pipeline->use_per_attribute_vb_descs);
3711
3712 while (mask) {
3713 unsigned i = u_bit_scan(&mask);
3714 uint32_t *desc = &((uint32_t *)vb_ptr)[desc_index++ * 4];
3715 uint32_t offset, rsrc_word3;
3716 unsigned binding =
3717 vs_state ? cmd_buffer->state.dynamic_vs_input.bindings[i]
3718 : (pipeline->use_per_attribute_vb_descs ? pipeline->attrib_bindings[i] : i);
3719 struct radv_buffer *buffer = cmd_buffer->vertex_binding_buffers[binding];
3720 unsigned num_records;
3721 unsigned stride;
3722
3723 if (vs_state) {
3724 unsigned format = vs_state->formats[i];
3725 unsigned dfmt = format & 0xf;
3726 unsigned nfmt = (format >> 4) & 0x7;
3727
3728 rsrc_word3 = vs_state->post_shuffle & (1u << i) ? DST_SEL_ZYXW : data_format_dst_sel[dfmt];
3729
3730 if (chip >= GFX10)
3731 rsrc_word3 |= S_008F0C_FORMAT(ac_get_tbuffer_format(chip, dfmt, nfmt));
3732 else
3733 rsrc_word3 |= S_008F0C_NUM_FORMAT(nfmt) | S_008F0C_DATA_FORMAT(dfmt);
3734 } else {
3735 if (chip >= GFX10)
3736 rsrc_word3 = DST_SEL_XYZW | S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_UINT);
3737 else
3738 rsrc_word3 = DST_SEL_XYZW | S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
3739 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
3740 }
3741
3742 if (pipeline->uses_dynamic_stride) {
3743 stride = cmd_buffer->vertex_bindings[binding].stride;
3744 } else {
3745 stride = pipeline->binding_stride[binding];
3746 }
3747
3748 if (!buffer) {
3749 if (full_null_descriptors) {
3750 /* Put all the info in for the DGC generation shader in case the VBO gets overridden. */
3751 desc[0] = 0;
3752 desc[1] = S_008F04_STRIDE(stride);
3753 desc[2] = 0;
3754 desc[3] = rsrc_word3;
3755 } else if (vs_state) {
3756 /* Stride needs to be non-zero on GFX9, or else bounds checking is disabled. We need
3757 * to include the format/word3 so that the alpha channel is 1 for formats without an
3758 * alpha channel.
3759 */
3760 desc[0] = 0;
3761 desc[1] = S_008F04_STRIDE(16);
3762 desc[2] = 0;
3763 desc[3] = rsrc_word3;
3764 } else {
3765 memset(desc, 0, 4 * 4);
3766 }
3767
3768 continue;
3769 }
3770
3771 va = radv_buffer_get_va(buffer->bo);
3772
3773 offset = cmd_buffer->vertex_bindings[binding].offset;
3774 va += offset + buffer->offset;
3775 if (vs_state)
3776 va += vs_state->offsets[i];
3777
3778 if (cmd_buffer->vertex_bindings[binding].size) {
3779 num_records = cmd_buffer->vertex_bindings[binding].size;
3780 } else {
3781 num_records = vk_buffer_range(&buffer->vk, offset, VK_WHOLE_SIZE);
3782 }
3783
3784 if (pipeline->use_per_attribute_vb_descs) {
3785 uint32_t attrib_end =
3786 vs_state ? vs_state->offsets[i] + vs_state->format_sizes[i] : pipeline->attrib_ends[i];
3787
3788 if (num_records < attrib_end) {
3789 num_records = 0; /* not enough space for one vertex */
3790 } else if (stride == 0) {
3791 num_records = 1; /* only one vertex */
3792 } else {
3793 num_records = (num_records - attrib_end) / stride + 1;
3794 /* If attrib_offset>stride, then the compiler will increase the vertex index by
3795 * attrib_offset/stride and decrease the offset by attrib_offset%stride. This is
3796 * only allowed with static strides.
3797 */
3798 num_records += pipeline->attrib_index_offset[i];
3799 }
3800
3801 /* GFX10 uses OOB_SELECT_RAW if stride==0, so convert num_records from elements into
3802 * into bytes in that case. GFX8 always uses bytes.
3803 */
3804 if (num_records && (chip == GFX8 || (chip != GFX9 && !stride))) {
3805 num_records = (num_records - 1) * stride + attrib_end;
3806 } else if (!num_records) {
3807 /* On GFX9, it seems bounds checking is disabled if both
3808 * num_records and stride are zero. This doesn't seem necessary on GFX8, GFX10 and
3809 * GFX10.3 but it doesn't hurt.
3810 */
3811 if (full_null_descriptors) {
3812 /* Put all the info in for the DGC generation shader in case the VBO gets overridden.
3813 */
3814 desc[0] = 0;
3815 desc[1] = S_008F04_STRIDE(stride);
3816 desc[2] = 0;
3817 desc[3] = rsrc_word3;
3818 } else if (vs_state) {
3819 desc[0] = 0;
3820 desc[1] = S_008F04_STRIDE(16);
3821 desc[2] = 0;
3822 desc[3] = rsrc_word3;
3823 } else {
3824 memset(desc, 0, 16);
3825 }
3826
3827 continue;
3828 }
3829 } else {
3830 if (chip != GFX8 && stride)
3831 num_records = DIV_ROUND_UP(num_records, stride);
3832 }
3833
3834 if (chip >= GFX10) {
3835 /* OOB_SELECT chooses the out-of-bounds check:
3836 * - 1: index >= NUM_RECORDS (Structured)
3837 * - 3: offset >= NUM_RECORDS (Raw)
3838 */
3839 int oob_select = stride ? V_008F0C_OOB_SELECT_STRUCTURED : V_008F0C_OOB_SELECT_RAW;
3840 rsrc_word3 |= S_008F0C_OOB_SELECT(oob_select) | S_008F0C_RESOURCE_LEVEL(chip < GFX11);
3841 }
3842
3843 desc[0] = va;
3844 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride);
3845 desc[2] = num_records;
3846 desc[3] = rsrc_word3;
3847 }
3848 }
3849
3850 static void
radv_flush_vertex_descriptors(struct radv_cmd_buffer * cmd_buffer,bool pipeline_is_dirty)3851 radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
3852 {
3853 if ((pipeline_is_dirty || (cmd_buffer->state.dirty & RADV_CMD_DIRTY_VERTEX_BUFFER)) &&
3854 cmd_buffer->state.graphics_pipeline->vb_desc_usage_mask) {
3855 /* Mesh shaders don't have vertex descriptors. */
3856 assert(!cmd_buffer->state.mesh_shading);
3857
3858 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
3859 unsigned vb_offset;
3860 void *vb_ptr;
3861 uint64_t va;
3862
3863 /* allocate some descriptor state for vertex buffers */
3864 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, pipeline->vb_desc_alloc_size, &vb_offset,
3865 &vb_ptr))
3866 return;
3867
3868 radv_write_vertex_descriptors(cmd_buffer, pipeline, false, vb_ptr);
3869
3870 va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
3871 va += vb_offset;
3872
3873 radv_emit_userdata_address(cmd_buffer->device, cmd_buffer->cs, &pipeline->base,
3874 MESA_SHADER_VERTEX, AC_UD_VS_VERTEX_BUFFERS, va);
3875
3876 cmd_buffer->state.vb_va = va;
3877 cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_VBO_DESCRIPTORS;
3878
3879 if (unlikely(cmd_buffer->device->trace_bo))
3880 radv_save_vertex_descriptors(cmd_buffer, (uintptr_t)vb_ptr);
3881 }
3882 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_VERTEX_BUFFER;
3883 }
3884
3885 static void
radv_emit_streamout_buffers(struct radv_cmd_buffer * cmd_buffer,uint64_t va)3886 radv_emit_streamout_buffers(struct radv_cmd_buffer *cmd_buffer, uint64_t va)
3887 {
3888 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
3889 struct radv_userdata_info *loc;
3890 uint32_t base_reg;
3891
3892 for (unsigned stage = 0; stage < MESA_VULKAN_SHADER_STAGES; ++stage) {
3893 if (!radv_get_shader(&pipeline->base, stage))
3894 continue;
3895
3896 loc = radv_lookup_user_sgpr(&pipeline->base, stage, AC_UD_STREAMOUT_BUFFERS);
3897 if (loc->sgpr_idx == -1)
3898 continue;
3899
3900 base_reg = pipeline->base.user_data_0[stage];
3901
3902 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, va,
3903 false);
3904 }
3905
3906 if (radv_pipeline_has_gs_copy_shader(&pipeline->base)) {
3907 loc = &pipeline->base.gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_STREAMOUT_BUFFERS];
3908 if (loc->sgpr_idx != -1) {
3909 base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0;
3910
3911 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4,
3912 va, false);
3913 }
3914 }
3915 }
3916
3917 static void
radv_flush_streamout_descriptors(struct radv_cmd_buffer * cmd_buffer)3918 radv_flush_streamout_descriptors(struct radv_cmd_buffer *cmd_buffer)
3919 {
3920 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_STREAMOUT_BUFFER) {
3921 struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
3922 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
3923 unsigned so_offset;
3924 void *so_ptr;
3925 uint64_t va;
3926
3927 /* Allocate some descriptor state for streamout buffers. */
3928 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, MAX_SO_BUFFERS * 16, &so_offset, &so_ptr))
3929 return;
3930
3931 for (uint32_t i = 0; i < MAX_SO_BUFFERS; i++) {
3932 struct radv_buffer *buffer = sb[i].buffer;
3933 uint32_t *desc = &((uint32_t *)so_ptr)[i * 4];
3934
3935 if (!(so->enabled_mask & (1 << i)))
3936 continue;
3937
3938 va = radv_buffer_get_va(buffer->bo) + buffer->offset;
3939
3940 va += sb[i].offset;
3941
3942 /* Set the descriptor.
3943 *
3944 * On GFX8, the format must be non-INVALID, otherwise
3945 * the buffer will be considered not bound and store
3946 * instructions will be no-ops.
3947 */
3948 uint32_t size = 0xffffffff;
3949
3950 /* Compute the correct buffer size for NGG streamout
3951 * because it's used to determine the max emit per
3952 * buffer.
3953 */
3954 if (cmd_buffer->device->physical_device->use_ngg_streamout)
3955 size = buffer->vk.size - sb[i].offset;
3956
3957 uint32_t rsrc_word3 =
3958 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3959 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
3960
3961 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
3962 rsrc_word3 |= S_008F0C_FORMAT(V_008F0C_GFX11_FORMAT_32_FLOAT) |
3963 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW);
3964 } else if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10) {
3965 rsrc_word3 |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
3966 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
3967 } else {
3968 rsrc_word3 |= S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
3969 }
3970
3971 desc[0] = va;
3972 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
3973 desc[2] = size;
3974 desc[3] = rsrc_word3;
3975 }
3976
3977 va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
3978 va += so_offset;
3979
3980 radv_emit_streamout_buffers(cmd_buffer, va);
3981 }
3982
3983 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_STREAMOUT_BUFFER;
3984 }
3985
3986 static void
radv_flush_ngg_query_state(struct radv_cmd_buffer * cmd_buffer)3987 radv_flush_ngg_query_state(struct radv_cmd_buffer *cmd_buffer)
3988 {
3989 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
3990 const unsigned stage = pipeline->last_vgt_api_stage;
3991 struct radv_userdata_info *loc;
3992 uint32_t ngg_query_state = 0;
3993 uint32_t base_reg;
3994
3995 loc = radv_lookup_user_sgpr(&pipeline->base, stage, AC_UD_NGG_QUERY_STATE);
3996 if (loc->sgpr_idx == -1)
3997 return;
3998
3999 assert(pipeline->is_ngg);
4000
4001 /* By default NGG queries are disabled but they are enabled if the command buffer has active GDS
4002 * queries or if it's a secondary command buffer that inherits the number of generated
4003 * primitives.
4004 */
4005 if (cmd_buffer->state.active_pipeline_gds_queries ||
4006 (cmd_buffer->state.inherited_pipeline_statistics &
4007 VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT))
4008 ngg_query_state = 1;
4009
4010 base_reg = pipeline->base.user_data_0[stage];
4011 assert(loc->sgpr_idx != -1);
4012
4013 radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, ngg_query_state);
4014 }
4015
4016 static void
radv_flush_force_vrs_state(struct radv_cmd_buffer * cmd_buffer)4017 radv_flush_force_vrs_state(struct radv_cmd_buffer *cmd_buffer)
4018 {
4019 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
4020 enum amd_gfx_level gfx_level = pipeline->base.device->physical_device->rad_info.gfx_level;
4021 const unsigned stage = pipeline->last_vgt_api_stage;
4022 struct radv_userdata_info *loc;
4023 uint32_t vrs_rates = 0;
4024 uint32_t base_reg;
4025
4026 if (!pipeline->force_vrs_per_vertex) {
4027 /* Un-set the SGPR index so we know to re-emit it later. */
4028 cmd_buffer->state.last_vrs_rates_sgpr_idx = -1;
4029 return;
4030 }
4031
4032 loc = radv_lookup_user_sgpr(&pipeline->base, stage, AC_UD_FORCE_VRS_RATES);
4033 assert(loc->sgpr_idx != -1);
4034
4035 base_reg = pipeline->base.user_data_0[stage];
4036
4037 switch (cmd_buffer->device->force_vrs) {
4038 case RADV_FORCE_VRS_2x2:
4039 vrs_rates = gfx_level >= GFX11 ? V_0283D0_VRS_SHADING_RATE_2X2 : (1u << 2) | (1u << 4);
4040 break;
4041 case RADV_FORCE_VRS_2x1:
4042 vrs_rates = gfx_level >= GFX11 ? V_0283D0_VRS_SHADING_RATE_2X1 : (1u << 2) | (0u << 4);
4043 break;
4044 case RADV_FORCE_VRS_1x2:
4045 vrs_rates = gfx_level >= GFX11 ? V_0283D0_VRS_SHADING_RATE_1X2 : (0u << 2) | (1u << 4);
4046 break;
4047 default:
4048 break;
4049 }
4050
4051 if (cmd_buffer->state.last_vrs_rates != vrs_rates ||
4052 cmd_buffer->state.last_vrs_rates_sgpr_idx != loc->sgpr_idx) {
4053 radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, vrs_rates);
4054 }
4055
4056 cmd_buffer->state.last_vrs_rates = vrs_rates;
4057 cmd_buffer->state.last_vrs_rates_sgpr_idx = loc->sgpr_idx;
4058 }
4059
4060 static void
radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer * cmd_buffer,bool pipeline_is_dirty)4061 radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
4062 {
4063 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
4064
4065 radv_flush_vertex_descriptors(cmd_buffer, pipeline_is_dirty);
4066 radv_flush_streamout_descriptors(cmd_buffer);
4067
4068 VkShaderStageFlags stages = VK_SHADER_STAGE_ALL_GRAPHICS | VK_SHADER_STAGE_MESH_BIT_NV;
4069 radv_flush_descriptors(cmd_buffer, stages, &pipeline->base, VK_PIPELINE_BIND_POINT_GRAPHICS);
4070 radv_flush_constants(cmd_buffer, stages, &pipeline->base, VK_PIPELINE_BIND_POINT_GRAPHICS);
4071 radv_flush_ngg_query_state(cmd_buffer);
4072 radv_flush_force_vrs_state(cmd_buffer);
4073 }
4074
4075 struct radv_draw_info {
4076 /**
4077 * Number of vertices.
4078 */
4079 uint32_t count;
4080
4081 /**
4082 * First instance id.
4083 */
4084 uint32_t first_instance;
4085
4086 /**
4087 * Number of instances.
4088 */
4089 uint32_t instance_count;
4090
4091 /**
4092 * Whether it's an indexed draw.
4093 */
4094 bool indexed;
4095
4096 /**
4097 * Indirect draw parameters resource.
4098 */
4099 struct radv_buffer *indirect;
4100 uint64_t indirect_offset;
4101 uint32_t stride;
4102
4103 /**
4104 * Draw count parameters resource.
4105 */
4106 struct radv_buffer *count_buffer;
4107 uint64_t count_buffer_offset;
4108
4109 /**
4110 * Stream output parameters resource.
4111 */
4112 struct radv_buffer *strmout_buffer;
4113 uint64_t strmout_buffer_offset;
4114 };
4115
4116 static uint32_t
radv_get_primitive_reset_index(struct radv_cmd_buffer * cmd_buffer)4117 radv_get_primitive_reset_index(struct radv_cmd_buffer *cmd_buffer)
4118 {
4119 uint32_t index_type = G_028A7C_INDEX_TYPE(cmd_buffer->state.index_type);
4120 switch (index_type) {
4121 case V_028A7C_VGT_INDEX_8:
4122 return 0xffu;
4123 case V_028A7C_VGT_INDEX_16:
4124 return 0xffffu;
4125 case V_028A7C_VGT_INDEX_32:
4126 return 0xffffffffu;
4127 default:
4128 unreachable("invalid index type");
4129 }
4130 }
4131
4132 static void
si_emit_ia_multi_vgt_param(struct radv_cmd_buffer * cmd_buffer,bool instanced_draw,bool indirect_draw,bool count_from_stream_output,uint32_t draw_vertex_count)4133 si_emit_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer, bool instanced_draw,
4134 bool indirect_draw, bool count_from_stream_output,
4135 uint32_t draw_vertex_count)
4136 {
4137 struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info;
4138 struct radv_cmd_state *state = &cmd_buffer->state;
4139 unsigned topology = state->dynamic.primitive_topology;
4140 bool prim_restart_enable = state->dynamic.primitive_restart_enable;
4141 struct radeon_cmdbuf *cs = cmd_buffer->cs;
4142 unsigned ia_multi_vgt_param;
4143
4144 ia_multi_vgt_param =
4145 si_get_ia_multi_vgt_param(cmd_buffer, instanced_draw, indirect_draw, count_from_stream_output,
4146 draw_vertex_count, topology, prim_restart_enable);
4147
4148 if (state->last_ia_multi_vgt_param != ia_multi_vgt_param) {
4149 if (info->gfx_level == GFX9) {
4150 radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device, cs,
4151 R_030960_IA_MULTI_VGT_PARAM, 4, ia_multi_vgt_param);
4152 } else if (info->gfx_level >= GFX7) {
4153 radeon_set_context_reg_idx(cs, R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param);
4154 } else {
4155 radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param);
4156 }
4157 state->last_ia_multi_vgt_param = ia_multi_vgt_param;
4158 }
4159 }
4160
4161 static void
radv_emit_draw_registers(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * draw_info)4162 radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *draw_info)
4163 {
4164 struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info;
4165 struct radv_cmd_state *state = &cmd_buffer->state;
4166 struct radeon_cmdbuf *cs = cmd_buffer->cs;
4167 uint32_t topology = state->dynamic.primitive_topology;
4168 bool disable_instance_packing = false;
4169
4170 /* Draw state. */
4171 if (info->gfx_level < GFX10) {
4172 si_emit_ia_multi_vgt_param(cmd_buffer, draw_info->instance_count > 1, draw_info->indirect,
4173 !!draw_info->strmout_buffer,
4174 draw_info->indirect ? 0 : draw_info->count);
4175 }
4176
4177 if (state->dynamic.primitive_restart_enable) {
4178 uint32_t primitive_reset_index = radv_get_primitive_reset_index(cmd_buffer);
4179
4180 if (primitive_reset_index != state->last_primitive_reset_index) {
4181 radeon_set_context_reg(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, primitive_reset_index);
4182 state->last_primitive_reset_index = primitive_reset_index;
4183 }
4184 }
4185
4186 if (draw_info->strmout_buffer) {
4187 uint64_t va = radv_buffer_get_va(draw_info->strmout_buffer->bo);
4188
4189 va += draw_info->strmout_buffer->offset + draw_info->strmout_buffer_offset;
4190
4191 radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE, draw_info->stride);
4192
4193 if (info->gfx_level >= GFX10) {
4194 /* Emitting a COPY_DATA packet should be enough because RADV doesn't support preemption
4195 * (shadow memory) but for unknown reasons, it can lead to GPU hangs on GFX10+.
4196 */
4197 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
4198 radeon_emit(cs, 0);
4199
4200 radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, 0));
4201 radeon_emit(cs, va);
4202 radeon_emit(cs, va >> 32);
4203 radeon_emit(cs, (R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE - SI_CONTEXT_REG_OFFSET) >> 2);
4204 radeon_emit(cs, 1); /* 1 DWORD */
4205 } else {
4206 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
4207 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) |
4208 COPY_DATA_WR_CONFIRM);
4209 radeon_emit(cs, va);
4210 radeon_emit(cs, va >> 32);
4211 radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2);
4212 radeon_emit(cs, 0); /* unused */
4213 }
4214
4215 radv_cs_add_buffer(cmd_buffer->device->ws, cs, draw_info->strmout_buffer->bo);
4216 }
4217
4218 /* RDNA2 is affected by a hardware bug when instance packing is enabled for adjacent primitive
4219 * topologies and instance_count > 1, pipeline stats generated by GE are incorrect. It needs to
4220 * be applied for indexed and non-indexed draws.
4221 */
4222 if (info->gfx_level == GFX10_3 && state->active_pipeline_queries > 0 &&
4223 (draw_info->instance_count > 1 || draw_info->indirect) &&
4224 (topology == V_008958_DI_PT_LINELIST_ADJ || topology == V_008958_DI_PT_LINESTRIP_ADJ ||
4225 topology == V_008958_DI_PT_TRILIST_ADJ || topology == V_008958_DI_PT_TRISTRIP_ADJ)) {
4226 disable_instance_packing = true;
4227 }
4228
4229 if ((draw_info->indexed && state->index_type != state->last_index_type) ||
4230 (info->gfx_level == GFX10_3 &&
4231 (state->last_index_type == -1 ||
4232 disable_instance_packing != G_028A7C_DISABLE_INSTANCE_PACKING(state->last_index_type)))) {
4233 uint32_t index_type = state->index_type | S_028A7C_DISABLE_INSTANCE_PACKING(disable_instance_packing);
4234
4235 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9) {
4236 radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device, cs,
4237 R_03090C_VGT_INDEX_TYPE, 2, index_type);
4238 } else {
4239 radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
4240 radeon_emit(cs, index_type);
4241 }
4242
4243 state->last_index_type = index_type;
4244 }
4245 }
4246
4247 static void
radv_stage_flush(struct radv_cmd_buffer * cmd_buffer,VkPipelineStageFlags2 src_stage_mask)4248 radv_stage_flush(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 src_stage_mask)
4249 {
4250 /* For simplicity, if the barrier wants to wait for the task shader,
4251 * just make it wait for the mesh shader too.
4252 */
4253 if (src_stage_mask & VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_NV)
4254 src_stage_mask |= VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_NV;
4255
4256 if (src_stage_mask & (VK_PIPELINE_STAGE_2_COPY_BIT |
4257 VK_PIPELINE_STAGE_2_RESOLVE_BIT |
4258 VK_PIPELINE_STAGE_2_BLIT_BIT |
4259 VK_PIPELINE_STAGE_2_CLEAR_BIT)) {
4260 /* Be conservative for now. */
4261 src_stage_mask |= VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT;
4262 }
4263
4264 if (src_stage_mask &
4265 (VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT |
4266 VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_KHR |
4267 VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_COPY_BIT_KHR |
4268 VK_PIPELINE_STAGE_2_RAY_TRACING_SHADER_BIT_KHR | VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT |
4269 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) {
4270 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
4271 }
4272
4273 if (src_stage_mask &
4274 (VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT |
4275 VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT |
4276 VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT | VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT |
4277 VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) {
4278 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
4279 } else if (src_stage_mask &
4280 (VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT |
4281 VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT |
4282 VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT |
4283 VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT |
4284 VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT |
4285 VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_NV |
4286 VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT |
4287 VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT)) {
4288 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH;
4289 }
4290 }
4291
4292 static bool
can_skip_buffer_l2_flushes(struct radv_device * device)4293 can_skip_buffer_l2_flushes(struct radv_device *device)
4294 {
4295 return device->physical_device->rad_info.gfx_level == GFX9 ||
4296 (device->physical_device->rad_info.gfx_level >= GFX10 &&
4297 !device->physical_device->rad_info.tcc_rb_non_coherent);
4298 }
4299
4300 /*
4301 * In vulkan barriers have two kinds of operations:
4302 *
4303 * - visibility (implemented with radv_src_access_flush)
4304 * - availability (implemented with radv_dst_access_flush)
4305 *
4306 * for a memory operation to observe the result of a previous memory operation
4307 * one needs to do a visibility operation from the source memory and then an
4308 * availability operation to the target memory.
4309 *
4310 * The complication is the availability and visibility operations do not need to
4311 * be in the same barrier.
4312 *
4313 * The cleanest way to implement this is to define the visibility operation to
4314 * bring the caches to a "state of rest", which none of the caches below that
4315 * level dirty.
4316 *
4317 * For GFX8 and earlier this would be VRAM/GTT with none of the caches dirty.
4318 *
4319 * For GFX9+ we can define the state at rest to be L2 instead of VRAM for all
4320 * buffers and for images marked as coherent, and VRAM/GTT for non-coherent
4321 * images. However, given the existence of memory barriers which do not specify
4322 * the image/buffer it often devolves to just VRAM/GTT anyway.
4323 *
4324 * To help reducing the invalidations for GPUs that have L2 coherency between the
4325 * RB and the shader caches, we always invalidate L2 on the src side, as we can
4326 * use our knowledge of past usage to optimize flushes away.
4327 */
4328
4329 enum radv_cmd_flush_bits
radv_src_access_flush(struct radv_cmd_buffer * cmd_buffer,VkAccessFlags2 src_flags,const struct radv_image * image)4330 radv_src_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags2 src_flags,
4331 const struct radv_image *image)
4332 {
4333 bool has_CB_meta = true, has_DB_meta = true;
4334 bool image_is_coherent = image ? image->l2_coherent : false;
4335 enum radv_cmd_flush_bits flush_bits = 0;
4336
4337 if (image) {
4338 if (!radv_image_has_CB_metadata(image))
4339 has_CB_meta = false;
4340 if (!radv_image_has_htile(image))
4341 has_DB_meta = false;
4342 }
4343
4344 u_foreach_bit64(b, src_flags)
4345 {
4346 switch ((VkAccessFlags2)(1 << b)) {
4347 case VK_ACCESS_2_SHADER_WRITE_BIT:
4348 case VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT:
4349 /* since the STORAGE bit isn't set we know that this is a meta operation.
4350 * on the dst flush side we skip CB/DB flushes without the STORAGE bit, so
4351 * set it here. */
4352 if (image && !(image->vk.usage & VK_IMAGE_USAGE_STORAGE_BIT)) {
4353 if (vk_format_is_depth_or_stencil(image->vk.format)) {
4354 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
4355 } else {
4356 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
4357 }
4358 }
4359
4360 if (!image_is_coherent)
4361 flush_bits |= RADV_CMD_FLAG_INV_L2;
4362 break;
4363 case VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_KHR:
4364 case VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT:
4365 case VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
4366 if (!image_is_coherent)
4367 flush_bits |= RADV_CMD_FLAG_WB_L2;
4368 break;
4369 case VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT:
4370 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
4371 if (has_CB_meta)
4372 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
4373 break;
4374 case VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
4375 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
4376 if (has_DB_meta)
4377 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
4378 break;
4379 case VK_ACCESS_2_TRANSFER_WRITE_BIT:
4380 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB;
4381
4382 if (!image_is_coherent)
4383 flush_bits |= RADV_CMD_FLAG_INV_L2;
4384 if (has_CB_meta)
4385 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
4386 if (has_DB_meta)
4387 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
4388 break;
4389 case VK_ACCESS_2_MEMORY_WRITE_BIT:
4390 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB;
4391
4392 if (!image_is_coherent)
4393 flush_bits |= RADV_CMD_FLAG_INV_L2;
4394 if (has_CB_meta)
4395 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
4396 if (has_DB_meta)
4397 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
4398 break;
4399 default:
4400 break;
4401 }
4402 }
4403 return flush_bits;
4404 }
4405
4406 enum radv_cmd_flush_bits
radv_dst_access_flush(struct radv_cmd_buffer * cmd_buffer,VkAccessFlags2 dst_flags,const struct radv_image * image)4407 radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags2 dst_flags,
4408 const struct radv_image *image)
4409 {
4410 bool has_CB_meta = true, has_DB_meta = true;
4411 enum radv_cmd_flush_bits flush_bits = 0;
4412 bool flush_CB = true, flush_DB = true;
4413 bool image_is_coherent = image ? image->l2_coherent : false;
4414
4415 if (image) {
4416 if (!(image->vk.usage & VK_IMAGE_USAGE_STORAGE_BIT)) {
4417 flush_CB = false;
4418 flush_DB = false;
4419 }
4420
4421 if (!radv_image_has_CB_metadata(image))
4422 has_CB_meta = false;
4423 if (!radv_image_has_htile(image))
4424 has_DB_meta = false;
4425 }
4426
4427 /* All the L2 invalidations below are not the CB/DB. So if there are no incoherent images
4428 * in the L2 cache in CB/DB mode then they are already usable from all the other L2 clients. */
4429 image_is_coherent |=
4430 can_skip_buffer_l2_flushes(cmd_buffer->device) && !cmd_buffer->state.rb_noncoherent_dirty;
4431
4432 u_foreach_bit64(b, dst_flags)
4433 {
4434 switch ((VkAccessFlags2)(1 << b)) {
4435 case VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT:
4436 /* SMEM loads are used to read compute dispatch size in shaders */
4437 if (!cmd_buffer->device->load_grid_size_from_user_sgpr)
4438 flush_bits |= RADV_CMD_FLAG_INV_SCACHE;
4439
4440 /* Ensure the DGC meta shader can read the commands. */
4441 if (cmd_buffer->device->uses_device_generated_commands) {
4442 flush_bits |= RADV_CMD_FLAG_INV_SCACHE | RADV_CMD_FLAG_INV_VCACHE;
4443
4444 if (cmd_buffer->device->physical_device->rad_info.gfx_level < GFX9)
4445 flush_bits |= RADV_CMD_FLAG_INV_L2;
4446 }
4447
4448 break;
4449 case VK_ACCESS_2_INDEX_READ_BIT:
4450 case VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
4451 break;
4452 case VK_ACCESS_2_UNIFORM_READ_BIT:
4453 flush_bits |= RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_SCACHE;
4454 break;
4455 case VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT:
4456 case VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT:
4457 case VK_ACCESS_2_TRANSFER_READ_BIT:
4458 case VK_ACCESS_2_TRANSFER_WRITE_BIT:
4459 flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
4460
4461 if (has_CB_meta || has_DB_meta)
4462 flush_bits |= RADV_CMD_FLAG_INV_L2_METADATA;
4463 if (!image_is_coherent)
4464 flush_bits |= RADV_CMD_FLAG_INV_L2;
4465 break;
4466 case VK_ACCESS_2_SHADER_BINDING_TABLE_READ_BIT_KHR:
4467 case VK_ACCESS_2_SHADER_READ_BIT:
4468 case VK_ACCESS_2_SHADER_STORAGE_READ_BIT:
4469 flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
4470 /* Unlike LLVM, ACO uses SMEM for SSBOs and we have to
4471 * invalidate the scalar cache. */
4472 if (!cmd_buffer->device->physical_device->use_llvm && !image)
4473 flush_bits |= RADV_CMD_FLAG_INV_SCACHE;
4474
4475 if (has_CB_meta || has_DB_meta)
4476 flush_bits |= RADV_CMD_FLAG_INV_L2_METADATA;
4477 if (!image_is_coherent)
4478 flush_bits |= RADV_CMD_FLAG_INV_L2;
4479 break;
4480 case VK_ACCESS_2_ACCELERATION_STRUCTURE_READ_BIT_KHR:
4481 flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
4482 if (cmd_buffer->device->physical_device->rad_info.gfx_level < GFX9)
4483 flush_bits |= RADV_CMD_FLAG_INV_L2;
4484 break;
4485 case VK_ACCESS_2_SHADER_WRITE_BIT:
4486 case VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT:
4487 case VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_KHR:
4488 break;
4489 case VK_ACCESS_2_COLOR_ATTACHMENT_READ_BIT:
4490 case VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT:
4491 if (flush_CB)
4492 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
4493 if (has_CB_meta)
4494 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
4495 break;
4496 case VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_READ_BIT:
4497 case VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
4498 if (flush_DB)
4499 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
4500 if (has_DB_meta)
4501 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
4502 break;
4503 case VK_ACCESS_2_MEMORY_READ_BIT:
4504 case VK_ACCESS_2_MEMORY_WRITE_BIT:
4505 flush_bits |= RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_SCACHE;
4506 if (!image_is_coherent)
4507 flush_bits |= RADV_CMD_FLAG_INV_L2;
4508 if (flush_CB)
4509 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
4510 if (has_CB_meta)
4511 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
4512 if (flush_DB)
4513 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
4514 if (has_DB_meta)
4515 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
4516 break;
4517 default:
4518 break;
4519 }
4520 }
4521 return flush_bits;
4522 }
4523
4524 void
radv_emit_subpass_barrier(struct radv_cmd_buffer * cmd_buffer,const struct radv_subpass_barrier * barrier)4525 radv_emit_subpass_barrier(struct radv_cmd_buffer *cmd_buffer,
4526 const struct radv_subpass_barrier *barrier)
4527 {
4528 struct radv_render_pass *pass = cmd_buffer->state.pass;
4529
4530 for (uint32_t i = 0; i < pass->attachment_count; i++) {
4531 struct radv_image_view *iview = cmd_buffer->state.attachments[i].iview;
4532
4533 cmd_buffer->state.flush_bits |=
4534 radv_src_access_flush(cmd_buffer, barrier->src_access_mask, iview->image);
4535 }
4536
4537 radv_stage_flush(cmd_buffer, barrier->src_stage_mask);
4538
4539 for (uint32_t i = 0; i < pass->attachment_count; i++) {
4540 struct radv_image_view *iview = cmd_buffer->state.attachments[i].iview;
4541
4542 cmd_buffer->state.flush_bits |=
4543 radv_dst_access_flush(cmd_buffer, barrier->dst_access_mask, iview->image);
4544 }
4545
4546 radv_ace_internal_barrier(cmd_buffer, barrier->src_stage_mask, barrier->dst_stage_mask);
4547 }
4548
4549 uint32_t
radv_get_subpass_id(struct radv_cmd_buffer * cmd_buffer)4550 radv_get_subpass_id(struct radv_cmd_buffer *cmd_buffer)
4551 {
4552 struct radv_cmd_state *state = &cmd_buffer->state;
4553 uint32_t subpass_id = state->subpass - state->pass->subpasses;
4554
4555 /* The id of this subpass shouldn't exceed the number of subpasses in
4556 * this render pass minus 1.
4557 */
4558 assert(subpass_id < state->pass->subpass_count);
4559 return subpass_id;
4560 }
4561
4562 static struct radv_sample_locations_state *
radv_get_attachment_sample_locations(struct radv_cmd_buffer * cmd_buffer,uint32_t att_idx,bool begin_subpass)4563 radv_get_attachment_sample_locations(struct radv_cmd_buffer *cmd_buffer, uint32_t att_idx,
4564 bool begin_subpass)
4565 {
4566 struct radv_cmd_state *state = &cmd_buffer->state;
4567 uint32_t subpass_id = radv_get_subpass_id(cmd_buffer);
4568 struct radv_image_view *view = state->attachments[att_idx].iview;
4569
4570 if (view->image->info.samples == 1)
4571 return NULL;
4572
4573 if (state->pass->attachments[att_idx].first_subpass_idx == subpass_id) {
4574 /* Return the initial sample locations if this is the initial
4575 * layout transition of the given subpass attachemnt.
4576 */
4577 if (state->attachments[att_idx].sample_location.count > 0)
4578 return &state->attachments[att_idx].sample_location;
4579 } else {
4580 /* Otherwise return the subpass sample locations if defined. */
4581 if (state->subpass_sample_locs) {
4582 /* Because the driver sets the current subpass before
4583 * initial layout transitions, we should use the sample
4584 * locations from the previous subpass to avoid an
4585 * off-by-one problem. Otherwise, use the sample
4586 * locations for the current subpass for final layout
4587 * transitions.
4588 */
4589 if (begin_subpass)
4590 subpass_id--;
4591
4592 for (uint32_t i = 0; i < state->num_subpass_sample_locs; i++) {
4593 if (state->subpass_sample_locs[i].subpass_idx == subpass_id)
4594 return &state->subpass_sample_locs[i].sample_location;
4595 }
4596 }
4597 }
4598
4599 return NULL;
4600 }
4601
4602 static void
radv_handle_subpass_image_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_subpass_attachment att,bool begin_subpass)4603 radv_handle_subpass_image_transition(struct radv_cmd_buffer *cmd_buffer,
4604 struct radv_subpass_attachment att, bool begin_subpass)
4605 {
4606 unsigned idx = att.attachment;
4607 struct radv_image_view *view = cmd_buffer->state.attachments[idx].iview;
4608 struct radv_sample_locations_state *sample_locs;
4609 VkImageSubresourceRange range;
4610 range.aspectMask = view->vk.aspects;
4611 range.baseMipLevel = view->vk.base_mip_level;
4612 range.levelCount = 1;
4613 range.baseArrayLayer = view->vk.base_array_layer;
4614 range.layerCount = cmd_buffer->state.framebuffer->layers;
4615
4616 if (cmd_buffer->state.subpass->view_mask) {
4617 /* If the current subpass uses multiview, the driver might have
4618 * performed a fast color/depth clear to the whole image
4619 * (including all layers). To make sure the driver will
4620 * decompress the image correctly (if needed), we have to
4621 * account for the "real" number of layers. If the view mask is
4622 * sparse, this will decompress more layers than needed.
4623 */
4624 range.layerCount = util_last_bit(cmd_buffer->state.subpass->view_mask);
4625 }
4626
4627 /* Get the subpass sample locations for the given attachment, if NULL
4628 * is returned the driver will use the default HW locations.
4629 */
4630 sample_locs = radv_get_attachment_sample_locations(cmd_buffer, idx, begin_subpass);
4631
4632 /* Determine if the subpass uses separate depth/stencil layouts. */
4633 bool uses_separate_depth_stencil_layouts = false;
4634 if ((cmd_buffer->state.attachments[idx].current_layout !=
4635 cmd_buffer->state.attachments[idx].current_stencil_layout) ||
4636 (att.layout != att.stencil_layout)) {
4637 uses_separate_depth_stencil_layouts = true;
4638 }
4639
4640 /* For separate layouts, perform depth and stencil transitions
4641 * separately.
4642 */
4643 if (uses_separate_depth_stencil_layouts &&
4644 (range.aspectMask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT))) {
4645 /* Depth-only transitions. */
4646 range.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT;
4647 radv_handle_image_transition(cmd_buffer, view->image,
4648 cmd_buffer->state.attachments[idx].current_layout,
4649 cmd_buffer->state.attachments[idx].current_in_render_loop,
4650 att.layout, att.in_render_loop, 0, 0, &range, sample_locs);
4651
4652 /* Stencil-only transitions. */
4653 range.aspectMask = VK_IMAGE_ASPECT_STENCIL_BIT;
4654 radv_handle_image_transition(
4655 cmd_buffer, view->image, cmd_buffer->state.attachments[idx].current_stencil_layout,
4656 cmd_buffer->state.attachments[idx].current_in_render_loop, att.stencil_layout,
4657 att.in_render_loop, 0, 0, &range, sample_locs);
4658 } else {
4659 radv_handle_image_transition(cmd_buffer, view->image,
4660 cmd_buffer->state.attachments[idx].current_layout,
4661 cmd_buffer->state.attachments[idx].current_in_render_loop,
4662 att.layout, att.in_render_loop, 0, 0, &range, sample_locs);
4663 }
4664
4665 cmd_buffer->state.attachments[idx].current_layout = att.layout;
4666 cmd_buffer->state.attachments[idx].current_stencil_layout = att.stencil_layout;
4667 cmd_buffer->state.attachments[idx].current_in_render_loop = att.in_render_loop;
4668 }
4669
4670 void
radv_cmd_buffer_set_subpass(struct radv_cmd_buffer * cmd_buffer,const struct radv_subpass * subpass)4671 radv_cmd_buffer_set_subpass(struct radv_cmd_buffer *cmd_buffer, const struct radv_subpass *subpass)
4672 {
4673 cmd_buffer->state.subpass = subpass;
4674
4675 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FRAMEBUFFER;
4676 }
4677
4678 static VkResult
radv_cmd_state_setup_sample_locations(struct radv_cmd_buffer * cmd_buffer,struct radv_render_pass * pass,const VkRenderPassBeginInfo * info)4679 radv_cmd_state_setup_sample_locations(struct radv_cmd_buffer *cmd_buffer,
4680 struct radv_render_pass *pass,
4681 const VkRenderPassBeginInfo *info)
4682 {
4683 const struct VkRenderPassSampleLocationsBeginInfoEXT *sample_locs =
4684 vk_find_struct_const(info->pNext, RENDER_PASS_SAMPLE_LOCATIONS_BEGIN_INFO_EXT);
4685 struct radv_cmd_state *state = &cmd_buffer->state;
4686
4687 if (!sample_locs) {
4688 state->subpass_sample_locs = NULL;
4689 return VK_SUCCESS;
4690 }
4691
4692 for (uint32_t i = 0; i < sample_locs->attachmentInitialSampleLocationsCount; i++) {
4693 const VkAttachmentSampleLocationsEXT *att_sample_locs =
4694 &sample_locs->pAttachmentInitialSampleLocations[i];
4695 uint32_t att_idx = att_sample_locs->attachmentIndex;
4696 struct radv_image *image = cmd_buffer->state.attachments[att_idx].iview->image;
4697
4698 assert(vk_format_is_depth_or_stencil(image->vk.format));
4699
4700 /* From the Vulkan spec 1.1.108:
4701 *
4702 * "If the image referenced by the framebuffer attachment at
4703 * index attachmentIndex was not created with
4704 * VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT
4705 * then the values specified in sampleLocationsInfo are
4706 * ignored."
4707 */
4708 if (!(image->vk.create_flags & VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT))
4709 continue;
4710
4711 const VkSampleLocationsInfoEXT *sample_locs_info = &att_sample_locs->sampleLocationsInfo;
4712
4713 state->attachments[att_idx].sample_location.per_pixel =
4714 sample_locs_info->sampleLocationsPerPixel;
4715 state->attachments[att_idx].sample_location.grid_size =
4716 sample_locs_info->sampleLocationGridSize;
4717 state->attachments[att_idx].sample_location.count = sample_locs_info->sampleLocationsCount;
4718 typed_memcpy(&state->attachments[att_idx].sample_location.locations[0],
4719 sample_locs_info->pSampleLocations, sample_locs_info->sampleLocationsCount);
4720 }
4721
4722 state->subpass_sample_locs =
4723 vk_alloc(&cmd_buffer->pool->vk.alloc,
4724 sample_locs->postSubpassSampleLocationsCount * sizeof(state->subpass_sample_locs[0]),
4725 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
4726 if (state->subpass_sample_locs == NULL) {
4727 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
4728 return cmd_buffer->record_result;
4729 }
4730
4731 state->num_subpass_sample_locs = sample_locs->postSubpassSampleLocationsCount;
4732
4733 for (uint32_t i = 0; i < sample_locs->postSubpassSampleLocationsCount; i++) {
4734 const VkSubpassSampleLocationsEXT *subpass_sample_locs_info =
4735 &sample_locs->pPostSubpassSampleLocations[i];
4736 const VkSampleLocationsInfoEXT *sample_locs_info =
4737 &subpass_sample_locs_info->sampleLocationsInfo;
4738
4739 state->subpass_sample_locs[i].subpass_idx = subpass_sample_locs_info->subpassIndex;
4740 state->subpass_sample_locs[i].sample_location.per_pixel =
4741 sample_locs_info->sampleLocationsPerPixel;
4742 state->subpass_sample_locs[i].sample_location.grid_size =
4743 sample_locs_info->sampleLocationGridSize;
4744 state->subpass_sample_locs[i].sample_location.count = sample_locs_info->sampleLocationsCount;
4745 typed_memcpy(&state->subpass_sample_locs[i].sample_location.locations[0],
4746 sample_locs_info->pSampleLocations, sample_locs_info->sampleLocationsCount);
4747 }
4748
4749 return VK_SUCCESS;
4750 }
4751
4752 static VkResult
radv_cmd_state_setup_attachments(struct radv_cmd_buffer * cmd_buffer,struct radv_render_pass * pass,const VkRenderPassBeginInfo * info)4753 radv_cmd_state_setup_attachments(struct radv_cmd_buffer *cmd_buffer, struct radv_render_pass *pass,
4754 const VkRenderPassBeginInfo *info)
4755 {
4756 struct radv_cmd_state *state = &cmd_buffer->state;
4757 const struct VkRenderPassAttachmentBeginInfo *attachment_info = NULL;
4758
4759 if (info) {
4760 attachment_info = vk_find_struct_const(info->pNext, RENDER_PASS_ATTACHMENT_BEGIN_INFO);
4761 }
4762
4763 if (pass->attachment_count == 0) {
4764 state->attachments = NULL;
4765 return VK_SUCCESS;
4766 }
4767
4768 state->attachments =
4769 vk_alloc(&cmd_buffer->pool->vk.alloc, pass->attachment_count * sizeof(state->attachments[0]),
4770 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
4771 if (state->attachments == NULL) {
4772 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
4773 return cmd_buffer->record_result;
4774 }
4775
4776 for (uint32_t i = 0; i < pass->attachment_count; ++i) {
4777 struct radv_render_pass_attachment *att = &pass->attachments[i];
4778 VkImageAspectFlags att_aspects = vk_format_aspects(att->format);
4779 VkImageAspectFlags clear_aspects = 0;
4780
4781 if (att_aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
4782 /* color attachment */
4783 if (att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
4784 clear_aspects |= VK_IMAGE_ASPECT_COLOR_BIT;
4785 }
4786 } else {
4787 /* depthstencil attachment */
4788 if ((att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
4789 att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
4790 clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
4791 if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
4792 att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_DONT_CARE)
4793 clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
4794 }
4795 if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
4796 att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
4797 clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
4798 }
4799 }
4800
4801 state->attachments[i].pending_clear_aspects = clear_aspects;
4802 state->attachments[i].cleared_views = 0;
4803 if (clear_aspects && info) {
4804 assert(info->clearValueCount > i);
4805 state->attachments[i].clear_value = info->pClearValues[i];
4806 }
4807
4808 state->attachments[i].current_layout = att->initial_layout;
4809 state->attachments[i].current_in_render_loop = false;
4810 state->attachments[i].current_stencil_layout = att->stencil_initial_layout;
4811 state->attachments[i].sample_location.count = 0;
4812
4813 struct radv_image_view *iview;
4814 if (attachment_info && attachment_info->attachmentCount > i) {
4815 iview = radv_image_view_from_handle(attachment_info->pAttachments[i]);
4816 } else {
4817 iview = radv_image_view_from_handle(state->framebuffer->attachments[i]);
4818 }
4819
4820 state->attachments[i].iview = iview;
4821 if (iview->vk.aspects & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
4822 radv_initialise_ds_surface(cmd_buffer->device, &state->attachments[i].ds, iview);
4823 } else {
4824 radv_initialise_color_surface(cmd_buffer->device, &state->attachments[i].cb, iview);
4825 }
4826 }
4827
4828 return VK_SUCCESS;
4829 }
4830
4831 VKAPI_ATTR VkResult VKAPI_CALL
radv_AllocateCommandBuffers(VkDevice _device,const VkCommandBufferAllocateInfo * pAllocateInfo,VkCommandBuffer * pCommandBuffers)4832 radv_AllocateCommandBuffers(VkDevice _device, const VkCommandBufferAllocateInfo *pAllocateInfo,
4833 VkCommandBuffer *pCommandBuffers)
4834 {
4835 RADV_FROM_HANDLE(radv_device, device, _device);
4836 RADV_FROM_HANDLE(radv_cmd_pool, pool, pAllocateInfo->commandPool);
4837
4838 VkResult result = VK_SUCCESS;
4839 uint32_t i;
4840
4841 for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
4842
4843 if (!list_is_empty(&pool->free_cmd_buffers)) {
4844 struct radv_cmd_buffer *cmd_buffer =
4845 list_first_entry(&pool->free_cmd_buffers, struct radv_cmd_buffer, pool_link);
4846
4847 list_del(&cmd_buffer->pool_link);
4848 list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
4849
4850 result = radv_reset_cmd_buffer(cmd_buffer);
4851 vk_command_buffer_finish(&cmd_buffer->vk);
4852 VkResult init_result =
4853 vk_command_buffer_init(&cmd_buffer->vk, &pool->vk, pAllocateInfo->level);
4854 if (init_result != VK_SUCCESS)
4855 result = init_result;
4856
4857 pCommandBuffers[i] = radv_cmd_buffer_to_handle(cmd_buffer);
4858 } else {
4859 result = radv_create_cmd_buffer(device, pool, pAllocateInfo->level, &pCommandBuffers[i]);
4860 }
4861 if (result != VK_SUCCESS)
4862 break;
4863 }
4864
4865 if (result != VK_SUCCESS) {
4866 radv_FreeCommandBuffers(_device, pAllocateInfo->commandPool, i, pCommandBuffers);
4867
4868 /* From the Vulkan 1.0.66 spec:
4869 *
4870 * "vkAllocateCommandBuffers can be used to create multiple
4871 * command buffers. If the creation of any of those command
4872 * buffers fails, the implementation must destroy all
4873 * successfully created command buffer objects from this
4874 * command, set all entries of the pCommandBuffers array to
4875 * NULL and return the error."
4876 */
4877 memset(pCommandBuffers, 0, sizeof(*pCommandBuffers) * pAllocateInfo->commandBufferCount);
4878 }
4879
4880 return result;
4881 }
4882
4883 VKAPI_ATTR void VKAPI_CALL
radv_FreeCommandBuffers(VkDevice device,VkCommandPool commandPool,uint32_t commandBufferCount,const VkCommandBuffer * pCommandBuffers)4884 radv_FreeCommandBuffers(VkDevice device, VkCommandPool commandPool, uint32_t commandBufferCount,
4885 const VkCommandBuffer *pCommandBuffers)
4886 {
4887 RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
4888
4889 for (uint32_t i = 0; i < commandBufferCount; i++) {
4890 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, pCommandBuffers[i]);
4891
4892 if (!cmd_buffer)
4893 continue;
4894 assert(cmd_buffer->pool == pool);
4895
4896 list_del(&cmd_buffer->pool_link);
4897 list_addtail(&cmd_buffer->pool_link, &pool->free_cmd_buffers);
4898 }
4899 }
4900
4901 VKAPI_ATTR VkResult VKAPI_CALL
radv_ResetCommandBuffer(VkCommandBuffer commandBuffer,VkCommandBufferResetFlags flags)4902 radv_ResetCommandBuffer(VkCommandBuffer commandBuffer, VkCommandBufferResetFlags flags)
4903 {
4904 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4905 return radv_reset_cmd_buffer(cmd_buffer);
4906 }
4907
4908 static void
radv_inherit_dynamic_rendering(struct radv_cmd_buffer * cmd_buffer,const VkCommandBufferInheritanceInfo * inherit_info,const VkCommandBufferInheritanceRenderingInfo * dyn_info)4909 radv_inherit_dynamic_rendering(struct radv_cmd_buffer *cmd_buffer,
4910 const VkCommandBufferInheritanceInfo *inherit_info,
4911 const VkCommandBufferInheritanceRenderingInfo *dyn_info)
4912 {
4913 const VkAttachmentSampleCountInfoAMD *sample_info =
4914 vk_find_struct_const(inherit_info->pNext, ATTACHMENT_SAMPLE_COUNT_INFO_AMD);
4915 VkResult result;
4916 /* (normal + resolve) for color attachments and ds and a VRS attachment */
4917 VkAttachmentDescription2 att_desc[MAX_RTS * 2 + 3];
4918 VkAttachmentReference2 color_refs[MAX_RTS], ds_ref;
4919 unsigned att_count = 0;
4920
4921 VkSubpassDescription2 subpass = {
4922 .sType = VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_2,
4923 .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
4924 .viewMask = dyn_info->viewMask,
4925 .colorAttachmentCount = dyn_info->colorAttachmentCount,
4926 .pColorAttachments = color_refs,
4927 };
4928
4929 for (unsigned i = 0; i < dyn_info->colorAttachmentCount; ++i) {
4930 if (dyn_info->pColorAttachmentFormats[i] == VK_FORMAT_UNDEFINED) {
4931 color_refs[i] = (VkAttachmentReference2){
4932 .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
4933 .attachment = VK_ATTACHMENT_UNUSED,
4934 };
4935 continue;
4936 }
4937
4938 color_refs[i] = (VkAttachmentReference2){
4939 .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
4940 .attachment = att_count,
4941 .layout = VK_IMAGE_LAYOUT_GENERAL, /* Shouldn't be used */
4942 .aspectMask = 0, /* Shouldn't be used */
4943 };
4944
4945 VkAttachmentDescription2 *att = att_desc + att_count++;
4946 memset(att, 0, sizeof(*att));
4947 att->sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2;
4948 att->format = dyn_info->pColorAttachmentFormats[i];
4949 att->samples =
4950 sample_info ? sample_info->pColorAttachmentSamples[i] : dyn_info->rasterizationSamples;
4951 att->loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
4952 att->storeOp = VK_ATTACHMENT_STORE_OP_STORE;
4953 att->initialLayout = VK_IMAGE_LAYOUT_GENERAL;
4954 att->finalLayout = VK_IMAGE_LAYOUT_GENERAL;
4955 }
4956
4957 if (dyn_info->depthAttachmentFormat != VK_FORMAT_UNDEFINED ||
4958 dyn_info->stencilAttachmentFormat != VK_FORMAT_UNDEFINED) {
4959 VkFormat fmt = dyn_info->depthAttachmentFormat != VK_FORMAT_UNDEFINED
4960 ? dyn_info->depthAttachmentFormat
4961 : dyn_info->stencilAttachmentFormat;
4962
4963 ds_ref = (VkAttachmentReference2){
4964 .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
4965 .attachment = att_count,
4966 .layout = VK_IMAGE_LAYOUT_GENERAL, /* Shouldn't be used */
4967 .aspectMask = 0, /* Shouldn't be used */
4968 };
4969 subpass.pDepthStencilAttachment = &ds_ref;
4970
4971 VkAttachmentDescription2 *att = att_desc + att_count++;
4972
4973 memset(att, 0, sizeof(*att));
4974 att->sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2;
4975 att->format = fmt;
4976 att->samples =
4977 sample_info ? sample_info->depthStencilAttachmentSamples : dyn_info->rasterizationSamples;
4978 att->loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
4979 att->storeOp = VK_ATTACHMENT_STORE_OP_STORE;
4980 att->stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
4981 att->stencilStoreOp = VK_ATTACHMENT_STORE_OP_STORE;
4982 }
4983
4984 VkRenderPassCreateInfo2 rp_create_info = {
4985 .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2,
4986 .attachmentCount = att_count,
4987 .pAttachments = att_desc,
4988 .subpassCount = 1,
4989 .pSubpasses = &subpass,
4990 };
4991
4992 VkRenderPass rp;
4993 result =
4994 radv_CreateRenderPass2(radv_device_to_handle(cmd_buffer->device), &rp_create_info, NULL, &rp);
4995 if (result != VK_SUCCESS) {
4996 cmd_buffer->record_result = result;
4997 return;
4998 }
4999
5000 cmd_buffer->state.pass = radv_render_pass_from_handle(rp);
5001 cmd_buffer->state.own_render_pass = true;
5002 }
5003
5004 VKAPI_ATTR VkResult VKAPI_CALL
radv_BeginCommandBuffer(VkCommandBuffer commandBuffer,const VkCommandBufferBeginInfo * pBeginInfo)5005 radv_BeginCommandBuffer(VkCommandBuffer commandBuffer, const VkCommandBufferBeginInfo *pBeginInfo)
5006 {
5007 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5008 VkResult result = VK_SUCCESS;
5009
5010 if (cmd_buffer->status != RADV_CMD_BUFFER_STATUS_INITIAL) {
5011 /* If the command buffer has already been resetted with
5012 * vkResetCommandBuffer, no need to do it again.
5013 */
5014 result = radv_reset_cmd_buffer(cmd_buffer);
5015 if (result != VK_SUCCESS)
5016 return result;
5017 }
5018
5019 memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
5020 cmd_buffer->state.last_primitive_reset_en = -1;
5021 cmd_buffer->state.last_index_type = -1;
5022 cmd_buffer->state.last_num_instances = -1;
5023 cmd_buffer->state.last_vertex_offset = -1;
5024 cmd_buffer->state.last_first_instance = -1;
5025 cmd_buffer->state.last_drawid = -1;
5026 cmd_buffer->state.last_subpass_color_count = MAX_RTS;
5027 cmd_buffer->state.predication_type = -1;
5028 cmd_buffer->state.last_sx_ps_downconvert = -1;
5029 cmd_buffer->state.last_sx_blend_opt_epsilon = -1;
5030 cmd_buffer->state.last_sx_blend_opt_control = -1;
5031 cmd_buffer->state.last_nggc_settings = -1;
5032 cmd_buffer->state.last_nggc_settings_sgpr_idx = -1;
5033 cmd_buffer->state.mesh_shading = false;
5034 cmd_buffer->state.last_vrs_rates = -1;
5035 cmd_buffer->state.last_vrs_rates_sgpr_idx = -1;
5036 cmd_buffer->usage_flags = pBeginInfo->flags;
5037
5038 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
5039 (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) {
5040 struct radv_subpass *subpass = NULL;
5041
5042 assert(pBeginInfo->pInheritanceInfo);
5043
5044 cmd_buffer->state.framebuffer =
5045 vk_framebuffer_from_handle(pBeginInfo->pInheritanceInfo->framebuffer);
5046
5047 if (pBeginInfo->pInheritanceInfo->renderPass) {
5048 cmd_buffer->state.pass =
5049 radv_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
5050 assert(pBeginInfo->pInheritanceInfo->subpass < cmd_buffer->state.pass->subpass_count);
5051 subpass = &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
5052 } else {
5053 const VkCommandBufferInheritanceRenderingInfo *dyn_info =
5054 vk_find_struct_const(pBeginInfo->pInheritanceInfo->pNext,
5055 COMMAND_BUFFER_INHERITANCE_RENDERING_INFO);
5056 if (dyn_info) {
5057 radv_inherit_dynamic_rendering(cmd_buffer, pBeginInfo->pInheritanceInfo, dyn_info);
5058 subpass = &cmd_buffer->state.pass->subpasses[0];
5059 }
5060 }
5061
5062 if (cmd_buffer->state.framebuffer) {
5063 result = radv_cmd_state_setup_attachments(cmd_buffer, cmd_buffer->state.pass, NULL);
5064 if (result != VK_SUCCESS)
5065 return result;
5066 }
5067
5068 cmd_buffer->state.inherited_pipeline_statistics =
5069 pBeginInfo->pInheritanceInfo->pipelineStatistics;
5070
5071 if (cmd_buffer->state.pass) {
5072 cmd_buffer->state.subpass = subpass;
5073 if (cmd_buffer->state.framebuffer)
5074 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FRAMEBUFFER;
5075 }
5076 }
5077
5078 if (unlikely(cmd_buffer->device->trace_bo))
5079 radv_cmd_buffer_trace_emit(cmd_buffer);
5080
5081 radv_describe_begin_cmd_buffer(cmd_buffer);
5082
5083 cmd_buffer->status = RADV_CMD_BUFFER_STATUS_RECORDING;
5084
5085 return result;
5086 }
5087
5088 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes,const VkDeviceSize * pStrides)5089 radv_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer, uint32_t firstBinding,
5090 uint32_t bindingCount, const VkBuffer *pBuffers,
5091 const VkDeviceSize *pOffsets, const VkDeviceSize *pSizes,
5092 const VkDeviceSize *pStrides)
5093 {
5094 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5095 struct radv_vertex_binding *vb = cmd_buffer->vertex_bindings;
5096 const struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
5097
5098 /* We have to defer setting up vertex buffer since we need the buffer
5099 * stride from the pipeline. */
5100
5101 assert(firstBinding + bindingCount <= MAX_VBS);
5102 enum amd_gfx_level chip = cmd_buffer->device->physical_device->rad_info.gfx_level;
5103
5104 if (firstBinding + bindingCount > cmd_buffer->used_vertex_bindings)
5105 cmd_buffer->used_vertex_bindings = firstBinding + bindingCount;
5106
5107 uint32_t misaligned_mask_invalid = 0;
5108
5109 for (uint32_t i = 0; i < bindingCount; i++) {
5110 RADV_FROM_HANDLE(radv_buffer, buffer, pBuffers[i]);
5111 uint32_t idx = firstBinding + i;
5112 VkDeviceSize size = pSizes ? pSizes[i] : 0;
5113 /* if pStrides=NULL, it shouldn't overwrite the strides specified by CmdSetVertexInputEXT */
5114 VkDeviceSize stride = pStrides ? pStrides[i] : vb[idx].stride;
5115
5116 if (!!cmd_buffer->vertex_binding_buffers[idx] != !!buffer ||
5117 (buffer && ((vb[idx].offset & 0x3) != (pOffsets[i] & 0x3) ||
5118 (vb[idx].stride & 0x3) != (stride & 0x3)))) {
5119 misaligned_mask_invalid |= state->bindings_match_attrib ? BITFIELD_BIT(idx) : 0xffffffff;
5120 }
5121
5122 cmd_buffer->vertex_binding_buffers[idx] = buffer;
5123 vb[idx].offset = pOffsets[i];
5124 vb[idx].size = size;
5125 vb[idx].stride = stride;
5126
5127 uint32_t bit = BITFIELD_BIT(idx);
5128 if (buffer) {
5129 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->vertex_binding_buffers[idx]->bo);
5130 cmd_buffer->state.vbo_bound_mask |= bit;
5131 } else {
5132 cmd_buffer->state.vbo_bound_mask &= ~bit;
5133 }
5134 }
5135
5136 if ((chip == GFX6 || chip >= GFX10) && misaligned_mask_invalid) {
5137 cmd_buffer->state.vbo_misaligned_mask_invalid = misaligned_mask_invalid;
5138 cmd_buffer->state.vbo_misaligned_mask &= ~misaligned_mask_invalid;
5139 }
5140
5141 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER |
5142 RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT;
5143 }
5144
5145 static uint32_t
vk_to_index_type(VkIndexType type)5146 vk_to_index_type(VkIndexType type)
5147 {
5148 switch (type) {
5149 case VK_INDEX_TYPE_UINT8_EXT:
5150 return V_028A7C_VGT_INDEX_8;
5151 case VK_INDEX_TYPE_UINT16:
5152 return V_028A7C_VGT_INDEX_16;
5153 case VK_INDEX_TYPE_UINT32:
5154 return V_028A7C_VGT_INDEX_32;
5155 default:
5156 unreachable("invalid index type");
5157 }
5158 }
5159
5160 uint32_t
radv_get_vgt_index_size(uint32_t type)5161 radv_get_vgt_index_size(uint32_t type)
5162 {
5163 uint32_t index_type = G_028A7C_INDEX_TYPE(type);
5164 switch (index_type) {
5165 case V_028A7C_VGT_INDEX_8:
5166 return 1;
5167 case V_028A7C_VGT_INDEX_16:
5168 return 2;
5169 case V_028A7C_VGT_INDEX_32:
5170 return 4;
5171 default:
5172 unreachable("invalid index type");
5173 }
5174 }
5175
5176 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,VkBuffer buffer,VkDeviceSize offset,VkIndexType indexType)5177 radv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset,
5178 VkIndexType indexType)
5179 {
5180 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5181 RADV_FROM_HANDLE(radv_buffer, index_buffer, buffer);
5182
5183 cmd_buffer->state.index_buffer = index_buffer;
5184 cmd_buffer->state.index_offset = offset;
5185 cmd_buffer->state.index_type = vk_to_index_type(indexType);
5186 cmd_buffer->state.index_va = radv_buffer_get_va(index_buffer->bo);
5187 cmd_buffer->state.index_va += index_buffer->offset + offset;
5188
5189 int index_size = radv_get_vgt_index_size(vk_to_index_type(indexType));
5190 cmd_buffer->state.max_index_count =
5191 (vk_buffer_range(&index_buffer->vk, offset, VK_WHOLE_SIZE)) / index_size;
5192 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
5193 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, index_buffer->bo);
5194 }
5195
5196 static void
radv_bind_descriptor_set(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point,struct radv_descriptor_set * set,unsigned idx)5197 radv_bind_descriptor_set(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point,
5198 struct radv_descriptor_set *set, unsigned idx)
5199 {
5200 struct radeon_winsys *ws = cmd_buffer->device->ws;
5201
5202 radv_set_descriptor_set(cmd_buffer, bind_point, set, idx);
5203
5204 assert(set);
5205 assert(!(set->header.layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR));
5206
5207 if (!cmd_buffer->device->use_global_bo_list) {
5208 for (unsigned j = 0; j < set->header.buffer_count; ++j)
5209 if (set->descriptors[j])
5210 radv_cs_add_buffer(ws, cmd_buffer->cs, set->descriptors[j]);
5211 }
5212
5213 if (set->header.bo)
5214 radv_cs_add_buffer(ws, cmd_buffer->cs, set->header.bo);
5215 }
5216
5217 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipelineLayout _layout,uint32_t firstSet,uint32_t descriptorSetCount,const VkDescriptorSet * pDescriptorSets,uint32_t dynamicOffsetCount,const uint32_t * pDynamicOffsets)5218 radv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint,
5219 VkPipelineLayout _layout, uint32_t firstSet, uint32_t descriptorSetCount,
5220 const VkDescriptorSet *pDescriptorSets, uint32_t dynamicOffsetCount,
5221 const uint32_t *pDynamicOffsets)
5222 {
5223 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5224 RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
5225 unsigned dyn_idx = 0;
5226
5227 const bool no_dynamic_bounds =
5228 cmd_buffer->device->instance->debug_flags & RADV_DEBUG_NO_DYNAMIC_BOUNDS;
5229 struct radv_descriptor_state *descriptors_state =
5230 radv_get_descriptors_state(cmd_buffer, pipelineBindPoint);
5231
5232 for (unsigned i = 0; i < descriptorSetCount; ++i) {
5233 unsigned set_idx = i + firstSet;
5234 RADV_FROM_HANDLE(radv_descriptor_set, set, pDescriptorSets[i]);
5235
5236 if (!set) {
5237 /* From the Vulkan spec 1.3.211:
5238 *
5239 * "VUID-vkCmdBindDescriptorSets-layout-06564
5240 * If layout was not created with VK_PIPELINE_LAYOUT_CREATE_INDEPENDENT_SETS_BIT_EXT, each
5241 * element of pDescriptorSets must be a valid VkDescriptorSet"
5242 */
5243 assert(layout->independent_sets);
5244 continue;
5245 }
5246
5247 /* If the set is already bound we only need to update the
5248 * (potentially changed) dynamic offsets. */
5249 if (descriptors_state->sets[set_idx] != set ||
5250 !(descriptors_state->valid & (1u << set_idx))) {
5251 radv_bind_descriptor_set(cmd_buffer, pipelineBindPoint, set, set_idx);
5252 }
5253
5254 for (unsigned j = 0; j < set->header.layout->dynamic_offset_count; ++j, ++dyn_idx) {
5255 unsigned idx = j + layout->set[i + firstSet].dynamic_offset_start;
5256 uint32_t *dst = descriptors_state->dynamic_buffers + idx * 4;
5257 assert(dyn_idx < dynamicOffsetCount);
5258
5259 struct radv_descriptor_range *range = set->header.dynamic_descriptors + j;
5260
5261 if (!range->va) {
5262 memset(dst, 0, 4 * 4);
5263 } else {
5264 uint64_t va = range->va + pDynamicOffsets[dyn_idx];
5265 dst[0] = va;
5266 dst[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
5267 dst[2] = no_dynamic_bounds ? 0xffffffffu : range->size;
5268 dst[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
5269 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
5270
5271 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
5272 dst[3] |= S_008F0C_FORMAT(V_008F0C_GFX11_FORMAT_32_FLOAT) |
5273 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW);
5274 } else if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10) {
5275 dst[3] |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
5276 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
5277 } else {
5278 dst[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
5279 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
5280 }
5281 }
5282
5283 cmd_buffer->push_constant_stages |= set->header.layout->dynamic_shader_stages;
5284 }
5285 }
5286 }
5287
5288 static bool
radv_init_push_descriptor_set(struct radv_cmd_buffer * cmd_buffer,struct radv_descriptor_set * set,struct radv_descriptor_set_layout * layout,VkPipelineBindPoint bind_point)5289 radv_init_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer, struct radv_descriptor_set *set,
5290 struct radv_descriptor_set_layout *layout,
5291 VkPipelineBindPoint bind_point)
5292 {
5293 struct radv_descriptor_state *descriptors_state =
5294 radv_get_descriptors_state(cmd_buffer, bind_point);
5295 set->header.size = layout->size;
5296
5297 if (set->header.layout != layout) {
5298 if (set->header.layout)
5299 vk_descriptor_set_layout_unref(&cmd_buffer->device->vk, &set->header.layout->vk);
5300 vk_descriptor_set_layout_ref(&layout->vk);
5301 set->header.layout = layout;
5302 }
5303
5304 if (descriptors_state->push_set.capacity < set->header.size) {
5305 size_t new_size = MAX2(set->header.size, 1024);
5306 new_size = MAX2(new_size, 2 * descriptors_state->push_set.capacity);
5307 new_size = MIN2(new_size, 96 * MAX_PUSH_DESCRIPTORS);
5308
5309 free(set->header.mapped_ptr);
5310 set->header.mapped_ptr = malloc(new_size);
5311
5312 if (!set->header.mapped_ptr) {
5313 descriptors_state->push_set.capacity = 0;
5314 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
5315 return false;
5316 }
5317
5318 descriptors_state->push_set.capacity = new_size;
5319 }
5320
5321 return true;
5322 }
5323
5324 void
radv_meta_push_descriptor_set(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint pipelineBindPoint,VkPipelineLayout _layout,uint32_t set,uint32_t descriptorWriteCount,const VkWriteDescriptorSet * pDescriptorWrites)5325 radv_meta_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer,
5326 VkPipelineBindPoint pipelineBindPoint, VkPipelineLayout _layout,
5327 uint32_t set, uint32_t descriptorWriteCount,
5328 const VkWriteDescriptorSet *pDescriptorWrites)
5329 {
5330 RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
5331 struct radv_descriptor_set *push_set =
5332 (struct radv_descriptor_set *)&cmd_buffer->meta_push_descriptors;
5333 unsigned bo_offset;
5334
5335 assert(set == 0);
5336 assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
5337
5338 push_set->header.size = layout->set[set].layout->size;
5339 push_set->header.layout = layout->set[set].layout;
5340
5341 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, push_set->header.size, &bo_offset,
5342 (void **)&push_set->header.mapped_ptr))
5343 return;
5344
5345 push_set->header.va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
5346 push_set->header.va += bo_offset;
5347
5348 radv_cmd_update_descriptor_sets(cmd_buffer->device, cmd_buffer,
5349 radv_descriptor_set_to_handle(push_set), descriptorWriteCount,
5350 pDescriptorWrites, 0, NULL);
5351
5352 radv_set_descriptor_set(cmd_buffer, pipelineBindPoint, push_set, set);
5353 }
5354
5355 VKAPI_ATTR void VKAPI_CALL
radv_CmdPushDescriptorSetKHR(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipelineLayout _layout,uint32_t set,uint32_t descriptorWriteCount,const VkWriteDescriptorSet * pDescriptorWrites)5356 radv_CmdPushDescriptorSetKHR(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint,
5357 VkPipelineLayout _layout, uint32_t set, uint32_t descriptorWriteCount,
5358 const VkWriteDescriptorSet *pDescriptorWrites)
5359 {
5360 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5361 RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
5362 struct radv_descriptor_state *descriptors_state =
5363 radv_get_descriptors_state(cmd_buffer, pipelineBindPoint);
5364 struct radv_descriptor_set *push_set =
5365 (struct radv_descriptor_set *)&descriptors_state->push_set.set;
5366
5367 assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
5368
5369 if (!radv_init_push_descriptor_set(cmd_buffer, push_set, layout->set[set].layout,
5370 pipelineBindPoint))
5371 return;
5372
5373 /* Check that there are no inline uniform block updates when calling vkCmdPushDescriptorSetKHR()
5374 * because it is invalid, according to Vulkan spec.
5375 */
5376 for (int i = 0; i < descriptorWriteCount; i++) {
5377 ASSERTED const VkWriteDescriptorSet *writeset = &pDescriptorWrites[i];
5378 assert(writeset->descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK);
5379 }
5380
5381 radv_cmd_update_descriptor_sets(cmd_buffer->device, cmd_buffer,
5382 radv_descriptor_set_to_handle(push_set), descriptorWriteCount,
5383 pDescriptorWrites, 0, NULL);
5384
5385 radv_set_descriptor_set(cmd_buffer, pipelineBindPoint, push_set, set);
5386 descriptors_state->push_dirty = true;
5387 }
5388
5389 VKAPI_ATTR void VKAPI_CALL
radv_CmdPushDescriptorSetWithTemplateKHR(VkCommandBuffer commandBuffer,VkDescriptorUpdateTemplate descriptorUpdateTemplate,VkPipelineLayout _layout,uint32_t set,const void * pData)5390 radv_CmdPushDescriptorSetWithTemplateKHR(VkCommandBuffer commandBuffer,
5391 VkDescriptorUpdateTemplate descriptorUpdateTemplate,
5392 VkPipelineLayout _layout, uint32_t set, const void *pData)
5393 {
5394 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5395 RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
5396 RADV_FROM_HANDLE(radv_descriptor_update_template, templ, descriptorUpdateTemplate);
5397 struct radv_descriptor_state *descriptors_state =
5398 radv_get_descriptors_state(cmd_buffer, templ->bind_point);
5399 struct radv_descriptor_set *push_set =
5400 (struct radv_descriptor_set *)&descriptors_state->push_set.set;
5401
5402 assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
5403
5404 if (!radv_init_push_descriptor_set(cmd_buffer, push_set, layout->set[set].layout,
5405 templ->bind_point))
5406 return;
5407
5408 radv_cmd_update_descriptor_set_with_template(cmd_buffer->device, cmd_buffer, push_set,
5409 descriptorUpdateTemplate, pData);
5410
5411 radv_set_descriptor_set(cmd_buffer, templ->bind_point, push_set, set);
5412 descriptors_state->push_dirty = true;
5413 }
5414
5415 VKAPI_ATTR void VKAPI_CALL
radv_CmdPushConstants(VkCommandBuffer commandBuffer,VkPipelineLayout layout,VkShaderStageFlags stageFlags,uint32_t offset,uint32_t size,const void * pValues)5416 radv_CmdPushConstants(VkCommandBuffer commandBuffer, VkPipelineLayout layout,
5417 VkShaderStageFlags stageFlags, uint32_t offset, uint32_t size,
5418 const void *pValues)
5419 {
5420 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5421 memcpy(cmd_buffer->push_constants + offset, pValues, size);
5422 cmd_buffer->push_constant_stages |= stageFlags;
5423 }
5424
5425 VKAPI_ATTR VkResult VKAPI_CALL
radv_EndCommandBuffer(VkCommandBuffer commandBuffer)5426 radv_EndCommandBuffer(VkCommandBuffer commandBuffer)
5427 {
5428 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5429
5430 radv_emit_mip_change_flush_default(cmd_buffer);
5431
5432 if (cmd_buffer->qf != RADV_QUEUE_TRANSFER) {
5433 if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX6)
5434 cmd_buffer->state.flush_bits |=
5435 RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_WB_L2;
5436
5437 /* Make sure to sync all pending active queries at the end of
5438 * command buffer.
5439 */
5440 cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
5441
5442 /* Flush noncoherent images on GFX9+ so we can assume they're clean on the start of a
5443 * command buffer.
5444 */
5445 if (cmd_buffer->state.rb_noncoherent_dirty && can_skip_buffer_l2_flushes(cmd_buffer->device))
5446 cmd_buffer->state.flush_bits |= radv_src_access_flush(
5447 cmd_buffer,
5448 VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT |
5449 VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,
5450 NULL);
5451
5452 /* Since NGG streamout uses GDS, we need to make GDS idle when
5453 * we leave the IB, otherwise another process might overwrite
5454 * it while our shaders are busy.
5455 */
5456 if (cmd_buffer->gds_needed)
5457 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
5458
5459 /* Finalize the internal compute command stream, if it exists. */
5460 if (cmd_buffer->ace_internal.cs) {
5461 VkResult result = radv_ace_internal_finalize(cmd_buffer);
5462 if (result != VK_SUCCESS)
5463 return vk_error(cmd_buffer, result);
5464 }
5465
5466 si_emit_cache_flush(cmd_buffer);
5467 }
5468
5469 /* Make sure CP DMA is idle at the end of IBs because the kernel
5470 * doesn't wait for it.
5471 */
5472 si_cp_dma_wait_for_idle(cmd_buffer);
5473
5474 radv_describe_end_cmd_buffer(cmd_buffer);
5475
5476 vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer->state.attachments);
5477 vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer->state.subpass_sample_locs);
5478
5479 VkResult result = cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs);
5480 if (result != VK_SUCCESS)
5481 return vk_error(cmd_buffer, result);
5482
5483 cmd_buffer->status = RADV_CMD_BUFFER_STATUS_EXECUTABLE;
5484
5485 return cmd_buffer->record_result;
5486 }
5487
5488 static void
radv_emit_compute_pipeline(struct radv_cmd_buffer * cmd_buffer,struct radv_compute_pipeline * pipeline)5489 radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer,
5490 struct radv_compute_pipeline *pipeline)
5491 {
5492 if (pipeline == cmd_buffer->state.emitted_compute_pipeline)
5493 return;
5494
5495 assert(!pipeline->base.ctx_cs.cdw);
5496
5497 cmd_buffer->state.emitted_compute_pipeline = pipeline;
5498
5499 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->base.cs.cdw);
5500 radeon_emit_array(cmd_buffer->cs, pipeline->base.cs.buf, pipeline->base.cs.cdw);
5501
5502 cmd_buffer->compute_scratch_size_per_wave_needed =
5503 MAX2(cmd_buffer->compute_scratch_size_per_wave_needed, pipeline->base.scratch_bytes_per_wave);
5504 cmd_buffer->compute_scratch_waves_wanted =
5505 MAX2(cmd_buffer->compute_scratch_waves_wanted, pipeline->base.max_waves);
5506
5507 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->base.slab_bo);
5508
5509 if (unlikely(cmd_buffer->device->trace_bo))
5510 radv_save_pipeline(cmd_buffer, &pipeline->base);
5511 }
5512
5513 static void
radv_mark_descriptor_sets_dirty(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)5514 radv_mark_descriptor_sets_dirty(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point)
5515 {
5516 struct radv_descriptor_state *descriptors_state =
5517 radv_get_descriptors_state(cmd_buffer, bind_point);
5518
5519 descriptors_state->dirty |= descriptors_state->valid;
5520 }
5521
5522 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindPipeline(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipeline _pipeline)5523 radv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint,
5524 VkPipeline _pipeline)
5525 {
5526 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5527 RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);
5528
5529 switch (pipelineBindPoint) {
5530 case VK_PIPELINE_BIND_POINT_COMPUTE: {
5531 struct radv_compute_pipeline *compute_pipeline = radv_pipeline_to_compute(pipeline);
5532
5533 if (cmd_buffer->state.compute_pipeline == compute_pipeline)
5534 return;
5535 radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
5536
5537 cmd_buffer->state.compute_pipeline = compute_pipeline;
5538 cmd_buffer->push_constant_stages |= VK_SHADER_STAGE_COMPUTE_BIT;
5539 cmd_buffer->task_rings_needed |=
5540 pipeline->shaders[MESA_SHADER_COMPUTE]->info.cs.uses_task_rings;
5541 break;
5542 }
5543 case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR: {
5544 struct radv_compute_pipeline *compute_pipeline = radv_pipeline_to_compute(pipeline);
5545
5546 if (cmd_buffer->state.rt_pipeline == compute_pipeline)
5547 return;
5548 radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
5549
5550 cmd_buffer->state.rt_pipeline = compute_pipeline;
5551 cmd_buffer->push_constant_stages |= RADV_RT_STAGE_BITS;
5552 if (compute_pipeline->dynamic_stack_size)
5553 radv_set_rt_stack_size(cmd_buffer, cmd_buffer->state.rt_stack_size);
5554 break;
5555 }
5556 case VK_PIPELINE_BIND_POINT_GRAPHICS: {
5557 struct radv_graphics_pipeline *graphics_pipeline =
5558 pipeline ? radv_pipeline_to_graphics(pipeline) : NULL;
5559
5560 if (cmd_buffer->state.graphics_pipeline == graphics_pipeline)
5561 return;
5562 radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
5563
5564 bool vtx_emit_count_changed =
5565 !pipeline || !cmd_buffer->state.graphics_pipeline ||
5566 cmd_buffer->state.graphics_pipeline->vtx_emit_num != graphics_pipeline->vtx_emit_num ||
5567 cmd_buffer->state.graphics_pipeline->vtx_base_sgpr != graphics_pipeline->vtx_base_sgpr;
5568 cmd_buffer->state.graphics_pipeline = graphics_pipeline;
5569 if (!pipeline)
5570 break;
5571
5572 bool mesh_shading = radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_MESH);
5573 if (mesh_shading != cmd_buffer->state.mesh_shading) {
5574 /* Re-emit VRS state because the combiner is different (vertex vs primitive).
5575 * Re-emit primitive topology because the mesh shading pipeline clobbered it.
5576 */
5577 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE |
5578 RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY;
5579 }
5580
5581 cmd_buffer->state.mesh_shading = mesh_shading;
5582 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_PIPELINE | RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT;
5583 cmd_buffer->push_constant_stages |= graphics_pipeline->active_stages;
5584
5585 /* the new vertex shader might not have the same user regs */
5586 if (vtx_emit_count_changed) {
5587 cmd_buffer->state.last_first_instance = -1;
5588 cmd_buffer->state.last_vertex_offset = -1;
5589 cmd_buffer->state.last_drawid = -1;
5590 }
5591
5592 /* Prefetch all pipeline shaders at first draw time. */
5593 cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_SHADERS;
5594
5595 if (cmd_buffer->device->physical_device->rad_info.has_vgt_flush_ngg_legacy_bug &&
5596 cmd_buffer->state.emitted_graphics_pipeline &&
5597 cmd_buffer->state.emitted_graphics_pipeline->is_ngg &&
5598 !cmd_buffer->state.graphics_pipeline->is_ngg) {
5599 /* Transitioning from NGG to legacy GS requires
5600 * VGT_FLUSH on GFX10 and Navi21. VGT_FLUSH
5601 * is also emitted at the beginning of IBs when legacy
5602 * GS ring pointers are set.
5603 */
5604 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_FLUSH;
5605 }
5606
5607 radv_bind_dynamic_state(cmd_buffer, &graphics_pipeline->dynamic_state);
5608
5609 if (graphics_pipeline->esgs_ring_size > cmd_buffer->esgs_ring_size_needed)
5610 cmd_buffer->esgs_ring_size_needed = graphics_pipeline->esgs_ring_size;
5611 if (graphics_pipeline->gsvs_ring_size > cmd_buffer->gsvs_ring_size_needed)
5612 cmd_buffer->gsvs_ring_size_needed = graphics_pipeline->gsvs_ring_size;
5613
5614 if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_TESS_CTRL))
5615 cmd_buffer->tess_rings_needed = true;
5616 if (mesh_shading)
5617 cmd_buffer->mesh_scratch_ring_needed |=
5618 pipeline->shaders[MESA_SHADER_MESH]->info.ms.needs_ms_scratch_ring;
5619
5620 if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_TASK)) {
5621 if (!cmd_buffer->ace_internal.cs) {
5622 cmd_buffer->ace_internal.cs = radv_ace_internal_create(cmd_buffer);
5623 if (!cmd_buffer->ace_internal.cs)
5624 return;
5625 }
5626
5627 cmd_buffer->task_rings_needed = true;
5628 }
5629 break;
5630 }
5631 default:
5632 assert(!"invalid bind point");
5633 break;
5634 }
5635 }
5636
5637 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetViewport(VkCommandBuffer commandBuffer,uint32_t firstViewport,uint32_t viewportCount,const VkViewport * pViewports)5638 radv_CmdSetViewport(VkCommandBuffer commandBuffer, uint32_t firstViewport, uint32_t viewportCount,
5639 const VkViewport *pViewports)
5640 {
5641 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5642 struct radv_cmd_state *state = &cmd_buffer->state;
5643 ASSERTED const uint32_t total_count = firstViewport + viewportCount;
5644
5645 assert(firstViewport < MAX_VIEWPORTS);
5646 assert(total_count >= 1 && total_count <= MAX_VIEWPORTS);
5647
5648 if (state->dynamic.viewport.count < total_count)
5649 state->dynamic.viewport.count = total_count;
5650
5651 memcpy(state->dynamic.viewport.viewports + firstViewport, pViewports,
5652 viewportCount * sizeof(*pViewports));
5653 for (unsigned i = 0; i < viewportCount; i++) {
5654 radv_get_viewport_xform(&pViewports[i],
5655 state->dynamic.viewport.xform[i + firstViewport].scale,
5656 state->dynamic.viewport.xform[i + firstViewport].translate);
5657 }
5658
5659 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_VIEWPORT;
5660 }
5661
5662 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetScissor(VkCommandBuffer commandBuffer,uint32_t firstScissor,uint32_t scissorCount,const VkRect2D * pScissors)5663 radv_CmdSetScissor(VkCommandBuffer commandBuffer, uint32_t firstScissor, uint32_t scissorCount,
5664 const VkRect2D *pScissors)
5665 {
5666 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5667 struct radv_cmd_state *state = &cmd_buffer->state;
5668 ASSERTED const uint32_t total_count = firstScissor + scissorCount;
5669
5670 assert(firstScissor < MAX_SCISSORS);
5671 assert(total_count >= 1 && total_count <= MAX_SCISSORS);
5672
5673 if (state->dynamic.scissor.count < total_count)
5674 state->dynamic.scissor.count = total_count;
5675
5676 memcpy(state->dynamic.scissor.scissors + firstScissor, pScissors,
5677 scissorCount * sizeof(*pScissors));
5678
5679 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR;
5680 }
5681
5682 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetLineWidth(VkCommandBuffer commandBuffer,float lineWidth)5683 radv_CmdSetLineWidth(VkCommandBuffer commandBuffer, float lineWidth)
5684 {
5685 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5686
5687 if (cmd_buffer->state.dynamic.line_width != lineWidth)
5688 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR;
5689
5690 cmd_buffer->state.dynamic.line_width = lineWidth;
5691 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH;
5692 }
5693
5694 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthBias(VkCommandBuffer commandBuffer,float depthBiasConstantFactor,float depthBiasClamp,float depthBiasSlopeFactor)5695 radv_CmdSetDepthBias(VkCommandBuffer commandBuffer, float depthBiasConstantFactor,
5696 float depthBiasClamp, float depthBiasSlopeFactor)
5697 {
5698 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5699 struct radv_cmd_state *state = &cmd_buffer->state;
5700
5701 state->dynamic.depth_bias.bias = depthBiasConstantFactor;
5702 state->dynamic.depth_bias.clamp = depthBiasClamp;
5703 state->dynamic.depth_bias.slope = depthBiasSlopeFactor;
5704
5705 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
5706 }
5707
5708 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetBlendConstants(VkCommandBuffer commandBuffer,const float blendConstants[4])5709 radv_CmdSetBlendConstants(VkCommandBuffer commandBuffer, const float blendConstants[4])
5710 {
5711 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5712 struct radv_cmd_state *state = &cmd_buffer->state;
5713
5714 memcpy(state->dynamic.blend_constants, blendConstants, sizeof(float) * 4);
5715
5716 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS;
5717 }
5718
5719 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthBounds(VkCommandBuffer commandBuffer,float minDepthBounds,float maxDepthBounds)5720 radv_CmdSetDepthBounds(VkCommandBuffer commandBuffer, float minDepthBounds, float maxDepthBounds)
5721 {
5722 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5723 struct radv_cmd_state *state = &cmd_buffer->state;
5724
5725 state->dynamic.depth_bounds.min = minDepthBounds;
5726 state->dynamic.depth_bounds.max = maxDepthBounds;
5727
5728 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS;
5729 }
5730
5731 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t compareMask)5732 radv_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask,
5733 uint32_t compareMask)
5734 {
5735 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5736 struct radv_cmd_state *state = &cmd_buffer->state;
5737
5738 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
5739 state->dynamic.stencil_compare_mask.front = compareMask;
5740 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
5741 state->dynamic.stencil_compare_mask.back = compareMask;
5742
5743 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK;
5744 }
5745
5746 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t writeMask)5747 radv_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask,
5748 uint32_t writeMask)
5749 {
5750 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5751 struct radv_cmd_state *state = &cmd_buffer->state;
5752
5753 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
5754 state->dynamic.stencil_write_mask.front = writeMask;
5755 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
5756 state->dynamic.stencil_write_mask.back = writeMask;
5757
5758 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK;
5759 }
5760
5761 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetStencilReference(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t reference)5762 radv_CmdSetStencilReference(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask,
5763 uint32_t reference)
5764 {
5765 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5766
5767 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
5768 cmd_buffer->state.dynamic.stencil_reference.front = reference;
5769 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
5770 cmd_buffer->state.dynamic.stencil_reference.back = reference;
5771
5772 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE;
5773 }
5774
5775 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDiscardRectangleEXT(VkCommandBuffer commandBuffer,uint32_t firstDiscardRectangle,uint32_t discardRectangleCount,const VkRect2D * pDiscardRectangles)5776 radv_CmdSetDiscardRectangleEXT(VkCommandBuffer commandBuffer, uint32_t firstDiscardRectangle,
5777 uint32_t discardRectangleCount, const VkRect2D *pDiscardRectangles)
5778 {
5779 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5780 struct radv_cmd_state *state = &cmd_buffer->state;
5781 ASSERTED const uint32_t total_count = firstDiscardRectangle + discardRectangleCount;
5782
5783 assert(firstDiscardRectangle < MAX_DISCARD_RECTANGLES);
5784 assert(total_count >= 1 && total_count <= MAX_DISCARD_RECTANGLES);
5785
5786 typed_memcpy(&state->dynamic.discard_rectangle.rectangles[firstDiscardRectangle],
5787 pDiscardRectangles, discardRectangleCount);
5788
5789 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE;
5790 }
5791
5792 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetSampleLocationsEXT(VkCommandBuffer commandBuffer,const VkSampleLocationsInfoEXT * pSampleLocationsInfo)5793 radv_CmdSetSampleLocationsEXT(VkCommandBuffer commandBuffer,
5794 const VkSampleLocationsInfoEXT *pSampleLocationsInfo)
5795 {
5796 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5797 struct radv_cmd_state *state = &cmd_buffer->state;
5798
5799 assert(pSampleLocationsInfo->sampleLocationsCount <= MAX_SAMPLE_LOCATIONS);
5800
5801 state->dynamic.sample_location.per_pixel = pSampleLocationsInfo->sampleLocationsPerPixel;
5802 state->dynamic.sample_location.grid_size = pSampleLocationsInfo->sampleLocationGridSize;
5803 state->dynamic.sample_location.count = pSampleLocationsInfo->sampleLocationsCount;
5804 typed_memcpy(&state->dynamic.sample_location.locations[0],
5805 pSampleLocationsInfo->pSampleLocations, pSampleLocationsInfo->sampleLocationsCount);
5806
5807 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS;
5808 }
5809
5810 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetLineStippleEXT(VkCommandBuffer commandBuffer,uint32_t lineStippleFactor,uint16_t lineStipplePattern)5811 radv_CmdSetLineStippleEXT(VkCommandBuffer commandBuffer, uint32_t lineStippleFactor,
5812 uint16_t lineStipplePattern)
5813 {
5814 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5815 struct radv_cmd_state *state = &cmd_buffer->state;
5816
5817 state->dynamic.line_stipple.factor = lineStippleFactor;
5818 state->dynamic.line_stipple.pattern = lineStipplePattern;
5819
5820 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE;
5821 }
5822
5823 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetCullMode(VkCommandBuffer commandBuffer,VkCullModeFlags cullMode)5824 radv_CmdSetCullMode(VkCommandBuffer commandBuffer, VkCullModeFlags cullMode)
5825 {
5826 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5827 struct radv_cmd_state *state = &cmd_buffer->state;
5828
5829 state->dynamic.cull_mode = cullMode;
5830
5831 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_CULL_MODE;
5832 }
5833
5834 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetFrontFace(VkCommandBuffer commandBuffer,VkFrontFace frontFace)5835 radv_CmdSetFrontFace(VkCommandBuffer commandBuffer, VkFrontFace frontFace)
5836 {
5837 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5838 struct radv_cmd_state *state = &cmd_buffer->state;
5839
5840 state->dynamic.front_face = frontFace;
5841
5842 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE;
5843 }
5844
5845 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetPrimitiveTopology(VkCommandBuffer commandBuffer,VkPrimitiveTopology primitiveTopology)5846 radv_CmdSetPrimitiveTopology(VkCommandBuffer commandBuffer, VkPrimitiveTopology primitiveTopology)
5847 {
5848 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5849 struct radv_cmd_state *state = &cmd_buffer->state;
5850 unsigned primitive_topology = si_translate_prim(primitiveTopology);
5851
5852 if ((state->dynamic.primitive_topology == V_008958_DI_PT_LINESTRIP) !=
5853 (primitive_topology == V_008958_DI_PT_LINESTRIP))
5854 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE;
5855
5856 if (radv_prim_is_points_or_lines(state->dynamic.primitive_topology) !=
5857 radv_prim_is_points_or_lines(primitive_topology))
5858 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR;
5859
5860 state->dynamic.primitive_topology = primitive_topology;
5861
5862 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY;
5863 }
5864
5865 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetViewportWithCount(VkCommandBuffer commandBuffer,uint32_t viewportCount,const VkViewport * pViewports)5866 radv_CmdSetViewportWithCount(VkCommandBuffer commandBuffer, uint32_t viewportCount,
5867 const VkViewport *pViewports)
5868 {
5869 radv_CmdSetViewport(commandBuffer, 0, viewportCount, pViewports);
5870 }
5871
5872 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetScissorWithCount(VkCommandBuffer commandBuffer,uint32_t scissorCount,const VkRect2D * pScissors)5873 radv_CmdSetScissorWithCount(VkCommandBuffer commandBuffer, uint32_t scissorCount,
5874 const VkRect2D *pScissors)
5875 {
5876 radv_CmdSetScissor(commandBuffer, 0, scissorCount, pScissors);
5877 }
5878
5879 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthTestEnable(VkCommandBuffer commandBuffer,VkBool32 depthTestEnable)5880 radv_CmdSetDepthTestEnable(VkCommandBuffer commandBuffer, VkBool32 depthTestEnable)
5881
5882 {
5883 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5884 struct radv_cmd_state *state = &cmd_buffer->state;
5885
5886 state->dynamic.depth_test_enable = depthTestEnable;
5887
5888 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE;
5889 }
5890
5891 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthWriteEnable(VkCommandBuffer commandBuffer,VkBool32 depthWriteEnable)5892 radv_CmdSetDepthWriteEnable(VkCommandBuffer commandBuffer, VkBool32 depthWriteEnable)
5893 {
5894 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5895 struct radv_cmd_state *state = &cmd_buffer->state;
5896
5897 state->dynamic.depth_write_enable = depthWriteEnable;
5898
5899 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE;
5900 }
5901
5902 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthCompareOp(VkCommandBuffer commandBuffer,VkCompareOp depthCompareOp)5903 radv_CmdSetDepthCompareOp(VkCommandBuffer commandBuffer, VkCompareOp depthCompareOp)
5904 {
5905 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5906 struct radv_cmd_state *state = &cmd_buffer->state;
5907
5908 state->dynamic.depth_compare_op = depthCompareOp;
5909
5910 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP;
5911 }
5912
5913 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthBoundsTestEnable(VkCommandBuffer commandBuffer,VkBool32 depthBoundsTestEnable)5914 radv_CmdSetDepthBoundsTestEnable(VkCommandBuffer commandBuffer, VkBool32 depthBoundsTestEnable)
5915 {
5916 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5917 struct radv_cmd_state *state = &cmd_buffer->state;
5918
5919 state->dynamic.depth_bounds_test_enable = depthBoundsTestEnable;
5920
5921 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE;
5922 }
5923
5924 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetStencilTestEnable(VkCommandBuffer commandBuffer,VkBool32 stencilTestEnable)5925 radv_CmdSetStencilTestEnable(VkCommandBuffer commandBuffer, VkBool32 stencilTestEnable)
5926 {
5927 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5928 struct radv_cmd_state *state = &cmd_buffer->state;
5929
5930 state->dynamic.stencil_test_enable = stencilTestEnable;
5931
5932 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE;
5933 }
5934
5935 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetStencilOp(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,VkStencilOp failOp,VkStencilOp passOp,VkStencilOp depthFailOp,VkCompareOp compareOp)5936 radv_CmdSetStencilOp(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask,
5937 VkStencilOp failOp, VkStencilOp passOp, VkStencilOp depthFailOp,
5938 VkCompareOp compareOp)
5939 {
5940 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5941 struct radv_cmd_state *state = &cmd_buffer->state;
5942
5943 if (faceMask & VK_STENCIL_FACE_FRONT_BIT) {
5944 state->dynamic.stencil_op.front.fail_op = failOp;
5945 state->dynamic.stencil_op.front.pass_op = passOp;
5946 state->dynamic.stencil_op.front.depth_fail_op = depthFailOp;
5947 state->dynamic.stencil_op.front.compare_op = compareOp;
5948 }
5949
5950 if (faceMask & VK_STENCIL_FACE_BACK_BIT) {
5951 state->dynamic.stencil_op.back.fail_op = failOp;
5952 state->dynamic.stencil_op.back.pass_op = passOp;
5953 state->dynamic.stencil_op.back.depth_fail_op = depthFailOp;
5954 state->dynamic.stencil_op.back.compare_op = compareOp;
5955 }
5956
5957 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP;
5958 }
5959
5960 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetFragmentShadingRateKHR(VkCommandBuffer commandBuffer,const VkExtent2D * pFragmentSize,const VkFragmentShadingRateCombinerOpKHR combinerOps[2])5961 radv_CmdSetFragmentShadingRateKHR(VkCommandBuffer commandBuffer, const VkExtent2D *pFragmentSize,
5962 const VkFragmentShadingRateCombinerOpKHR combinerOps[2])
5963 {
5964 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5965 struct radv_cmd_state *state = &cmd_buffer->state;
5966
5967 state->dynamic.fragment_shading_rate.size = *pFragmentSize;
5968 for (unsigned i = 0; i < 2; i++)
5969 state->dynamic.fragment_shading_rate.combiner_ops[i] = combinerOps[i];
5970
5971 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE;
5972 }
5973
5974 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthBiasEnable(VkCommandBuffer commandBuffer,VkBool32 depthBiasEnable)5975 radv_CmdSetDepthBiasEnable(VkCommandBuffer commandBuffer, VkBool32 depthBiasEnable)
5976 {
5977 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5978 struct radv_cmd_state *state = &cmd_buffer->state;
5979
5980 state->dynamic.depth_bias_enable = depthBiasEnable;
5981
5982 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE;
5983 }
5984
5985 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetPrimitiveRestartEnable(VkCommandBuffer commandBuffer,VkBool32 primitiveRestartEnable)5986 radv_CmdSetPrimitiveRestartEnable(VkCommandBuffer commandBuffer, VkBool32 primitiveRestartEnable)
5987 {
5988 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5989 struct radv_cmd_state *state = &cmd_buffer->state;
5990
5991 state->dynamic.primitive_restart_enable = primitiveRestartEnable;
5992
5993 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE;
5994 }
5995
5996 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetRasterizerDiscardEnable(VkCommandBuffer commandBuffer,VkBool32 rasterizerDiscardEnable)5997 radv_CmdSetRasterizerDiscardEnable(VkCommandBuffer commandBuffer, VkBool32 rasterizerDiscardEnable)
5998 {
5999 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6000 struct radv_cmd_state *state = &cmd_buffer->state;
6001
6002 state->dynamic.rasterizer_discard_enable = rasterizerDiscardEnable;
6003
6004 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
6005 }
6006
6007 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetPatchControlPointsEXT(VkCommandBuffer commandBuffer,uint32_t patchControlPoints)6008 radv_CmdSetPatchControlPointsEXT(VkCommandBuffer commandBuffer, uint32_t patchControlPoints)
6009 {
6010 /* not implemented */
6011 }
6012
6013 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetLogicOpEXT(VkCommandBuffer commandBuffer,VkLogicOp logicOp)6014 radv_CmdSetLogicOpEXT(VkCommandBuffer commandBuffer, VkLogicOp logicOp)
6015 {
6016 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6017 struct radv_cmd_state *state = &cmd_buffer->state;
6018 unsigned logic_op = si_translate_blend_logic_op(logicOp);
6019
6020 state->dynamic.logic_op = logic_op;
6021
6022 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP;
6023 }
6024
6025 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer,uint32_t attachmentCount,const VkBool32 * pColorWriteEnables)6026 radv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer, uint32_t attachmentCount,
6027 const VkBool32 *pColorWriteEnables)
6028 {
6029 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6030 struct radv_cmd_state *state = &cmd_buffer->state;
6031 uint32_t color_write_enable = 0;
6032
6033 assert(attachmentCount <= MAX_RTS);
6034
6035 for (uint32_t i = 0; i < attachmentCount; i++) {
6036 color_write_enable |= pColorWriteEnables[i] ? (0xfu << (i * 4)) : 0;
6037 }
6038
6039 state->dynamic.color_write_enable = color_write_enable;
6040
6041 state->dirty |= RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE;
6042 }
6043
6044 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetVertexInputEXT(VkCommandBuffer commandBuffer,uint32_t vertexBindingDescriptionCount,const VkVertexInputBindingDescription2EXT * pVertexBindingDescriptions,uint32_t vertexAttributeDescriptionCount,const VkVertexInputAttributeDescription2EXT * pVertexAttributeDescriptions)6045 radv_CmdSetVertexInputEXT(VkCommandBuffer commandBuffer, uint32_t vertexBindingDescriptionCount,
6046 const VkVertexInputBindingDescription2EXT *pVertexBindingDescriptions,
6047 uint32_t vertexAttributeDescriptionCount,
6048 const VkVertexInputAttributeDescription2EXT *pVertexAttributeDescriptions)
6049 {
6050 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6051 struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
6052
6053 const VkVertexInputBindingDescription2EXT *bindings[MAX_VBS];
6054 for (unsigned i = 0; i < vertexBindingDescriptionCount; i++)
6055 bindings[pVertexBindingDescriptions[i].binding] = &pVertexBindingDescriptions[i];
6056
6057 cmd_buffer->state.vbo_misaligned_mask = 0;
6058 cmd_buffer->state.vbo_misaligned_mask_invalid = 0;
6059
6060 memset(state, 0, sizeof(*state));
6061 state->bindings_match_attrib = true;
6062
6063 enum amd_gfx_level chip = cmd_buffer->device->physical_device->rad_info.gfx_level;
6064 for (unsigned i = 0; i < vertexAttributeDescriptionCount; i++) {
6065 const VkVertexInputAttributeDescription2EXT *attrib = &pVertexAttributeDescriptions[i];
6066 const VkVertexInputBindingDescription2EXT *binding = bindings[attrib->binding];
6067 unsigned loc = attrib->location;
6068
6069 state->attribute_mask |= 1u << loc;
6070 state->bindings[loc] = attrib->binding;
6071 if (attrib->binding != loc)
6072 state->bindings_match_attrib = false;
6073 if (binding->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE) {
6074 state->instance_rate_inputs |= 1u << loc;
6075 state->divisors[loc] = binding->divisor;
6076 if (binding->divisor == 0) {
6077 state->zero_divisors |= 1u << loc;
6078 } else if (binding->divisor > 1) {
6079 state->nontrivial_divisors |= 1u << loc;
6080 }
6081 }
6082 cmd_buffer->vertex_bindings[attrib->binding].stride = binding->stride;
6083 state->offsets[loc] = attrib->offset;
6084
6085 struct dynamic_vertex_format_cache *found = NULL;
6086 util_dynarray_foreach(&cmd_buffer->cached_vertex_formats,
6087 struct dynamic_vertex_format_cache,
6088 vf) {
6089 if (vf->format == attrib->format) {
6090 found = vf;
6091 break;
6092 }
6093 }
6094 if (!found) {
6095 unsigned nfmt, dfmt;
6096 bool post_shuffle;
6097 enum radv_vs_input_alpha_adjust alpha_adjust;
6098 const struct util_format_description *format_desc = vk_format_description(attrib->format);
6099
6100 found = util_dynarray_grow(&cmd_buffer->cached_vertex_formats,
6101 struct dynamic_vertex_format_cache, 1);
6102 radv_translate_vertex_format(cmd_buffer->device->physical_device, attrib->format, format_desc,
6103 &dfmt, &nfmt, &post_shuffle, &alpha_adjust);
6104 found->format = attrib->format;
6105 found->hw_fmt = dfmt | (nfmt << 4);
6106 const uint8_t format_align_req_minus_1 = format_desc->channel[0].size >= 32 ? 3 :
6107 (format_desc->block.bits / 8u - 1);
6108 found->fmt_align_req_minus_1 = format_align_req_minus_1;
6109 found->fmt_size = format_desc->block.bits / 8u;
6110 found->post_shuffle = post_shuffle;
6111 found->alpha_adjust_lo = alpha_adjust & 0x1;
6112 found->alpha_adjust_hi = (alpha_adjust >> 1) & 0x1;
6113 }
6114
6115 state->formats[loc] = found->hw_fmt;
6116 state->format_align_req_minus_1[loc] = found->fmt_align_req_minus_1;
6117 state->format_sizes[loc] = found->fmt_size;
6118 state->alpha_adjust_lo |= found->alpha_adjust_lo << loc;
6119 state->alpha_adjust_hi |= found->alpha_adjust_hi << loc;
6120 if (found->post_shuffle)
6121 state->post_shuffle |= 1u << loc;
6122
6123 if ((chip == GFX6 || chip >= GFX10) &&
6124 cmd_buffer->state.vbo_bound_mask & BITFIELD_BIT(attrib->binding)) {
6125 if (binding->stride & found->fmt_align_req_minus_1) {
6126 cmd_buffer->state.vbo_misaligned_mask |= BITFIELD_BIT(loc);
6127 } else if ((cmd_buffer->vertex_bindings[attrib->binding].offset + state->offsets[loc]) &
6128 found->fmt_align_req_minus_1) {
6129 cmd_buffer->state.vbo_misaligned_mask |= BITFIELD_BIT(loc);
6130 }
6131 }
6132 }
6133
6134 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER |
6135 RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT;
6136 }
6137
6138 VKAPI_ATTR void VKAPI_CALL
radv_CmdExecuteCommands(VkCommandBuffer commandBuffer,uint32_t commandBufferCount,const VkCommandBuffer * pCmdBuffers)6139 radv_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCount,
6140 const VkCommandBuffer *pCmdBuffers)
6141 {
6142 RADV_FROM_HANDLE(radv_cmd_buffer, primary, commandBuffer);
6143
6144 assert(commandBufferCount > 0);
6145
6146 radv_emit_mip_change_flush_default(primary);
6147
6148 /* Emit pending flushes on primary prior to executing secondary */
6149 si_emit_cache_flush(primary);
6150
6151 /* Make sure CP DMA is idle on primary prior to executing secondary. */
6152 si_cp_dma_wait_for_idle(primary);
6153
6154 for (uint32_t i = 0; i < commandBufferCount; i++) {
6155 RADV_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]);
6156 bool allow_ib2 = true;
6157
6158 if (secondary->device->physical_device->rad_info.gfx_level == GFX7 &&
6159 secondary->state.uses_draw_indirect_multi) {
6160 /* Do not launch an IB2 for secondary command buffers that contain
6161 * DRAW_{INDEX}_INDIRECT_MULTI on GFX7 because it's illegal and hang the GPU.
6162 */
6163 allow_ib2 = false;
6164 }
6165
6166 if (secondary->qf == RADV_QUEUE_COMPUTE) {
6167 /* IB2 packets are not supported on compute queues according to PAL. */
6168 allow_ib2 = false;
6169 }
6170
6171 primary->scratch_size_per_wave_needed =
6172 MAX2(primary->scratch_size_per_wave_needed, secondary->scratch_size_per_wave_needed);
6173 primary->scratch_waves_wanted =
6174 MAX2(primary->scratch_waves_wanted, secondary->scratch_waves_wanted);
6175 primary->compute_scratch_size_per_wave_needed =
6176 MAX2(primary->compute_scratch_size_per_wave_needed,
6177 secondary->compute_scratch_size_per_wave_needed);
6178 primary->compute_scratch_waves_wanted =
6179 MAX2(primary->compute_scratch_waves_wanted, secondary->compute_scratch_waves_wanted);
6180
6181 if (secondary->esgs_ring_size_needed > primary->esgs_ring_size_needed)
6182 primary->esgs_ring_size_needed = secondary->esgs_ring_size_needed;
6183 if (secondary->gsvs_ring_size_needed > primary->gsvs_ring_size_needed)
6184 primary->gsvs_ring_size_needed = secondary->gsvs_ring_size_needed;
6185 if (secondary->tess_rings_needed)
6186 primary->tess_rings_needed = true;
6187 if (secondary->task_rings_needed)
6188 primary->task_rings_needed = true;
6189 if (secondary->mesh_scratch_ring_needed)
6190 primary->mesh_scratch_ring_needed = true;
6191 if (secondary->sample_positions_needed)
6192 primary->sample_positions_needed = true;
6193 if (secondary->gds_needed)
6194 primary->gds_needed = true;
6195
6196 if (!secondary->state.framebuffer && primary->state.pass && (primary->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)) {
6197 /* Emit the framebuffer state from primary if secondary
6198 * has been recorded without a framebuffer, otherwise
6199 * fast color/depth clears can't work.
6200 */
6201 radv_emit_fb_mip_change_flush(primary);
6202 radv_emit_framebuffer_state(primary);
6203 }
6204
6205 if (secondary->ace_internal.cs) {
6206 if (!primary->ace_internal.cs) {
6207 primary->ace_internal.cs = radv_ace_internal_create(primary);
6208 if (!primary->ace_internal.cs)
6209 return;
6210 }
6211
6212 struct radeon_cmdbuf *ace_primary = primary->ace_internal.cs;
6213 struct radeon_cmdbuf *ace_secondary = secondary->ace_internal.cs;
6214
6215 /* Emit pending flushes on primary prior to executing secondary. */
6216 radv_ace_internal_cache_flush(primary);
6217
6218 /* Wait for primary GFX->ACE semaphore, if necessary. */
6219 if (radv_flush_gfx2ace_semaphore(primary))
6220 radv_wait_gfx2ace_semaphore(primary);
6221
6222 /* Execute the secondary compute cmdbuf.
6223 * Don't use IB2 packets because they are not supported on compute queues.
6224 */
6225 primary->device->ws->cs_execute_secondary(ace_primary, ace_secondary, false);
6226 }
6227
6228 /* Update pending ACE internal flush bits from the secondary cmdbuf */
6229 primary->ace_internal.flush_bits |= secondary->ace_internal.flush_bits;
6230
6231 /* Increment primary semaphore if secondary was dirty.
6232 * This happens when the secondary cmdbuf has a barrier which
6233 * isn't consumed by a draw call.
6234 */
6235 if (radv_ace_internal_sem_dirty(secondary))
6236 primary->ace_internal.sem.gfx2ace_value++;
6237
6238 primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs, allow_ib2);
6239
6240 /* When the secondary command buffer is compute only we don't
6241 * need to re-emit the current graphics pipeline.
6242 */
6243 if (secondary->state.emitted_graphics_pipeline) {
6244 primary->state.emitted_graphics_pipeline = secondary->state.emitted_graphics_pipeline;
6245 }
6246
6247 /* When the secondary command buffer is graphics only we don't
6248 * need to re-emit the current compute pipeline.
6249 */
6250 if (secondary->state.emitted_compute_pipeline) {
6251 primary->state.emitted_compute_pipeline = secondary->state.emitted_compute_pipeline;
6252 }
6253
6254 /* Only re-emit the draw packets when needed. */
6255 if (secondary->state.last_primitive_reset_en != -1) {
6256 primary->state.last_primitive_reset_en = secondary->state.last_primitive_reset_en;
6257 }
6258
6259 if (secondary->state.last_primitive_reset_index) {
6260 primary->state.last_primitive_reset_index = secondary->state.last_primitive_reset_index;
6261 }
6262
6263 if (secondary->state.last_ia_multi_vgt_param) {
6264 primary->state.last_ia_multi_vgt_param = secondary->state.last_ia_multi_vgt_param;
6265 }
6266
6267 primary->state.last_first_instance = secondary->state.last_first_instance;
6268 primary->state.last_num_instances = secondary->state.last_num_instances;
6269 primary->state.last_drawid = secondary->state.last_drawid;
6270 primary->state.last_subpass_color_count = secondary->state.last_subpass_color_count;
6271 primary->state.last_vertex_offset = secondary->state.last_vertex_offset;
6272 primary->state.last_sx_ps_downconvert = secondary->state.last_sx_ps_downconvert;
6273 primary->state.last_sx_blend_opt_epsilon = secondary->state.last_sx_blend_opt_epsilon;
6274 primary->state.last_sx_blend_opt_control = secondary->state.last_sx_blend_opt_control;
6275
6276 if (secondary->state.last_index_type != -1) {
6277 primary->state.last_index_type = secondary->state.last_index_type;
6278 }
6279
6280 primary->state.last_nggc_settings = secondary->state.last_nggc_settings;
6281 primary->state.last_nggc_settings_sgpr_idx = secondary->state.last_nggc_settings_sgpr_idx;
6282 primary->state.last_nggc_skip = secondary->state.last_nggc_skip;
6283
6284 primary->state.last_vrs_rates = secondary->state.last_vrs_rates;
6285 primary->state.last_vrs_rates_sgpr_idx = secondary->state.last_vrs_rates_sgpr_idx;
6286 }
6287
6288 /* After executing commands from secondary buffers we have to dirty
6289 * some states.
6290 */
6291 primary->state.dirty |=
6292 RADV_CMD_DIRTY_PIPELINE | RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_DYNAMIC_ALL;
6293 radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_GRAPHICS);
6294 radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_COMPUTE);
6295 }
6296
6297 VKAPI_ATTR VkResult VKAPI_CALL
radv_CreateCommandPool(VkDevice _device,const VkCommandPoolCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkCommandPool * pCmdPool)6298 radv_CreateCommandPool(VkDevice _device, const VkCommandPoolCreateInfo *pCreateInfo,
6299 const VkAllocationCallbacks *pAllocator, VkCommandPool *pCmdPool)
6300 {
6301 RADV_FROM_HANDLE(radv_device, device, _device);
6302 struct radv_cmd_pool *pool;
6303
6304 pool =
6305 vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*pool), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
6306 if (pool == NULL)
6307 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
6308
6309 VkResult result = vk_command_pool_init(&pool->vk, &device->vk, pCreateInfo, pAllocator);
6310 if (result != VK_SUCCESS) {
6311 vk_free2(&device->vk.alloc, pAllocator, pool);
6312 return result;
6313 }
6314
6315 list_inithead(&pool->cmd_buffers);
6316 list_inithead(&pool->free_cmd_buffers);
6317
6318 *pCmdPool = radv_cmd_pool_to_handle(pool);
6319
6320 return VK_SUCCESS;
6321 }
6322
6323 VKAPI_ATTR void VKAPI_CALL
radv_DestroyCommandPool(VkDevice _device,VkCommandPool commandPool,const VkAllocationCallbacks * pAllocator)6324 radv_DestroyCommandPool(VkDevice _device, VkCommandPool commandPool,
6325 const VkAllocationCallbacks *pAllocator)
6326 {
6327 RADV_FROM_HANDLE(radv_device, device, _device);
6328 RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
6329
6330 if (!pool)
6331 return;
6332
6333 list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer, &pool->cmd_buffers, pool_link)
6334 {
6335 radv_destroy_cmd_buffer(cmd_buffer);
6336 }
6337
6338 list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer, &pool->free_cmd_buffers, pool_link)
6339 {
6340 radv_destroy_cmd_buffer(cmd_buffer);
6341 }
6342
6343 vk_command_pool_finish(&pool->vk);
6344 vk_free2(&device->vk.alloc, pAllocator, pool);
6345 }
6346
6347 VKAPI_ATTR VkResult VKAPI_CALL
radv_ResetCommandPool(VkDevice device,VkCommandPool commandPool,VkCommandPoolResetFlags flags)6348 radv_ResetCommandPool(VkDevice device, VkCommandPool commandPool, VkCommandPoolResetFlags flags)
6349 {
6350 RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
6351 VkResult result;
6352
6353 list_for_each_entry(struct radv_cmd_buffer, cmd_buffer, &pool->cmd_buffers, pool_link)
6354 {
6355 result = radv_reset_cmd_buffer(cmd_buffer);
6356 if (result != VK_SUCCESS)
6357 return result;
6358 }
6359
6360 return VK_SUCCESS;
6361 }
6362
6363 VKAPI_ATTR void VKAPI_CALL
radv_TrimCommandPool(VkDevice device,VkCommandPool commandPool,VkCommandPoolTrimFlags flags)6364 radv_TrimCommandPool(VkDevice device, VkCommandPool commandPool, VkCommandPoolTrimFlags flags)
6365 {
6366 RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
6367
6368 list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer, &pool->free_cmd_buffers, pool_link)
6369 {
6370 radv_destroy_cmd_buffer(cmd_buffer);
6371 }
6372 }
6373
6374 static void
radv_cmd_buffer_begin_subpass(struct radv_cmd_buffer * cmd_buffer,uint32_t subpass_id)6375 radv_cmd_buffer_begin_subpass(struct radv_cmd_buffer *cmd_buffer, uint32_t subpass_id)
6376 {
6377 struct radv_cmd_state *state = &cmd_buffer->state;
6378 struct radv_subpass *subpass = &state->pass->subpasses[subpass_id];
6379
6380 ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4096);
6381
6382 radv_emit_subpass_barrier(cmd_buffer, &subpass->start_barrier);
6383
6384 radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
6385
6386 radv_describe_barrier_start(cmd_buffer, RGP_BARRIER_EXTERNAL_RENDER_PASS_SYNC);
6387
6388 for (uint32_t i = 0; i < subpass->attachment_count; ++i) {
6389 const uint32_t a = subpass->attachments[i].attachment;
6390 if (a == VK_ATTACHMENT_UNUSED)
6391 continue;
6392
6393 radv_handle_subpass_image_transition(cmd_buffer, subpass->attachments[i], true);
6394 }
6395
6396 radv_ace_internal_barrier(cmd_buffer, 0, 0);
6397 radv_describe_barrier_end(cmd_buffer);
6398
6399 radv_cmd_buffer_clear_subpass(cmd_buffer);
6400
6401 if (subpass->vrs_attachment) {
6402 int idx = subpass->vrs_attachment->attachment;
6403 struct radv_image_view *vrs_iview = cmd_buffer->state.attachments[idx].iview;
6404
6405 if (subpass->depth_stencil_attachment) {
6406 /* When a subpass uses a VRS attachment and a depth/stencil attachment, we just need to
6407 * copy the VRS rates to the HTILE buffer of the attachment.
6408 */
6409 int ds_idx = subpass->depth_stencil_attachment->attachment;
6410 struct radv_image_view *ds_iview = cmd_buffer->state.attachments[ds_idx].iview;
6411 struct radv_image *ds_image = ds_iview->image;
6412 uint32_t level = ds_iview->vk.base_mip_level;
6413
6414 VkExtent2D extent = {
6415 .width = radv_minify(ds_image->info.width, level),
6416 .height = radv_minify(ds_image->info.height, level),
6417 };
6418
6419 /* HTILE buffer */
6420 uint64_t htile_offset = ds_image->bindings[0].offset + ds_image->planes[0].surface.meta_offset +
6421 ds_image->planes[0].surface.u.gfx9.meta_levels[level].offset;
6422 uint64_t htile_size = ds_image->planes[0].surface.u.gfx9.meta_levels[level].size;
6423 struct radv_buffer htile_buffer;
6424
6425 radv_buffer_init(&htile_buffer, cmd_buffer->device, ds_image->bindings[0].bo, htile_size, htile_offset);
6426
6427 /* Copy the VRS rates to the HTILE buffer. */
6428 radv_copy_vrs_htile(cmd_buffer, vrs_iview->image, &extent, ds_image, &htile_buffer, true);
6429
6430 radv_buffer_finish(&htile_buffer);
6431 } else {
6432 /* When a subpass uses a VRS attachment without binding a depth/stencil attachment, we have
6433 * to copy the VRS rates to our internal HTILE buffer.
6434 */
6435 struct vk_framebuffer *fb = cmd_buffer->state.framebuffer;
6436 struct radv_image *ds_image = radv_cmd_buffer_get_vrs_image(cmd_buffer);
6437
6438 if (ds_image) {
6439 /* HTILE buffer */
6440 struct radv_buffer *htile_buffer = cmd_buffer->device->vrs.buffer;
6441
6442 VkExtent2D extent = {
6443 .width = MIN2(fb->width, ds_image->info.width),
6444 .height = MIN2(fb->height, ds_image->info.height),
6445 };
6446
6447 /* Copy the VRS rates to the HTILE buffer. */
6448 radv_copy_vrs_htile(cmd_buffer, vrs_iview->image, &extent, ds_image, htile_buffer, false);
6449 }
6450 }
6451 }
6452
6453 assert(cmd_buffer->cs->cdw <= cdw_max);
6454 }
6455
6456 static void
radv_mark_noncoherent_rb(struct radv_cmd_buffer * cmd_buffer)6457 radv_mark_noncoherent_rb(struct radv_cmd_buffer *cmd_buffer)
6458 {
6459 const struct radv_subpass *subpass = cmd_buffer->state.subpass;
6460
6461 /* Have to be conservative in cmdbuffers with inherited attachments. */
6462 if (!cmd_buffer->state.attachments) {
6463 cmd_buffer->state.rb_noncoherent_dirty = true;
6464 return;
6465 }
6466
6467 for (uint32_t i = 0; i < subpass->color_count; ++i) {
6468 const uint32_t a = subpass->color_attachments[i].attachment;
6469 if (a == VK_ATTACHMENT_UNUSED)
6470 continue;
6471 if (!cmd_buffer->state.attachments[a].iview->image->l2_coherent) {
6472 cmd_buffer->state.rb_noncoherent_dirty = true;
6473 return;
6474 }
6475 }
6476 if (subpass->depth_stencil_attachment &&
6477 !cmd_buffer->state.attachments[subpass->depth_stencil_attachment->attachment]
6478 .iview->image->l2_coherent)
6479 cmd_buffer->state.rb_noncoherent_dirty = true;
6480 }
6481
6482 void
radv_cmd_buffer_restore_subpass(struct radv_cmd_buffer * cmd_buffer,const struct radv_subpass * subpass)6483 radv_cmd_buffer_restore_subpass(struct radv_cmd_buffer *cmd_buffer,
6484 const struct radv_subpass *subpass)
6485 {
6486 radv_mark_noncoherent_rb(cmd_buffer);
6487 radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
6488 }
6489
6490 static void
radv_cmd_buffer_end_subpass(struct radv_cmd_buffer * cmd_buffer)6491 radv_cmd_buffer_end_subpass(struct radv_cmd_buffer *cmd_buffer)
6492 {
6493 struct radv_cmd_state *state = &cmd_buffer->state;
6494 const struct radv_subpass *subpass = state->subpass;
6495 uint32_t subpass_id = radv_get_subpass_id(cmd_buffer);
6496
6497 radv_cmd_buffer_resolve_subpass(cmd_buffer);
6498
6499 radv_describe_barrier_start(cmd_buffer, RGP_BARRIER_EXTERNAL_RENDER_PASS_SYNC);
6500
6501 for (uint32_t i = 0; i < subpass->attachment_count; ++i) {
6502 const uint32_t a = subpass->attachments[i].attachment;
6503 if (a == VK_ATTACHMENT_UNUSED)
6504 continue;
6505
6506 if (state->pass->attachments[a].last_subpass_idx != subpass_id)
6507 continue;
6508
6509 VkImageLayout layout = state->pass->attachments[a].final_layout;
6510 VkImageLayout stencil_layout = state->pass->attachments[a].stencil_final_layout;
6511 struct radv_subpass_attachment att = {a, layout, stencil_layout};
6512 radv_handle_subpass_image_transition(cmd_buffer, att, false);
6513 }
6514
6515 radv_ace_internal_barrier(cmd_buffer, 0, 0);
6516 radv_describe_barrier_end(cmd_buffer);
6517 }
6518
6519 VKAPI_ATTR void VKAPI_CALL
radv_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,const VkRenderPassBeginInfo * pRenderPassBeginInfo,const VkSubpassBeginInfo * pSubpassBeginInfo)6520 radv_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
6521 const VkRenderPassBeginInfo *pRenderPassBeginInfo,
6522 const VkSubpassBeginInfo *pSubpassBeginInfo)
6523 {
6524 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6525 RADV_FROM_HANDLE(radv_render_pass, pass, pRenderPassBeginInfo->renderPass);
6526 RADV_FROM_HANDLE(vk_framebuffer, framebuffer, pRenderPassBeginInfo->framebuffer);
6527 VkResult result;
6528
6529 cmd_buffer->state.framebuffer = framebuffer;
6530 cmd_buffer->state.pass = pass;
6531 cmd_buffer->state.render_area = pRenderPassBeginInfo->renderArea;
6532
6533 result = radv_cmd_state_setup_attachments(cmd_buffer, pass, pRenderPassBeginInfo);
6534 if (result != VK_SUCCESS)
6535 return;
6536
6537 result = radv_cmd_state_setup_sample_locations(cmd_buffer, pass, pRenderPassBeginInfo);
6538 if (result != VK_SUCCESS)
6539 return;
6540
6541 radv_cmd_buffer_begin_subpass(cmd_buffer, 0);
6542 }
6543
6544 VKAPI_ATTR void VKAPI_CALL
radv_CmdNextSubpass2(VkCommandBuffer commandBuffer,const VkSubpassBeginInfo * pSubpassBeginInfo,const VkSubpassEndInfo * pSubpassEndInfo)6545 radv_CmdNextSubpass2(VkCommandBuffer commandBuffer, const VkSubpassBeginInfo *pSubpassBeginInfo,
6546 const VkSubpassEndInfo *pSubpassEndInfo)
6547 {
6548 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6549
6550 radv_mark_noncoherent_rb(cmd_buffer);
6551
6552 uint32_t prev_subpass = radv_get_subpass_id(cmd_buffer);
6553 radv_cmd_buffer_end_subpass(cmd_buffer);
6554 radv_cmd_buffer_begin_subpass(cmd_buffer, prev_subpass + 1);
6555 }
6556
6557 static void
radv_emit_view_index_per_stage(struct radeon_cmdbuf * cs,struct radv_graphics_pipeline * pipeline,unsigned stage,unsigned index)6558 radv_emit_view_index_per_stage(struct radeon_cmdbuf *cs, struct radv_graphics_pipeline *pipeline,
6559 unsigned stage, unsigned index)
6560 {
6561 struct radv_userdata_info *loc = radv_lookup_user_sgpr(&pipeline->base, stage, AC_UD_VIEW_INDEX);
6562 if (loc->sgpr_idx == -1)
6563 return;
6564 uint32_t base_reg = pipeline->base.user_data_0[stage];
6565 radeon_set_sh_reg(cs, base_reg + loc->sgpr_idx * 4, index);
6566 }
6567
6568 static void
radv_emit_view_index(struct radv_cmd_buffer * cmd_buffer,unsigned index)6569 radv_emit_view_index(struct radv_cmd_buffer *cmd_buffer, unsigned index)
6570 {
6571 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
6572
6573 radv_foreach_stage(stage, pipeline->active_stages & ~VK_SHADER_STAGE_TASK_BIT_NV) {
6574 radv_emit_view_index_per_stage(cmd_buffer->cs, pipeline, stage, index);
6575 }
6576 if (radv_pipeline_has_gs_copy_shader(&pipeline->base)) {
6577 struct radv_userdata_info *loc =
6578 &pipeline->base.gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_VIEW_INDEX];
6579 if (loc->sgpr_idx != -1) {
6580 uint32_t base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0;
6581 radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, index);
6582 }
6583 }
6584 if (pipeline->active_stages & VK_SHADER_STAGE_TASK_BIT_NV) {
6585 radv_emit_view_index_per_stage(cmd_buffer->ace_internal.cs, pipeline, MESA_SHADER_TASK,
6586 index);
6587 }
6588 }
6589
6590 /**
6591 * Emulates predication for MEC using COND_EXEC.
6592 * When the current command buffer is predicating, emit a COND_EXEC packet
6593 * so that the MEC skips the next few dwords worth of packets.
6594 *
6595 * To make it work with inverted conditional rendering, we allocate
6596 * space in the upload BO and emit some packets to invert the condition.
6597 */
6598 static void
radv_cs_emit_compute_predication(struct radv_cmd_state * state,struct radeon_cmdbuf * cs,uint64_t inv_va,bool * inv_emitted,unsigned dwords)6599 radv_cs_emit_compute_predication(struct radv_cmd_state *state, struct radeon_cmdbuf *cs,
6600 uint64_t inv_va, bool *inv_emitted, unsigned dwords)
6601 {
6602 if (!state->predicating)
6603 return;
6604
6605 uint64_t va = state->predication_va;
6606
6607 if (!state->predication_type) {
6608 /* Invert the condition the first time it is needed. */
6609 if (!*inv_emitted) {
6610 *inv_emitted = true;
6611
6612 /* Write 1 to the inverted predication VA. */
6613 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
6614 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
6615 COPY_DATA_WR_CONFIRM);
6616 radeon_emit(cs, 1);
6617 radeon_emit(cs, 0);
6618 radeon_emit(cs, inv_va);
6619 radeon_emit(cs, inv_va >> 32);
6620
6621 /* If the API predication VA == 0, skip next command. */
6622 radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0));
6623 radeon_emit(cs, va);
6624 radeon_emit(cs, va >> 32);
6625 radeon_emit(cs, 0);
6626 radeon_emit(cs, 6); /* 1x COPY_DATA size */
6627
6628 /* Write 0 to the new predication VA (when the API condition != 0) */
6629 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
6630 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
6631 COPY_DATA_WR_CONFIRM);
6632 radeon_emit(cs, 0);
6633 radeon_emit(cs, 0);
6634 radeon_emit(cs, inv_va);
6635 radeon_emit(cs, inv_va >> 32);
6636 }
6637
6638 va = inv_va;
6639 }
6640
6641 radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0));
6642 radeon_emit(cs, va);
6643 radeon_emit(cs, va >> 32);
6644 radeon_emit(cs, 0); /* Cache policy */
6645 radeon_emit(cs, dwords); /* Size of the predicated packet(s) in DWORDs. */
6646 }
6647
6648 static void
radv_cs_emit_draw_packet(struct radv_cmd_buffer * cmd_buffer,uint32_t vertex_count,uint32_t use_opaque)6649 radv_cs_emit_draw_packet(struct radv_cmd_buffer *cmd_buffer, uint32_t vertex_count,
6650 uint32_t use_opaque)
6651 {
6652 radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, cmd_buffer->state.predicating));
6653 radeon_emit(cmd_buffer->cs, vertex_count);
6654 radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX | use_opaque);
6655 }
6656
6657 /**
6658 * Emit a PKT3_DRAW_INDEX_2 packet to render "index_count` vertices.
6659 *
6660 * The starting address "index_va" may point anywhere within the index buffer. The number of
6661 * indexes allocated in the index buffer *past that point* is specified by "max_index_count".
6662 * Hardware uses this information to return 0 for out-of-bounds reads.
6663 */
6664 static void
radv_cs_emit_draw_indexed_packet(struct radv_cmd_buffer * cmd_buffer,uint64_t index_va,uint32_t max_index_count,uint32_t index_count,bool not_eop)6665 radv_cs_emit_draw_indexed_packet(struct radv_cmd_buffer *cmd_buffer, uint64_t index_va,
6666 uint32_t max_index_count, uint32_t index_count, bool not_eop)
6667 {
6668 radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_2, 4, cmd_buffer->state.predicating));
6669 radeon_emit(cmd_buffer->cs, max_index_count);
6670 radeon_emit(cmd_buffer->cs, index_va);
6671 radeon_emit(cmd_buffer->cs, index_va >> 32);
6672 radeon_emit(cmd_buffer->cs, index_count);
6673 /* NOT_EOP allows merging multiple draws into 1 wave, but only user VGPRs
6674 * can be changed between draws and GS fast launch must be disabled.
6675 * NOT_EOP doesn't work on gfx9 and older.
6676 */
6677 radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_DMA | S_0287F0_NOT_EOP(not_eop));
6678 }
6679
6680 /* MUST inline this function to avoid massive perf loss in drawoverhead */
6681 ALWAYS_INLINE static void
radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer * cmd_buffer,bool indexed,uint32_t draw_count,uint64_t count_va,uint32_t stride)6682 radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer *cmd_buffer, bool indexed,
6683 uint32_t draw_count, uint64_t count_va, uint32_t stride)
6684 {
6685 struct radeon_cmdbuf *cs = cmd_buffer->cs;
6686 const unsigned di_src_sel = indexed ? V_0287F0_DI_SRC_SEL_DMA : V_0287F0_DI_SRC_SEL_AUTO_INDEX;
6687 bool draw_id_enable = cmd_buffer->state.graphics_pipeline->uses_drawid;
6688 uint32_t base_reg = cmd_buffer->state.graphics_pipeline->vtx_base_sgpr;
6689 uint32_t vertex_offset_reg, start_instance_reg = 0, draw_id_reg = 0;
6690 bool predicating = cmd_buffer->state.predicating;
6691 bool mesh = cmd_buffer->state.mesh_shading;
6692 assert(base_reg);
6693
6694 /* just reset draw state for vertex data */
6695 cmd_buffer->state.last_first_instance = -1;
6696 cmd_buffer->state.last_num_instances = -1;
6697 cmd_buffer->state.last_drawid = -1;
6698 cmd_buffer->state.last_vertex_offset = -1;
6699
6700 vertex_offset_reg = (base_reg - SI_SH_REG_OFFSET) >> 2;
6701 if (cmd_buffer->state.graphics_pipeline->uses_baseinstance)
6702 start_instance_reg = ((base_reg + (draw_id_enable ? 8 : 4)) - SI_SH_REG_OFFSET) >> 2;
6703 if (draw_id_enable)
6704 draw_id_reg = ((base_reg + mesh * 12 + 4) - SI_SH_REG_OFFSET) >> 2;
6705
6706 if (draw_count == 1 && !count_va && !draw_id_enable) {
6707 radeon_emit(cs,
6708 PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT : PKT3_DRAW_INDIRECT, 3, predicating));
6709 radeon_emit(cs, 0);
6710 radeon_emit(cs, vertex_offset_reg);
6711 radeon_emit(cs, start_instance_reg);
6712 radeon_emit(cs, di_src_sel);
6713 } else {
6714 radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI : PKT3_DRAW_INDIRECT_MULTI, 8,
6715 predicating));
6716 radeon_emit(cs, 0);
6717 radeon_emit(cs, vertex_offset_reg);
6718 radeon_emit(cs, start_instance_reg);
6719 radeon_emit(cs, draw_id_reg | S_2C3_DRAW_INDEX_ENABLE(draw_id_enable) |
6720 S_2C3_COUNT_INDIRECT_ENABLE(!!count_va));
6721 radeon_emit(cs, draw_count); /* count */
6722 radeon_emit(cs, count_va); /* count_addr */
6723 radeon_emit(cs, count_va >> 32);
6724 radeon_emit(cs, stride); /* stride */
6725 radeon_emit(cs, di_src_sel);
6726
6727 cmd_buffer->state.uses_draw_indirect_multi = true;
6728 }
6729 }
6730
6731 ALWAYS_INLINE static void
radv_cs_emit_dispatch_taskmesh_direct_ace_packet(struct radv_cmd_buffer * cmd_buffer,const uint32_t x,const uint32_t y,const uint32_t z)6732 radv_cs_emit_dispatch_taskmesh_direct_ace_packet(struct radv_cmd_buffer *cmd_buffer,
6733 const uint32_t x, const uint32_t y,
6734 const uint32_t z)
6735 {
6736 struct radv_pipeline *pipeline = &cmd_buffer->state.graphics_pipeline->base;
6737 struct radv_shader *compute_shader = radv_get_shader(pipeline, MESA_SHADER_TASK);
6738 struct radeon_cmdbuf *cs = cmd_buffer->ace_internal.cs;
6739 const bool predicating = cmd_buffer->state.predicating;
6740 const uint32_t dispatch_initiator = cmd_buffer->device->dispatch_initiator_task |
6741 S_00B800_CS_W32_EN(compute_shader->info.wave_size == 32);
6742
6743 struct radv_userdata_info *ring_entry_loc =
6744 radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_TASK_RING_ENTRY);
6745 assert(ring_entry_loc && ring_entry_loc->sgpr_idx != -1 && ring_entry_loc->num_sgprs == 1);
6746
6747 uint32_t ring_entry_reg =
6748 (R_00B900_COMPUTE_USER_DATA_0 + ring_entry_loc->sgpr_idx * 4 - SI_SH_REG_OFFSET) >> 2;
6749
6750 radeon_emit(cs, PKT3(PKT3_DISPATCH_TASKMESH_DIRECT_ACE, 4, predicating) | PKT3_SHADER_TYPE_S(1));
6751 radeon_emit(cs, x);
6752 radeon_emit(cs, y);
6753 radeon_emit(cs, z);
6754 radeon_emit(cs, dispatch_initiator);
6755 radeon_emit(cs, ring_entry_reg & 0xFFFF);
6756 }
6757
6758 ALWAYS_INLINE static void
radv_cs_emit_dispatch_taskmesh_indirect_multi_ace_packet(struct radv_cmd_buffer * cmd_buffer,uint64_t data_va,uint32_t draw_count,uint64_t count_va,uint32_t stride)6759 radv_cs_emit_dispatch_taskmesh_indirect_multi_ace_packet(struct radv_cmd_buffer *cmd_buffer,
6760 uint64_t data_va, uint32_t draw_count,
6761 uint64_t count_va, uint32_t stride)
6762 {
6763 assert((data_va & 0x03) == 0);
6764 assert((count_va & 0x03) == 0);
6765
6766 struct radv_pipeline *pipeline = &cmd_buffer->state.graphics_pipeline->base;
6767 struct radv_shader *compute_shader = radv_get_shader(pipeline, MESA_SHADER_TASK);
6768 struct radeon_cmdbuf *cs = cmd_buffer->ace_internal.cs;
6769
6770 const uint32_t count_indirect_enable = !!count_va;
6771 const uint32_t xyz_dim_enable = compute_shader->info.cs.uses_grid_size;
6772 const uint32_t draw_id_enable = compute_shader->info.vs.needs_draw_id;
6773 const uint32_t dispatch_initiator = cmd_buffer->device->dispatch_initiator_task |
6774 S_00B800_CS_W32_EN(compute_shader->info.wave_size == 32);
6775
6776 const struct radv_userdata_info *ring_entry_loc =
6777 radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_TASK_RING_ENTRY);
6778 const struct radv_userdata_info *xyz_dim_loc =
6779 radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_CS_GRID_SIZE);
6780 const struct radv_userdata_info *draw_id_loc =
6781 radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_CS_TASK_DRAW_ID);
6782
6783 assert(ring_entry_loc->sgpr_idx != -1 && ring_entry_loc->num_sgprs == 1);
6784 assert(!xyz_dim_enable || (xyz_dim_loc->sgpr_idx != -1 && xyz_dim_loc->num_sgprs == 3));
6785 assert(!draw_id_enable || (draw_id_loc->sgpr_idx != -1 && draw_id_loc->num_sgprs == 1));
6786
6787 const uint32_t ring_entry_reg =
6788 (R_00B900_COMPUTE_USER_DATA_0 + ring_entry_loc->sgpr_idx * 4 - SI_SH_REG_OFFSET) >> 2;
6789 const uint32_t xyz_dim_reg =
6790 !xyz_dim_enable
6791 ? 0
6792 : (R_00B900_COMPUTE_USER_DATA_0 + xyz_dim_loc->sgpr_idx * 4 - SI_SH_REG_OFFSET) >> 2;
6793 const uint32_t draw_id_reg =
6794 !draw_id_enable
6795 ? 0
6796 : (R_00B900_COMPUTE_USER_DATA_0 + draw_id_loc->sgpr_idx * 4 - SI_SH_REG_OFFSET) >> 2;
6797
6798 radeon_emit(cs, PKT3(PKT3_DISPATCH_TASKMESH_INDIRECT_MULTI_ACE, 9, 0) | PKT3_SHADER_TYPE_S(1));
6799 radeon_emit(cs, data_va);
6800 radeon_emit(cs, data_va >> 32);
6801 radeon_emit(cs, ring_entry_reg & 0xFFFF);
6802 radeon_emit(cs, (count_indirect_enable << 1) | (draw_id_enable << 2) | (xyz_dim_enable << 3) |
6803 (draw_id_reg << 16));
6804 radeon_emit(cs, xyz_dim_reg & 0xFFFF);
6805 radeon_emit(cs, draw_count);
6806 radeon_emit(cs, count_va);
6807 radeon_emit(cs, count_va >> 32);
6808 radeon_emit(cs, stride);
6809 radeon_emit(cs, dispatch_initiator);
6810 }
6811
6812 ALWAYS_INLINE static void
radv_cs_emit_dispatch_taskmesh_gfx_packet(struct radv_cmd_buffer * cmd_buffer)6813 radv_cs_emit_dispatch_taskmesh_gfx_packet(struct radv_cmd_buffer *cmd_buffer)
6814 {
6815 struct radv_pipeline *pipeline = &cmd_buffer->state.graphics_pipeline->base;
6816 struct radeon_cmdbuf *cs = cmd_buffer->cs;
6817 bool predicating = cmd_buffer->state.predicating;
6818
6819 struct radv_userdata_info *ring_entry_loc =
6820 radv_lookup_user_sgpr(pipeline, MESA_SHADER_MESH, AC_UD_TASK_RING_ENTRY);
6821
6822 assert(ring_entry_loc && ring_entry_loc->sgpr_idx != -1);
6823
6824 uint32_t base_reg = cmd_buffer->state.graphics_pipeline->vtx_base_sgpr;
6825 uint32_t xyz_dim_reg = ((base_reg + 4) - SI_SH_REG_OFFSET) >> 2;
6826 uint32_t ring_entry_reg = ((base_reg + ring_entry_loc->sgpr_idx * 4) - SI_SH_REG_OFFSET) >> 2;
6827
6828 radeon_emit(cs, PKT3(PKT3_DISPATCH_TASKMESH_GFX, 2, predicating));
6829 radeon_emit(cs, (ring_entry_reg << 16) | (xyz_dim_reg & 0xFFFF));
6830 radeon_emit(cs, 0);
6831 radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX);
6832 }
6833
6834 static inline void
radv_emit_userdata_vertex_internal(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,const uint32_t vertex_offset)6835 radv_emit_userdata_vertex_internal(struct radv_cmd_buffer *cmd_buffer,
6836 const struct radv_draw_info *info, const uint32_t vertex_offset)
6837 {
6838 struct radv_cmd_state *state = &cmd_buffer->state;
6839 struct radeon_cmdbuf *cs = cmd_buffer->cs;
6840 const bool uses_baseinstance = state->graphics_pipeline->uses_baseinstance;
6841 const bool uses_drawid = state->graphics_pipeline->uses_drawid;
6842
6843 radeon_set_sh_reg_seq(cs, state->graphics_pipeline->vtx_base_sgpr, state->graphics_pipeline->vtx_emit_num);
6844
6845 radeon_emit(cs, vertex_offset);
6846 state->last_vertex_offset = vertex_offset;
6847 if (uses_drawid) {
6848 radeon_emit(cs, 0);
6849 state->last_drawid = 0;
6850 }
6851 if (uses_baseinstance) {
6852 radeon_emit(cs, info->first_instance);
6853 state->last_first_instance = info->first_instance;
6854 }
6855 }
6856
6857 ALWAYS_INLINE static void
radv_emit_userdata_vertex(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,const uint32_t vertex_offset)6858 radv_emit_userdata_vertex(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
6859 const uint32_t vertex_offset)
6860 {
6861 const struct radv_cmd_state *state = &cmd_buffer->state;
6862 const bool uses_baseinstance = state->graphics_pipeline->uses_baseinstance;
6863 const bool uses_drawid = state->graphics_pipeline->uses_drawid;
6864
6865 /* this looks very dumb, but it allows the compiler to optimize better and yields
6866 * ~3-4% perf increase in drawoverhead
6867 */
6868 if (vertex_offset != state->last_vertex_offset) {
6869 radv_emit_userdata_vertex_internal(cmd_buffer, info, vertex_offset);
6870 } else if (uses_drawid && 0 != state->last_drawid) {
6871 radv_emit_userdata_vertex_internal(cmd_buffer, info, vertex_offset);
6872 } else if (uses_baseinstance && info->first_instance != state->last_first_instance) {
6873 radv_emit_userdata_vertex_internal(cmd_buffer, info, vertex_offset);
6874 }
6875 }
6876
6877 ALWAYS_INLINE static void
radv_emit_userdata_vertex_drawid(struct radv_cmd_buffer * cmd_buffer,uint32_t vertex_offset,uint32_t drawid)6878 radv_emit_userdata_vertex_drawid(struct radv_cmd_buffer *cmd_buffer, uint32_t vertex_offset, uint32_t drawid)
6879 {
6880 struct radv_cmd_state *state = &cmd_buffer->state;
6881 struct radeon_cmdbuf *cs = cmd_buffer->cs;
6882 radeon_set_sh_reg_seq(cs, state->graphics_pipeline->vtx_base_sgpr, 1 + !!drawid);
6883 radeon_emit(cs, vertex_offset);
6884 state->last_vertex_offset = vertex_offset;
6885 if (drawid)
6886 radeon_emit(cs, drawid);
6887
6888 }
6889
6890 ALWAYS_INLINE static void
radv_emit_userdata_mesh(struct radv_cmd_buffer * cmd_buffer,const uint32_t x,const uint32_t y,const uint32_t z,const uint32_t first_task)6891 radv_emit_userdata_mesh(struct radv_cmd_buffer *cmd_buffer,
6892 const uint32_t x, const uint32_t y, const uint32_t z,
6893 const uint32_t first_task)
6894 {
6895 struct radv_cmd_state *state = &cmd_buffer->state;
6896 struct radeon_cmdbuf *cs = cmd_buffer->cs;
6897 const bool uses_drawid = state->graphics_pipeline->uses_drawid;
6898
6899 radeon_set_sh_reg_seq(cs, state->graphics_pipeline->vtx_base_sgpr, state->graphics_pipeline->vtx_emit_num);
6900 radeon_emit(cs, first_task);
6901 radeon_emit(cs, x);
6902 radeon_emit(cs, y);
6903 radeon_emit(cs, z);
6904
6905 if (uses_drawid) {
6906 radeon_emit(cs, 0);
6907 state->last_drawid = 0;
6908 }
6909 }
6910
6911 ALWAYS_INLINE static void
radv_emit_userdata_mesh_first_task_0_draw_id_0(struct radv_cmd_buffer * cmd_buffer)6912 radv_emit_userdata_mesh_first_task_0_draw_id_0(struct radv_cmd_buffer *cmd_buffer)
6913 {
6914 struct radv_cmd_state *state = &cmd_buffer->state;
6915 struct radeon_cmdbuf *cs = cmd_buffer->cs;
6916 struct radv_graphics_pipeline *pipeline = state->graphics_pipeline;
6917 const bool uses_drawid = pipeline->uses_drawid;
6918
6919 radeon_set_sh_reg_seq(cs, pipeline->vtx_base_sgpr, 1);
6920 radeon_emit(cs, 0);
6921
6922 if (uses_drawid) {
6923 radeon_set_sh_reg_seq(cs, pipeline->vtx_base_sgpr + (pipeline->vtx_emit_num - 1) * 4, 1);
6924 radeon_emit(cs, 0);
6925 }
6926 }
6927
6928 ALWAYS_INLINE static void
radv_emit_userdata_task_ib_only(struct radv_cmd_buffer * cmd_buffer,uint64_t ib_va,uint32_t ib_stride)6929 radv_emit_userdata_task_ib_only(struct radv_cmd_buffer *cmd_buffer, uint64_t ib_va,
6930 uint32_t ib_stride)
6931 {
6932 struct radv_pipeline *pipeline = &cmd_buffer->state.graphics_pipeline->base;
6933 struct radeon_cmdbuf *cs = cmd_buffer->ace_internal.cs;
6934
6935 struct radv_userdata_info *task_ib_loc =
6936 radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_CS_TASK_IB);
6937
6938 if (task_ib_loc->sgpr_idx != -1) {
6939 assert(task_ib_loc->num_sgprs == 3);
6940 unsigned task_ib_reg = R_00B900_COMPUTE_USER_DATA_0 + task_ib_loc->sgpr_idx * 4;
6941
6942 radeon_set_sh_reg_seq(cs, task_ib_reg, 3);
6943 radeon_emit(cs, ib_va);
6944 radeon_emit(cs, ib_va >> 32);
6945 radeon_emit(cs, ib_stride);
6946 }
6947 }
6948
6949 ALWAYS_INLINE static void
radv_emit_userdata_task(struct radv_cmd_buffer * cmd_buffer,uint32_t x,uint32_t y,uint32_t z,uint32_t draw_id,uint32_t first_task,uint64_t ib_va)6950 radv_emit_userdata_task(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t y, uint32_t z,
6951 uint32_t draw_id, uint32_t first_task, uint64_t ib_va)
6952 {
6953 struct radv_pipeline *pipeline = &cmd_buffer->state.graphics_pipeline->base;
6954 struct radeon_cmdbuf *cs = cmd_buffer->ace_internal.cs;
6955
6956 struct radv_userdata_info *xyz_loc =
6957 radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_CS_GRID_SIZE);
6958 struct radv_userdata_info *draw_id_loc =
6959 radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_CS_TASK_DRAW_ID);
6960
6961 if (xyz_loc->sgpr_idx != -1) {
6962 assert(xyz_loc->num_sgprs == 3);
6963 unsigned xyz_reg = R_00B900_COMPUTE_USER_DATA_0 + xyz_loc->sgpr_idx * 4;
6964
6965 radeon_set_sh_reg_seq(cs, xyz_reg, 3);
6966 radeon_emit(cs, x);
6967 radeon_emit(cs, y);
6968 radeon_emit(cs, z);
6969 }
6970
6971 if (draw_id_loc->sgpr_idx != -1) {
6972 assert(draw_id_loc->num_sgprs == 1);
6973 unsigned draw_id_reg = R_00B900_COMPUTE_USER_DATA_0 + draw_id_loc->sgpr_idx * 4;
6974
6975 radeon_set_sh_reg_seq(cs, draw_id_reg, 1);
6976 radeon_emit(cs, draw_id);
6977 }
6978
6979 radv_emit_userdata_task_ib_only(cmd_buffer, ib_va, first_task ? 8 : 0);
6980 }
6981
6982 ALWAYS_INLINE static void
radv_emit_draw_packets_indexed(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,uint32_t drawCount,const VkMultiDrawIndexedInfoEXT * minfo,uint32_t stride,const int32_t * vertexOffset)6983 radv_emit_draw_packets_indexed(struct radv_cmd_buffer *cmd_buffer,
6984 const struct radv_draw_info *info,
6985 uint32_t drawCount, const VkMultiDrawIndexedInfoEXT *minfo,
6986 uint32_t stride,
6987 const int32_t *vertexOffset)
6988
6989 {
6990 struct radv_cmd_state *state = &cmd_buffer->state;
6991 struct radeon_cmdbuf *cs = cmd_buffer->cs;
6992 const int index_size = radv_get_vgt_index_size(state->index_type);
6993 unsigned i = 0;
6994 const bool uses_drawid = state->graphics_pipeline->uses_drawid;
6995 const bool can_eop =
6996 !uses_drawid && cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10;
6997
6998 if (uses_drawid) {
6999 if (vertexOffset) {
7000 radv_emit_userdata_vertex(cmd_buffer, info, *vertexOffset);
7001 vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) {
7002 const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
7003
7004 /* Skip draw calls with 0-sized index buffers if the GPU can't handle them */
7005 if (!remaining_indexes &&
7006 cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
7007 continue;
7008
7009 if (i > 0)
7010 radeon_set_sh_reg(cs, state->graphics_pipeline->vtx_base_sgpr + sizeof(uint32_t), i);
7011
7012 const uint64_t index_va = state->index_va + draw->firstIndex * index_size;
7013
7014 if (!state->subpass->view_mask) {
7015 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
7016 } else {
7017 u_foreach_bit(view, state->subpass->view_mask) {
7018 radv_emit_view_index(cmd_buffer, view);
7019
7020 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
7021 }
7022 }
7023 }
7024 } else {
7025 vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) {
7026 const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
7027
7028 /* Skip draw calls with 0-sized index buffers if the GPU can't handle them */
7029 if (!remaining_indexes &&
7030 cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
7031 continue;
7032
7033 if (i > 0) {
7034 if (state->last_vertex_offset != draw->vertexOffset)
7035 radv_emit_userdata_vertex_drawid(cmd_buffer, draw->vertexOffset, i);
7036 else
7037 radeon_set_sh_reg(cs, state->graphics_pipeline->vtx_base_sgpr + sizeof(uint32_t), i);
7038 } else
7039 radv_emit_userdata_vertex(cmd_buffer, info, draw->vertexOffset);
7040
7041 const uint64_t index_va = state->index_va + draw->firstIndex * index_size;
7042
7043 if (!state->subpass->view_mask) {
7044 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
7045 } else {
7046 u_foreach_bit(view, state->subpass->view_mask) {
7047 radv_emit_view_index(cmd_buffer, view);
7048
7049 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
7050 }
7051 }
7052 }
7053 }
7054 if (drawCount > 1) {
7055 state->last_drawid = drawCount - 1;
7056 }
7057 } else {
7058 if (vertexOffset) {
7059 if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX10) {
7060 /* GFX10 has a bug that consecutive draw packets with NOT_EOP must not have
7061 * count == 0 for the last draw that doesn't have NOT_EOP.
7062 */
7063 while (drawCount > 1) {
7064 const VkMultiDrawIndexedInfoEXT *last = (const VkMultiDrawIndexedInfoEXT*)(((const uint8_t*)minfo) + (drawCount - 1) * stride);
7065 if (last->indexCount)
7066 break;
7067 drawCount--;
7068 }
7069 }
7070
7071 radv_emit_userdata_vertex(cmd_buffer, info, *vertexOffset);
7072 vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) {
7073 const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
7074
7075 /* Skip draw calls with 0-sized index buffers if the GPU can't handle them */
7076 if (!remaining_indexes &&
7077 cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
7078 continue;
7079
7080 const uint64_t index_va = state->index_va + draw->firstIndex * index_size;
7081
7082 if (!state->subpass->view_mask) {
7083 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, can_eop && i < drawCount - 1);
7084 } else {
7085 u_foreach_bit(view, state->subpass->view_mask) {
7086 radv_emit_view_index(cmd_buffer, view);
7087
7088 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
7089 }
7090 }
7091 }
7092 } else {
7093 vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) {
7094 const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
7095
7096 /* Skip draw calls with 0-sized index buffers if the GPU can't handle them */
7097 if (!remaining_indexes &&
7098 cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
7099 continue;
7100
7101 const VkMultiDrawIndexedInfoEXT *next = (const VkMultiDrawIndexedInfoEXT*)(i < drawCount - 1 ? ((uint8_t*)draw + stride) : NULL);
7102 const bool offset_changes = next && next->vertexOffset != draw->vertexOffset;
7103 radv_emit_userdata_vertex(cmd_buffer, info, draw->vertexOffset);
7104
7105 const uint64_t index_va = state->index_va + draw->firstIndex * index_size;
7106
7107 if (!state->subpass->view_mask) {
7108 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, can_eop && !offset_changes && i < drawCount - 1);
7109 } else {
7110 u_foreach_bit(view, state->subpass->view_mask) {
7111 radv_emit_view_index(cmd_buffer, view);
7112
7113 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
7114 }
7115 }
7116 }
7117 }
7118 if (drawCount > 1) {
7119 state->last_drawid = drawCount - 1;
7120 }
7121 }
7122 }
7123
7124 ALWAYS_INLINE static void
radv_emit_direct_draw_packets(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,uint32_t drawCount,const VkMultiDrawInfoEXT * minfo,uint32_t use_opaque,uint32_t stride)7125 radv_emit_direct_draw_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
7126 uint32_t drawCount, const VkMultiDrawInfoEXT *minfo,
7127 uint32_t use_opaque, uint32_t stride)
7128 {
7129 unsigned i = 0;
7130 const uint32_t view_mask = cmd_buffer->state.subpass->view_mask;
7131 const bool uses_drawid = cmd_buffer->state.graphics_pipeline->uses_drawid;
7132 uint32_t last_start = 0;
7133
7134 vk_foreach_multi_draw(draw, i, minfo, drawCount, stride) {
7135 if (!i)
7136 radv_emit_userdata_vertex(cmd_buffer, info, draw->firstVertex);
7137 else
7138 radv_emit_userdata_vertex_drawid(cmd_buffer, draw->firstVertex, uses_drawid ? i : 0);
7139
7140 if (!view_mask) {
7141 radv_cs_emit_draw_packet(cmd_buffer, draw->vertexCount, use_opaque);
7142 } else {
7143 u_foreach_bit(view, view_mask) {
7144 radv_emit_view_index(cmd_buffer, view);
7145 radv_cs_emit_draw_packet(cmd_buffer, draw->vertexCount, use_opaque);
7146 }
7147 }
7148 last_start = draw->firstVertex;
7149 }
7150 if (drawCount > 1) {
7151 struct radv_cmd_state *state = &cmd_buffer->state;
7152 state->last_vertex_offset = last_start;
7153 if (uses_drawid)
7154 state->last_drawid = drawCount - 1;
7155 }
7156 }
7157
7158 ALWAYS_INLINE static void
radv_emit_direct_mesh_draw_packet(struct radv_cmd_buffer * cmd_buffer,uint32_t x,uint32_t y,uint32_t z,uint32_t first_task)7159 radv_emit_direct_mesh_draw_packet(struct radv_cmd_buffer *cmd_buffer,
7160 uint32_t x, uint32_t y, uint32_t z,
7161 uint32_t first_task)
7162 {
7163 const uint32_t view_mask = cmd_buffer->state.subpass->view_mask;
7164 const uint32_t count = x * y * z;
7165
7166 radv_emit_userdata_mesh(cmd_buffer, x, y, z, first_task);
7167
7168 if (!view_mask) {
7169 radv_cs_emit_draw_packet(cmd_buffer, count, 0);
7170 } else {
7171 u_foreach_bit(view, view_mask) {
7172 radv_emit_view_index(cmd_buffer, view);
7173 radv_cs_emit_draw_packet(cmd_buffer, count, 0);
7174 }
7175 }
7176 }
7177
7178 ALWAYS_INLINE static void
radv_emit_direct_taskmesh_draw_packets(struct radv_cmd_buffer * cmd_buffer,uint32_t x,uint32_t y,uint32_t z,uint32_t first_task)7179 radv_emit_direct_taskmesh_draw_packets(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t y,
7180 uint32_t z, uint32_t first_task)
7181 {
7182 uint64_t fake_ib_va = 0;
7183 const uint32_t view_mask = cmd_buffer->state.subpass->view_mask;
7184 const unsigned num_views = MAX2(1, util_bitcount(view_mask));
7185 unsigned ace_predication_size = num_views * 6; /* DISPATCH_TASKMESH_DIRECT_ACE size */
7186
7187 if (first_task) {
7188 /* Pass this as the IB to the shader for emulating firstTask in task shaders. */
7189 uint32_t fake_ib_dwords[2] = {x, first_task};
7190 unsigned fake_ib_offset;
7191 radv_cmd_buffer_upload_data(cmd_buffer, 8, fake_ib_dwords, &fake_ib_offset);
7192 fake_ib_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + fake_ib_offset;
7193 }
7194
7195 radv_emit_userdata_task(cmd_buffer, x, y, z, 0, first_task, fake_ib_va);
7196 radv_emit_userdata_mesh_first_task_0_draw_id_0(cmd_buffer);
7197 radv_cs_emit_compute_predication(&cmd_buffer->state, cmd_buffer->ace_internal.cs,
7198 cmd_buffer->mec_inv_pred_va, &cmd_buffer->mec_inv_pred_emitted,
7199 ace_predication_size);
7200
7201 if (!view_mask) {
7202 radv_cs_emit_dispatch_taskmesh_direct_ace_packet(cmd_buffer, x, y, z);
7203 radv_cs_emit_dispatch_taskmesh_gfx_packet(cmd_buffer);
7204 } else {
7205 u_foreach_bit (view, view_mask) {
7206 radv_emit_view_index(cmd_buffer, view);
7207 radv_cs_emit_dispatch_taskmesh_direct_ace_packet(cmd_buffer, x, y, z);
7208 radv_cs_emit_dispatch_taskmesh_gfx_packet(cmd_buffer);
7209 }
7210 }
7211 }
7212
7213 static void
radv_emit_indirect_taskmesh_draw_packets(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,uint64_t nv_ib_va,uint32_t nv_ib_stride)7214 radv_emit_indirect_taskmesh_draw_packets(struct radv_cmd_buffer *cmd_buffer,
7215 const struct radv_draw_info *info, uint64_t nv_ib_va,
7216 uint32_t nv_ib_stride)
7217 {
7218 const uint32_t view_mask = cmd_buffer->state.subpass->view_mask;
7219 struct radeon_winsys *ws = cmd_buffer->device->ws;
7220 const unsigned num_views = MAX2(1, util_bitcount(view_mask));
7221 unsigned ace_predication_size = num_views * 11; /* DISPATCH_TASKMESH_INDIRECT_MULTI_ACE size */
7222 struct radeon_cmdbuf *ace_cs = cmd_buffer->ace_internal.cs;
7223
7224 const uint64_t va =
7225 radv_buffer_get_va(info->indirect->bo) + info->indirect->offset + info->indirect_offset;
7226 const uint64_t count_va = !info->count_buffer
7227 ? 0
7228 : radv_buffer_get_va(info->count_buffer->bo) +
7229 info->count_buffer->offset + info->count_buffer_offset;
7230 uint64_t workaround_cond_va = 0;
7231
7232 if (count_va) {
7233 radv_cs_add_buffer(ws, cmd_buffer->ace_internal.cs, info->count_buffer->bo);
7234
7235 /* MEC firmware bug workaround.
7236 * When the count buffer contains zero, DISPATCH_TASKMESH_INDIRECT_MULTI_ACE hangs.
7237 * - We must ensure that DISPATCH_TASKMESH_INDIRECT_MULTI_ACE
7238 * is only executed when the count buffer contains non-zero.
7239 * - Furthermore, we must also ensure that each DISPATCH_TASKMESH_GFX packet
7240 * has a matching ACE packet.
7241 *
7242 * As a workaround:
7243 * - Reserve a dword in the upload buffer and initialize it to 1 for the workaround
7244 * - When count != 0, write 0 to the workaround BO and execute the indirect dispatch
7245 * - When workaround BO != 0 (count was 0), execute an empty direct dispatch
7246 */
7247
7248 uint32_t workaround_cond_init = 0;
7249 uint32_t workaround_cond_off;
7250 if (!radv_cmd_buffer_upload_data(cmd_buffer, 4, &workaround_cond_init, &workaround_cond_off))
7251 cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
7252
7253 workaround_cond_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + workaround_cond_off;
7254
7255 radeon_emit(ace_cs, PKT3(PKT3_COPY_DATA, 4, 0));
7256 radeon_emit(ace_cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
7257 COPY_DATA_WR_CONFIRM);
7258 radeon_emit(ace_cs, 1);
7259 radeon_emit(ace_cs, 0);
7260 radeon_emit(ace_cs, workaround_cond_va);
7261 radeon_emit(ace_cs, workaround_cond_va >> 32);
7262
7263 /* 2x COND_EXEC + 1x COPY_DATA + Nx DISPATCH_TASKMESH_DIRECT_ACE */
7264 ace_predication_size += 2 * 5 + 6 + 6 * num_views;
7265 }
7266
7267 radv_cs_add_buffer(ws, cmd_buffer->ace_internal.cs, info->indirect->bo);
7268 radv_emit_userdata_task_ib_only(cmd_buffer, nv_ib_va, nv_ib_stride);
7269 radv_emit_userdata_mesh_first_task_0_draw_id_0(cmd_buffer);
7270 radv_cs_emit_compute_predication(&cmd_buffer->state, cmd_buffer->ace_internal.cs,
7271 cmd_buffer->mec_inv_pred_va, &cmd_buffer->mec_inv_pred_emitted,
7272 ace_predication_size);
7273
7274 if (workaround_cond_va) {
7275 radeon_emit(ace_cs, PKT3(PKT3_COND_EXEC, 3, 0));
7276 radeon_emit(ace_cs, count_va);
7277 radeon_emit(ace_cs, count_va >> 32);
7278 radeon_emit(ace_cs, 0);
7279 radeon_emit(ace_cs,
7280 6 + 11 * num_views); /* 1x COPY_DATA + Nx DISPATCH_TASKMESH_INDIRECT_MULTI_ACE */
7281
7282 radeon_emit(ace_cs, PKT3(PKT3_COPY_DATA, 4, 0));
7283 radeon_emit(ace_cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
7284 COPY_DATA_WR_CONFIRM);
7285 radeon_emit(ace_cs, 0);
7286 radeon_emit(ace_cs, 0);
7287 radeon_emit(ace_cs, workaround_cond_va);
7288 radeon_emit(ace_cs, workaround_cond_va >> 32);
7289 }
7290
7291 if (!view_mask) {
7292 radv_cs_emit_dispatch_taskmesh_indirect_multi_ace_packet(cmd_buffer, va, info->count,
7293 count_va, info->stride);
7294 radv_cs_emit_dispatch_taskmesh_gfx_packet(cmd_buffer);
7295 } else {
7296 u_foreach_bit (view, view_mask) {
7297 radv_emit_view_index(cmd_buffer, view);
7298 radv_cs_emit_dispatch_taskmesh_indirect_multi_ace_packet(cmd_buffer, va, info->count,
7299 count_va, info->stride);
7300 radv_cs_emit_dispatch_taskmesh_gfx_packet(cmd_buffer);
7301 }
7302 }
7303
7304 if (workaround_cond_va) {
7305 radeon_emit(ace_cs, PKT3(PKT3_COND_EXEC, 3, 0));
7306 radeon_emit(ace_cs, workaround_cond_va);
7307 radeon_emit(ace_cs, workaround_cond_va >> 32);
7308 radeon_emit(ace_cs, 0);
7309 radeon_emit(ace_cs, 6 * num_views); /* Nx DISPATCH_TASKMESH_DIRECT_ACE */
7310
7311 for (unsigned v = 0; v < num_views; ++v) {
7312 radv_cs_emit_dispatch_taskmesh_direct_ace_packet(cmd_buffer, 0, 0, 0);
7313 }
7314 }
7315 }
7316
7317 static void
radv_emit_indirect_draw_packets(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info)7318 radv_emit_indirect_draw_packets(struct radv_cmd_buffer *cmd_buffer,
7319 const struct radv_draw_info *info)
7320 {
7321 const struct radv_cmd_state *state = &cmd_buffer->state;
7322 struct radeon_winsys *ws = cmd_buffer->device->ws;
7323 struct radeon_cmdbuf *cs = cmd_buffer->cs;
7324 const uint64_t va =
7325 radv_buffer_get_va(info->indirect->bo) + info->indirect->offset + info->indirect_offset;
7326 const uint64_t count_va = info->count_buffer
7327 ? radv_buffer_get_va(info->count_buffer->bo) +
7328 info->count_buffer->offset + info->count_buffer_offset
7329 : 0;
7330
7331 radv_cs_add_buffer(ws, cs, info->indirect->bo);
7332
7333 radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0));
7334 radeon_emit(cs, 1);
7335 radeon_emit(cs, va);
7336 radeon_emit(cs, va >> 32);
7337
7338 if (info->count_buffer) {
7339 radv_cs_add_buffer(ws, cs, info->count_buffer->bo);
7340 }
7341
7342 if (!state->subpass->view_mask) {
7343 radv_cs_emit_indirect_draw_packet(cmd_buffer, info->indexed, info->count, count_va,
7344 info->stride);
7345 } else {
7346 u_foreach_bit(i, state->subpass->view_mask)
7347 {
7348 radv_emit_view_index(cmd_buffer, i);
7349
7350 radv_cs_emit_indirect_draw_packet(cmd_buffer, info->indexed, info->count, count_va,
7351 info->stride);
7352 }
7353 }
7354 }
7355
7356 /*
7357 * Vega and raven have a bug which triggers if there are multiple context
7358 * register contexts active at the same time with different scissor values.
7359 *
7360 * There are two possible workarounds:
7361 * 1) Wait for PS_PARTIAL_FLUSH every time the scissor is changed. That way
7362 * there is only ever 1 active set of scissor values at the same time.
7363 *
7364 * 2) Whenever the hardware switches contexts we have to set the scissor
7365 * registers again even if it is a noop. That way the new context gets
7366 * the correct scissor values.
7367 *
7368 * This implements option 2. radv_need_late_scissor_emission needs to
7369 * return true on affected HW if radv_emit_all_graphics_states sets
7370 * any context registers.
7371 */
7372 static bool
radv_need_late_scissor_emission(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info)7373 radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer,
7374 const struct radv_draw_info *info)
7375 {
7376 struct radv_cmd_state *state = &cmd_buffer->state;
7377
7378 if (!cmd_buffer->device->physical_device->rad_info.has_gfx9_scissor_bug)
7379 return false;
7380
7381 if (cmd_buffer->state.context_roll_without_scissor_emitted || info->strmout_buffer)
7382 return true;
7383
7384 uint64_t used_states =
7385 cmd_buffer->state.graphics_pipeline->needed_dynamic_state | ~RADV_CMD_DIRTY_DYNAMIC_ALL;
7386
7387 /* Index, vertex and streamout buffers don't change context regs, and
7388 * pipeline is already handled.
7389 */
7390 used_states &= ~(RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_VERTEX_BUFFER |
7391 RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT | RADV_CMD_DIRTY_STREAMOUT_BUFFER |
7392 RADV_CMD_DIRTY_PIPELINE);
7393
7394 if (cmd_buffer->state.dirty & used_states)
7395 return true;
7396
7397 uint32_t primitive_reset_index = radv_get_primitive_reset_index(cmd_buffer);
7398
7399 if (info->indexed && state->dynamic.primitive_restart_enable &&
7400 primitive_reset_index != state->last_primitive_reset_index)
7401 return true;
7402
7403 return false;
7404 }
7405
7406 ALWAYS_INLINE static bool
radv_skip_ngg_culling(bool has_tess,const unsigned vtx_cnt,bool indirect)7407 radv_skip_ngg_culling(bool has_tess, const unsigned vtx_cnt,
7408 bool indirect)
7409 {
7410 /* If we have to draw only a few vertices, we get better latency if
7411 * we disable NGG culling.
7412 *
7413 * When tessellation is used, what matters is the number of tessellated
7414 * vertices, so let's always assume it's not a small draw.
7415 */
7416 return !has_tess && !indirect && vtx_cnt < 128;
7417 }
7418
7419 ALWAYS_INLINE static uint32_t
radv_get_ngg_culling_settings(struct radv_cmd_buffer * cmd_buffer,bool vp_y_inverted)7420 radv_get_ngg_culling_settings(struct radv_cmd_buffer *cmd_buffer, bool vp_y_inverted)
7421 {
7422 const struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
7423 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
7424
7425 /* Cull every triangle when rasterizer discard is enabled. */
7426 if (d->rasterizer_discard_enable ||
7427 G_028810_DX_RASTERIZATION_KILL(cmd_buffer->state.graphics_pipeline->pa_cl_clip_cntl))
7428 return radv_nggc_front_face | radv_nggc_back_face;
7429
7430 uint32_t pa_su_sc_mode_cntl = cmd_buffer->state.graphics_pipeline->pa_su_sc_mode_cntl;
7431 uint32_t nggc_settings = radv_nggc_none;
7432
7433 /* The culling code needs to know whether face is CW or CCW. */
7434 bool ccw = (pipeline->needed_dynamic_state & RADV_DYNAMIC_FRONT_FACE)
7435 ? d->front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE
7436 : G_028814_FACE(pa_su_sc_mode_cntl) == 0;
7437
7438 /* Take inverted viewport into account. */
7439 ccw ^= vp_y_inverted;
7440
7441 if (ccw)
7442 nggc_settings |= radv_nggc_face_is_ccw;
7443
7444 /* Face culling settings. */
7445 if ((pipeline->needed_dynamic_state & RADV_DYNAMIC_CULL_MODE)
7446 ? (d->cull_mode & VK_CULL_MODE_FRONT_BIT)
7447 : G_028814_CULL_FRONT(pa_su_sc_mode_cntl))
7448 nggc_settings |= radv_nggc_front_face;
7449 if ((pipeline->needed_dynamic_state & RADV_DYNAMIC_CULL_MODE)
7450 ? (d->cull_mode & VK_CULL_MODE_BACK_BIT)
7451 : G_028814_CULL_BACK(pa_su_sc_mode_cntl))
7452 nggc_settings |= radv_nggc_back_face;
7453
7454 /* Small primitive culling is only valid when conservative overestimation is not used. It's also
7455 * disabled for user sample locations because small primitive culling assumes a sample
7456 * position at (0.5, 0.5). */
7457 if (!pipeline->uses_conservative_overestimate && !pipeline->uses_user_sample_locations) {
7458 nggc_settings |= radv_nggc_small_primitives;
7459
7460 /* small_prim_precision = num_samples / 2^subpixel_bits
7461 * num_samples is also always a power of two, so the small prim precision can only be
7462 * a power of two between 2^-2 and 2^-6, therefore it's enough to remember the exponent.
7463 */
7464 unsigned subpixel_bits = 256;
7465 int32_t small_prim_precision_log2 = util_logbase2(pipeline->ms.num_samples) - util_logbase2(subpixel_bits);
7466 nggc_settings |= ((uint32_t) small_prim_precision_log2 << 24u);
7467 }
7468
7469 return nggc_settings;
7470 }
7471
7472 static void
radv_emit_ngg_culling_state(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * draw_info)7473 radv_emit_ngg_culling_state(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *draw_info)
7474 {
7475 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
7476 const unsigned stage = pipeline->last_vgt_api_stage;
7477 const bool nggc_supported = pipeline->has_ngg_culling;
7478
7479 if (!nggc_supported && !cmd_buffer->state.last_nggc_settings) {
7480 /* Current shader doesn't support culling and culling was already disabled:
7481 * No further steps needed, just remember the SGPR's location is not set.
7482 */
7483 cmd_buffer->state.last_nggc_settings_sgpr_idx = -1;
7484 return;
7485 }
7486
7487 /* Check dirty flags:
7488 * - Dirty pipeline: SGPR index may have changed (we have to re-emit if changed).
7489 * - Dirty dynamic flags: culling settings may have changed.
7490 */
7491 const bool dirty =
7492 cmd_buffer->state.dirty &
7493 (RADV_CMD_DIRTY_PIPELINE |
7494 RADV_CMD_DIRTY_DYNAMIC_CULL_MODE | RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE |
7495 RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT);
7496
7497 /* Check small draw status:
7498 * For small draw calls, we disable culling by setting the SGPR to 0.
7499 */
7500 const bool skip =
7501 radv_skip_ngg_culling(stage == MESA_SHADER_TESS_EVAL, draw_info->count, draw_info->indirect);
7502
7503 /* See if anything changed. */
7504 if (!dirty && skip == cmd_buffer->state.last_nggc_skip)
7505 return;
7506
7507 /* Remember small draw state. */
7508 cmd_buffer->state.last_nggc_skip = skip;
7509 const struct radv_shader *v = pipeline->base.shaders[stage];
7510 assert(v->info.has_ngg_culling == nggc_supported);
7511
7512 /* Find the user SGPR. */
7513 const uint32_t base_reg = pipeline->base.user_data_0[stage];
7514 const int8_t nggc_sgpr_idx = v->info.user_sgprs_locs.shader_data[AC_UD_NGG_CULLING_SETTINGS].sgpr_idx;
7515 assert(!nggc_supported || nggc_sgpr_idx != -1);
7516
7517 /* Get viewport transform. */
7518 float vp_scale[2], vp_translate[2];
7519 memcpy(vp_scale, cmd_buffer->state.dynamic.viewport.xform[0].scale, 2 * sizeof(float));
7520 memcpy(vp_translate, cmd_buffer->state.dynamic.viewport.xform[0].translate, 2 * sizeof(float));
7521 bool vp_y_inverted = (-vp_scale[1] + vp_translate[1]) > (vp_scale[1] + vp_translate[1]);
7522
7523 /* Get current culling settings. */
7524 uint32_t nggc_settings = nggc_supported && !skip
7525 ? radv_get_ngg_culling_settings(cmd_buffer, vp_y_inverted)
7526 : radv_nggc_none;
7527
7528 bool emit_viewport = nggc_settings &&
7529 (cmd_buffer->state.dirty & RADV_CMD_DIRTY_DYNAMIC_VIEWPORT ||
7530 cmd_buffer->state.last_nggc_settings_sgpr_idx != nggc_sgpr_idx ||
7531 !cmd_buffer->state.last_nggc_settings);
7532
7533 if (emit_viewport) {
7534 /* Correction for inverted Y */
7535 if (vp_y_inverted) {
7536 vp_scale[1] = -vp_scale[1];
7537 vp_translate[1] = -vp_translate[1];
7538 }
7539
7540 /* Correction for number of samples per pixel. */
7541 for (unsigned i = 0; i < 2; ++i) {
7542 vp_scale[i] *= (float) pipeline->ms.num_samples;
7543 vp_translate[i] *= (float) pipeline->ms.num_samples;
7544 }
7545
7546 uint32_t vp_reg_values[4] = {fui(vp_scale[0]), fui(vp_scale[1]), fui(vp_translate[0]), fui(vp_translate[1])};
7547 const int8_t vp_sgpr_idx = v->info.user_sgprs_locs.shader_data[AC_UD_NGG_VIEWPORT].sgpr_idx;
7548 assert(vp_sgpr_idx != -1);
7549 radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + vp_sgpr_idx * 4, 4);
7550 radeon_emit_array(cmd_buffer->cs, vp_reg_values, 4);
7551 }
7552
7553 bool emit_settings = nggc_supported &&
7554 (cmd_buffer->state.last_nggc_settings != nggc_settings ||
7555 cmd_buffer->state.last_nggc_settings_sgpr_idx != nggc_sgpr_idx);
7556
7557 /* This needs to be emitted when culling is turned on
7558 * and when it's already on but some settings change.
7559 */
7560 if (emit_settings) {
7561 assert(nggc_sgpr_idx >= 0);
7562 radeon_set_sh_reg(cmd_buffer->cs, base_reg + nggc_sgpr_idx * 4, nggc_settings);
7563 }
7564
7565 /* These only need to be emitted when culling is turned on or off,
7566 * but not when it stays on and just some settings change.
7567 */
7568 if (!!cmd_buffer->state.last_nggc_settings != !!nggc_settings) {
7569 uint32_t rsrc2 = v->config.rsrc2;
7570
7571 if (!nggc_settings) {
7572 /* Allocate less LDS when culling is disabled. (But GS always needs it.) */
7573 if (stage != MESA_SHADER_GEOMETRY)
7574 rsrc2 = (rsrc2 & C_00B22C_LDS_SIZE) | S_00B22C_LDS_SIZE(v->info.num_lds_blocks_when_not_culling);
7575 }
7576
7577 /* When the pipeline is dirty and not yet emitted, don't write it here
7578 * because radv_emit_graphics_pipeline will overwrite this register.
7579 */
7580 if (!(cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) ||
7581 cmd_buffer->state.emitted_graphics_pipeline == pipeline) {
7582 radeon_set_sh_reg(cmd_buffer->cs, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2);
7583 }
7584 }
7585
7586 cmd_buffer->state.last_nggc_settings = nggc_settings;
7587 cmd_buffer->state.last_nggc_settings_sgpr_idx = nggc_sgpr_idx;
7588 }
7589
7590 static void
radv_emit_all_graphics_states(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,bool pipeline_is_dirty)7591 radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
7592 bool pipeline_is_dirty)
7593 {
7594 bool late_scissor_emission;
7595
7596 if ((cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER) ||
7597 cmd_buffer->state.emitted_graphics_pipeline != cmd_buffer->state.graphics_pipeline)
7598 radv_emit_rbplus_state(cmd_buffer);
7599
7600 if (cmd_buffer->device->physical_device->use_ngg_culling &&
7601 cmd_buffer->state.graphics_pipeline->is_ngg)
7602 radv_emit_ngg_culling_state(cmd_buffer, info);
7603
7604 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE)
7605 radv_emit_graphics_pipeline(cmd_buffer);
7606
7607 /* This should be before the cmd_buffer->state.dirty is cleared
7608 * (excluding RADV_CMD_DIRTY_PIPELINE) and after
7609 * cmd_buffer->state.context_roll_without_scissor_emitted is set. */
7610 late_scissor_emission = radv_need_late_scissor_emission(cmd_buffer, info);
7611
7612 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)
7613 radv_emit_framebuffer_state(cmd_buffer);
7614
7615 if (info->indexed) {
7616 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_INDEX_BUFFER)
7617 radv_emit_index_buffer(cmd_buffer, info->indirect);
7618 } else {
7619 /* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE,
7620 * so the state must be re-emitted before the next indexed
7621 * draw.
7622 */
7623 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) {
7624 cmd_buffer->state.last_index_type = -1;
7625 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
7626 }
7627 }
7628
7629 if (cmd_buffer->device->force_vrs != RADV_FORCE_VRS_1x1) {
7630 struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
7631 uint64_t dynamic_states =
7632 cmd_buffer->state.dirty & cmd_buffer->state.emitted_graphics_pipeline->needed_dynamic_state;
7633
7634 if ((dynamic_states & RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE) &&
7635 d->fragment_shading_rate.size.width == 1 &&
7636 d->fragment_shading_rate.size.height == 1 &&
7637 d->fragment_shading_rate.combiner_ops[0] == VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR &&
7638 d->fragment_shading_rate.combiner_ops[1] == VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR) {
7639 /* When per-vertex VRS is forced and the dynamic fragment shading rate is a no-op, ignore
7640 * it. This is needed for vkd3d-proton because it always declares per-draw VRS as dynamic.
7641 */
7642 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE;
7643 }
7644 }
7645
7646 radv_cmd_buffer_flush_dynamic_state(cmd_buffer, pipeline_is_dirty);
7647
7648 radv_emit_draw_registers(cmd_buffer, info);
7649
7650 if (late_scissor_emission)
7651 radv_emit_scissor(cmd_buffer);
7652 }
7653
7654 /* MUST inline this function to avoid massive perf loss in drawoverhead */
7655 ALWAYS_INLINE static bool
radv_before_draw(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,uint32_t drawCount)7656 radv_before_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info, uint32_t drawCount)
7657 {
7658 const bool has_prefetch = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7;
7659 const bool pipeline_is_dirty = (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) &&
7660 cmd_buffer->state.graphics_pipeline != cmd_buffer->state.emitted_graphics_pipeline;
7661
7662 ASSERTED const unsigned cdw_max =
7663 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4096 + 128 * (drawCount - 1));
7664
7665 if (likely(!info->indirect)) {
7666 /* GFX6-GFX7 treat instance_count==0 as instance_count==1. There is
7667 * no workaround for indirect draws, but we can at least skip
7668 * direct draws.
7669 */
7670 if (unlikely(!info->instance_count))
7671 return false;
7672
7673 /* Handle count == 0. */
7674 if (unlikely(!info->count && !info->strmout_buffer))
7675 return false;
7676 }
7677
7678 /* Need to apply this workaround early as it can set flush flags. */
7679 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)
7680 radv_emit_fb_mip_change_flush(cmd_buffer);
7681
7682 /* Use optimal packet order based on whether we need to sync the
7683 * pipeline.
7684 */
7685 if (cmd_buffer->state.flush_bits &
7686 (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB |
7687 RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
7688 /* If we have to wait for idle, set all states first, so that
7689 * all SET packets are processed in parallel with previous draw
7690 * calls. Then upload descriptors, set shader pointers, and
7691 * draw, and prefetch at the end. This ensures that the time
7692 * the CUs are idle is very short. (there are only SET_SH
7693 * packets between the wait and the draw)
7694 */
7695 radv_emit_all_graphics_states(cmd_buffer, info, pipeline_is_dirty);
7696 si_emit_cache_flush(cmd_buffer);
7697 /* <-- CUs are idle here --> */
7698
7699 radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty);
7700 } else {
7701 /* If we don't wait for idle, start prefetches first, then set
7702 * states, and draw at the end.
7703 */
7704 si_emit_cache_flush(cmd_buffer);
7705
7706 if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) {
7707 /* Only prefetch the vertex shader and VBO descriptors
7708 * in order to start the draw as soon as possible.
7709 */
7710 radv_emit_prefetch_L2(cmd_buffer, cmd_buffer->state.graphics_pipeline, true);
7711 }
7712
7713 radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty);
7714
7715 radv_emit_all_graphics_states(cmd_buffer, info, pipeline_is_dirty);
7716 }
7717
7718 radv_describe_draw(cmd_buffer);
7719 if (likely(!info->indirect)) {
7720 struct radv_cmd_state *state = &cmd_buffer->state;
7721 struct radeon_cmdbuf *cs = cmd_buffer->cs;
7722 assert(state->graphics_pipeline->vtx_base_sgpr);
7723 if (state->last_num_instances != info->instance_count) {
7724 radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, false));
7725 radeon_emit(cs, info->instance_count);
7726 state->last_num_instances = info->instance_count;
7727 }
7728 }
7729 assert(cmd_buffer->cs->cdw <= cdw_max);
7730
7731 return true;
7732 }
7733
7734 ALWAYS_INLINE static bool
radv_before_taskmesh_draw(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,uint32_t drawCount)7735 radv_before_taskmesh_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
7736 uint32_t drawCount)
7737 {
7738 struct radv_descriptor_state *descriptors_state =
7739 radv_get_descriptors_state(cmd_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS);
7740 const bool pipeline_is_dirty =
7741 cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE &&
7742 cmd_buffer->state.graphics_pipeline != cmd_buffer->state.emitted_graphics_pipeline;
7743 const bool push_dirty = descriptors_state->push_dirty;
7744 const uint32_t desc_dirty = descriptors_state->dirty;
7745
7746 const bool gfx_result = radv_before_draw(cmd_buffer, info, drawCount);
7747 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
7748 struct radv_shader *task_shader = radv_get_shader(&pipeline->base, MESA_SHADER_TASK);
7749
7750 /* If there is no task shader, no need to do anything special. */
7751 if (!task_shader)
7752 return gfx_result;
7753
7754 /* Need to check the count even for indirect draws to work around
7755 * an issue with DISPATCH_TASKMESH_INDIRECT_MULTI_ACE.
7756 */
7757 if (!info->count || !gfx_result)
7758 return false;
7759
7760 const bool need_task_semaphore = radv_flush_gfx2ace_semaphore(cmd_buffer);
7761 struct radv_physical_device *pdevice = cmd_buffer->device->physical_device;
7762 struct radeon_cmdbuf *ace_cs = cmd_buffer->ace_internal.cs;
7763 struct radeon_winsys *ws = cmd_buffer->device->ws;
7764
7765 assert(ace_cs);
7766 ASSERTED const unsigned ace_cdw_max =
7767 radeon_check_space(ws, ace_cs, 4096 + 128 * (drawCount - 1));
7768
7769 if (need_task_semaphore)
7770 radv_wait_gfx2ace_semaphore(cmd_buffer);
7771
7772 if (pipeline_is_dirty) {
7773 radv_pipeline_emit_hw_cs(pdevice, ace_cs, task_shader);
7774 radv_pipeline_emit_compute_state(pdevice, ace_cs, task_shader);
7775 }
7776
7777 radv_ace_internal_cache_flush(cmd_buffer);
7778
7779 /* Restore dirty state of descriptors
7780 * They were marked non-dirty in radv_before_draw,
7781 * but they need to be re-emitted now to the ACE cmdbuf.
7782 */
7783 descriptors_state->push_dirty = push_dirty;
7784 descriptors_state->dirty = desc_dirty;
7785
7786 /* Flush descriptors and push constants for task shaders. */
7787 radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_TASK_BIT_NV, &pipeline->base,
7788 VK_PIPELINE_BIND_POINT_GRAPHICS);
7789 radv_flush_constants(cmd_buffer, VK_SHADER_STAGE_TASK_BIT_NV, &pipeline->base,
7790 VK_PIPELINE_BIND_POINT_GRAPHICS);
7791
7792 assert(ace_cs->cdw <= ace_cdw_max);
7793 return true;
7794 }
7795
7796 static void
radv_after_draw(struct radv_cmd_buffer * cmd_buffer)7797 radv_after_draw(struct radv_cmd_buffer *cmd_buffer)
7798 {
7799 const struct radeon_info *rad_info = &cmd_buffer->device->physical_device->rad_info;
7800 bool has_prefetch = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7;
7801 /* Start prefetches after the draw has been started. Both will
7802 * run in parallel, but starting the draw first is more
7803 * important.
7804 */
7805 if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) {
7806 radv_emit_prefetch_L2(cmd_buffer, cmd_buffer->state.graphics_pipeline, false);
7807 }
7808
7809 /* Workaround for a VGT hang when streamout is enabled.
7810 * It must be done after drawing.
7811 */
7812 if (radv_is_streamout_enabled(cmd_buffer) &&
7813 (rad_info->family == CHIP_HAWAII || rad_info->family == CHIP_TONGA ||
7814 rad_info->family == CHIP_FIJI)) {
7815 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_STREAMOUT_SYNC;
7816 }
7817
7818 radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_PS_PARTIAL_FLUSH);
7819 }
7820
7821 static struct radv_buffer
radv_nv_mesh_indirect_bo(struct radv_cmd_buffer * cmd_buffer,struct radv_buffer * buffer,VkDeviceSize offset,uint32_t draw_count,uint32_t stride)7822 radv_nv_mesh_indirect_bo(struct radv_cmd_buffer *cmd_buffer,
7823 struct radv_buffer *buffer, VkDeviceSize offset,
7824 uint32_t draw_count, uint32_t stride)
7825 {
7826 /* Translates the indirect BO format used by NV_mesh_shader API
7827 * to the BO format used by DRAW_INDIRECT / DRAW_INDIRECT_MULTI.
7828 */
7829
7830 struct radeon_cmdbuf *cs = cmd_buffer->cs;
7831 struct radeon_winsys *ws = cmd_buffer->device->ws;
7832
7833 const size_t src_stride = MAX2(stride, sizeof(VkDrawMeshTasksIndirectCommandNV));
7834 const size_t dst_stride = sizeof(VkDrawIndirectCommand);
7835 const size_t src_off_task_count = offsetof(VkDrawMeshTasksIndirectCommandNV, taskCount);
7836 const size_t src_off_first_task = offsetof(VkDrawMeshTasksIndirectCommandNV, firstTask);
7837 const size_t dst_off_vertex_count = offsetof(VkDrawIndirectCommand, vertexCount);
7838 const size_t dst_off_first_vertex = offsetof(VkDrawIndirectCommand, firstVertex);
7839
7840 /* Fill the buffer with all zeroes except instanceCount = 1.
7841 * This helps emit fewer copy packets below.
7842 */
7843 VkDrawIndirectCommand *fill_data = (VkDrawIndirectCommand *) alloca(dst_stride * draw_count);
7844 const VkDrawIndirectCommand filler = { .instanceCount = 1 };
7845 for (unsigned i = 0; i < draw_count; ++i)
7846 fill_data[i] = filler;
7847
7848 /* We'll have to copy data from the API BO. */
7849 uint64_t va = radv_buffer_get_va(buffer->bo) + buffer->offset + offset;
7850 radv_cs_add_buffer(ws, cs, buffer->bo);
7851
7852 /* Allocate some space in the upload BO. */
7853 unsigned out_offset;
7854 radv_cmd_buffer_upload_data(cmd_buffer, dst_stride * draw_count, fill_data, &out_offset);
7855 const uint64_t new_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + out_offset;
7856
7857 ASSERTED unsigned cdw_max = radeon_check_space(ws, cs, 12 * draw_count + 2);
7858
7859 /* Copy data from the API BO so that the format is suitable for the
7860 * indirect draw packet:
7861 * - vertexCount = taskCount (copied here)
7862 * - instanceCount = 1 (filled by CPU above)
7863 * - firstVertex = firstTask (copied here)
7864 * - firstInstance = 0 (filled by CPU above)
7865 */
7866 for (unsigned i = 0; i < draw_count; ++i) {
7867 const uint64_t src_task_count = va + i * src_stride + src_off_task_count;
7868 const uint64_t src_first_task = va + i * src_stride + src_off_first_task;
7869 const uint64_t dst_vertex_count = new_va + i * dst_stride + dst_off_vertex_count;
7870 const uint64_t dst_first_vertex = new_va + i * dst_stride + dst_off_first_vertex;
7871
7872 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating));
7873 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
7874 COPY_DATA_WR_CONFIRM);
7875 radeon_emit(cs, src_task_count);
7876 radeon_emit(cs, src_task_count >> 32);
7877 radeon_emit(cs, dst_vertex_count);
7878 radeon_emit(cs, dst_vertex_count >> 32);
7879
7880 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating));
7881 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
7882 COPY_DATA_WR_CONFIRM);
7883 radeon_emit(cs, src_first_task);
7884 radeon_emit(cs, src_first_task >> 32);
7885 radeon_emit(cs, dst_first_vertex);
7886 radeon_emit(cs, dst_first_vertex >> 32);
7887 }
7888
7889 /* Wait for the copies to finish */
7890 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
7891 radeon_emit(cs, 0);
7892
7893 /* The draw packet can now use this buffer: */
7894 struct radv_buffer buf = *buffer;
7895 buf.bo = cmd_buffer->upload.upload_bo;
7896 buf.offset = out_offset;
7897
7898 assert(cmd_buffer->cs->cdw <= cdw_max);
7899
7900 return buf;
7901 }
7902
7903 static struct radv_buffer
radv_nv_task_indirect_bo(struct radv_cmd_buffer * cmd_buffer,struct radv_buffer * buffer,VkDeviceSize offset,uint32_t draw_count,uint32_t stride)7904 radv_nv_task_indirect_bo(struct radv_cmd_buffer *cmd_buffer, struct radv_buffer *buffer,
7905 VkDeviceSize offset, uint32_t draw_count, uint32_t stride)
7906 {
7907 /* Translates the indirect BO format used by NV_mesh_shader API
7908 * to the BO format used by DISPATCH_TASKMESH_INDIRECT_MULTI_ACE.
7909 */
7910
7911 assert(draw_count);
7912 static_assert(sizeof(VkDispatchIndirectCommand) == 12, "Incorrect size of taskmesh command.");
7913
7914 struct radeon_cmdbuf *cs = cmd_buffer->ace_internal.cs;
7915 struct radeon_winsys *ws = cmd_buffer->device->ws;
7916
7917 const size_t src_stride = MAX2(stride, sizeof(VkDrawMeshTasksIndirectCommandNV));
7918 const size_t dst_stride = sizeof(VkDispatchIndirectCommand);
7919 const size_t src_off_task_count = offsetof(VkDrawMeshTasksIndirectCommandNV, taskCount);
7920 const size_t dst_off_x = offsetof(VkDispatchIndirectCommand, x);
7921
7922 const unsigned new_disp_size = dst_stride * draw_count;
7923
7924 const uint64_t va = radv_buffer_get_va(buffer->bo) + buffer->offset + offset;
7925 radv_cs_add_buffer(ws, cs, buffer->bo);
7926
7927 /* Fill the buffer with X=0, Y=1, Z=1. */
7928 VkDispatchIndirectCommand *fill_data = (VkDispatchIndirectCommand *)alloca(new_disp_size);
7929 for (unsigned i = 0; i < draw_count; ++i) {
7930 fill_data[i].x = 0;
7931 fill_data[i].y = 1;
7932 fill_data[i].z = 1;
7933 }
7934
7935 /* Allocate space in the upload BO. */
7936 unsigned out_offset;
7937 ASSERTED bool uploaded =
7938 radv_cmd_buffer_upload_data(cmd_buffer, new_disp_size, fill_data, &out_offset);
7939 const uint64_t new_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + out_offset;
7940 assert(uploaded);
7941
7942 /* Clamp draw count to fit the actual size of the buffer.
7943 * This is to avoid potential out of bounds copies (eg. for draws with an indirect count buffer).
7944 * The remaining indirect draws will stay filled with X=0, Y=1, Z=1 which is harmless.
7945 */
7946 draw_count = MIN2(draw_count, (buffer->vk.size - buffer->offset - offset) / src_stride);
7947
7948 ASSERTED unsigned cdw_max = radeon_check_space(ws, cs, 6 * draw_count + 2);
7949
7950 /* Copy taskCount from the NV API BO to the X dispatch size of the compatible BO. */
7951 for (unsigned i = 0; i < draw_count; ++i) {
7952 const uint64_t src_task_count = va + i * src_stride + src_off_task_count;
7953 const uint64_t dst_x = new_va + i * dst_stride + dst_off_x;
7954
7955 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating));
7956 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
7957 COPY_DATA_WR_CONFIRM);
7958 radeon_emit(cs, src_task_count);
7959 radeon_emit(cs, src_task_count >> 32);
7960 radeon_emit(cs, dst_x);
7961 radeon_emit(cs, dst_x >> 32);
7962 }
7963
7964 assert(cs->cdw <= cdw_max);
7965
7966 /* The draw packet can now use this buffer: */
7967 struct radv_buffer buf = *buffer;
7968 buf.bo = cmd_buffer->upload.upload_bo;
7969 buf.offset = out_offset;
7970
7971 return buf;
7972 }
7973
7974 VKAPI_ATTR void VKAPI_CALL
radv_CmdDraw(VkCommandBuffer commandBuffer,uint32_t vertexCount,uint32_t instanceCount,uint32_t firstVertex,uint32_t firstInstance)7975 radv_CmdDraw(VkCommandBuffer commandBuffer, uint32_t vertexCount, uint32_t instanceCount,
7976 uint32_t firstVertex, uint32_t firstInstance)
7977 {
7978 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7979 struct radv_draw_info info;
7980
7981 info.count = vertexCount;
7982 info.instance_count = instanceCount;
7983 info.first_instance = firstInstance;
7984 info.strmout_buffer = NULL;
7985 info.indirect = NULL;
7986 info.indexed = false;
7987
7988 if (!radv_before_draw(cmd_buffer, &info, 1))
7989 return;
7990 const VkMultiDrawInfoEXT minfo = { firstVertex, vertexCount };
7991 radv_emit_direct_draw_packets(cmd_buffer, &info, 1, &minfo, 0, 0);
7992 radv_after_draw(cmd_buffer);
7993 }
7994
7995 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawMultiEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawInfoEXT * pVertexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride)7996 radv_CmdDrawMultiEXT(VkCommandBuffer commandBuffer, uint32_t drawCount, const VkMultiDrawInfoEXT *pVertexInfo,
7997 uint32_t instanceCount, uint32_t firstInstance, uint32_t stride)
7998 {
7999 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8000 struct radv_draw_info info;
8001
8002 if (!drawCount)
8003 return;
8004
8005 info.count = pVertexInfo->vertexCount;
8006 info.instance_count = instanceCount;
8007 info.first_instance = firstInstance;
8008 info.strmout_buffer = NULL;
8009 info.indirect = NULL;
8010 info.indexed = false;
8011
8012 if (!radv_before_draw(cmd_buffer, &info, drawCount))
8013 return;
8014 radv_emit_direct_draw_packets(cmd_buffer, &info, drawCount, pVertexInfo, 0, stride);
8015 radv_after_draw(cmd_buffer);
8016 }
8017
8018 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawIndexed(VkCommandBuffer commandBuffer,uint32_t indexCount,uint32_t instanceCount,uint32_t firstIndex,int32_t vertexOffset,uint32_t firstInstance)8019 radv_CmdDrawIndexed(VkCommandBuffer commandBuffer, uint32_t indexCount, uint32_t instanceCount,
8020 uint32_t firstIndex, int32_t vertexOffset, uint32_t firstInstance)
8021 {
8022 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8023 struct radv_draw_info info;
8024
8025 info.indexed = true;
8026 info.count = indexCount;
8027 info.instance_count = instanceCount;
8028 info.first_instance = firstInstance;
8029 info.strmout_buffer = NULL;
8030 info.indirect = NULL;
8031
8032 if (!radv_before_draw(cmd_buffer, &info, 1))
8033 return;
8034 const VkMultiDrawIndexedInfoEXT minfo = { firstIndex, indexCount, vertexOffset };
8035 radv_emit_draw_packets_indexed(cmd_buffer, &info, 1, &minfo, 0, NULL);
8036 radv_after_draw(cmd_buffer);
8037 }
8038
8039 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawIndexedInfoEXT * pIndexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride,const int32_t * pVertexOffset)8040 radv_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer, uint32_t drawCount, const VkMultiDrawIndexedInfoEXT *pIndexInfo,
8041 uint32_t instanceCount, uint32_t firstInstance, uint32_t stride, const int32_t *pVertexOffset)
8042 {
8043 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8044 struct radv_draw_info info;
8045
8046 if (!drawCount)
8047 return;
8048
8049 const VkMultiDrawIndexedInfoEXT *minfo = pIndexInfo;
8050 info.indexed = true;
8051 info.count = minfo->indexCount;
8052 info.instance_count = instanceCount;
8053 info.first_instance = firstInstance;
8054 info.strmout_buffer = NULL;
8055 info.indirect = NULL;
8056
8057 if (!radv_before_draw(cmd_buffer, &info, drawCount))
8058 return;
8059 radv_emit_draw_packets_indexed(cmd_buffer, &info, drawCount, pIndexInfo, stride, pVertexOffset);
8060 radv_after_draw(cmd_buffer);
8061 }
8062
8063 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)8064 radv_CmdDrawIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset,
8065 uint32_t drawCount, uint32_t stride)
8066 {
8067 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8068 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
8069 struct radv_draw_info info;
8070
8071 info.count = drawCount;
8072 info.indirect = buffer;
8073 info.indirect_offset = offset;
8074 info.stride = stride;
8075 info.strmout_buffer = NULL;
8076 info.count_buffer = NULL;
8077 info.indexed = false;
8078 info.instance_count = 0;
8079
8080 if (!radv_before_draw(cmd_buffer, &info, 1))
8081 return;
8082 radv_emit_indirect_draw_packets(cmd_buffer, &info);
8083 radv_after_draw(cmd_buffer);
8084 }
8085
8086 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)8087 radv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset,
8088 uint32_t drawCount, uint32_t stride)
8089 {
8090 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8091 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
8092 struct radv_draw_info info;
8093
8094 info.indexed = true;
8095 info.count = drawCount;
8096 info.indirect = buffer;
8097 info.indirect_offset = offset;
8098 info.stride = stride;
8099 info.count_buffer = NULL;
8100 info.strmout_buffer = NULL;
8101 info.instance_count = 0;
8102
8103 if (!radv_before_draw(cmd_buffer, &info, 1))
8104 return;
8105 radv_emit_indirect_draw_packets(cmd_buffer, &info);
8106 radv_after_draw(cmd_buffer);
8107 }
8108
8109 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer _countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)8110 radv_CmdDrawIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset,
8111 VkBuffer _countBuffer, VkDeviceSize countBufferOffset,
8112 uint32_t maxDrawCount, uint32_t stride)
8113 {
8114 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8115 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
8116 RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
8117 struct radv_draw_info info;
8118
8119 info.count = maxDrawCount;
8120 info.indirect = buffer;
8121 info.indirect_offset = offset;
8122 info.count_buffer = count_buffer;
8123 info.count_buffer_offset = countBufferOffset;
8124 info.stride = stride;
8125 info.strmout_buffer = NULL;
8126 info.indexed = false;
8127 info.instance_count = 0;
8128
8129 if (!radv_before_draw(cmd_buffer, &info, 1))
8130 return;
8131 radv_emit_indirect_draw_packets(cmd_buffer, &info);
8132 radv_after_draw(cmd_buffer);
8133 }
8134
8135 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer _countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)8136 radv_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer,
8137 VkDeviceSize offset, VkBuffer _countBuffer,
8138 VkDeviceSize countBufferOffset, uint32_t maxDrawCount,
8139 uint32_t stride)
8140 {
8141 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8142 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
8143 RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
8144 struct radv_draw_info info;
8145
8146 info.indexed = true;
8147 info.count = maxDrawCount;
8148 info.indirect = buffer;
8149 info.indirect_offset = offset;
8150 info.count_buffer = count_buffer;
8151 info.count_buffer_offset = countBufferOffset;
8152 info.stride = stride;
8153 info.strmout_buffer = NULL;
8154 info.instance_count = 0;
8155
8156 if (!radv_before_draw(cmd_buffer, &info, 1))
8157 return;
8158 radv_emit_indirect_draw_packets(cmd_buffer, &info);
8159 radv_after_draw(cmd_buffer);
8160 }
8161
8162 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawMeshTasksNV(VkCommandBuffer commandBuffer,uint32_t taskCount,uint32_t firstTask)8163 radv_CmdDrawMeshTasksNV(VkCommandBuffer commandBuffer, uint32_t taskCount, uint32_t firstTask)
8164 {
8165 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8166 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
8167 struct radv_draw_info info;
8168
8169 info.count = taskCount;
8170 info.instance_count = 1;
8171 info.first_instance = 0;
8172 info.stride = 0;
8173 info.indexed = false;
8174 info.strmout_buffer = NULL;
8175 info.count_buffer = NULL;
8176 info.indirect = NULL;
8177
8178 if (!radv_before_taskmesh_draw(cmd_buffer, &info, 1))
8179 return;
8180
8181 if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
8182 radv_emit_direct_taskmesh_draw_packets(cmd_buffer, taskCount, 1, 1, firstTask);
8183 } else {
8184 radv_emit_direct_mesh_draw_packet(cmd_buffer, taskCount, 1, 1, firstTask);
8185 }
8186
8187 radv_after_draw(cmd_buffer);
8188 }
8189
8190 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawMeshTasksIndirectNV(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)8191 radv_CmdDrawMeshTasksIndirectNV(VkCommandBuffer commandBuffer, VkBuffer _buffer,
8192 VkDeviceSize offset, uint32_t drawCount, uint32_t stride)
8193 {
8194 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8195 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
8196
8197 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
8198 struct radv_draw_info info;
8199
8200 info.indirect = buffer;
8201 info.indirect_offset = offset;
8202 info.stride = stride;
8203 info.count = drawCount;
8204 info.strmout_buffer = NULL;
8205 info.count_buffer = NULL;
8206 info.indexed = false;
8207 info.instance_count = 0;
8208
8209 if (!radv_before_taskmesh_draw(cmd_buffer, &info, drawCount))
8210 return;
8211
8212 /* Indirect draw with mesh shader only:
8213 * Use DRAW_INDIRECT / DRAW_INDIRECT_MULTI like normal indirect draws.
8214 * Needed because DISPATCH_MESH_INDIRECT_MULTI doesn't support firstTask.
8215 *
8216 * Indirect draw with task + mesh shaders:
8217 * Use DISPATCH_TASKMESH_INDIRECT_MULTI_ACE + DISPATCH_TASKMESH_GFX.
8218 * These packets don't support firstTask so we implement that by
8219 * reading the NV command's indirect buffer in the shader.
8220 *
8221 * The indirect BO layout from the NV_mesh_shader API is incompatible
8222 * with AMD HW. To make it work, we allocate some space
8223 * in the upload buffer and copy the data to it.
8224 */
8225
8226 if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
8227 uint64_t nv_ib_va = radv_buffer_get_va(buffer->bo) + buffer->offset + offset;
8228 uint32_t nv_ib_stride = MAX2(stride, sizeof(VkDrawMeshTasksIndirectCommandNV));
8229 struct radv_buffer buf =
8230 radv_nv_task_indirect_bo(cmd_buffer, buffer, offset, drawCount, stride);
8231 info.indirect = &buf;
8232 info.indirect_offset = 0;
8233 info.stride = sizeof(VkDispatchIndirectCommand);
8234
8235 radv_emit_indirect_taskmesh_draw_packets(cmd_buffer, &info, nv_ib_va, nv_ib_stride);
8236 } else {
8237 struct radv_buffer buf =
8238 radv_nv_mesh_indirect_bo(cmd_buffer, buffer, offset, drawCount, stride);
8239 info.indirect = &buf;
8240 info.indirect_offset = 0;
8241 info.stride = sizeof(VkDrawIndirectCommand);
8242
8243 radv_emit_indirect_draw_packets(cmd_buffer, &info);
8244 }
8245
8246 radv_after_draw(cmd_buffer);
8247 }
8248
8249 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawMeshTasksIndirectCountNV(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer _countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)8250 radv_CmdDrawMeshTasksIndirectCountNV(VkCommandBuffer commandBuffer, VkBuffer _buffer,
8251 VkDeviceSize offset, VkBuffer _countBuffer,
8252 VkDeviceSize countBufferOffset, uint32_t maxDrawCount,
8253 uint32_t stride)
8254 {
8255 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8256 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
8257 RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
8258
8259 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
8260 struct radv_draw_info info;
8261
8262 info.indirect = buffer;
8263 info.indirect_offset = offset;
8264 info.stride = stride;
8265 info.count = maxDrawCount;
8266 info.strmout_buffer = NULL;
8267 info.count_buffer = count_buffer;
8268 info.count_buffer_offset = countBufferOffset;
8269 info.indexed = false;
8270 info.instance_count = 0;
8271
8272 if (!radv_before_taskmesh_draw(cmd_buffer, &info, maxDrawCount))
8273 return;
8274
8275 if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
8276 uint64_t nv_ib_va = radv_buffer_get_va(buffer->bo) + buffer->offset + offset;
8277 uint32_t nv_ib_stride = MAX2(stride, sizeof(VkDrawMeshTasksIndirectCommandNV));
8278 struct radv_buffer buf =
8279 radv_nv_task_indirect_bo(cmd_buffer, buffer, offset, maxDrawCount, stride);
8280 info.indirect = &buf;
8281 info.indirect_offset = 0;
8282 info.stride = sizeof(VkDispatchIndirectCommand);
8283
8284 radv_emit_indirect_taskmesh_draw_packets(cmd_buffer, &info, nv_ib_va, nv_ib_stride);
8285 } else {
8286 struct radv_buffer buf =
8287 radv_nv_mesh_indirect_bo(cmd_buffer, buffer, offset, maxDrawCount, stride);
8288 info.indirect = &buf;
8289 info.indirect_offset = 0;
8290 info.stride = sizeof(VkDrawIndirectCommand);
8291
8292 radv_emit_indirect_draw_packets(cmd_buffer, &info);
8293 }
8294
8295 radv_after_draw(cmd_buffer);
8296 }
8297
8298 void
radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer,VkBool32 isPreprocessed,const VkGeneratedCommandsInfoNV * pGeneratedCommandsInfo)8299 radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer, VkBool32 isPreprocessed,
8300 const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo)
8301 {
8302 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8303 VK_FROM_HANDLE(radv_indirect_command_layout, layout,
8304 pGeneratedCommandsInfo->indirectCommandsLayout);
8305 VK_FROM_HANDLE(radv_buffer, prep_buffer, pGeneratedCommandsInfo->preprocessBuffer);
8306
8307 /* The only actions that can be done are draws, so skip on other queues. */
8308 if (cmd_buffer->qf != RADV_QUEUE_GENERAL)
8309 return;
8310
8311 /* Secondary command buffers are needed for the full extension but can't use
8312 * PKT3_INDIRECT_BUFFER_CIK.
8313 */
8314 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
8315
8316 radv_prepare_dgc(cmd_buffer, pGeneratedCommandsInfo);
8317
8318 struct radv_draw_info info;
8319
8320 info.count = pGeneratedCommandsInfo->sequencesCount;
8321 info.indirect = prep_buffer; /* We're not really going use it this way, but a good signal
8322 that this is not direct. */
8323 info.indirect_offset = 0;
8324 info.stride = 0;
8325 info.strmout_buffer = NULL;
8326 info.count_buffer = NULL;
8327 info.indexed = layout->indexed;
8328 info.instance_count = 0;
8329
8330 if (!radv_before_draw(cmd_buffer, &info, 1))
8331 return;
8332
8333 uint32_t cmdbuf_size = radv_get_indirect_cmdbuf_size(pGeneratedCommandsInfo);
8334 uint64_t va = radv_buffer_get_va(prep_buffer->bo) + prep_buffer->offset +
8335 pGeneratedCommandsInfo->preprocessOffset;
8336 const uint32_t view_mask = cmd_buffer->state.subpass->view_mask;
8337
8338 radeon_emit(cmd_buffer->cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
8339 radeon_emit(cmd_buffer->cs, 0);
8340
8341 if (!view_mask) {
8342 radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0));
8343 radeon_emit(cmd_buffer->cs, va);
8344 radeon_emit(cmd_buffer->cs, va >> 32);
8345 radeon_emit(cmd_buffer->cs, cmdbuf_size >> 2);
8346 } else {
8347 u_foreach_bit (view, view_mask) {
8348 radv_emit_view_index(cmd_buffer, view);
8349
8350 radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0));
8351 radeon_emit(cmd_buffer->cs, va);
8352 radeon_emit(cmd_buffer->cs, va >> 32);
8353 radeon_emit(cmd_buffer->cs, cmdbuf_size >> 2);
8354 }
8355 }
8356
8357 if (layout->binds_index_buffer) {
8358 cmd_buffer->state.last_index_type = -1;
8359 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
8360 }
8361
8362 if (layout->bind_vbo_mask)
8363 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER;
8364
8365 if (layout->binds_state)
8366 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE;
8367
8368 cmd_buffer->push_constant_stages |= ~0;
8369
8370 cmd_buffer->state.last_index_type = -1;
8371 cmd_buffer->state.last_num_instances = -1;
8372 cmd_buffer->state.last_vertex_offset = -1;
8373 cmd_buffer->state.last_first_instance = -1;
8374 cmd_buffer->state.last_drawid = -1;
8375
8376 radv_after_draw(cmd_buffer);
8377 }
8378
8379 struct radv_dispatch_info {
8380 /**
8381 * Determine the layout of the grid (in block units) to be used.
8382 */
8383 uint32_t blocks[3];
8384
8385 /**
8386 * A starting offset for the grid. If unaligned is set, the offset
8387 * must still be aligned.
8388 */
8389 uint32_t offsets[3];
8390 /**
8391 * Whether it's an unaligned compute dispatch.
8392 */
8393 bool unaligned;
8394
8395 /**
8396 * Indirect compute parameters resource.
8397 */
8398 struct radeon_winsys_bo *indirect;
8399 uint64_t va;
8400 };
8401
8402 static void
radv_emit_dispatch_packets(struct radv_cmd_buffer * cmd_buffer,struct radv_compute_pipeline * pipeline,const struct radv_dispatch_info * info)8403 radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer,
8404 struct radv_compute_pipeline *pipeline,
8405 const struct radv_dispatch_info *info)
8406 {
8407 struct radv_shader *compute_shader = pipeline->base.shaders[MESA_SHADER_COMPUTE];
8408 unsigned dispatch_initiator = cmd_buffer->device->dispatch_initiator;
8409 struct radeon_winsys *ws = cmd_buffer->device->ws;
8410 bool predicating = cmd_buffer->state.predicating;
8411 struct radeon_cmdbuf *cs = cmd_buffer->cs;
8412 struct radv_userdata_info *loc;
8413
8414 radv_describe_dispatch(cmd_buffer, info->blocks[0], info->blocks[1], info->blocks[2]);
8415
8416 loc = radv_lookup_user_sgpr(&pipeline->base, MESA_SHADER_COMPUTE, AC_UD_CS_GRID_SIZE);
8417
8418 ASSERTED unsigned cdw_max = radeon_check_space(ws, cs, 30);
8419
8420 if (compute_shader->info.wave_size == 32) {
8421 assert(cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10);
8422 dispatch_initiator |= S_00B800_CS_W32_EN(1);
8423 }
8424
8425 if (info->va) {
8426 if (info->indirect)
8427 radv_cs_add_buffer(ws, cs, info->indirect);
8428
8429 if (info->unaligned) {
8430 radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
8431 radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[0]));
8432 radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[1]));
8433 radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[2]));
8434
8435 dispatch_initiator |= S_00B800_USE_THREAD_DIMENSIONS(1);
8436 }
8437
8438 if (loc->sgpr_idx != -1) {
8439 unsigned reg = R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4;
8440
8441 if (cmd_buffer->device->load_grid_size_from_user_sgpr) {
8442 assert(cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10_3);
8443 radeon_emit(cs, PKT3(PKT3_LOAD_SH_REG_INDEX, 3, 0));
8444 radeon_emit(cs, info->va);
8445 radeon_emit(cs, info->va >> 32);
8446 radeon_emit(cs, (reg - SI_SH_REG_OFFSET) >> 2);
8447 radeon_emit(cs, 3);
8448 } else {
8449 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, reg, info->va, true);
8450 }
8451 }
8452
8453 if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
8454 radv_cs_emit_compute_predication(&cmd_buffer->state, cs, cmd_buffer->mec_inv_pred_va,
8455 &cmd_buffer->mec_inv_pred_emitted,
8456 4 /* DISPATCH_INDIRECT size */);
8457 radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 2, 0) | PKT3_SHADER_TYPE_S(1));
8458 radeon_emit(cs, info->va);
8459 radeon_emit(cs, info->va >> 32);
8460 radeon_emit(cs, dispatch_initiator);
8461 } else {
8462 radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) | PKT3_SHADER_TYPE_S(1));
8463 radeon_emit(cs, 1);
8464 radeon_emit(cs, info->va);
8465 radeon_emit(cs, info->va >> 32);
8466
8467 radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, predicating) | PKT3_SHADER_TYPE_S(1));
8468 radeon_emit(cs, 0);
8469 radeon_emit(cs, dispatch_initiator);
8470 }
8471 } else {
8472 unsigned blocks[3] = {info->blocks[0], info->blocks[1], info->blocks[2]};
8473 unsigned offsets[3] = {info->offsets[0], info->offsets[1], info->offsets[2]};
8474
8475 if (info->unaligned) {
8476 unsigned *cs_block_size = compute_shader->info.cs.block_size;
8477 unsigned remainder[3];
8478
8479 /* If aligned, these should be an entire block size,
8480 * not 0.
8481 */
8482 remainder[0] = blocks[0] + cs_block_size[0] - align_u32_npot(blocks[0], cs_block_size[0]);
8483 remainder[1] = blocks[1] + cs_block_size[1] - align_u32_npot(blocks[1], cs_block_size[1]);
8484 remainder[2] = blocks[2] + cs_block_size[2] - align_u32_npot(blocks[2], cs_block_size[2]);
8485
8486 blocks[0] = round_up_u32(blocks[0], cs_block_size[0]);
8487 blocks[1] = round_up_u32(blocks[1], cs_block_size[1]);
8488 blocks[2] = round_up_u32(blocks[2], cs_block_size[2]);
8489
8490 for (unsigned i = 0; i < 3; ++i) {
8491 assert(offsets[i] % cs_block_size[i] == 0);
8492 offsets[i] /= cs_block_size[i];
8493 }
8494
8495 radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
8496 radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(cs_block_size[0]) |
8497 S_00B81C_NUM_THREAD_PARTIAL(remainder[0]));
8498 radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(cs_block_size[1]) |
8499 S_00B81C_NUM_THREAD_PARTIAL(remainder[1]));
8500 radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(cs_block_size[2]) |
8501 S_00B81C_NUM_THREAD_PARTIAL(remainder[2]));
8502
8503 dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1);
8504 }
8505
8506 if (loc->sgpr_idx != -1) {
8507 if (cmd_buffer->device->load_grid_size_from_user_sgpr) {
8508 assert(loc->num_sgprs == 3);
8509
8510 radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, 3);
8511 radeon_emit(cs, blocks[0]);
8512 radeon_emit(cs, blocks[1]);
8513 radeon_emit(cs, blocks[2]);
8514 } else {
8515 uint32_t offset;
8516 if (!radv_cmd_buffer_upload_data(cmd_buffer, 12, blocks, &offset))
8517 return;
8518
8519 uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset;
8520 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs,
8521 R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, va, true);
8522 }
8523 }
8524
8525 if (offsets[0] || offsets[1] || offsets[2]) {
8526 radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3);
8527 radeon_emit(cs, offsets[0]);
8528 radeon_emit(cs, offsets[1]);
8529 radeon_emit(cs, offsets[2]);
8530
8531 /* The blocks in the packet are not counts but end values. */
8532 for (unsigned i = 0; i < 3; ++i)
8533 blocks[i] += offsets[i];
8534 } else {
8535 dispatch_initiator |= S_00B800_FORCE_START_AT_000(1);
8536 }
8537
8538 if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
8539 radv_cs_emit_compute_predication(&cmd_buffer->state, cs, cmd_buffer->mec_inv_pred_va,
8540 &cmd_buffer->mec_inv_pred_emitted,
8541 5 /* DISPATCH_DIRECT size */);
8542 predicating = false;
8543 }
8544
8545 radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, predicating) | PKT3_SHADER_TYPE_S(1));
8546 radeon_emit(cs, blocks[0]);
8547 radeon_emit(cs, blocks[1]);
8548 radeon_emit(cs, blocks[2]);
8549 radeon_emit(cs, dispatch_initiator);
8550 }
8551
8552 assert(cmd_buffer->cs->cdw <= cdw_max);
8553 }
8554
8555 static void
radv_upload_compute_shader_descriptors(struct radv_cmd_buffer * cmd_buffer,struct radv_compute_pipeline * pipeline,VkPipelineBindPoint bind_point)8556 radv_upload_compute_shader_descriptors(struct radv_cmd_buffer *cmd_buffer,
8557 struct radv_compute_pipeline *pipeline,
8558 VkPipelineBindPoint bind_point)
8559 {
8560 radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_COMPUTE_BIT, &pipeline->base, bind_point);
8561 radv_flush_constants(cmd_buffer,
8562 bind_point == VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR
8563 ? RADV_RT_STAGE_BITS
8564 : VK_SHADER_STAGE_COMPUTE_BIT,
8565 &pipeline->base, bind_point);
8566 }
8567
8568 static void
radv_dispatch(struct radv_cmd_buffer * cmd_buffer,const struct radv_dispatch_info * info,struct radv_compute_pipeline * pipeline,VkPipelineBindPoint bind_point)8569 radv_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info,
8570 struct radv_compute_pipeline *pipeline, VkPipelineBindPoint bind_point)
8571 {
8572 bool has_prefetch = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7;
8573 bool pipeline_is_dirty = pipeline != cmd_buffer->state.emitted_compute_pipeline;
8574
8575 if (pipeline->cs_regalloc_hang_bug)
8576 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
8577 RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
8578
8579 if (cmd_buffer->state.flush_bits &
8580 (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB |
8581 RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
8582 /* If we have to wait for idle, set all states first, so that
8583 * all SET packets are processed in parallel with previous draw
8584 * calls. Then upload descriptors, set shader pointers, and
8585 * dispatch, and prefetch at the end. This ensures that the
8586 * time the CUs are idle is very short. (there are only SET_SH
8587 * packets between the wait and the draw)
8588 */
8589 radv_emit_compute_pipeline(cmd_buffer, pipeline);
8590 si_emit_cache_flush(cmd_buffer);
8591 /* <-- CUs are idle here --> */
8592
8593 radv_upload_compute_shader_descriptors(cmd_buffer, pipeline, bind_point);
8594
8595 radv_emit_dispatch_packets(cmd_buffer, pipeline, info);
8596 /* <-- CUs are busy here --> */
8597
8598 /* Start prefetches after the dispatch has been started. Both
8599 * will run in parallel, but starting the dispatch first is
8600 * more important.
8601 */
8602 if (has_prefetch && pipeline_is_dirty) {
8603 radv_emit_shader_prefetch(cmd_buffer, pipeline->base.shaders[MESA_SHADER_COMPUTE]);
8604 }
8605 } else {
8606 /* If we don't wait for idle, start prefetches first, then set
8607 * states, and dispatch at the end.
8608 */
8609 si_emit_cache_flush(cmd_buffer);
8610
8611 if (has_prefetch && pipeline_is_dirty) {
8612 radv_emit_shader_prefetch(cmd_buffer, pipeline->base.shaders[MESA_SHADER_COMPUTE]);
8613 }
8614
8615 radv_upload_compute_shader_descriptors(cmd_buffer, pipeline, bind_point);
8616
8617 radv_emit_compute_pipeline(cmd_buffer, pipeline);
8618 radv_emit_dispatch_packets(cmd_buffer, pipeline, info);
8619 }
8620
8621 if (pipeline_is_dirty) {
8622 /* Raytracing uses compute shaders but has separate bind points and pipelines.
8623 * So if we set compute userdata & shader registers we should dirty the raytracing
8624 * ones and the other way around.
8625 *
8626 * We only need to do this when the pipeline is dirty because when we switch between
8627 * the two we always need to switch pipelines.
8628 */
8629 radv_mark_descriptor_sets_dirty(cmd_buffer, bind_point == VK_PIPELINE_BIND_POINT_COMPUTE
8630 ? VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR
8631 : VK_PIPELINE_BIND_POINT_COMPUTE);
8632 }
8633
8634 if (pipeline->cs_regalloc_hang_bug)
8635 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
8636
8637 radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_CS_PARTIAL_FLUSH);
8638 }
8639
8640 static void
radv_compute_dispatch(struct radv_cmd_buffer * cmd_buffer,const struct radv_dispatch_info * info)8641 radv_compute_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info)
8642 {
8643 radv_dispatch(cmd_buffer, info, cmd_buffer->state.compute_pipeline,
8644 VK_PIPELINE_BIND_POINT_COMPUTE);
8645 }
8646
8647 VKAPI_ATTR void VKAPI_CALL
radv_CmdDispatchBase(VkCommandBuffer commandBuffer,uint32_t base_x,uint32_t base_y,uint32_t base_z,uint32_t x,uint32_t y,uint32_t z)8648 radv_CmdDispatchBase(VkCommandBuffer commandBuffer, uint32_t base_x, uint32_t base_y,
8649 uint32_t base_z, uint32_t x, uint32_t y, uint32_t z)
8650 {
8651 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8652 struct radv_dispatch_info info = {0};
8653
8654 info.blocks[0] = x;
8655 info.blocks[1] = y;
8656 info.blocks[2] = z;
8657
8658 info.offsets[0] = base_x;
8659 info.offsets[1] = base_y;
8660 info.offsets[2] = base_z;
8661 radv_compute_dispatch(cmd_buffer, &info);
8662 }
8663
8664 VKAPI_ATTR void VKAPI_CALL
radv_CmdDispatch(VkCommandBuffer commandBuffer,uint32_t x,uint32_t y,uint32_t z)8665 radv_CmdDispatch(VkCommandBuffer commandBuffer, uint32_t x, uint32_t y, uint32_t z)
8666 {
8667 radv_CmdDispatchBase(commandBuffer, 0, 0, 0, x, y, z);
8668 }
8669
8670 VKAPI_ATTR void VKAPI_CALL
radv_CmdDispatchIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset)8671 radv_CmdDispatchIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset)
8672 {
8673 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8674 RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
8675 struct radv_dispatch_info info = {0};
8676
8677 info.indirect = buffer->bo;
8678 info.va = radv_buffer_get_va(buffer->bo) + buffer->offset + offset;
8679
8680 radv_compute_dispatch(cmd_buffer, &info);
8681 }
8682
8683 void
radv_unaligned_dispatch(struct radv_cmd_buffer * cmd_buffer,uint32_t x,uint32_t y,uint32_t z)8684 radv_unaligned_dispatch(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t y, uint32_t z)
8685 {
8686 struct radv_dispatch_info info = {0};
8687
8688 info.blocks[0] = x;
8689 info.blocks[1] = y;
8690 info.blocks[2] = z;
8691 info.unaligned = 1;
8692
8693 radv_compute_dispatch(cmd_buffer, &info);
8694 }
8695
8696 void
radv_indirect_dispatch(struct radv_cmd_buffer * cmd_buffer,struct radeon_winsys_bo * bo,uint64_t va)8697 radv_indirect_dispatch(struct radv_cmd_buffer *cmd_buffer, struct radeon_winsys_bo *bo, uint64_t va)
8698 {
8699 struct radv_dispatch_info info = {0};
8700
8701 info.indirect = bo;
8702 info.va = va;
8703
8704 radv_compute_dispatch(cmd_buffer, &info);
8705 }
8706
8707 enum radv_rt_mode {
8708 radv_rt_mode_direct,
8709 radv_rt_mode_indirect,
8710 radv_rt_mode_indirect2,
8711 };
8712
8713 static void
radv_trace_rays(struct radv_cmd_buffer * cmd_buffer,const VkTraceRaysIndirectCommand2KHR * tables,uint64_t indirect_va,enum radv_rt_mode mode)8714 radv_trace_rays(struct radv_cmd_buffer *cmd_buffer, const VkTraceRaysIndirectCommand2KHR *tables,
8715 uint64_t indirect_va, enum radv_rt_mode mode)
8716 {
8717 struct radv_compute_pipeline *pipeline = cmd_buffer->state.rt_pipeline;
8718 uint32_t base_reg = pipeline->base.user_data_0[MESA_SHADER_COMPUTE];
8719
8720 struct radv_dispatch_info info = {0};
8721 info.unaligned = true;
8722
8723 uint64_t launch_size_va;
8724 uint64_t sbt_va;
8725
8726 if (mode != radv_rt_mode_indirect2) {
8727 uint32_t upload_size = mode == radv_rt_mode_direct
8728 ? sizeof(VkTraceRaysIndirectCommand2KHR)
8729 : offsetof(VkTraceRaysIndirectCommand2KHR, width);
8730
8731 uint32_t offset;
8732 if (!radv_cmd_buffer_upload_data(cmd_buffer, upload_size, tables, &offset))
8733 return;
8734
8735 uint64_t upload_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset;
8736
8737 launch_size_va = (mode == radv_rt_mode_direct)
8738 ? upload_va + offsetof(VkTraceRaysIndirectCommand2KHR, width)
8739 : indirect_va;
8740 sbt_va = upload_va;
8741 } else {
8742 launch_size_va = indirect_va + offsetof(VkTraceRaysIndirectCommand2KHR, width);
8743 sbt_va = indirect_va;
8744 }
8745
8746 if (mode == radv_rt_mode_direct) {
8747 info.blocks[0] = tables->width;
8748 info.blocks[1] = tables->height;
8749 info.blocks[2] = tables->depth;
8750 } else
8751 info.va = launch_size_va;
8752
8753 struct radv_userdata_info *desc_loc =
8754 radv_lookup_user_sgpr(&pipeline->base, MESA_SHADER_COMPUTE, AC_UD_CS_SBT_DESCRIPTORS);
8755 if (desc_loc->sgpr_idx != -1) {
8756 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs,
8757 base_reg + desc_loc->sgpr_idx * 4, sbt_va, true);
8758 }
8759
8760 struct radv_userdata_info *size_loc =
8761 radv_lookup_user_sgpr(&pipeline->base, MESA_SHADER_COMPUTE, AC_UD_CS_RAY_LAUNCH_SIZE_ADDR);
8762 if (size_loc->sgpr_idx != -1) {
8763 radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs,
8764 base_reg + size_loc->sgpr_idx * 4, launch_size_va, true);
8765 }
8766
8767 radv_dispatch(cmd_buffer, &info, pipeline, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
8768 }
8769
8770 VKAPI_ATTR void VKAPI_CALL
radv_CmdTraceRaysKHR(VkCommandBuffer commandBuffer,const VkStridedDeviceAddressRegionKHR * pRaygenShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pMissShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pHitShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pCallableShaderBindingTable,uint32_t width,uint32_t height,uint32_t depth)8771 radv_CmdTraceRaysKHR(VkCommandBuffer commandBuffer,
8772 const VkStridedDeviceAddressRegionKHR *pRaygenShaderBindingTable,
8773 const VkStridedDeviceAddressRegionKHR *pMissShaderBindingTable,
8774 const VkStridedDeviceAddressRegionKHR *pHitShaderBindingTable,
8775 const VkStridedDeviceAddressRegionKHR *pCallableShaderBindingTable,
8776 uint32_t width, uint32_t height, uint32_t depth)
8777 {
8778 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8779
8780 VkTraceRaysIndirectCommand2KHR tables = {
8781 .raygenShaderRecordAddress = pRaygenShaderBindingTable->deviceAddress,
8782 .raygenShaderRecordSize = pRaygenShaderBindingTable->size,
8783 .missShaderBindingTableAddress = pMissShaderBindingTable->deviceAddress,
8784 .missShaderBindingTableSize = pMissShaderBindingTable->size,
8785 .missShaderBindingTableStride = pMissShaderBindingTable->stride,
8786 .hitShaderBindingTableAddress = pHitShaderBindingTable->deviceAddress,
8787 .hitShaderBindingTableSize = pHitShaderBindingTable->size,
8788 .hitShaderBindingTableStride = pHitShaderBindingTable->stride,
8789 .callableShaderBindingTableAddress = pCallableShaderBindingTable->deviceAddress,
8790 .callableShaderBindingTableSize = pCallableShaderBindingTable->size,
8791 .callableShaderBindingTableStride = pCallableShaderBindingTable->stride,
8792 .width = width,
8793 .height = height,
8794 .depth = depth,
8795 };
8796
8797 radv_trace_rays(cmd_buffer, &tables, 0, radv_rt_mode_direct);
8798 }
8799
8800 VKAPI_ATTR void VKAPI_CALL
radv_CmdTraceRaysIndirectKHR(VkCommandBuffer commandBuffer,const VkStridedDeviceAddressRegionKHR * pRaygenShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pMissShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pHitShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pCallableShaderBindingTable,VkDeviceAddress indirectDeviceAddress)8801 radv_CmdTraceRaysIndirectKHR(VkCommandBuffer commandBuffer,
8802 const VkStridedDeviceAddressRegionKHR *pRaygenShaderBindingTable,
8803 const VkStridedDeviceAddressRegionKHR *pMissShaderBindingTable,
8804 const VkStridedDeviceAddressRegionKHR *pHitShaderBindingTable,
8805 const VkStridedDeviceAddressRegionKHR *pCallableShaderBindingTable,
8806 VkDeviceAddress indirectDeviceAddress)
8807 {
8808 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8809
8810 assert(cmd_buffer->device->use_global_bo_list);
8811
8812 VkTraceRaysIndirectCommand2KHR tables = {
8813 .raygenShaderRecordAddress = pRaygenShaderBindingTable->deviceAddress,
8814 .raygenShaderRecordSize = pRaygenShaderBindingTable->size,
8815 .missShaderBindingTableAddress = pMissShaderBindingTable->deviceAddress,
8816 .missShaderBindingTableSize = pMissShaderBindingTable->size,
8817 .missShaderBindingTableStride = pMissShaderBindingTable->stride,
8818 .hitShaderBindingTableAddress = pHitShaderBindingTable->deviceAddress,
8819 .hitShaderBindingTableSize = pHitShaderBindingTable->size,
8820 .hitShaderBindingTableStride = pHitShaderBindingTable->stride,
8821 .callableShaderBindingTableAddress = pCallableShaderBindingTable->deviceAddress,
8822 .callableShaderBindingTableSize = pCallableShaderBindingTable->size,
8823 .callableShaderBindingTableStride = pCallableShaderBindingTable->stride,
8824 };
8825
8826 radv_trace_rays(cmd_buffer, &tables, indirectDeviceAddress, radv_rt_mode_indirect);
8827 }
8828
8829 VKAPI_ATTR void VKAPI_CALL
radv_CmdTraceRaysIndirect2KHR(VkCommandBuffer commandBuffer,VkDeviceAddress indirectDeviceAddress)8830 radv_CmdTraceRaysIndirect2KHR(VkCommandBuffer commandBuffer, VkDeviceAddress indirectDeviceAddress)
8831 {
8832 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8833
8834 assert(cmd_buffer->device->use_global_bo_list);
8835
8836 radv_trace_rays(cmd_buffer, NULL, indirectDeviceAddress, radv_rt_mode_indirect2);
8837 }
8838
8839 static void
radv_set_rt_stack_size(struct radv_cmd_buffer * cmd_buffer,uint32_t size)8840 radv_set_rt_stack_size(struct radv_cmd_buffer *cmd_buffer, uint32_t size)
8841 {
8842 unsigned wave_size = 0;
8843 unsigned scratch_bytes_per_wave = 0;
8844
8845 if (cmd_buffer->state.rt_pipeline) {
8846 scratch_bytes_per_wave = cmd_buffer->state.rt_pipeline->base.scratch_bytes_per_wave;
8847 wave_size = cmd_buffer->state.rt_pipeline->base.shaders[MESA_SHADER_COMPUTE]->info.wave_size;
8848 }
8849
8850 /* The hardware register is specified as a multiple of 256 DWORDS. */
8851 scratch_bytes_per_wave += align(size * wave_size, 1024);
8852
8853 cmd_buffer->compute_scratch_size_per_wave_needed =
8854 MAX2(cmd_buffer->compute_scratch_size_per_wave_needed, scratch_bytes_per_wave);
8855 }
8856
8857 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetRayTracingPipelineStackSizeKHR(VkCommandBuffer commandBuffer,uint32_t size)8858 radv_CmdSetRayTracingPipelineStackSizeKHR(VkCommandBuffer commandBuffer, uint32_t size)
8859 {
8860 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8861
8862 radv_set_rt_stack_size(cmd_buffer, size);
8863 cmd_buffer->state.rt_stack_size = size;
8864 }
8865
8866 VKAPI_ATTR void VKAPI_CALL
radv_CmdEndRenderPass2(VkCommandBuffer commandBuffer,const VkSubpassEndInfo * pSubpassEndInfo)8867 radv_CmdEndRenderPass2(VkCommandBuffer commandBuffer, const VkSubpassEndInfo *pSubpassEndInfo)
8868 {
8869 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8870
8871 radv_mark_noncoherent_rb(cmd_buffer);
8872
8873 radv_emit_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier);
8874
8875 radv_cmd_buffer_end_subpass(cmd_buffer);
8876
8877 vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer->state.attachments);
8878 vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer->state.subpass_sample_locs);
8879
8880 cmd_buffer->state.pass = NULL;
8881 cmd_buffer->state.subpass = NULL;
8882 cmd_buffer->state.attachments = NULL;
8883 cmd_buffer->state.framebuffer = NULL;
8884 cmd_buffer->state.subpass_sample_locs = NULL;
8885 }
8886
8887 VKAPI_ATTR void VKAPI_CALL
radv_CmdBeginRendering(VkCommandBuffer commandBuffer,const VkRenderingInfo * pRenderingInfo)8888 radv_CmdBeginRendering(VkCommandBuffer commandBuffer, const VkRenderingInfo *pRenderingInfo)
8889 {
8890 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8891 const VkRenderingFragmentShadingRateAttachmentInfoKHR *vrs_info = vk_find_struct_const(
8892 pRenderingInfo->pNext, RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_INFO_KHR);
8893 VkResult result;
8894 /* (normal + resolve) for color attachments and ds and a VRS attachment */
8895 VkAttachmentDescription2 att_desc[MAX_RTS * 2 + 3];
8896 VkAttachmentDescriptionStencilLayout ds_stencil_att, ds_stencil_resolve_att;
8897 VkImageView iviews[MAX_RTS * 2 + 3];
8898 VkAttachmentReference2 color_refs[MAX_RTS], color_resolve_refs[MAX_RTS];
8899 VkAttachmentReference2 ds_ref, ds_resolve_ref, vrs_ref;
8900 VkAttachmentReferenceStencilLayout ds_stencil_ref, ds_stencil_resolve_ref;
8901 VkSubpassDescriptionDepthStencilResolve ds_resolve_info;
8902 VkFragmentShadingRateAttachmentInfoKHR vrs_subpass_info;
8903 VkClearValue clear_values[MAX_RTS * 2 + 3];
8904 unsigned att_count = 0;
8905
8906 VkSubpassDescription2 subpass = {
8907 .sType = VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_2,
8908 .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
8909 .viewMask = pRenderingInfo->viewMask,
8910 .colorAttachmentCount = pRenderingInfo->colorAttachmentCount,
8911 .pColorAttachments = color_refs,
8912 .pResolveAttachments = color_resolve_refs,
8913 };
8914
8915 for (unsigned i = 0; i < pRenderingInfo->colorAttachmentCount; ++i) {
8916 color_refs[i] = (VkAttachmentReference2){
8917 .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
8918 .attachment = VK_ATTACHMENT_UNUSED,
8919 };
8920 color_resolve_refs[i] = (VkAttachmentReference2){
8921 .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
8922 .attachment = VK_ATTACHMENT_UNUSED,
8923 };
8924
8925 if (pRenderingInfo->pColorAttachments[i].imageView == VK_NULL_HANDLE)
8926 continue;
8927
8928 const VkRenderingAttachmentInfo *info = &pRenderingInfo->pColorAttachments[i];
8929 RADV_FROM_HANDLE(radv_image_view, iview, info->imageView);
8930 color_refs[i] = (VkAttachmentReference2){.sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
8931 .attachment = att_count,
8932 .layout = info->imageLayout,
8933 .aspectMask = iview->vk.aspects};
8934
8935 iviews[att_count] = info->imageView;
8936 clear_values[att_count] = info->clearValue;
8937 VkAttachmentDescription2 *att = att_desc + att_count++;
8938
8939 memset(att, 0, sizeof(*att));
8940 att->sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2;
8941 att->format = iview->vk.format;
8942 att->samples = iview->image->info.samples;
8943 att->loadOp = info->loadOp;
8944 att->storeOp = info->storeOp;
8945 att->initialLayout = info->imageLayout;
8946 att->finalLayout = info->imageLayout;
8947
8948 if (pRenderingInfo->flags & VK_RENDERING_RESUMING_BIT)
8949 att->loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
8950
8951 if (pRenderingInfo->flags & VK_RENDERING_SUSPENDING_BIT)
8952 att->storeOp = VK_ATTACHMENT_STORE_OP_STORE;
8953
8954 if (info->resolveMode != VK_RESOLVE_MODE_NONE &&
8955 !(pRenderingInfo->flags & VK_RENDERING_SUSPENDING_BIT)) {
8956 RADV_FROM_HANDLE(radv_image_view, resolve_iview, info->resolveImageView);
8957 color_resolve_refs[i] =
8958 (VkAttachmentReference2){.sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
8959 .attachment = att_count,
8960 .layout = info->resolveImageLayout,
8961 .aspectMask = resolve_iview->vk.aspects};
8962
8963 iviews[att_count] = info->resolveImageView;
8964 att = att_desc + att_count++;
8965
8966 memset(att, 0, sizeof(*att));
8967 att->sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2;
8968 att->format = resolve_iview->vk.format;
8969 att->samples = resolve_iview->image->info.samples;
8970 att->loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
8971 att->storeOp = VK_ATTACHMENT_STORE_OP_STORE;
8972 att->initialLayout = info->resolveImageLayout;
8973 att->finalLayout = info->resolveImageLayout;
8974 }
8975 }
8976
8977 if (pRenderingInfo->pDepthAttachment || pRenderingInfo->pStencilAttachment) {
8978 const VkRenderingAttachmentInfo *common_info = pRenderingInfo->pDepthAttachment
8979 ? pRenderingInfo->pDepthAttachment
8980 : pRenderingInfo->pStencilAttachment;
8981 RADV_FROM_HANDLE(radv_image_view, iview, common_info->imageView);
8982
8983 if (common_info->imageView != VK_NULL_HANDLE) {
8984 ds_ref = (VkAttachmentReference2){
8985 .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
8986 .attachment = att_count,
8987 .layout = common_info->imageLayout,
8988 .aspectMask = (pRenderingInfo->pDepthAttachment ? VK_IMAGE_ASPECT_DEPTH_BIT : 0) |
8989 (pRenderingInfo->pStencilAttachment ? VK_IMAGE_ASPECT_STENCIL_BIT : 0)};
8990 subpass.pDepthStencilAttachment = &ds_ref;
8991
8992 iviews[att_count] = common_info->imageView;
8993 if (pRenderingInfo->pDepthAttachment)
8994 clear_values[att_count].depthStencil.depth =
8995 pRenderingInfo->pDepthAttachment->clearValue.depthStencil.depth;
8996 if (pRenderingInfo->pStencilAttachment)
8997 clear_values[att_count].depthStencil.stencil =
8998 pRenderingInfo->pStencilAttachment->clearValue.depthStencil.stencil;
8999 VkAttachmentDescription2 *att = att_desc + att_count++;
9000
9001 memset(att, 0, sizeof(*att));
9002 att->sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2;
9003 att->format = iview->vk.format;
9004 att->samples = iview->image->info.samples;
9005
9006 if (pRenderingInfo->pDepthAttachment) {
9007 att->loadOp = pRenderingInfo->pDepthAttachment->loadOp;
9008 att->storeOp = pRenderingInfo->pDepthAttachment->storeOp;
9009 } else {
9010 att->loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
9011 att->storeOp = VK_ATTACHMENT_STORE_OP_STORE;
9012 }
9013
9014 if (pRenderingInfo->pStencilAttachment) {
9015 att->stencilLoadOp = pRenderingInfo->pStencilAttachment->loadOp;
9016 att->stencilStoreOp = pRenderingInfo->pStencilAttachment->storeOp;
9017 } else {
9018 att->stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
9019 att->stencilStoreOp = VK_ATTACHMENT_STORE_OP_STORE;
9020 }
9021
9022 if (pRenderingInfo->flags & VK_RENDERING_RESUMING_BIT) {
9023 att->loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
9024 att->stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
9025 }
9026
9027 if (pRenderingInfo->flags & VK_RENDERING_SUSPENDING_BIT) {
9028 att->storeOp = VK_ATTACHMENT_STORE_OP_STORE;
9029 att->stencilStoreOp = VK_ATTACHMENT_STORE_OP_STORE;
9030 }
9031
9032 att->initialLayout = common_info->imageLayout;
9033 att->finalLayout = common_info->imageLayout;
9034
9035 if (pRenderingInfo->pDepthAttachment && pRenderingInfo->pStencilAttachment) {
9036 ds_ref.pNext = &ds_stencil_ref;
9037 ds_stencil_ref = (VkAttachmentReferenceStencilLayout){
9038 .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_STENCIL_LAYOUT,
9039 .stencilLayout = pRenderingInfo->pStencilAttachment->imageLayout};
9040
9041 att->pNext = &ds_stencil_att;
9042 ds_stencil_att = (VkAttachmentDescriptionStencilLayout){
9043 .sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_STENCIL_LAYOUT,
9044 .stencilInitialLayout = pRenderingInfo->pStencilAttachment->imageLayout,
9045 .stencilFinalLayout = pRenderingInfo->pStencilAttachment->imageLayout,
9046 };
9047 }
9048
9049 if (((pRenderingInfo->pDepthAttachment &&
9050 pRenderingInfo->pDepthAttachment->resolveMode != VK_RESOLVE_MODE_NONE) ||
9051 (pRenderingInfo->pStencilAttachment &&
9052 pRenderingInfo->pStencilAttachment->resolveMode != VK_RESOLVE_MODE_NONE)) &&
9053 !(pRenderingInfo->flags & VK_RENDERING_SUSPENDING_BIT)) {
9054 RADV_FROM_HANDLE(radv_image_view, resolve_iview, common_info->resolveImageView);
9055 ds_resolve_ref =
9056 (VkAttachmentReference2){.sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
9057 .attachment = att_count,
9058 .layout = common_info->resolveImageLayout,
9059 .aspectMask = resolve_iview->vk.aspects};
9060
9061 iviews[att_count] = common_info->resolveImageView;
9062 att = att_desc + att_count++;
9063
9064 memset(att, 0, sizeof(*att));
9065 att->sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2;
9066 att->format = resolve_iview->vk.format;
9067 att->samples = resolve_iview->image->info.samples;
9068 att->loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
9069 att->storeOp = VK_ATTACHMENT_STORE_OP_STORE;
9070 att->initialLayout = common_info->resolveImageLayout;
9071 att->finalLayout = common_info->resolveImageLayout;
9072
9073 ds_resolve_info = (VkSubpassDescriptionDepthStencilResolve){
9074 .sType = VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_DEPTH_STENCIL_RESOLVE,
9075 .pNext = subpass.pNext,
9076 .depthResolveMode =
9077 (pRenderingInfo->pDepthAttachment &&
9078 pRenderingInfo->pDepthAttachment->resolveMode != VK_RESOLVE_MODE_NONE)
9079 ? pRenderingInfo->pDepthAttachment->resolveMode
9080 : VK_RESOLVE_MODE_NONE,
9081 .stencilResolveMode =
9082 (pRenderingInfo->pStencilAttachment &&
9083 pRenderingInfo->pStencilAttachment->resolveMode != VK_RESOLVE_MODE_NONE)
9084 ? pRenderingInfo->pStencilAttachment->resolveMode
9085 : VK_RESOLVE_MODE_NONE,
9086 .pDepthStencilResolveAttachment = &ds_resolve_ref};
9087 subpass.pNext = &ds_resolve_info;
9088
9089 if (pRenderingInfo->pDepthAttachment && pRenderingInfo->pStencilAttachment &&
9090 pRenderingInfo->pDepthAttachment->resolveMode != VK_RESOLVE_MODE_NONE &&
9091 pRenderingInfo->pStencilAttachment->resolveMode != VK_RESOLVE_MODE_NONE) {
9092 ds_resolve_ref.pNext = &ds_stencil_resolve_ref;
9093 ds_stencil_resolve_ref = (VkAttachmentReferenceStencilLayout){
9094 .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_STENCIL_LAYOUT,
9095 .stencilLayout = pRenderingInfo->pStencilAttachment->resolveImageLayout};
9096
9097 att->pNext = &ds_stencil_resolve_att;
9098 ds_stencil_resolve_att = (VkAttachmentDescriptionStencilLayout){
9099 .sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_STENCIL_LAYOUT,
9100 .stencilInitialLayout = pRenderingInfo->pStencilAttachment->resolveImageLayout,
9101 .stencilFinalLayout = pRenderingInfo->pStencilAttachment->resolveImageLayout,
9102 };
9103 }
9104 }
9105 }
9106 }
9107
9108 if (vrs_info && vrs_info->imageView) {
9109 RADV_FROM_HANDLE(radv_image_view, iview, vrs_info->imageView);
9110 vrs_ref = (VkAttachmentReference2){.sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
9111 .attachment = att_count,
9112 .layout = vrs_info->imageLayout,
9113 .aspectMask = iview->vk.aspects};
9114
9115 iviews[att_count] = vrs_info->imageView;
9116 VkAttachmentDescription2 *att = att_desc + att_count++;
9117
9118 memset(att, 0, sizeof(*att));
9119 att->sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2;
9120 att->format = iview->vk.format;
9121 att->samples = iview->image->info.samples;
9122 att->loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
9123 att->storeOp = VK_ATTACHMENT_STORE_OP_DONT_CARE;
9124 att->initialLayout = vrs_info->imageLayout;
9125 att->finalLayout = vrs_info->imageLayout;
9126
9127 vrs_subpass_info = (VkFragmentShadingRateAttachmentInfoKHR){
9128 .sType = VK_STRUCTURE_TYPE_FRAGMENT_SHADING_RATE_ATTACHMENT_INFO_KHR,
9129 .pNext = subpass.pNext,
9130 .pFragmentShadingRateAttachment = &vrs_ref,
9131 .shadingRateAttachmentTexelSize = vrs_info->shadingRateAttachmentTexelSize,
9132 };
9133 subpass.pNext = &vrs_subpass_info;
9134 }
9135
9136 VkRenderPassCreateInfo2 rp_create_info = {
9137 .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2,
9138 .attachmentCount = att_count,
9139 .pAttachments = att_desc,
9140 .subpassCount = 1,
9141 .pSubpasses = &subpass,
9142 };
9143
9144 VkRenderPass rp;
9145 result =
9146 radv_CreateRenderPass2(radv_device_to_handle(cmd_buffer->device), &rp_create_info, NULL, &rp);
9147 if (result != VK_SUCCESS) {
9148 cmd_buffer->record_result = result;
9149 return;
9150 }
9151
9152 unsigned w = pRenderingInfo->renderArea.offset.x + pRenderingInfo->renderArea.extent.width;
9153 unsigned h = pRenderingInfo->renderArea.offset.y + pRenderingInfo->renderArea.extent.height;
9154 for (unsigned i = 0; i < att_count; ++i) {
9155 RADV_FROM_HANDLE(radv_image_view, iview, iviews[i]);
9156
9157 if (vrs_info && vrs_info->imageView == iviews[i])
9158 continue;
9159
9160 w = MIN2(w, iview->extent.width);
9161 h = MIN2(h, iview->extent.height);
9162 }
9163 VkFramebufferCreateInfo fb_create_info = {
9164 .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
9165 .renderPass = rp,
9166 .attachmentCount = att_count,
9167 .pAttachments = iviews,
9168 .width = w,
9169 .height = h,
9170 .layers = pRenderingInfo->layerCount,
9171 };
9172
9173 VkFramebuffer fb;
9174 result =
9175 vk_common_CreateFramebuffer(radv_device_to_handle(cmd_buffer->device), &fb_create_info, NULL, &fb);
9176 if (result != VK_SUCCESS) {
9177 radv_DestroyRenderPass(radv_device_to_handle(cmd_buffer->device), rp, NULL);
9178 cmd_buffer->record_result = result;
9179 return;
9180 }
9181
9182 VkRenderPassBeginInfo begin_info = {.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
9183 .renderPass = rp,
9184 .framebuffer = fb,
9185 .renderArea = pRenderingInfo->renderArea,
9186 .clearValueCount = att_count,
9187 .pClearValues = clear_values};
9188
9189 const VkSubpassBeginInfo pass_begin_info = {
9190 .sType = VK_STRUCTURE_TYPE_SUBPASS_BEGIN_INFO,
9191 .contents = (pRenderingInfo->flags & VK_RENDERING_CONTENTS_SECONDARY_COMMAND_BUFFERS_BIT)
9192 ? VK_SUBPASS_CONTENTS_SECONDARY_COMMAND_BUFFERS
9193 : VK_SUBPASS_CONTENTS_INLINE,
9194 };
9195
9196 radv_CmdBeginRenderPass2(commandBuffer, &begin_info, &pass_begin_info);
9197 }
9198
9199 VKAPI_ATTR void VKAPI_CALL
radv_CmdEndRendering(VkCommandBuffer commandBuffer)9200 radv_CmdEndRendering(VkCommandBuffer commandBuffer)
9201 {
9202 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9203 struct radv_render_pass *pass = cmd_buffer->state.pass;
9204 struct vk_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
9205
9206 radv_CmdEndRenderPass2(commandBuffer, NULL);
9207
9208 vk_common_DestroyFramebuffer(radv_device_to_handle(cmd_buffer->device),
9209 vk_framebuffer_to_handle(framebuffer), NULL);
9210 radv_DestroyRenderPass(radv_device_to_handle(cmd_buffer->device),
9211 radv_render_pass_to_handle(pass), NULL);
9212 }
9213
9214 /*
9215 * For HTILE we have the following interesting clear words:
9216 * 0xfffff30f: Uncompressed, full depth range, for depth+stencil HTILE
9217 * 0xfffc000f: Uncompressed, full depth range, for depth only HTILE.
9218 * 0xfffffff0: Clear depth to 1.0
9219 * 0x00000000: Clear depth to 0.0
9220 */
9221 static void
radv_initialize_htile(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range)9222 radv_initialize_htile(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
9223 const VkImageSubresourceRange *range)
9224 {
9225 struct radv_cmd_state *state = &cmd_buffer->state;
9226 uint32_t htile_value = radv_get_htile_initial_value(cmd_buffer->device, image);
9227 VkClearDepthStencilValue value = {0};
9228 struct radv_barrier_data barrier = {0};
9229
9230 barrier.layout_transitions.init_mask_ram = 1;
9231 radv_describe_layout_transition(cmd_buffer, &barrier);
9232
9233 /* Transitioning from LAYOUT_UNDEFINED layout not everyone is consistent
9234 * in considering previous rendering work for WAW hazards. */
9235 state->flush_bits |=
9236 radv_src_access_flush(cmd_buffer, VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, image);
9237
9238 if (image->planes[0].surface.has_stencil &&
9239 !(range->aspectMask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT))) {
9240 /* Flush caches before performing a separate aspect initialization because it's a
9241 * read-modify-write operation.
9242 */
9243 state->flush_bits |= radv_dst_access_flush(cmd_buffer, VK_ACCESS_2_SHADER_READ_BIT, image);
9244 }
9245
9246 state->flush_bits |= radv_clear_htile(cmd_buffer, image, range, htile_value);
9247
9248 radv_set_ds_clear_metadata(cmd_buffer, image, range, value, range->aspectMask);
9249
9250 if (radv_image_is_tc_compat_htile(image) && (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)) {
9251 /* Initialize the TC-compat metada value to 0 because by
9252 * default DB_Z_INFO.RANGE_PRECISION is set to 1, and we only
9253 * need have to conditionally update its value when performing
9254 * a fast depth clear.
9255 */
9256 radv_set_tc_compat_zrange_metadata(cmd_buffer, image, range, 0);
9257 }
9258 }
9259
9260 static void
radv_handle_depth_image_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,bool src_render_loop,VkImageLayout dst_layout,bool dst_render_loop,unsigned src_queue_mask,unsigned dst_queue_mask,const VkImageSubresourceRange * range,struct radv_sample_locations_state * sample_locs)9261 radv_handle_depth_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
9262 VkImageLayout src_layout, bool src_render_loop,
9263 VkImageLayout dst_layout, bool dst_render_loop,
9264 unsigned src_queue_mask, unsigned dst_queue_mask,
9265 const VkImageSubresourceRange *range,
9266 struct radv_sample_locations_state *sample_locs)
9267 {
9268 struct radv_device *device = cmd_buffer->device;
9269
9270 if (!radv_htile_enabled(image, range->baseMipLevel))
9271 return;
9272
9273 if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
9274 radv_initialize_htile(cmd_buffer, image, range);
9275 } else if (!radv_layout_is_htile_compressed(device, image, src_layout, src_render_loop,
9276 src_queue_mask) &&
9277 radv_layout_is_htile_compressed(device, image, dst_layout, dst_render_loop,
9278 dst_queue_mask)) {
9279 radv_initialize_htile(cmd_buffer, image, range);
9280 } else if (radv_layout_is_htile_compressed(device, image, src_layout, src_render_loop,
9281 src_queue_mask) &&
9282 !radv_layout_is_htile_compressed(device, image, dst_layout, dst_render_loop,
9283 dst_queue_mask)) {
9284 cmd_buffer->state.flush_bits |=
9285 RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
9286
9287 radv_expand_depth_stencil(cmd_buffer, image, range, sample_locs);
9288
9289 cmd_buffer->state.flush_bits |=
9290 RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
9291 }
9292 }
9293
9294 static uint32_t
radv_init_cmask(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,uint32_t value)9295 radv_init_cmask(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
9296 const VkImageSubresourceRange *range, uint32_t value)
9297 {
9298 struct radv_barrier_data barrier = {0};
9299
9300 barrier.layout_transitions.init_mask_ram = 1;
9301 radv_describe_layout_transition(cmd_buffer, &barrier);
9302
9303 return radv_clear_cmask(cmd_buffer, image, range, value);
9304 }
9305
9306 uint32_t
radv_init_fmask(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range)9307 radv_init_fmask(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
9308 const VkImageSubresourceRange *range)
9309 {
9310 static const uint32_t fmask_clear_values[4] = {0x00000000, 0x02020202, 0xE4E4E4E4, 0x76543210};
9311 uint32_t log2_samples = util_logbase2(image->info.samples);
9312 uint32_t value = fmask_clear_values[log2_samples];
9313 struct radv_barrier_data barrier = {0};
9314
9315 barrier.layout_transitions.init_mask_ram = 1;
9316 radv_describe_layout_transition(cmd_buffer, &barrier);
9317
9318 return radv_clear_fmask(cmd_buffer, image, range, value);
9319 }
9320
9321 uint32_t
radv_init_dcc(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,uint32_t value)9322 radv_init_dcc(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
9323 const VkImageSubresourceRange *range, uint32_t value)
9324 {
9325 struct radv_barrier_data barrier = {0};
9326 uint32_t flush_bits = 0;
9327 unsigned size = 0;
9328
9329 barrier.layout_transitions.init_mask_ram = 1;
9330 radv_describe_layout_transition(cmd_buffer, &barrier);
9331
9332 flush_bits |= radv_clear_dcc(cmd_buffer, image, range, value);
9333
9334 if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX8) {
9335 /* When DCC is enabled with mipmaps, some levels might not
9336 * support fast clears and we have to initialize them as "fully
9337 * expanded".
9338 */
9339 /* Compute the size of all fast clearable DCC levels. */
9340 for (unsigned i = 0; i < image->planes[0].surface.num_meta_levels; i++) {
9341 struct legacy_surf_dcc_level *dcc_level = &image->planes[0].surface.u.legacy.color.dcc_level[i];
9342 unsigned dcc_fast_clear_size =
9343 dcc_level->dcc_slice_fast_clear_size * image->info.array_size;
9344
9345 if (!dcc_fast_clear_size)
9346 break;
9347
9348 size = dcc_level->dcc_offset + dcc_fast_clear_size;
9349 }
9350
9351 /* Initialize the mipmap levels without DCC. */
9352 if (size != image->planes[0].surface.meta_size) {
9353 flush_bits |= radv_fill_buffer(cmd_buffer, image, image->bindings[0].bo,
9354 radv_buffer_get_va(image->bindings[0].bo) +
9355 image->bindings[0].offset +
9356 image->planes[0].surface.meta_offset + size,
9357 image->planes[0].surface.meta_size - size, 0xffffffff);
9358 }
9359 }
9360
9361 return flush_bits;
9362 }
9363
9364 /**
9365 * Initialize DCC/FMASK/CMASK metadata for a color image.
9366 */
9367 static void
radv_init_color_image_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,bool src_render_loop,VkImageLayout dst_layout,bool dst_render_loop,unsigned src_queue_mask,unsigned dst_queue_mask,const VkImageSubresourceRange * range)9368 radv_init_color_image_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
9369 VkImageLayout src_layout, bool src_render_loop,
9370 VkImageLayout dst_layout, bool dst_render_loop,
9371 unsigned src_queue_mask, unsigned dst_queue_mask,
9372 const VkImageSubresourceRange *range)
9373 {
9374 uint32_t flush_bits = 0;
9375
9376 /* Transitioning from LAYOUT_UNDEFINED layout not everyone is
9377 * consistent in considering previous rendering work for WAW hazards.
9378 */
9379 cmd_buffer->state.flush_bits |=
9380 radv_src_access_flush(cmd_buffer, VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT, image);
9381
9382 if (radv_image_has_cmask(image)) {
9383 uint32_t value;
9384
9385 if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX9) {
9386 /* TODO: Fix clearing CMASK layers on GFX9. */
9387 if (radv_image_is_tc_compat_cmask(image) ||
9388 (radv_image_has_fmask(image) &&
9389 radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel, dst_layout,
9390 dst_render_loop, dst_queue_mask))) {
9391 value = 0xccccccccu;
9392 } else {
9393 value = 0xffffffffu;
9394 }
9395 } else {
9396 static const uint32_t cmask_clear_values[4] = {0xffffffff, 0xdddddddd, 0xeeeeeeee, 0xffffffff};
9397 uint32_t log2_samples = util_logbase2(image->info.samples);
9398
9399 value = cmask_clear_values[log2_samples];
9400 }
9401
9402 flush_bits |= radv_init_cmask(cmd_buffer, image, range, value);
9403 }
9404
9405 if (radv_image_has_fmask(image)) {
9406 flush_bits |= radv_init_fmask(cmd_buffer, image, range);
9407 }
9408
9409 if (radv_dcc_enabled(image, range->baseMipLevel)) {
9410 uint32_t value = 0xffffffffu; /* Fully expanded mode. */
9411
9412 if (radv_layout_dcc_compressed(cmd_buffer->device, image, range->baseMipLevel,
9413 dst_layout, dst_render_loop, dst_queue_mask)) {
9414 value = 0u;
9415 }
9416
9417 flush_bits |= radv_init_dcc(cmd_buffer, image, range, value);
9418 }
9419
9420 if (radv_image_has_cmask(image) || radv_dcc_enabled(image, range->baseMipLevel)) {
9421 radv_update_fce_metadata(cmd_buffer, image, range, false);
9422
9423 uint32_t color_values[2] = {0};
9424 radv_set_color_clear_metadata(cmd_buffer, image, range, color_values);
9425 }
9426
9427 cmd_buffer->state.flush_bits |= flush_bits;
9428 }
9429
9430 static void
radv_retile_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,VkImageLayout dst_layout,unsigned dst_queue_mask)9431 radv_retile_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
9432 VkImageLayout src_layout, VkImageLayout dst_layout, unsigned dst_queue_mask)
9433 {
9434 /* If the image is read-only, we don't have to retile DCC because it can't change. */
9435 if (!(image->vk.usage & RADV_IMAGE_USAGE_WRITE_BITS))
9436 return;
9437
9438 if (src_layout != VK_IMAGE_LAYOUT_PRESENT_SRC_KHR &&
9439 (dst_layout == VK_IMAGE_LAYOUT_PRESENT_SRC_KHR ||
9440 (dst_queue_mask & (1u << RADV_QUEUE_FOREIGN))))
9441 radv_retile_dcc(cmd_buffer, image);
9442 }
9443
9444 static bool
radv_image_need_retile(const struct radv_image * image)9445 radv_image_need_retile(const struct radv_image *image)
9446 {
9447 return image->planes[0].surface.display_dcc_offset &&
9448 image->planes[0].surface.display_dcc_offset != image->planes[0].surface.meta_offset;
9449 }
9450
9451 /**
9452 * Handle color image transitions for DCC/FMASK/CMASK.
9453 */
9454 static void
radv_handle_color_image_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,bool src_render_loop,VkImageLayout dst_layout,bool dst_render_loop,unsigned src_queue_mask,unsigned dst_queue_mask,const VkImageSubresourceRange * range)9455 radv_handle_color_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
9456 VkImageLayout src_layout, bool src_render_loop,
9457 VkImageLayout dst_layout, bool dst_render_loop,
9458 unsigned src_queue_mask, unsigned dst_queue_mask,
9459 const VkImageSubresourceRange *range)
9460 {
9461 bool dcc_decompressed = false, fast_clear_flushed = false;
9462
9463 if (!radv_image_has_cmask(image) && !radv_image_has_fmask(image) &&
9464 !radv_dcc_enabled(image, range->baseMipLevel))
9465 return;
9466
9467 if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
9468 radv_init_color_image_metadata(cmd_buffer, image, src_layout, src_render_loop, dst_layout,
9469 dst_render_loop, src_queue_mask, dst_queue_mask, range);
9470
9471 if (radv_image_need_retile(image))
9472 radv_retile_transition(cmd_buffer, image, src_layout, dst_layout, dst_queue_mask);
9473 return;
9474 }
9475
9476 if (radv_dcc_enabled(image, range->baseMipLevel)) {
9477 if (src_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) {
9478 cmd_buffer->state.flush_bits |= radv_init_dcc(cmd_buffer, image, range, 0xffffffffu);
9479 } else if (radv_layout_dcc_compressed(cmd_buffer->device, image, range->baseMipLevel,
9480 src_layout, src_render_loop, src_queue_mask) &&
9481 !radv_layout_dcc_compressed(cmd_buffer->device, image, range->baseMipLevel,
9482 dst_layout, dst_render_loop, dst_queue_mask)) {
9483 radv_decompress_dcc(cmd_buffer, image, range);
9484 dcc_decompressed = true;
9485 } else if (radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel,
9486 src_layout, src_render_loop, src_queue_mask) &&
9487 !radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel,
9488 dst_layout, dst_render_loop, dst_queue_mask)) {
9489 radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
9490 fast_clear_flushed = true;
9491 }
9492
9493 if (radv_image_need_retile(image))
9494 radv_retile_transition(cmd_buffer, image, src_layout, dst_layout, dst_queue_mask);
9495 } else if (radv_image_has_cmask(image) || radv_image_has_fmask(image)) {
9496 if (radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel,
9497 src_layout, src_render_loop, src_queue_mask) &&
9498 !radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel,
9499 dst_layout, dst_render_loop, dst_queue_mask)) {
9500 radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
9501 fast_clear_flushed = true;
9502 }
9503 }
9504
9505 /* MSAA color decompress. */
9506 if (radv_image_has_fmask(image) &&
9507 (image->vk.usage & (VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT)) &&
9508 radv_layout_fmask_compressed(cmd_buffer->device, image, src_layout, src_queue_mask) &&
9509 !radv_layout_fmask_compressed(cmd_buffer->device, image, dst_layout, dst_queue_mask)) {
9510 if (radv_dcc_enabled(image, range->baseMipLevel) &&
9511 !radv_image_use_dcc_image_stores(cmd_buffer->device, image) && !dcc_decompressed) {
9512 /* A DCC decompress is required before expanding FMASK
9513 * when DCC stores aren't supported to avoid being in
9514 * a state where DCC is compressed and the main
9515 * surface is uncompressed.
9516 */
9517 radv_decompress_dcc(cmd_buffer, image, range);
9518 } else if (!fast_clear_flushed) {
9519 /* A FMASK decompress is required before expanding
9520 * FMASK.
9521 */
9522 radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
9523 }
9524
9525 struct radv_barrier_data barrier = {0};
9526 barrier.layout_transitions.fmask_color_expand = 1;
9527 radv_describe_layout_transition(cmd_buffer, &barrier);
9528
9529 radv_expand_fmask_image_inplace(cmd_buffer, image, range);
9530 }
9531 }
9532
9533 static void
radv_handle_image_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,bool src_render_loop,VkImageLayout dst_layout,bool dst_render_loop,uint32_t src_family_index,uint32_t dst_family_index,const VkImageSubresourceRange * range,struct radv_sample_locations_state * sample_locs)9534 radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
9535 VkImageLayout src_layout, bool src_render_loop,
9536 VkImageLayout dst_layout, bool dst_render_loop, uint32_t src_family_index,
9537 uint32_t dst_family_index, const VkImageSubresourceRange *range,
9538 struct radv_sample_locations_state *sample_locs)
9539 {
9540 enum radv_queue_family src_qf = vk_queue_to_radv(cmd_buffer->device->physical_device, src_family_index);
9541 enum radv_queue_family dst_qf = vk_queue_to_radv(cmd_buffer->device->physical_device, dst_family_index);
9542 if (image->exclusive && src_family_index != dst_family_index) {
9543 /* This is an acquire or a release operation and there will be
9544 * a corresponding release/acquire. Do the transition in the
9545 * most flexible queue. */
9546
9547 assert(src_qf == cmd_buffer->qf ||
9548 dst_qf == cmd_buffer->qf);
9549
9550 if (src_family_index == VK_QUEUE_FAMILY_EXTERNAL || src_family_index == VK_QUEUE_FAMILY_FOREIGN_EXT)
9551 return;
9552
9553 if (cmd_buffer->qf == RADV_QUEUE_TRANSFER)
9554 return;
9555
9556 if (cmd_buffer->qf == RADV_QUEUE_COMPUTE &&
9557 (src_qf == RADV_QUEUE_GENERAL || dst_qf == RADV_QUEUE_GENERAL))
9558 return;
9559 }
9560
9561 unsigned src_queue_mask =
9562 radv_image_queue_family_mask(image, src_qf, cmd_buffer->qf);
9563 unsigned dst_queue_mask =
9564 radv_image_queue_family_mask(image, dst_qf, cmd_buffer->qf);
9565
9566 if (src_layout == dst_layout && src_render_loop == dst_render_loop && src_queue_mask == dst_queue_mask)
9567 return;
9568
9569 if (vk_format_has_depth(image->vk.format)) {
9570 radv_handle_depth_image_transition(cmd_buffer, image, src_layout, src_render_loop, dst_layout,
9571 dst_render_loop, src_queue_mask, dst_queue_mask, range,
9572 sample_locs);
9573 } else {
9574 radv_handle_color_image_transition(cmd_buffer, image, src_layout, src_render_loop, dst_layout,
9575 dst_render_loop, src_queue_mask, dst_queue_mask, range);
9576 }
9577 }
9578
9579 static void
radv_cp_dma_wait_for_stages(struct radv_cmd_buffer * cmd_buffer,VkPipelineStageFlags2 stage_mask)9580 radv_cp_dma_wait_for_stages(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 stage_mask)
9581 {
9582 /* Make sure CP DMA is idle because the driver might have performed a DMA operation for copying a
9583 * buffer (or a MSAA image using FMASK). Note that updating a buffer is considered a clear
9584 * operation but it might also use a CP DMA copy in some rare situations. Other operations using
9585 * a CP DMA clear are implicitly synchronized (see CP_DMA_SYNC).
9586 */
9587 if (stage_mask & (VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_CLEAR_BIT |
9588 VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT | VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT |
9589 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
9590 si_cp_dma_wait_for_idle(cmd_buffer);
9591 }
9592
9593 static void
radv_barrier(struct radv_cmd_buffer * cmd_buffer,const VkDependencyInfo * dep_info,enum rgp_barrier_reason reason)9594 radv_barrier(struct radv_cmd_buffer *cmd_buffer, const VkDependencyInfo *dep_info,
9595 enum rgp_barrier_reason reason)
9596 {
9597 enum radv_cmd_flush_bits src_flush_bits = 0;
9598 enum radv_cmd_flush_bits dst_flush_bits = 0;
9599 VkPipelineStageFlags2 src_stage_mask = 0;
9600 VkPipelineStageFlags2 dst_stage_mask = 0;
9601
9602 if (cmd_buffer->state.subpass)
9603 radv_mark_noncoherent_rb(cmd_buffer);
9604
9605 radv_describe_barrier_start(cmd_buffer, reason);
9606
9607 for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
9608 src_stage_mask |= dep_info->pMemoryBarriers[i].srcStageMask;
9609 src_flush_bits |=
9610 radv_src_access_flush(cmd_buffer, dep_info->pMemoryBarriers[i].srcAccessMask, NULL);
9611 dst_stage_mask |= dep_info->pMemoryBarriers[i].dstStageMask;
9612 dst_flush_bits |=
9613 radv_dst_access_flush(cmd_buffer, dep_info->pMemoryBarriers[i].dstAccessMask, NULL);
9614 }
9615
9616 for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
9617 src_stage_mask |= dep_info->pBufferMemoryBarriers[i].srcStageMask;
9618 src_flush_bits |=
9619 radv_src_access_flush(cmd_buffer, dep_info->pBufferMemoryBarriers[i].srcAccessMask, NULL);
9620 dst_stage_mask |= dep_info->pBufferMemoryBarriers[i].dstStageMask;
9621 dst_flush_bits |=
9622 radv_dst_access_flush(cmd_buffer, dep_info->pBufferMemoryBarriers[i].dstAccessMask, NULL);
9623 }
9624
9625 for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
9626 RADV_FROM_HANDLE(radv_image, image, dep_info->pImageMemoryBarriers[i].image);
9627
9628 src_stage_mask |= dep_info->pImageMemoryBarriers[i].srcStageMask;
9629 src_flush_bits |=
9630 radv_src_access_flush(cmd_buffer, dep_info->pImageMemoryBarriers[i].srcAccessMask, image);
9631 dst_stage_mask |= dep_info->pImageMemoryBarriers[i].dstStageMask;
9632 dst_flush_bits |=
9633 radv_dst_access_flush(cmd_buffer, dep_info->pImageMemoryBarriers[i].dstAccessMask, image);
9634 }
9635
9636 /* The Vulkan spec 1.1.98 says:
9637 *
9638 * "An execution dependency with only
9639 * VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT in the destination stage mask
9640 * will only prevent that stage from executing in subsequently
9641 * submitted commands. As this stage does not perform any actual
9642 * execution, this is not observable - in effect, it does not delay
9643 * processing of subsequent commands. Similarly an execution dependency
9644 * with only VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT in the source stage mask
9645 * will effectively not wait for any prior commands to complete."
9646 */
9647 if (dst_stage_mask != VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT)
9648 radv_stage_flush(cmd_buffer, src_stage_mask);
9649 cmd_buffer->state.flush_bits |= src_flush_bits;
9650
9651 radv_ace_internal_barrier(cmd_buffer, src_stage_mask, 0);
9652
9653 for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
9654 RADV_FROM_HANDLE(radv_image, image, dep_info->pImageMemoryBarriers[i].image);
9655
9656 const struct VkSampleLocationsInfoEXT *sample_locs_info =
9657 vk_find_struct_const(dep_info->pImageMemoryBarriers[i].pNext, SAMPLE_LOCATIONS_INFO_EXT);
9658 struct radv_sample_locations_state sample_locations;
9659
9660 if (sample_locs_info) {
9661 assert(image->vk.create_flags & VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT);
9662 sample_locations.per_pixel = sample_locs_info->sampleLocationsPerPixel;
9663 sample_locations.grid_size = sample_locs_info->sampleLocationGridSize;
9664 sample_locations.count = sample_locs_info->sampleLocationsCount;
9665 typed_memcpy(&sample_locations.locations[0], sample_locs_info->pSampleLocations,
9666 sample_locs_info->sampleLocationsCount);
9667 }
9668
9669 radv_handle_image_transition(
9670 cmd_buffer, image, dep_info->pImageMemoryBarriers[i].oldLayout,
9671 false, /* Outside of a renderpass we are never in a renderloop */
9672 dep_info->pImageMemoryBarriers[i].newLayout,
9673 false, /* Outside of a renderpass we are never in a renderloop */
9674 dep_info->pImageMemoryBarriers[i].srcQueueFamilyIndex,
9675 dep_info->pImageMemoryBarriers[i].dstQueueFamilyIndex,
9676 &dep_info->pImageMemoryBarriers[i].subresourceRange, sample_locs_info ? &sample_locations : NULL);
9677 }
9678
9679 radv_ace_internal_barrier(cmd_buffer, 0, dst_stage_mask);
9680 radv_cp_dma_wait_for_stages(cmd_buffer, src_stage_mask);
9681
9682 cmd_buffer->state.flush_bits |= dst_flush_bits;
9683
9684 radv_describe_barrier_end(cmd_buffer);
9685 }
9686
9687 VKAPI_ATTR void VKAPI_CALL
radv_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,const VkDependencyInfo * pDependencyInfo)9688 radv_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,
9689 const VkDependencyInfo *pDependencyInfo)
9690 {
9691 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9692
9693 radv_barrier(cmd_buffer, pDependencyInfo, RGP_BARRIER_EXTERNAL_CMD_PIPELINE_BARRIER);
9694 }
9695
9696 static void
write_event(struct radv_cmd_buffer * cmd_buffer,struct radv_event * event,VkPipelineStageFlags2 stageMask,unsigned value)9697 write_event(struct radv_cmd_buffer *cmd_buffer, struct radv_event *event,
9698 VkPipelineStageFlags2 stageMask, unsigned value)
9699 {
9700 struct radeon_cmdbuf *cs = cmd_buffer->cs;
9701 uint64_t va = radv_buffer_get_va(event->bo);
9702
9703 si_emit_cache_flush(cmd_buffer);
9704
9705 radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo);
9706
9707 ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 28);
9708
9709 if (stageMask & (VK_PIPELINE_STAGE_2_COPY_BIT |
9710 VK_PIPELINE_STAGE_2_RESOLVE_BIT |
9711 VK_PIPELINE_STAGE_2_BLIT_BIT |
9712 VK_PIPELINE_STAGE_2_CLEAR_BIT)) {
9713 /* Be conservative for now. */
9714 stageMask |= VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT;
9715 }
9716
9717 /* Flags that only require a top-of-pipe event. */
9718 VkPipelineStageFlags2 top_of_pipe_flags = VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT;
9719
9720 /* Flags that only require a post-index-fetch event. */
9721 VkPipelineStageFlags2 post_index_fetch_flags =
9722 top_of_pipe_flags | VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT;
9723
9724 /* Flags that only require signaling post PS. */
9725 VkPipelineStageFlags2 post_ps_flags =
9726 post_index_fetch_flags | VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT |
9727 VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT |
9728 VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT | VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT |
9729 VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_NV |
9730 VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT |
9731 VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT |
9732 VK_PIPELINE_STAGE_2_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR |
9733 VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT;
9734
9735 /* Flags that only require signaling post CS. */
9736 VkPipelineStageFlags2 post_cs_flags = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT;
9737
9738 radv_cp_dma_wait_for_stages(cmd_buffer, stageMask);
9739
9740 if (!(stageMask & ~top_of_pipe_flags)) {
9741 /* Just need to sync the PFP engine. */
9742 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
9743 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
9744 radeon_emit(cs, va);
9745 radeon_emit(cs, va >> 32);
9746 radeon_emit(cs, value);
9747 } else if (!(stageMask & ~post_index_fetch_flags)) {
9748 /* Sync ME because PFP reads index and indirect buffers. */
9749 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
9750 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_ME));
9751 radeon_emit(cs, va);
9752 radeon_emit(cs, va >> 32);
9753 radeon_emit(cs, value);
9754 } else {
9755 unsigned event_type;
9756
9757 if (!(stageMask & ~post_ps_flags)) {
9758 /* Sync previous fragment shaders. */
9759 event_type = V_028A90_PS_DONE;
9760 } else if (!(stageMask & ~post_cs_flags)) {
9761 /* Sync previous compute shaders. */
9762 event_type = V_028A90_CS_DONE;
9763 } else {
9764 /* Otherwise, sync all prior GPU work. */
9765 event_type = V_028A90_BOTTOM_OF_PIPE_TS;
9766 }
9767
9768 si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level,
9769 radv_cmd_buffer_uses_mec(cmd_buffer), event_type, 0,
9770 EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, va, value,
9771 cmd_buffer->gfx9_eop_bug_va);
9772 }
9773
9774 assert(cmd_buffer->cs->cdw <= cdw_max);
9775 }
9776
9777 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetEvent2(VkCommandBuffer commandBuffer,VkEvent _event,const VkDependencyInfo * pDependencyInfo)9778 radv_CmdSetEvent2(VkCommandBuffer commandBuffer, VkEvent _event,
9779 const VkDependencyInfo* pDependencyInfo)
9780 {
9781 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9782 RADV_FROM_HANDLE(radv_event, event, _event);
9783 VkPipelineStageFlags2 src_stage_mask = 0;
9784
9785 for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++)
9786 src_stage_mask |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
9787 for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++)
9788 src_stage_mask |= pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
9789 for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++)
9790 src_stage_mask |= pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
9791
9792 write_event(cmd_buffer, event, src_stage_mask, 1);
9793 }
9794
9795 VKAPI_ATTR void VKAPI_CALL
radv_CmdResetEvent2(VkCommandBuffer commandBuffer,VkEvent _event,VkPipelineStageFlags2 stageMask)9796 radv_CmdResetEvent2(VkCommandBuffer commandBuffer, VkEvent _event,
9797 VkPipelineStageFlags2 stageMask)
9798 {
9799 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9800 RADV_FROM_HANDLE(radv_event, event, _event);
9801
9802 write_event(cmd_buffer, event, stageMask, 0);
9803 }
9804
9805 VKAPI_ATTR void VKAPI_CALL
radv_CmdWaitEvents2(VkCommandBuffer commandBuffer,uint32_t eventCount,const VkEvent * pEvents,const VkDependencyInfo * pDependencyInfos)9806 radv_CmdWaitEvents2(VkCommandBuffer commandBuffer, uint32_t eventCount, const VkEvent *pEvents,
9807 const VkDependencyInfo* pDependencyInfos)
9808 {
9809 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9810 struct radeon_cmdbuf *cs = cmd_buffer->cs;
9811
9812 for (unsigned i = 0; i < eventCount; ++i) {
9813 RADV_FROM_HANDLE(radv_event, event, pEvents[i]);
9814 uint64_t va = radv_buffer_get_va(event->bo);
9815
9816 radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo);
9817
9818 ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 7);
9819
9820 radv_cp_wait_mem(cs, WAIT_REG_MEM_EQUAL, va, 1, 0xffffffff);
9821 assert(cmd_buffer->cs->cdw <= cdw_max);
9822 }
9823
9824 radv_barrier(cmd_buffer, pDependencyInfos, RGP_BARRIER_EXTERNAL_CMD_WAIT_EVENTS);
9825 }
9826
9827 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDeviceMask(VkCommandBuffer commandBuffer,uint32_t deviceMask)9828 radv_CmdSetDeviceMask(VkCommandBuffer commandBuffer, uint32_t deviceMask)
9829 {
9830 /* No-op */
9831 }
9832
9833 /* VK_EXT_conditional_rendering */
9834 VKAPI_ATTR void VKAPI_CALL
radv_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,const VkConditionalRenderingBeginInfoEXT * pConditionalRenderingBegin)9835 radv_CmdBeginConditionalRenderingEXT(
9836 VkCommandBuffer commandBuffer,
9837 const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin)
9838 {
9839 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9840 RADV_FROM_HANDLE(radv_buffer, buffer, pConditionalRenderingBegin->buffer);
9841 struct radeon_cmdbuf *cs = cmd_buffer->cs;
9842 unsigned pred_op = PREDICATION_OP_BOOL32;
9843 bool draw_visible = true;
9844 uint64_t va;
9845
9846 va = radv_buffer_get_va(buffer->bo) + buffer->offset + pConditionalRenderingBegin->offset;
9847
9848 /* By default, if the 32-bit value at offset in buffer memory is zero,
9849 * then the rendering commands are discarded, otherwise they are
9850 * executed as normal. If the inverted flag is set, all commands are
9851 * discarded if the value is non zero.
9852 */
9853 if (pConditionalRenderingBegin->flags & VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT) {
9854 draw_visible = false;
9855 }
9856
9857 si_emit_cache_flush(cmd_buffer);
9858
9859 if (cmd_buffer->qf == RADV_QUEUE_GENERAL &&
9860 !cmd_buffer->device->physical_device->rad_info.has_32bit_predication) {
9861 uint64_t pred_value = 0, pred_va;
9862 unsigned pred_offset;
9863
9864 /* From the Vulkan spec 1.1.107:
9865 *
9866 * "If the 32-bit value at offset in buffer memory is zero,
9867 * then the rendering commands are discarded, otherwise they
9868 * are executed as normal. If the value of the predicate in
9869 * buffer memory changes while conditional rendering is
9870 * active, the rendering commands may be discarded in an
9871 * implementation-dependent way. Some implementations may
9872 * latch the value of the predicate upon beginning conditional
9873 * rendering while others may read it before every rendering
9874 * command."
9875 *
9876 * But, the AMD hardware treats the predicate as a 64-bit
9877 * value which means we need a workaround in the driver.
9878 * Luckily, it's not required to support if the value changes
9879 * when predication is active.
9880 *
9881 * The workaround is as follows:
9882 * 1) allocate a 64-value in the upload BO and initialize it
9883 * to 0
9884 * 2) copy the 32-bit predicate value to the upload BO
9885 * 3) use the new allocated VA address for predication
9886 *
9887 * Based on the conditionalrender demo, it's faster to do the
9888 * COPY_DATA in ME (+ sync PFP) instead of PFP.
9889 */
9890 radv_cmd_buffer_upload_data(cmd_buffer, 8, &pred_value, &pred_offset);
9891
9892 pred_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + pred_offset;
9893
9894 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
9895 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
9896 COPY_DATA_WR_CONFIRM);
9897 radeon_emit(cs, va);
9898 radeon_emit(cs, va >> 32);
9899 radeon_emit(cs, pred_va);
9900 radeon_emit(cs, pred_va >> 32);
9901
9902 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
9903 radeon_emit(cs, 0);
9904
9905 va = pred_va;
9906 pred_op = PREDICATION_OP_BOOL64;
9907 }
9908
9909 /* MEC doesn't support predication, we emulate it elsewhere. */
9910 if (!radv_cmd_buffer_uses_mec(cmd_buffer)) {
9911 si_emit_set_predication_state(cmd_buffer, draw_visible, pred_op, va);
9912 }
9913
9914 /* Store conditional rendering user info. */
9915 cmd_buffer->state.predicating = true;
9916 cmd_buffer->state.predication_type = draw_visible;
9917 cmd_buffer->state.predication_op = pred_op;
9918 cmd_buffer->state.predication_va = va;
9919 cmd_buffer->mec_inv_pred_emitted = false;
9920 }
9921
9922 VKAPI_ATTR void VKAPI_CALL
radv_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)9923 radv_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)
9924 {
9925 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9926
9927 /* MEC doesn't support predication, no need to emit anything here. */
9928 if (!radv_cmd_buffer_uses_mec(cmd_buffer)) {
9929 si_emit_set_predication_state(cmd_buffer, false, 0, 0);
9930 }
9931
9932 /* Reset conditional rendering user info. */
9933 cmd_buffer->state.predicating = false;
9934 cmd_buffer->state.predication_type = -1;
9935 cmd_buffer->state.predication_op = 0;
9936 cmd_buffer->state.predication_va = 0;
9937 cmd_buffer->mec_inv_pred_emitted = false;
9938 }
9939
9940 /* VK_EXT_transform_feedback */
9941 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes)9942 radv_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer, uint32_t firstBinding,
9943 uint32_t bindingCount, const VkBuffer *pBuffers,
9944 const VkDeviceSize *pOffsets, const VkDeviceSize *pSizes)
9945 {
9946 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9947 struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
9948 uint8_t enabled_mask = 0;
9949
9950 assert(firstBinding + bindingCount <= MAX_SO_BUFFERS);
9951 for (uint32_t i = 0; i < bindingCount; i++) {
9952 uint32_t idx = firstBinding + i;
9953
9954 sb[idx].buffer = radv_buffer_from_handle(pBuffers[i]);
9955 sb[idx].offset = pOffsets[i];
9956
9957 if (!pSizes || pSizes[i] == VK_WHOLE_SIZE) {
9958 sb[idx].size = sb[idx].buffer->vk.size - sb[idx].offset;
9959 } else {
9960 sb[idx].size = pSizes[i];
9961 }
9962
9963 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, sb[idx].buffer->bo);
9964
9965 enabled_mask |= 1 << idx;
9966 }
9967
9968 cmd_buffer->state.streamout.enabled_mask |= enabled_mask;
9969
9970 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_BUFFER;
9971 }
9972
9973 bool
radv_is_streamout_enabled(struct radv_cmd_buffer * cmd_buffer)9974 radv_is_streamout_enabled(struct radv_cmd_buffer *cmd_buffer)
9975 {
9976 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
9977
9978 /* Streamout must be enabled for the PRIMITIVES_GENERATED query to work. */
9979 return (so->streamout_enabled || cmd_buffer->state.prims_gen_query_enabled) &&
9980 !cmd_buffer->state.suspend_streamout;
9981 }
9982
9983 void
radv_emit_streamout_enable(struct radv_cmd_buffer * cmd_buffer)9984 radv_emit_streamout_enable(struct radv_cmd_buffer *cmd_buffer)
9985 {
9986 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
9987 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
9988 bool streamout_enabled = radv_is_streamout_enabled(cmd_buffer);
9989 struct radeon_cmdbuf *cs = cmd_buffer->cs;
9990 uint32_t enabled_stream_buffers_mask = 0;
9991
9992 if (pipeline && pipeline->streamout_shader) {
9993 enabled_stream_buffers_mask = pipeline->streamout_shader->info.so.enabled_stream_buffers_mask;
9994 }
9995
9996 radeon_set_context_reg_seq(cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
9997 radeon_emit(cs, S_028B94_STREAMOUT_0_EN(streamout_enabled) | S_028B94_RAST_STREAM(0) |
9998 S_028B94_STREAMOUT_1_EN(streamout_enabled) |
9999 S_028B94_STREAMOUT_2_EN(streamout_enabled) |
10000 S_028B94_STREAMOUT_3_EN(streamout_enabled));
10001 radeon_emit(cs, so->hw_enabled_mask & enabled_stream_buffers_mask);
10002
10003 cmd_buffer->state.context_roll_without_scissor_emitted = true;
10004 }
10005
10006 static void
radv_set_streamout_enable(struct radv_cmd_buffer * cmd_buffer,bool enable)10007 radv_set_streamout_enable(struct radv_cmd_buffer *cmd_buffer, bool enable)
10008 {
10009 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
10010 bool old_streamout_enabled = radv_is_streamout_enabled(cmd_buffer);
10011 uint32_t old_hw_enabled_mask = so->hw_enabled_mask;
10012
10013 so->streamout_enabled = enable;
10014
10015 so->hw_enabled_mask = so->enabled_mask | (so->enabled_mask << 4) | (so->enabled_mask << 8) |
10016 (so->enabled_mask << 12);
10017
10018 if (!cmd_buffer->device->physical_device->use_ngg_streamout &&
10019 ((old_streamout_enabled != radv_is_streamout_enabled(cmd_buffer)) ||
10020 (old_hw_enabled_mask != so->hw_enabled_mask)))
10021 radv_emit_streamout_enable(cmd_buffer);
10022
10023 if (cmd_buffer->device->physical_device->use_ngg_streamout) {
10024 cmd_buffer->gds_needed = true;
10025 cmd_buffer->gds_oa_needed = true;
10026 }
10027 }
10028
10029 static void
radv_flush_vgt_streamout(struct radv_cmd_buffer * cmd_buffer)10030 radv_flush_vgt_streamout(struct radv_cmd_buffer *cmd_buffer)
10031 {
10032 struct radeon_cmdbuf *cs = cmd_buffer->cs;
10033 unsigned reg_strmout_cntl;
10034
10035 /* The register is at different places on different ASICs. */
10036 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9) {
10037 reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
10038 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
10039 radeon_emit(cs, S_370_DST_SEL(V_370_MEM_MAPPED_REGISTER) | S_370_ENGINE_SEL(V_370_ME));
10040 radeon_emit(cs, R_0300FC_CP_STRMOUT_CNTL >> 2);
10041 radeon_emit(cs, 0);
10042 radeon_emit(cs, 0);
10043 } else if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) {
10044 reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
10045 radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0);
10046 } else {
10047 reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL;
10048 radeon_set_config_reg(cs, reg_strmout_cntl, 0);
10049 }
10050
10051 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
10052 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0));
10053
10054 radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
10055 radeon_emit(cs,
10056 WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
10057 radeon_emit(cs, reg_strmout_cntl >> 2); /* register */
10058 radeon_emit(cs, 0);
10059 radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
10060 radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
10061 radeon_emit(cs, 4); /* poll interval */
10062 }
10063
10064 static void
radv_emit_streamout_begin(struct radv_cmd_buffer * cmd_buffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)10065 radv_emit_streamout_begin(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer,
10066 uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
10067 const VkDeviceSize *pCounterBufferOffsets)
10068
10069 {
10070 struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
10071 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
10072 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
10073 struct radv_shader_info *info = &pipeline->streamout_shader->info;
10074 struct radeon_cmdbuf *cs = cmd_buffer->cs;
10075
10076 radv_flush_vgt_streamout(cmd_buffer);
10077
10078 assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
10079 u_foreach_bit(i, so->enabled_mask)
10080 {
10081 int32_t counter_buffer_idx = i - firstCounterBuffer;
10082 if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
10083 counter_buffer_idx = -1;
10084
10085 /* AMD GCN binds streamout buffers as shader resources.
10086 * VGT only counts primitives and tells the shader through
10087 * SGPRs what to do.
10088 */
10089 radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 2);
10090 radeon_emit(cs, sb[i].size >> 2); /* BUFFER_SIZE (in DW) */
10091 radeon_emit(cs, info->so.strides[i]); /* VTX_STRIDE (in DW) */
10092
10093 cmd_buffer->state.context_roll_without_scissor_emitted = true;
10094
10095 if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) {
10096 /* The array of counter buffers is optional. */
10097 RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
10098 uint64_t va = radv_buffer_get_va(buffer->bo);
10099 uint64_t counter_buffer_offset = 0;
10100
10101 if (pCounterBufferOffsets)
10102 counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
10103
10104 va += buffer->offset + counter_buffer_offset;
10105
10106 /* Append */
10107 radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
10108 radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) | /* offset in bytes */
10109 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
10110 radeon_emit(cs, 0); /* unused */
10111 radeon_emit(cs, 0); /* unused */
10112 radeon_emit(cs, va); /* src address lo */
10113 radeon_emit(cs, va >> 32); /* src address hi */
10114
10115 radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
10116 } else {
10117 /* Start from the beginning. */
10118 radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
10119 radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) | /* offset in bytes */
10120 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
10121 radeon_emit(cs, 0); /* unused */
10122 radeon_emit(cs, 0); /* unused */
10123 radeon_emit(cs, 0); /* unused */
10124 radeon_emit(cs, 0); /* unused */
10125 }
10126 }
10127
10128 radv_set_streamout_enable(cmd_buffer, true);
10129 }
10130
10131 static void
gfx10_emit_streamout_begin(struct radv_cmd_buffer * cmd_buffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)10132 gfx10_emit_streamout_begin(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer,
10133 uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
10134 const VkDeviceSize *pCounterBufferOffsets)
10135 {
10136 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
10137 unsigned last_target = util_last_bit(so->enabled_mask) - 1;
10138 struct radeon_cmdbuf *cs = cmd_buffer->cs;
10139
10140 assert(cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10);
10141 assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
10142
10143 /* Sync because the next streamout operation will overwrite GDS and we
10144 * have to make sure it's idle.
10145 * TODO: Improve by tracking if there is a streamout operation in
10146 * flight.
10147 */
10148 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH;
10149 si_emit_cache_flush(cmd_buffer);
10150
10151 u_foreach_bit(i, so->enabled_mask)
10152 {
10153 int32_t counter_buffer_idx = i - firstCounterBuffer;
10154 if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
10155 counter_buffer_idx = -1;
10156
10157 bool append =
10158 counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx];
10159 uint64_t va = 0;
10160
10161 if (append) {
10162 RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
10163 uint64_t counter_buffer_offset = 0;
10164
10165 if (pCounterBufferOffsets)
10166 counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
10167
10168 va += radv_buffer_get_va(buffer->bo);
10169 va += buffer->offset + counter_buffer_offset;
10170
10171 radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
10172 }
10173
10174 radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
10175 radeon_emit(cs, S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) |
10176 S_411_DST_SEL(V_411_GDS) | S_411_CP_SYNC(i == last_target));
10177 radeon_emit(cs, va);
10178 radeon_emit(cs, va >> 32);
10179 radeon_emit(cs, 4 * i); /* destination in GDS */
10180 radeon_emit(cs, 0);
10181 radeon_emit(cs, S_415_BYTE_COUNT_GFX9(4) | S_415_DISABLE_WR_CONFIRM_GFX9(i != last_target));
10182 }
10183
10184 radv_set_streamout_enable(cmd_buffer, true);
10185 }
10186
10187 VKAPI_ATTR void VKAPI_CALL
radv_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)10188 radv_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer,
10189 uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
10190 const VkDeviceSize *pCounterBufferOffsets)
10191 {
10192 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
10193
10194 if (cmd_buffer->device->physical_device->use_ngg_streamout) {
10195 gfx10_emit_streamout_begin(cmd_buffer, firstCounterBuffer, counterBufferCount,
10196 pCounterBuffers, pCounterBufferOffsets);
10197 } else {
10198 radv_emit_streamout_begin(cmd_buffer, firstCounterBuffer, counterBufferCount, pCounterBuffers,
10199 pCounterBufferOffsets);
10200 }
10201 }
10202
10203 static void
radv_emit_streamout_end(struct radv_cmd_buffer * cmd_buffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)10204 radv_emit_streamout_end(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer,
10205 uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
10206 const VkDeviceSize *pCounterBufferOffsets)
10207 {
10208 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
10209 struct radeon_cmdbuf *cs = cmd_buffer->cs;
10210
10211 radv_flush_vgt_streamout(cmd_buffer);
10212
10213 assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
10214 u_foreach_bit(i, so->enabled_mask)
10215 {
10216 int32_t counter_buffer_idx = i - firstCounterBuffer;
10217 if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
10218 counter_buffer_idx = -1;
10219
10220 if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) {
10221 /* The array of counters buffer is optional. */
10222 RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
10223 uint64_t va = radv_buffer_get_va(buffer->bo);
10224 uint64_t counter_buffer_offset = 0;
10225
10226 if (pCounterBufferOffsets)
10227 counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
10228
10229 va += buffer->offset + counter_buffer_offset;
10230
10231 radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
10232 radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) | /* offset in bytes */
10233 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
10234 STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
10235 radeon_emit(cs, va); /* dst address lo */
10236 radeon_emit(cs, va >> 32); /* dst address hi */
10237 radeon_emit(cs, 0); /* unused */
10238 radeon_emit(cs, 0); /* unused */
10239
10240 radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
10241 }
10242
10243 /* Deactivate transform feedback by zeroing the buffer size.
10244 * The counters (primitives generated, primitives emitted) may
10245 * be enabled even if there is not buffer bound. This ensures
10246 * that the primitives-emitted query won't increment.
10247 */
10248 radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 0);
10249
10250 cmd_buffer->state.context_roll_without_scissor_emitted = true;
10251 }
10252
10253 radv_set_streamout_enable(cmd_buffer, false);
10254 }
10255
10256 static void
gfx10_emit_streamout_end(struct radv_cmd_buffer * cmd_buffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)10257 gfx10_emit_streamout_end(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer,
10258 uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
10259 const VkDeviceSize *pCounterBufferOffsets)
10260 {
10261 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
10262 struct radeon_cmdbuf *cs = cmd_buffer->cs;
10263
10264 assert(cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10);
10265 assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
10266
10267 u_foreach_bit(i, so->enabled_mask)
10268 {
10269 int32_t counter_buffer_idx = i - firstCounterBuffer;
10270 if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
10271 counter_buffer_idx = -1;
10272
10273 if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) {
10274 /* The array of counters buffer is optional. */
10275 RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
10276 uint64_t va = radv_buffer_get_va(buffer->bo);
10277 uint64_t counter_buffer_offset = 0;
10278
10279 if (pCounterBufferOffsets)
10280 counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
10281
10282 va += buffer->offset + counter_buffer_offset;
10283
10284 si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level,
10285 radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_PS_DONE, 0,
10286 EOP_DST_SEL_TC_L2, EOP_DATA_SEL_GDS, va, EOP_DATA_GDS(i, 1), 0);
10287
10288 radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
10289 }
10290 }
10291
10292 radv_set_streamout_enable(cmd_buffer, false);
10293 }
10294
10295 VKAPI_ATTR void VKAPI_CALL
radv_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)10296 radv_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer,
10297 uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
10298 const VkDeviceSize *pCounterBufferOffsets)
10299 {
10300 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
10301
10302 if (cmd_buffer->device->physical_device->use_ngg_streamout) {
10303 gfx10_emit_streamout_end(cmd_buffer, firstCounterBuffer, counterBufferCount, pCounterBuffers,
10304 pCounterBufferOffsets);
10305 } else {
10306 radv_emit_streamout_end(cmd_buffer, firstCounterBuffer, counterBufferCount, pCounterBuffers,
10307 pCounterBufferOffsets);
10308 }
10309 }
10310
10311 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,uint32_t instanceCount,uint32_t firstInstance,VkBuffer _counterBuffer,VkDeviceSize counterBufferOffset,uint32_t counterOffset,uint32_t vertexStride)10312 radv_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer, uint32_t instanceCount,
10313 uint32_t firstInstance, VkBuffer _counterBuffer,
10314 VkDeviceSize counterBufferOffset, uint32_t counterOffset,
10315 uint32_t vertexStride)
10316 {
10317 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
10318 RADV_FROM_HANDLE(radv_buffer, counterBuffer, _counterBuffer);
10319 struct radv_draw_info info;
10320
10321 info.count = 0;
10322 info.instance_count = instanceCount;
10323 info.first_instance = firstInstance;
10324 info.strmout_buffer = counterBuffer;
10325 info.strmout_buffer_offset = counterBufferOffset;
10326 info.stride = vertexStride;
10327 info.indexed = false;
10328 info.indirect = NULL;
10329
10330 if (!radv_before_draw(cmd_buffer, &info, 1))
10331 return;
10332 struct VkMultiDrawInfoEXT minfo = { 0, 0 };
10333 radv_emit_direct_draw_packets(cmd_buffer, &info, 1, &minfo, S_0287F0_USE_OPAQUE(1), 0);
10334 radv_after_draw(cmd_buffer);
10335 }
10336
10337 /* VK_AMD_buffer_marker */
10338 VKAPI_ATTR void VKAPI_CALL
radv_CmdWriteBufferMarker2AMD(VkCommandBuffer commandBuffer,VkPipelineStageFlags2 stage,VkBuffer dstBuffer,VkDeviceSize dstOffset,uint32_t marker)10339 radv_CmdWriteBufferMarker2AMD(VkCommandBuffer commandBuffer, VkPipelineStageFlags2 stage,
10340 VkBuffer dstBuffer, VkDeviceSize dstOffset, uint32_t marker)
10341 {
10342 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
10343 RADV_FROM_HANDLE(radv_buffer, buffer, dstBuffer);
10344 struct radeon_cmdbuf *cs = cmd_buffer->cs;
10345 uint64_t va = radv_buffer_get_va(buffer->bo) + buffer->offset + dstOffset;
10346
10347 si_emit_cache_flush(cmd_buffer);
10348
10349 ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 12);
10350
10351 if (!(stage & ~VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT)) {
10352 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
10353 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
10354 COPY_DATA_WR_CONFIRM);
10355 radeon_emit(cs, marker);
10356 radeon_emit(cs, 0);
10357 radeon_emit(cs, va);
10358 radeon_emit(cs, va >> 32);
10359 } else {
10360 si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level,
10361 radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_BOTTOM_OF_PIPE_TS,
10362 0, EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, va, marker,
10363 cmd_buffer->gfx9_eop_bug_va);
10364 }
10365
10366 assert(cmd_buffer->cs->cdw <= cdw_max);
10367 }
10368
10369 void
radv_CmdBindPipelineShaderGroupNV(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipeline pipeline,uint32_t groupIndex)10370 radv_CmdBindPipelineShaderGroupNV(VkCommandBuffer commandBuffer,
10371 VkPipelineBindPoint pipelineBindPoint, VkPipeline pipeline,
10372 uint32_t groupIndex)
10373 {
10374 fprintf(stderr, "radv: unimplemented vkCmdBindPipelineShaderGroupNV\n");
10375 abort();
10376 }