• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2016 Red Hat.
3  * Copyright © 2016 Bas Nieuwenhuizen
4  *
5  * based in part on anv driver which is:
6  * Copyright © 2015 Intel Corporation
7  *
8  * Permission is hereby granted, free of charge, to any person obtaining a
9  * copy of this software and associated documentation files (the "Software"),
10  * to deal in the Software without restriction, including without limitation
11  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12  * and/or sell copies of the Software, and to permit persons to whom the
13  * Software is furnished to do so, subject to the following conditions:
14  *
15  * The above copyright notice and this permission notice (including the next
16  * paragraph) shall be included in all copies or substantial portions of the
17  * Software.
18  *
19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
22  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25  * IN THE SOFTWARE.
26  */
27 
28 #include "radv_cs.h"
29 #include "radv_debug.h"
30 #include "radv_meta.h"
31 #include "radv_private.h"
32 #include "radv_radeon_winsys.h"
33 #include "radv_shader.h"
34 #include "sid.h"
35 #include "vk_format.h"
36 #include "vk_util.h"
37 #include "vk_enum_defines.h"
38 #include "vk_common_entrypoints.h"
39 
40 #include "ac_debug.h"
41 #include "ac_shader_args.h"
42 
43 #include "util/fast_idiv_by_const.h"
44 
45 enum {
46    RADV_PREFETCH_VBO_DESCRIPTORS = (1 << 0),
47    RADV_PREFETCH_VS = (1 << 1),
48    RADV_PREFETCH_TCS = (1 << 2),
49    RADV_PREFETCH_TES = (1 << 3),
50    RADV_PREFETCH_GS = (1 << 4),
51    RADV_PREFETCH_PS = (1 << 5),
52    RADV_PREFETCH_MS = (1 << 6),
53    RADV_PREFETCH_SHADERS = (RADV_PREFETCH_VS | RADV_PREFETCH_TCS | RADV_PREFETCH_TES |
54                             RADV_PREFETCH_GS | RADV_PREFETCH_PS | RADV_PREFETCH_MS)
55 };
56 
57 static void radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer,
58                                          struct radv_image *image, VkImageLayout src_layout,
59                                          bool src_render_loop, VkImageLayout dst_layout,
60                                          bool dst_render_loop, uint32_t src_family_index,
61                                          uint32_t dst_family_index, const VkImageSubresourceRange *range,
62                                          struct radv_sample_locations_state *sample_locs);
63 
64 static void radv_set_rt_stack_size(struct radv_cmd_buffer *cmd_buffer, uint32_t size);
65 
66 const struct radv_dynamic_state default_dynamic_state = {
67    .viewport =
68       {
69          .count = 0,
70       },
71    .scissor =
72       {
73          .count = 0,
74       },
75    .line_width = 1.0f,
76    .depth_bias =
77       {
78          .bias = 0.0f,
79          .clamp = 0.0f,
80          .slope = 0.0f,
81       },
82    .blend_constants = {0.0f, 0.0f, 0.0f, 0.0f},
83    .depth_bounds =
84       {
85          .min = 0.0f,
86          .max = 1.0f,
87       },
88    .stencil_compare_mask =
89       {
90          .front = ~0u,
91          .back = ~0u,
92       },
93    .stencil_write_mask =
94       {
95          .front = ~0u,
96          .back = ~0u,
97       },
98    .stencil_reference =
99       {
100          .front = 0u,
101          .back = 0u,
102       },
103    .line_stipple =
104       {
105          .factor = 0u,
106          .pattern = 0u,
107       },
108    .cull_mode = 0u,
109    .front_face = 0u,
110    .primitive_topology = 0u,
111    .fragment_shading_rate =
112       {
113          .size = {1u, 1u},
114          .combiner_ops = {VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR,
115                           VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR},
116       },
117    .depth_bias_enable = 0u,
118    .primitive_restart_enable = 0u,
119    .rasterizer_discard_enable = 0u,
120    .logic_op = 0u,
121    .color_write_enable = 0xffffffffu,
122 };
123 
124 static void
radv_bind_dynamic_state(struct radv_cmd_buffer * cmd_buffer,const struct radv_dynamic_state * src)125 radv_bind_dynamic_state(struct radv_cmd_buffer *cmd_buffer, const struct radv_dynamic_state *src)
126 {
127    struct radv_dynamic_state *dest = &cmd_buffer->state.dynamic;
128    uint64_t copy_mask = src->mask;
129    uint64_t dest_mask = 0;
130 
131    dest->discard_rectangle.count = src->discard_rectangle.count;
132    dest->sample_location.count = src->sample_location.count;
133 
134    if (copy_mask & RADV_DYNAMIC_VIEWPORT) {
135       if (dest->viewport.count != src->viewport.count) {
136          dest->viewport.count = src->viewport.count;
137          dest_mask |= RADV_DYNAMIC_VIEWPORT;
138       }
139 
140       if (memcmp(&dest->viewport.viewports, &src->viewport.viewports,
141                  src->viewport.count * sizeof(VkViewport))) {
142          typed_memcpy(dest->viewport.viewports, src->viewport.viewports, src->viewport.count);
143          typed_memcpy(dest->viewport.xform, src->viewport.xform, src->viewport.count);
144          dest_mask |= RADV_DYNAMIC_VIEWPORT;
145       }
146    }
147 
148    if (copy_mask & RADV_DYNAMIC_SCISSOR) {
149       if (dest->scissor.count != src->scissor.count) {
150          dest->scissor.count = src->scissor.count;
151          dest_mask |= RADV_DYNAMIC_SCISSOR;
152       }
153 
154       if (memcmp(&dest->scissor.scissors, &src->scissor.scissors,
155                  src->scissor.count * sizeof(VkRect2D))) {
156          typed_memcpy(dest->scissor.scissors, src->scissor.scissors, src->scissor.count);
157          dest_mask |= RADV_DYNAMIC_SCISSOR;
158       }
159    }
160 
161    if (copy_mask & RADV_DYNAMIC_LINE_WIDTH) {
162       if (dest->line_width != src->line_width) {
163          dest->line_width = src->line_width;
164          dest_mask |= RADV_DYNAMIC_LINE_WIDTH;
165       }
166    }
167 
168    if (copy_mask & RADV_DYNAMIC_DEPTH_BIAS) {
169       if (memcmp(&dest->depth_bias, &src->depth_bias, sizeof(src->depth_bias))) {
170          dest->depth_bias = src->depth_bias;
171          dest_mask |= RADV_DYNAMIC_DEPTH_BIAS;
172       }
173    }
174 
175    if (copy_mask & RADV_DYNAMIC_BLEND_CONSTANTS) {
176       if (memcmp(&dest->blend_constants, &src->blend_constants, sizeof(src->blend_constants))) {
177          typed_memcpy(dest->blend_constants, src->blend_constants, 4);
178          dest_mask |= RADV_DYNAMIC_BLEND_CONSTANTS;
179       }
180    }
181 
182    if (copy_mask & RADV_DYNAMIC_DEPTH_BOUNDS) {
183       if (memcmp(&dest->depth_bounds, &src->depth_bounds, sizeof(src->depth_bounds))) {
184          dest->depth_bounds = src->depth_bounds;
185          dest_mask |= RADV_DYNAMIC_DEPTH_BOUNDS;
186       }
187    }
188 
189    if (copy_mask & RADV_DYNAMIC_STENCIL_COMPARE_MASK) {
190       if (memcmp(&dest->stencil_compare_mask, &src->stencil_compare_mask,
191                  sizeof(src->stencil_compare_mask))) {
192          dest->stencil_compare_mask = src->stencil_compare_mask;
193          dest_mask |= RADV_DYNAMIC_STENCIL_COMPARE_MASK;
194       }
195    }
196 
197    if (copy_mask & RADV_DYNAMIC_STENCIL_WRITE_MASK) {
198       if (memcmp(&dest->stencil_write_mask, &src->stencil_write_mask,
199                  sizeof(src->stencil_write_mask))) {
200          dest->stencil_write_mask = src->stencil_write_mask;
201          dest_mask |= RADV_DYNAMIC_STENCIL_WRITE_MASK;
202       }
203    }
204 
205    if (copy_mask & RADV_DYNAMIC_STENCIL_REFERENCE) {
206       if (memcmp(&dest->stencil_reference, &src->stencil_reference,
207                  sizeof(src->stencil_reference))) {
208          dest->stencil_reference = src->stencil_reference;
209          dest_mask |= RADV_DYNAMIC_STENCIL_REFERENCE;
210       }
211    }
212 
213    if (copy_mask & RADV_DYNAMIC_DISCARD_RECTANGLE) {
214       if (memcmp(&dest->discard_rectangle.rectangles, &src->discard_rectangle.rectangles,
215                  src->discard_rectangle.count * sizeof(VkRect2D))) {
216          typed_memcpy(dest->discard_rectangle.rectangles, src->discard_rectangle.rectangles,
217                       src->discard_rectangle.count);
218          dest_mask |= RADV_DYNAMIC_DISCARD_RECTANGLE;
219       }
220    }
221 
222    if (copy_mask & RADV_DYNAMIC_SAMPLE_LOCATIONS) {
223       if (dest->sample_location.per_pixel != src->sample_location.per_pixel ||
224           dest->sample_location.grid_size.width != src->sample_location.grid_size.width ||
225           dest->sample_location.grid_size.height != src->sample_location.grid_size.height ||
226           memcmp(&dest->sample_location.locations, &src->sample_location.locations,
227                  src->sample_location.count * sizeof(VkSampleLocationEXT))) {
228          dest->sample_location.per_pixel = src->sample_location.per_pixel;
229          dest->sample_location.grid_size = src->sample_location.grid_size;
230          typed_memcpy(dest->sample_location.locations, src->sample_location.locations,
231                       src->sample_location.count);
232          dest_mask |= RADV_DYNAMIC_SAMPLE_LOCATIONS;
233       }
234    }
235 
236    if (copy_mask & RADV_DYNAMIC_LINE_STIPPLE) {
237       if (memcmp(&dest->line_stipple, &src->line_stipple, sizeof(src->line_stipple))) {
238          dest->line_stipple = src->line_stipple;
239          dest_mask |= RADV_DYNAMIC_LINE_STIPPLE;
240       }
241    }
242 
243    if (copy_mask & RADV_DYNAMIC_CULL_MODE) {
244       if (dest->cull_mode != src->cull_mode) {
245          dest->cull_mode = src->cull_mode;
246          dest_mask |= RADV_DYNAMIC_CULL_MODE;
247       }
248    }
249 
250    if (copy_mask & RADV_DYNAMIC_FRONT_FACE) {
251       if (dest->front_face != src->front_face) {
252          dest->front_face = src->front_face;
253          dest_mask |= RADV_DYNAMIC_FRONT_FACE;
254       }
255    }
256 
257    if (copy_mask & RADV_DYNAMIC_PRIMITIVE_TOPOLOGY) {
258       if (dest->primitive_topology != src->primitive_topology) {
259          dest->primitive_topology = src->primitive_topology;
260          dest_mask |= RADV_DYNAMIC_PRIMITIVE_TOPOLOGY;
261       }
262    }
263 
264    if (copy_mask & RADV_DYNAMIC_DEPTH_TEST_ENABLE) {
265       if (dest->depth_test_enable != src->depth_test_enable) {
266          dest->depth_test_enable = src->depth_test_enable;
267          dest_mask |= RADV_DYNAMIC_DEPTH_TEST_ENABLE;
268       }
269    }
270 
271    if (copy_mask & RADV_DYNAMIC_DEPTH_WRITE_ENABLE) {
272       if (dest->depth_write_enable != src->depth_write_enable) {
273          dest->depth_write_enable = src->depth_write_enable;
274          dest_mask |= RADV_DYNAMIC_DEPTH_WRITE_ENABLE;
275       }
276    }
277 
278    if (copy_mask & RADV_DYNAMIC_DEPTH_COMPARE_OP) {
279       if (dest->depth_compare_op != src->depth_compare_op) {
280          dest->depth_compare_op = src->depth_compare_op;
281          dest_mask |= RADV_DYNAMIC_DEPTH_COMPARE_OP;
282       }
283    }
284 
285    if (copy_mask & RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE) {
286       if (dest->depth_bounds_test_enable != src->depth_bounds_test_enable) {
287          dest->depth_bounds_test_enable = src->depth_bounds_test_enable;
288          dest_mask |= RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE;
289       }
290    }
291 
292    if (copy_mask & RADV_DYNAMIC_STENCIL_TEST_ENABLE) {
293       if (dest->stencil_test_enable != src->stencil_test_enable) {
294          dest->stencil_test_enable = src->stencil_test_enable;
295          dest_mask |= RADV_DYNAMIC_STENCIL_TEST_ENABLE;
296       }
297    }
298 
299    if (copy_mask & RADV_DYNAMIC_STENCIL_OP) {
300       if (memcmp(&dest->stencil_op, &src->stencil_op, sizeof(src->stencil_op))) {
301          dest->stencil_op = src->stencil_op;
302          dest_mask |= RADV_DYNAMIC_STENCIL_OP;
303       }
304    }
305 
306    if (copy_mask & RADV_DYNAMIC_FRAGMENT_SHADING_RATE) {
307       if (memcmp(&dest->fragment_shading_rate, &src->fragment_shading_rate,
308                  sizeof(src->fragment_shading_rate))) {
309          dest->fragment_shading_rate = src->fragment_shading_rate;
310          dest_mask |= RADV_DYNAMIC_FRAGMENT_SHADING_RATE;
311       }
312    }
313 
314    if (copy_mask & RADV_DYNAMIC_DEPTH_BIAS_ENABLE) {
315       if (dest->depth_bias_enable != src->depth_bias_enable) {
316          dest->depth_bias_enable = src->depth_bias_enable;
317          dest_mask |= RADV_DYNAMIC_DEPTH_BIAS_ENABLE;
318       }
319    }
320 
321    if (copy_mask & RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE) {
322       if (dest->primitive_restart_enable != src->primitive_restart_enable) {
323          dest->primitive_restart_enable = src->primitive_restart_enable;
324          dest_mask |= RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE;
325       }
326    }
327 
328    if (copy_mask & RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE) {
329       if (dest->rasterizer_discard_enable != src->rasterizer_discard_enable) {
330          dest->rasterizer_discard_enable = src->rasterizer_discard_enable;
331          dest_mask |= RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
332       }
333    }
334 
335    if (copy_mask & RADV_DYNAMIC_LOGIC_OP) {
336       if (dest->logic_op != src->logic_op) {
337          dest->logic_op = src->logic_op;
338          dest_mask |= RADV_DYNAMIC_LOGIC_OP;
339       }
340    }
341 
342    if (copy_mask & RADV_DYNAMIC_COLOR_WRITE_ENABLE) {
343       if (dest->color_write_enable != src->color_write_enable) {
344          dest->color_write_enable = src->color_write_enable;
345          dest_mask |= RADV_DYNAMIC_COLOR_WRITE_ENABLE;
346       }
347    }
348 
349    cmd_buffer->state.dirty |= dest_mask;
350 }
351 
352 bool
radv_cmd_buffer_uses_mec(struct radv_cmd_buffer * cmd_buffer)353 radv_cmd_buffer_uses_mec(struct radv_cmd_buffer *cmd_buffer)
354 {
355    return cmd_buffer->qf == RADV_QUEUE_COMPUTE &&
356           cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7;
357 }
358 
359 enum amd_ip_type
radv_queue_family_to_ring(struct radv_physical_device * physical_device,enum radv_queue_family f)360 radv_queue_family_to_ring(struct radv_physical_device *physical_device,
361                           enum radv_queue_family f)
362 {
363    switch (f) {
364    case RADV_QUEUE_GENERAL:
365       return AMD_IP_GFX;
366    case RADV_QUEUE_COMPUTE:
367       return AMD_IP_COMPUTE;
368    case RADV_QUEUE_TRANSFER:
369       return AMD_IP_SDMA;
370    default:
371       unreachable("Unknown queue family");
372    }
373 }
374 
375 static void
radv_emit_write_data_packet(struct radv_cmd_buffer * cmd_buffer,unsigned engine_sel,uint64_t va,unsigned count,const uint32_t * data)376 radv_emit_write_data_packet(struct radv_cmd_buffer *cmd_buffer, unsigned engine_sel, uint64_t va,
377                             unsigned count, const uint32_t *data)
378 {
379    struct radeon_cmdbuf *cs = cmd_buffer->cs;
380 
381    radeon_check_space(cmd_buffer->device->ws, cs, 4 + count);
382 
383    radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0));
384    radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(engine_sel));
385    radeon_emit(cs, va);
386    radeon_emit(cs, va >> 32);
387    radeon_emit_array(cs, data, count);
388 }
389 
390 static void
radv_emit_clear_data(struct radv_cmd_buffer * cmd_buffer,unsigned engine_sel,uint64_t va,unsigned size)391 radv_emit_clear_data(struct radv_cmd_buffer *cmd_buffer, unsigned engine_sel, uint64_t va,
392                      unsigned size)
393 {
394    uint32_t *zeroes = alloca(size);
395    memset(zeroes, 0, size);
396    radv_emit_write_data_packet(cmd_buffer, engine_sel, va, size / 4, zeroes);
397 }
398 
399 static void
radv_destroy_cmd_buffer(struct radv_cmd_buffer * cmd_buffer)400 radv_destroy_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
401 {
402    list_del(&cmd_buffer->pool_link);
403 
404    util_dynarray_fini(&cmd_buffer->cached_vertex_formats);
405 
406    list_for_each_entry_safe(struct radv_cmd_buffer_upload, up, &cmd_buffer->upload.list, list)
407    {
408       cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, up->upload_bo);
409       list_del(&up->list);
410       free(up);
411    }
412 
413    if (cmd_buffer->upload.upload_bo)
414       cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, cmd_buffer->upload.upload_bo);
415 
416    if (cmd_buffer->state.own_render_pass) {
417       radv_DestroyRenderPass(radv_device_to_handle(cmd_buffer->device),
418                              radv_render_pass_to_handle(cmd_buffer->state.pass), NULL);
419       cmd_buffer->state.own_render_pass = false;
420    }
421 
422    if (cmd_buffer->cs)
423       cmd_buffer->device->ws->cs_destroy(cmd_buffer->cs);
424    if (cmd_buffer->ace_internal.cs)
425       cmd_buffer->device->ws->cs_destroy(cmd_buffer->ace_internal.cs);
426 
427    for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
428       struct radv_descriptor_set_header *set = &cmd_buffer->descriptors[i].push_set.set;
429       free(set->mapped_ptr);
430       if (set->layout)
431          vk_descriptor_set_layout_unref(&cmd_buffer->device->vk, &set->layout->vk);
432       vk_object_base_finish(&set->base);
433    }
434 
435    vk_object_base_finish(&cmd_buffer->meta_push_descriptors.base);
436 
437    vk_command_buffer_finish(&cmd_buffer->vk);
438    vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer);
439 }
440 
441 static VkResult
radv_create_cmd_buffer(struct radv_device * device,struct radv_cmd_pool * pool,VkCommandBufferLevel level,VkCommandBuffer * pCommandBuffer)442 radv_create_cmd_buffer(struct radv_device *device, struct radv_cmd_pool *pool,
443                        VkCommandBufferLevel level, VkCommandBuffer *pCommandBuffer)
444 {
445    struct radv_cmd_buffer *cmd_buffer;
446    unsigned ring;
447    cmd_buffer = vk_zalloc(&pool->vk.alloc, sizeof(*cmd_buffer), 8,
448                           VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
449    if (cmd_buffer == NULL)
450       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
451 
452    VkResult result =
453       vk_command_buffer_init(&cmd_buffer->vk, &pool->vk, level);
454    if (result != VK_SUCCESS) {
455       vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer);
456       return result;
457    }
458 
459    cmd_buffer->device = device;
460    cmd_buffer->pool = pool;
461 
462    list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
463    cmd_buffer->qf = vk_queue_to_radv(device->physical_device, pool->vk.queue_family_index);
464 
465    ring = radv_queue_family_to_ring(device->physical_device, cmd_buffer->qf);
466 
467    cmd_buffer->cs = device->ws->cs_create(device->ws, ring);
468    if (!cmd_buffer->cs) {
469       radv_destroy_cmd_buffer(cmd_buffer);
470       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
471    }
472 
473    vk_object_base_init(&device->vk, &cmd_buffer->meta_push_descriptors.base,
474                        VK_OBJECT_TYPE_DESCRIPTOR_SET);
475 
476    util_dynarray_init(&cmd_buffer->cached_vertex_formats, NULL);
477 
478    for (unsigned i = 0; i < MAX_BIND_POINTS; i++)
479       vk_object_base_init(&device->vk, &cmd_buffer->descriptors[i].push_set.set.base,
480                           VK_OBJECT_TYPE_DESCRIPTOR_SET);
481 
482    *pCommandBuffer = radv_cmd_buffer_to_handle(cmd_buffer);
483 
484    list_inithead(&cmd_buffer->upload.list);
485 
486    return VK_SUCCESS;
487 }
488 
489 static VkResult
radv_reset_cmd_buffer(struct radv_cmd_buffer * cmd_buffer)490 radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
491 {
492    vk_command_buffer_reset(&cmd_buffer->vk);
493 
494    cmd_buffer->device->ws->cs_reset(cmd_buffer->cs);
495    if (cmd_buffer->ace_internal.cs)
496       cmd_buffer->device->ws->cs_reset(cmd_buffer->ace_internal.cs);
497 
498    list_for_each_entry_safe(struct radv_cmd_buffer_upload, up, &cmd_buffer->upload.list, list)
499    {
500       cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, up->upload_bo);
501       list_del(&up->list);
502       free(up);
503    }
504 
505    if (cmd_buffer->state.own_render_pass) {
506       radv_DestroyRenderPass(radv_device_to_handle(cmd_buffer->device),
507                              radv_render_pass_to_handle(cmd_buffer->state.pass), NULL);
508       cmd_buffer->state.own_render_pass = false;
509    }
510 
511    cmd_buffer->push_constant_stages = 0;
512    cmd_buffer->scratch_size_per_wave_needed = 0;
513    cmd_buffer->scratch_waves_wanted = 0;
514    cmd_buffer->compute_scratch_size_per_wave_needed = 0;
515    cmd_buffer->compute_scratch_waves_wanted = 0;
516    cmd_buffer->esgs_ring_size_needed = 0;
517    cmd_buffer->gsvs_ring_size_needed = 0;
518    cmd_buffer->tess_rings_needed = false;
519    cmd_buffer->task_rings_needed = false;
520    cmd_buffer->mesh_scratch_ring_needed = false;
521    cmd_buffer->gds_needed = false;
522    cmd_buffer->gds_oa_needed = false;
523    cmd_buffer->sample_positions_needed = false;
524    cmd_buffer->ace_internal.sem.gfx2ace_value = 0;
525    cmd_buffer->ace_internal.sem.emitted_gfx2ace_value = 0;
526    cmd_buffer->ace_internal.sem.va = 0;
527 
528    if (cmd_buffer->upload.upload_bo)
529       radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->upload.upload_bo);
530    cmd_buffer->upload.offset = 0;
531 
532    cmd_buffer->record_result = VK_SUCCESS;
533 
534    memset(cmd_buffer->vertex_binding_buffers, 0, sizeof(struct radv_buffer *) * cmd_buffer->used_vertex_bindings);
535    cmd_buffer->used_vertex_bindings = 0;
536 
537    for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
538       cmd_buffer->descriptors[i].dirty = 0;
539       cmd_buffer->descriptors[i].valid = 0;
540       cmd_buffer->descriptors[i].push_dirty = false;
541    }
542 
543    if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) {
544       uint32_t pred_value = 0;
545       uint32_t pred_offset;
546       if (!radv_cmd_buffer_upload_data(cmd_buffer, 4, &pred_value, &pred_offset))
547          cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
548 
549       cmd_buffer->mec_inv_pred_emitted = false;
550       cmd_buffer->mec_inv_pred_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + pred_offset;
551    }
552 
553    if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9 &&
554        cmd_buffer->qf == RADV_QUEUE_GENERAL) {
555       unsigned num_db = cmd_buffer->device->physical_device->rad_info.max_render_backends;
556       unsigned fence_offset, eop_bug_offset;
557       void *fence_ptr;
558 
559       radv_cmd_buffer_upload_alloc(cmd_buffer, 8, &fence_offset, &fence_ptr);
560       memset(fence_ptr, 0, 8);
561 
562       cmd_buffer->gfx9_fence_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
563       cmd_buffer->gfx9_fence_va += fence_offset;
564 
565       radv_emit_clear_data(cmd_buffer, V_370_PFP, cmd_buffer->gfx9_fence_va, 8);
566 
567       if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX9) {
568          /* Allocate a buffer for the EOP bug on GFX9. */
569          radv_cmd_buffer_upload_alloc(cmd_buffer, 16 * num_db, &eop_bug_offset, &fence_ptr);
570          memset(fence_ptr, 0, 16 * num_db);
571          cmd_buffer->gfx9_eop_bug_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
572          cmd_buffer->gfx9_eop_bug_va += eop_bug_offset;
573 
574          radv_emit_clear_data(cmd_buffer, V_370_PFP, cmd_buffer->gfx9_eop_bug_va, 16 * num_db);
575       }
576    }
577 
578    cmd_buffer->status = RADV_CMD_BUFFER_STATUS_INITIAL;
579 
580    return cmd_buffer->record_result;
581 }
582 
583 static bool
radv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer * cmd_buffer,uint64_t min_needed)584 radv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer *cmd_buffer, uint64_t min_needed)
585 {
586    uint64_t new_size;
587    struct radeon_winsys_bo *bo = NULL;
588    struct radv_cmd_buffer_upload *upload;
589    struct radv_device *device = cmd_buffer->device;
590 
591    new_size = MAX2(min_needed, 16 * 1024);
592    new_size = MAX2(new_size, 2 * cmd_buffer->upload.size);
593 
594    VkResult result =
595       device->ws->buffer_create(device->ws, new_size, 4096, device->ws->cs_domain(device->ws),
596                                 RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING |
597                                    RADEON_FLAG_32BIT | RADEON_FLAG_GTT_WC,
598                                 RADV_BO_PRIORITY_UPLOAD_BUFFER, 0, &bo);
599 
600    if (result != VK_SUCCESS) {
601       cmd_buffer->record_result = result;
602       return false;
603    }
604 
605    radv_cs_add_buffer(device->ws, cmd_buffer->cs, bo);
606    if (cmd_buffer->upload.upload_bo) {
607       upload = malloc(sizeof(*upload));
608 
609       if (!upload) {
610          cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
611          device->ws->buffer_destroy(device->ws, bo);
612          return false;
613       }
614 
615       memcpy(upload, &cmd_buffer->upload, sizeof(*upload));
616       list_add(&upload->list, &cmd_buffer->upload.list);
617    }
618 
619    cmd_buffer->upload.upload_bo = bo;
620    cmd_buffer->upload.size = new_size;
621    cmd_buffer->upload.offset = 0;
622    cmd_buffer->upload.map = device->ws->buffer_map(cmd_buffer->upload.upload_bo);
623 
624    if (!cmd_buffer->upload.map) {
625       cmd_buffer->record_result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
626       return false;
627    }
628 
629    return true;
630 }
631 
632 bool
radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer * cmd_buffer,unsigned size,unsigned * out_offset,void ** ptr)633 radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer *cmd_buffer, unsigned size,
634                              unsigned *out_offset, void **ptr)
635 {
636    assert(size % 4 == 0);
637 
638    struct radeon_info *rad_info = &cmd_buffer->device->physical_device->rad_info;
639 
640    /* Align to the scalar cache line size if it results in this allocation
641     * being placed in less of them.
642     */
643    unsigned offset = cmd_buffer->upload.offset;
644    unsigned line_size = rad_info->gfx_level >= GFX10 ? 64 : 32;
645    unsigned gap = align(offset, line_size) - offset;
646    if ((size & (line_size - 1)) > gap)
647       offset = align(offset, line_size);
648 
649    if (offset + size > cmd_buffer->upload.size) {
650       if (!radv_cmd_buffer_resize_upload_buf(cmd_buffer, size))
651          return false;
652       offset = 0;
653    }
654 
655    *out_offset = offset;
656    *ptr = cmd_buffer->upload.map + offset;
657 
658    cmd_buffer->upload.offset = offset + size;
659    return true;
660 }
661 
662 bool
radv_cmd_buffer_upload_data(struct radv_cmd_buffer * cmd_buffer,unsigned size,const void * data,unsigned * out_offset)663 radv_cmd_buffer_upload_data(struct radv_cmd_buffer *cmd_buffer, unsigned size, const void *data,
664                             unsigned *out_offset)
665 {
666    uint8_t *ptr;
667 
668    if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, out_offset, (void **)&ptr))
669       return false;
670    assert(ptr);
671 
672    memcpy(ptr, data, size);
673    return true;
674 }
675 
676 void
radv_cmd_buffer_trace_emit(struct radv_cmd_buffer * cmd_buffer)677 radv_cmd_buffer_trace_emit(struct radv_cmd_buffer *cmd_buffer)
678 {
679    struct radv_device *device = cmd_buffer->device;
680    struct radeon_cmdbuf *cs = cmd_buffer->cs;
681    uint64_t va;
682 
683    va = radv_buffer_get_va(device->trace_bo);
684    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
685       va += 4;
686 
687    ++cmd_buffer->state.trace_id;
688    radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, 1, &cmd_buffer->state.trace_id);
689 
690    radeon_check_space(cmd_buffer->device->ws, cs, 2);
691 
692    radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
693    radeon_emit(cs, AC_ENCODE_TRACE_POINT(cmd_buffer->state.trace_id));
694 }
695 
696 static void
radv_ace_internal_barrier(struct radv_cmd_buffer * cmd_buffer,VkPipelineStageFlags2 src_stage_mask,VkPipelineStageFlags2 dst_stage_mask)697 radv_ace_internal_barrier(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 src_stage_mask,
698                           VkPipelineStageFlags2 dst_stage_mask)
699 {
700    /* Update flush bits from the main cmdbuf, except the stage flush. */
701    cmd_buffer->ace_internal.flush_bits |=
702       cmd_buffer->state.flush_bits & RADV_CMD_FLUSH_ALL_COMPUTE & ~RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
703 
704    /* Add stage flush only when necessary. */
705    if (src_stage_mask &
706        (VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_NV | VK_PIPELINE_STAGE_2_TRANSFER_BIT |
707         VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
708       cmd_buffer->ace_internal.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
709 
710    /* Block task shaders when we have to wait for CP DMA on the GFX cmdbuf. */
711    if (src_stage_mask &
712        (VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_CLEAR_BIT |
713         VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT | VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT |
714         VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
715       dst_stage_mask |= cmd_buffer->state.dma_is_busy ? VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_NV : 0;
716 
717    /* Increment the GFX/ACE semaphore when task shaders are blocked. */
718    if (dst_stage_mask &
719        (VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT_KHR | VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT |
720         VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_NV))
721       cmd_buffer->ace_internal.sem.gfx2ace_value++;
722 }
723 
724 static void
radv_ace_internal_cache_flush(struct radv_cmd_buffer * cmd_buffer)725 radv_ace_internal_cache_flush(struct radv_cmd_buffer *cmd_buffer)
726 {
727    struct radeon_cmdbuf *ace_cs = cmd_buffer->ace_internal.cs;
728    const uint32_t flush_bits = cmd_buffer->ace_internal.flush_bits;
729    enum rgp_flush_bits sqtt_flush_bits = 0;
730 
731    si_cs_emit_cache_flush(ace_cs, cmd_buffer->device->physical_device->rad_info.gfx_level, NULL, 0,
732                           true, flush_bits, &sqtt_flush_bits, 0);
733 
734    cmd_buffer->ace_internal.flush_bits = 0;
735 }
736 
737 static uint64_t
radv_ace_internal_sem_create(struct radv_cmd_buffer * cmd_buffer)738 radv_ace_internal_sem_create(struct radv_cmd_buffer *cmd_buffer)
739 {
740    /* DWORD 0: GFX->ACE semaphore (GFX blocks ACE, ie. ACE waits for GFX)
741     * DWORD 1: ACE->GFX semaphore
742     */
743    uint64_t sem_init = 0;
744    uint32_t va_off = 0;
745    if (!radv_cmd_buffer_upload_data(cmd_buffer, sizeof(uint64_t), &sem_init, &va_off)) {
746       cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
747       return 0;
748    }
749 
750    return radv_buffer_get_va(cmd_buffer->upload.upload_bo) + va_off;
751 }
752 
753 static bool
radv_ace_internal_sem_dirty(const struct radv_cmd_buffer * cmd_buffer)754 radv_ace_internal_sem_dirty(const struct radv_cmd_buffer *cmd_buffer)
755 {
756    return cmd_buffer->ace_internal.sem.gfx2ace_value !=
757           cmd_buffer->ace_internal.sem.emitted_gfx2ace_value;
758 }
759 
760 ALWAYS_INLINE static bool
radv_flush_gfx2ace_semaphore(struct radv_cmd_buffer * cmd_buffer)761 radv_flush_gfx2ace_semaphore(struct radv_cmd_buffer *cmd_buffer)
762 {
763    if (!radv_ace_internal_sem_dirty(cmd_buffer))
764       return false;
765 
766    if (!cmd_buffer->ace_internal.sem.va) {
767       cmd_buffer->ace_internal.sem.va = radv_ace_internal_sem_create(cmd_buffer);
768       if (!cmd_buffer->ace_internal.sem.va)
769          return false;
770    }
771 
772    /* GFX writes a value to the semaphore which ACE can wait for.*/
773    si_cs_emit_write_event_eop(
774       cmd_buffer->cs, cmd_buffer->device->physical_device->rad_info.gfx_level,
775       radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM,
776       EOP_DATA_SEL_VALUE_32BIT, cmd_buffer->ace_internal.sem.va,
777       cmd_buffer->ace_internal.sem.gfx2ace_value, cmd_buffer->gfx9_eop_bug_va);
778 
779    cmd_buffer->ace_internal.sem.emitted_gfx2ace_value = cmd_buffer->ace_internal.sem.gfx2ace_value;
780    return true;
781 }
782 
783 ALWAYS_INLINE static void
radv_wait_gfx2ace_semaphore(struct radv_cmd_buffer * cmd_buffer)784 radv_wait_gfx2ace_semaphore(struct radv_cmd_buffer *cmd_buffer)
785 {
786    assert(cmd_buffer->ace_internal.sem.va);
787    struct radeon_cmdbuf *ace_cs = cmd_buffer->ace_internal.cs;
788    radeon_check_space(cmd_buffer->device->ws, ace_cs, 7);
789 
790    /* ACE waits for the semaphore which GFX wrote. */
791    radv_cp_wait_mem(ace_cs, WAIT_REG_MEM_GREATER_OR_EQUAL, cmd_buffer->ace_internal.sem.va,
792                     cmd_buffer->ace_internal.sem.gfx2ace_value, 0xffffffff);
793 }
794 
795 static struct radeon_cmdbuf *
radv_ace_internal_create(struct radv_cmd_buffer * cmd_buffer)796 radv_ace_internal_create(struct radv_cmd_buffer *cmd_buffer)
797 {
798    assert(!cmd_buffer->ace_internal.cs);
799    struct radv_device *device = cmd_buffer->device;
800    struct radeon_cmdbuf *ace_cs = device->ws->cs_create(device->ws, AMD_IP_COMPUTE);
801 
802    if (!ace_cs) {
803       cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
804    }
805 
806    return ace_cs;
807 }
808 
809 static VkResult
radv_ace_internal_finalize(struct radv_cmd_buffer * cmd_buffer)810 radv_ace_internal_finalize(struct radv_cmd_buffer *cmd_buffer)
811 {
812    assert(cmd_buffer->ace_internal.cs);
813    struct radv_device *device = cmd_buffer->device;
814    struct radeon_cmdbuf *ace_cs = cmd_buffer->ace_internal.cs;
815 
816    /* Emit pending cache flush. */
817    radv_ace_internal_cache_flush(cmd_buffer);
818 
819    /* Clear the ACE semaphore if it exists.
820     * This is necessary in case the same cmd buffer is submitted again in the future.
821     */
822    if (cmd_buffer->ace_internal.sem.va) {
823       struct radeon_cmdbuf *main_cs = cmd_buffer->cs;
824       uint64_t gfx2ace_va = cmd_buffer->ace_internal.sem.va;
825       uint64_t ace2gfx_va = cmd_buffer->ace_internal.sem.va + 4;
826 
827       /* ACE: write 1 to the ACE->GFX semaphore. */
828       si_cs_emit_write_event_eop(ace_cs, cmd_buffer->device->physical_device->rad_info.gfx_level,
829                                  true, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM,
830                                  EOP_DATA_SEL_VALUE_32BIT, ace2gfx_va, 1,
831                                  cmd_buffer->gfx9_eop_bug_va);
832 
833       /* Wait for ACE to finish, otherwise we may risk writing 0 to the semaphore
834        * when ACE is still waiting for it. This may not happen in practice, but
835        * better safe than sorry.
836        */
837       radv_cp_wait_mem(main_cs, WAIT_REG_MEM_GREATER_OR_EQUAL, ace2gfx_va, 1, 0xffffffff);
838 
839       /* GFX: clear GFX->ACE and ACE->GFX semaphores. */
840       radv_emit_clear_data(cmd_buffer, V_370_ME, gfx2ace_va, 8);
841    }
842 
843    device->ws->cs_add_buffers(ace_cs, cmd_buffer->cs);
844    return device->ws->cs_finalize(ace_cs);
845 }
846 
847 static void
radv_cmd_buffer_after_draw(struct radv_cmd_buffer * cmd_buffer,enum radv_cmd_flush_bits flags)848 radv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer, enum radv_cmd_flush_bits flags)
849 {
850    if (unlikely(cmd_buffer->device->thread_trace.bo)) {
851       radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
852       radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) | EVENT_INDEX(0));
853    }
854 
855    if (cmd_buffer->device->instance->debug_flags & RADV_DEBUG_SYNC_SHADERS) {
856       enum rgp_flush_bits sqtt_flush_bits = 0;
857       assert(flags & (RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH));
858 
859       radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4);
860 
861       /* Force wait for graphics or compute engines to be idle. */
862       si_cs_emit_cache_flush(cmd_buffer->cs,
863                              cmd_buffer->device->physical_device->rad_info.gfx_level,
864                              &cmd_buffer->gfx9_fence_idx, cmd_buffer->gfx9_fence_va,
865                              radv_cmd_buffer_uses_mec(cmd_buffer), flags, &sqtt_flush_bits,
866                              cmd_buffer->gfx9_eop_bug_va);
867 
868       if (cmd_buffer->state.graphics_pipeline && (flags & RADV_CMD_FLAG_PS_PARTIAL_FLUSH) &&
869           radv_pipeline_has_stage(cmd_buffer->state.graphics_pipeline, MESA_SHADER_TASK)) {
870          /* Force wait for compute engines to be idle on the internal cmdbuf. */
871          si_cs_emit_cache_flush(cmd_buffer->ace_internal.cs,
872                                 cmd_buffer->device->physical_device->rad_info.gfx_level, NULL, 0,
873                                 true, RADV_CMD_FLAG_CS_PARTIAL_FLUSH, &sqtt_flush_bits, 0);
874       }
875    }
876 
877    if (unlikely(cmd_buffer->device->trace_bo))
878       radv_cmd_buffer_trace_emit(cmd_buffer);
879 }
880 
881 static void
radv_save_pipeline(struct radv_cmd_buffer * cmd_buffer,struct radv_pipeline * pipeline)882 radv_save_pipeline(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline)
883 {
884    struct radv_device *device = cmd_buffer->device;
885    enum amd_ip_type ring;
886    uint32_t data[2];
887    uint64_t va;
888 
889    va = radv_buffer_get_va(device->trace_bo);
890 
891    ring = radv_queue_family_to_ring(device->physical_device, cmd_buffer->qf);
892 
893    switch (ring) {
894    case AMD_IP_GFX:
895       va += 8;
896       break;
897    case AMD_IP_COMPUTE:
898       va += 16;
899       break;
900    default:
901       assert(!"invalid IP type");
902    }
903 
904    uint64_t pipeline_address = (uintptr_t)pipeline;
905    data[0] = pipeline_address;
906    data[1] = pipeline_address >> 32;
907 
908    radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, 2, data);
909 }
910 
911 static void
radv_save_vertex_descriptors(struct radv_cmd_buffer * cmd_buffer,uint64_t vb_ptr)912 radv_save_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, uint64_t vb_ptr)
913 {
914    struct radv_device *device = cmd_buffer->device;
915    uint32_t data[2];
916    uint64_t va;
917 
918    va = radv_buffer_get_va(device->trace_bo);
919    va += 24;
920 
921    data[0] = vb_ptr;
922    data[1] = vb_ptr >> 32;
923 
924    radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, 2, data);
925 }
926 
927 static void
radv_save_vs_prolog(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader_part * prolog)928 radv_save_vs_prolog(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader_part *prolog)
929 {
930    struct radv_device *device = cmd_buffer->device;
931    uint32_t data[2];
932    uint64_t va;
933 
934    va = radv_buffer_get_va(device->trace_bo);
935    va += 32;
936 
937    uint64_t prolog_address = (uintptr_t)prolog;
938    data[0] = prolog_address;
939    data[1] = prolog_address >> 32;
940 
941    radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, 2, data);
942 }
943 
944 void
radv_set_descriptor_set(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point,struct radv_descriptor_set * set,unsigned idx)945 radv_set_descriptor_set(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point,
946                         struct radv_descriptor_set *set, unsigned idx)
947 {
948    struct radv_descriptor_state *descriptors_state =
949       radv_get_descriptors_state(cmd_buffer, bind_point);
950 
951    descriptors_state->sets[idx] = set;
952 
953    descriptors_state->valid |= (1u << idx); /* active descriptors */
954    descriptors_state->dirty |= (1u << idx);
955 }
956 
957 static void
radv_save_descriptors(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)958 radv_save_descriptors(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point)
959 {
960    struct radv_descriptor_state *descriptors_state =
961       radv_get_descriptors_state(cmd_buffer, bind_point);
962    struct radv_device *device = cmd_buffer->device;
963    uint32_t data[MAX_SETS * 2] = {0};
964    uint64_t va;
965    va = radv_buffer_get_va(device->trace_bo) + 40;
966 
967    u_foreach_bit(i, descriptors_state->valid)
968    {
969       struct radv_descriptor_set *set = descriptors_state->sets[i];
970       data[i * 2] = (uint64_t)(uintptr_t)set;
971       data[i * 2 + 1] = (uint64_t)(uintptr_t)set >> 32;
972    }
973 
974    radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, MAX_SETS * 2, data);
975 }
976 
977 struct radv_userdata_info *
radv_lookup_user_sgpr(struct radv_pipeline * pipeline,gl_shader_stage stage,int idx)978 radv_lookup_user_sgpr(struct radv_pipeline *pipeline, gl_shader_stage stage, int idx)
979 {
980    struct radv_shader *shader = radv_get_shader(pipeline, stage);
981    return &shader->info.user_sgprs_locs.shader_data[idx];
982 }
983 
984 static void
radv_emit_userdata_address(struct radv_device * device,struct radeon_cmdbuf * cs,struct radv_pipeline * pipeline,gl_shader_stage stage,int idx,uint64_t va)985 radv_emit_userdata_address(struct radv_device *device, struct radeon_cmdbuf *cs,
986                            struct radv_pipeline *pipeline, gl_shader_stage stage, int idx,
987                            uint64_t va)
988 {
989    struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx);
990    uint32_t base_reg = pipeline->user_data_0[stage];
991    if (loc->sgpr_idx == -1)
992       return;
993 
994    assert(loc->num_sgprs == 1);
995 
996    radv_emit_shader_pointer(device, cs, base_reg + loc->sgpr_idx * 4, va, false);
997 }
998 
999 static void
radv_emit_descriptor_pointers(struct radv_device * device,struct radeon_cmdbuf * cs,struct radv_pipeline * pipeline,struct radv_descriptor_state * descriptors_state,gl_shader_stage stage)1000 radv_emit_descriptor_pointers(struct radv_device *device, struct radeon_cmdbuf *cs,
1001                               struct radv_pipeline *pipeline,
1002                               struct radv_descriptor_state *descriptors_state,
1003                               gl_shader_stage stage)
1004 {
1005    uint32_t sh_base = pipeline->user_data_0[stage];
1006    struct radv_userdata_locations *locs = &pipeline->shaders[stage]->info.user_sgprs_locs;
1007    unsigned mask = locs->descriptor_sets_enabled;
1008 
1009    mask &= descriptors_state->dirty & descriptors_state->valid;
1010 
1011    while (mask) {
1012       int start, count;
1013 
1014       u_bit_scan_consecutive_range(&mask, &start, &count);
1015 
1016       struct radv_userdata_info *loc = &locs->descriptor_sets[start];
1017       unsigned sh_offset = sh_base + loc->sgpr_idx * 4;
1018 
1019       radv_emit_shader_pointer_head(cs, sh_offset, count, true);
1020       for (int i = 0; i < count; i++) {
1021          struct radv_descriptor_set *set = descriptors_state->sets[start + i];
1022 
1023          radv_emit_shader_pointer_body(device, cs, set->header.va, true);
1024       }
1025    }
1026 }
1027 
1028 /**
1029  * Convert the user sample locations to hardware sample locations (the values
1030  * that will be emitted by PA_SC_AA_SAMPLE_LOCS_PIXEL_*).
1031  */
1032 static void
radv_convert_user_sample_locs(struct radv_sample_locations_state * state,uint32_t x,uint32_t y,VkOffset2D * sample_locs)1033 radv_convert_user_sample_locs(struct radv_sample_locations_state *state, uint32_t x, uint32_t y,
1034                               VkOffset2D *sample_locs)
1035 {
1036    uint32_t x_offset = x % state->grid_size.width;
1037    uint32_t y_offset = y % state->grid_size.height;
1038    uint32_t num_samples = (uint32_t)state->per_pixel;
1039    VkSampleLocationEXT *user_locs;
1040    uint32_t pixel_offset;
1041 
1042    pixel_offset = (x_offset + y_offset * state->grid_size.width) * num_samples;
1043 
1044    assert(pixel_offset <= MAX_SAMPLE_LOCATIONS);
1045    user_locs = &state->locations[pixel_offset];
1046 
1047    for (uint32_t i = 0; i < num_samples; i++) {
1048       float shifted_pos_x = user_locs[i].x - 0.5;
1049       float shifted_pos_y = user_locs[i].y - 0.5;
1050 
1051       int32_t scaled_pos_x = floorf(shifted_pos_x * 16);
1052       int32_t scaled_pos_y = floorf(shifted_pos_y * 16);
1053 
1054       sample_locs[i].x = CLAMP(scaled_pos_x, -8, 7);
1055       sample_locs[i].y = CLAMP(scaled_pos_y, -8, 7);
1056    }
1057 }
1058 
1059 /**
1060  * Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask based on hardware sample
1061  * locations.
1062  */
1063 static void
radv_compute_sample_locs_pixel(uint32_t num_samples,VkOffset2D * sample_locs,uint32_t * sample_locs_pixel)1064 radv_compute_sample_locs_pixel(uint32_t num_samples, VkOffset2D *sample_locs,
1065                                uint32_t *sample_locs_pixel)
1066 {
1067    for (uint32_t i = 0; i < num_samples; i++) {
1068       uint32_t sample_reg_idx = i / 4;
1069       uint32_t sample_loc_idx = i % 4;
1070       int32_t pos_x = sample_locs[i].x;
1071       int32_t pos_y = sample_locs[i].y;
1072 
1073       uint32_t shift_x = 8 * sample_loc_idx;
1074       uint32_t shift_y = shift_x + 4;
1075 
1076       sample_locs_pixel[sample_reg_idx] |= (pos_x & 0xf) << shift_x;
1077       sample_locs_pixel[sample_reg_idx] |= (pos_y & 0xf) << shift_y;
1078    }
1079 }
1080 
1081 /**
1082  * Compute the PA_SC_CENTROID_PRIORITY_* mask based on the top left hardware
1083  * sample locations.
1084  */
1085 static uint64_t
radv_compute_centroid_priority(struct radv_cmd_buffer * cmd_buffer,VkOffset2D * sample_locs,uint32_t num_samples)1086 radv_compute_centroid_priority(struct radv_cmd_buffer *cmd_buffer, VkOffset2D *sample_locs,
1087                                uint32_t num_samples)
1088 {
1089    uint32_t *centroid_priorities = alloca(num_samples * sizeof(*centroid_priorities));
1090    uint32_t sample_mask = num_samples - 1;
1091    uint32_t *distances = alloca(num_samples * sizeof(*distances));
1092    uint64_t centroid_priority = 0;
1093 
1094    /* Compute the distances from center for each sample. */
1095    for (int i = 0; i < num_samples; i++) {
1096       distances[i] = (sample_locs[i].x * sample_locs[i].x) + (sample_locs[i].y * sample_locs[i].y);
1097    }
1098 
1099    /* Compute the centroid priorities by looking at the distances array. */
1100    for (int i = 0; i < num_samples; i++) {
1101       uint32_t min_idx = 0;
1102 
1103       for (int j = 1; j < num_samples; j++) {
1104          if (distances[j] < distances[min_idx])
1105             min_idx = j;
1106       }
1107 
1108       centroid_priorities[i] = min_idx;
1109       distances[min_idx] = 0xffffffff;
1110    }
1111 
1112    /* Compute the final centroid priority. */
1113    for (int i = 0; i < 8; i++) {
1114       centroid_priority |= centroid_priorities[i & sample_mask] << (i * 4);
1115    }
1116 
1117    return centroid_priority << 32 | centroid_priority;
1118 }
1119 
1120 /**
1121  * Emit the sample locations that are specified with VK_EXT_sample_locations.
1122  */
1123 static void
radv_emit_sample_locations(struct radv_cmd_buffer * cmd_buffer)1124 radv_emit_sample_locations(struct radv_cmd_buffer *cmd_buffer)
1125 {
1126    struct radv_sample_locations_state *sample_location = &cmd_buffer->state.dynamic.sample_location;
1127    uint32_t num_samples = (uint32_t)sample_location->per_pixel;
1128    struct radeon_cmdbuf *cs = cmd_buffer->cs;
1129    uint32_t sample_locs_pixel[4][2] = {0};
1130    VkOffset2D sample_locs[4][8]; /* 8 is the max. sample count supported */
1131    uint32_t max_sample_dist = 0;
1132    uint64_t centroid_priority;
1133 
1134    if (!cmd_buffer->state.dynamic.sample_location.count)
1135       return;
1136 
1137    /* Convert the user sample locations to hardware sample locations. */
1138    radv_convert_user_sample_locs(sample_location, 0, 0, sample_locs[0]);
1139    radv_convert_user_sample_locs(sample_location, 1, 0, sample_locs[1]);
1140    radv_convert_user_sample_locs(sample_location, 0, 1, sample_locs[2]);
1141    radv_convert_user_sample_locs(sample_location, 1, 1, sample_locs[3]);
1142 
1143    /* Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask. */
1144    for (uint32_t i = 0; i < 4; i++) {
1145       radv_compute_sample_locs_pixel(num_samples, sample_locs[i], sample_locs_pixel[i]);
1146    }
1147 
1148    /* Compute the PA_SC_CENTROID_PRIORITY_* mask. */
1149    centroid_priority = radv_compute_centroid_priority(cmd_buffer, sample_locs[0], num_samples);
1150 
1151    /* Compute the maximum sample distance from the specified locations. */
1152    for (unsigned i = 0; i < 4; ++i) {
1153       for (uint32_t j = 0; j < num_samples; j++) {
1154          VkOffset2D offset = sample_locs[i][j];
1155          max_sample_dist = MAX2(max_sample_dist, MAX2(abs(offset.x), abs(offset.y)));
1156       }
1157    }
1158 
1159    /* Emit the specified user sample locations. */
1160    switch (num_samples) {
1161    case 2:
1162    case 4:
1163       radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0,
1164                              sample_locs_pixel[0][0]);
1165       radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0,
1166                              sample_locs_pixel[1][0]);
1167       radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0,
1168                              sample_locs_pixel[2][0]);
1169       radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0,
1170                              sample_locs_pixel[3][0]);
1171       break;
1172    case 8:
1173       radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0,
1174                              sample_locs_pixel[0][0]);
1175       radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0,
1176                              sample_locs_pixel[1][0]);
1177       radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0,
1178                              sample_locs_pixel[2][0]);
1179       radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0,
1180                              sample_locs_pixel[3][0]);
1181       radeon_set_context_reg(cs, R_028BFC_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_1,
1182                              sample_locs_pixel[0][1]);
1183       radeon_set_context_reg(cs, R_028C0C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_1,
1184                              sample_locs_pixel[1][1]);
1185       radeon_set_context_reg(cs, R_028C1C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_1,
1186                              sample_locs_pixel[2][1]);
1187       radeon_set_context_reg(cs, R_028C2C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1,
1188                              sample_locs_pixel[3][1]);
1189       break;
1190    default:
1191       unreachable("invalid number of samples");
1192    }
1193 
1194    /* Emit the maximum sample distance and the centroid priority. */
1195    radeon_set_context_reg_rmw(cs, R_028BE0_PA_SC_AA_CONFIG,
1196                               S_028BE0_MAX_SAMPLE_DIST(max_sample_dist), ~C_028BE0_MAX_SAMPLE_DIST);
1197 
1198    radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
1199    radeon_emit(cs, centroid_priority);
1200    radeon_emit(cs, centroid_priority >> 32);
1201 
1202    cmd_buffer->state.context_roll_without_scissor_emitted = true;
1203 }
1204 
1205 static void
radv_emit_inline_push_consts(struct radv_device * device,struct radeon_cmdbuf * cs,struct radv_pipeline * pipeline,gl_shader_stage stage,int idx,uint32_t * values)1206 radv_emit_inline_push_consts(struct radv_device *device, struct radeon_cmdbuf *cs,
1207                              struct radv_pipeline *pipeline, gl_shader_stage stage, int idx,
1208                              uint32_t *values)
1209 {
1210    struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx);
1211    uint32_t base_reg = pipeline->user_data_0[stage];
1212    if (loc->sgpr_idx == -1)
1213       return;
1214 
1215    radeon_check_space(device->ws, cs, 2 + loc->num_sgprs);
1216 
1217    radeon_set_sh_reg_seq(cs, base_reg + loc->sgpr_idx * 4, loc->num_sgprs);
1218    radeon_emit_array(cs, values, loc->num_sgprs);
1219 }
1220 
1221 static void
radv_update_multisample_state(struct radv_cmd_buffer * cmd_buffer,struct radv_graphics_pipeline * pipeline)1222 radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer,
1223                               struct radv_graphics_pipeline *pipeline)
1224 {
1225    int num_samples = pipeline->ms.num_samples;
1226    struct radv_graphics_pipeline *old_pipeline = cmd_buffer->state.emitted_graphics_pipeline;
1227 
1228    if (pipeline->base.shaders[MESA_SHADER_FRAGMENT]->info.ps.needs_sample_positions)
1229       cmd_buffer->sample_positions_needed = true;
1230 
1231    if (old_pipeline && num_samples == old_pipeline->ms.num_samples)
1232       return;
1233 
1234    radv_emit_default_sample_locations(cmd_buffer->cs, num_samples);
1235 
1236    cmd_buffer->state.context_roll_without_scissor_emitted = true;
1237 }
1238 
1239 static void
radv_update_binning_state(struct radv_cmd_buffer * cmd_buffer,struct radv_graphics_pipeline * pipeline)1240 radv_update_binning_state(struct radv_cmd_buffer *cmd_buffer,
1241                           struct radv_graphics_pipeline *pipeline)
1242 {
1243    const struct radv_graphics_pipeline *old_pipeline = cmd_buffer->state.emitted_graphics_pipeline;
1244 
1245    if (pipeline->base.device->physical_device->rad_info.gfx_level < GFX9)
1246       return;
1247 
1248    if (old_pipeline &&
1249        old_pipeline->binning.pa_sc_binner_cntl_0 ==
1250           pipeline->binning.pa_sc_binner_cntl_0)
1251       return;
1252 
1253    bool binning_flush = false;
1254    if (cmd_buffer->device->physical_device->rad_info.family == CHIP_VEGA12 ||
1255        cmd_buffer->device->physical_device->rad_info.family == CHIP_VEGA20 ||
1256        cmd_buffer->device->physical_device->rad_info.family == CHIP_RAVEN2 ||
1257        cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10) {
1258       binning_flush = !old_pipeline ||
1259                       G_028C44_BINNING_MODE(old_pipeline->binning.pa_sc_binner_cntl_0) !=
1260                          G_028C44_BINNING_MODE(pipeline->binning.pa_sc_binner_cntl_0);
1261    }
1262 
1263    radeon_set_context_reg(cmd_buffer->cs, R_028C44_PA_SC_BINNER_CNTL_0,
1264                           pipeline->binning.pa_sc_binner_cntl_0 |
1265                              S_028C44_FLUSH_ON_BINNING_TRANSITION(!!binning_flush));
1266 
1267    cmd_buffer->state.context_roll_without_scissor_emitted = true;
1268 }
1269 
1270 static void
radv_emit_shader_prefetch(struct radv_cmd_buffer * cmd_buffer,struct radv_shader * shader)1271 radv_emit_shader_prefetch(struct radv_cmd_buffer *cmd_buffer, struct radv_shader *shader)
1272 {
1273    uint64_t va;
1274 
1275    if (!shader)
1276       return;
1277 
1278    va = radv_shader_get_va(shader);
1279 
1280    si_cp_dma_prefetch(cmd_buffer, va, shader->code_size);
1281 }
1282 
1283 static void
radv_emit_prefetch_L2(struct radv_cmd_buffer * cmd_buffer,struct radv_graphics_pipeline * pipeline,bool first_stage_only)1284 radv_emit_prefetch_L2(struct radv_cmd_buffer *cmd_buffer,
1285                       struct radv_graphics_pipeline *pipeline, bool first_stage_only)
1286 {
1287    struct radv_cmd_state *state = &cmd_buffer->state;
1288    uint32_t mask = state->prefetch_L2_mask;
1289 
1290    /* Fast prefetch path for starting draws as soon as possible. */
1291    if (first_stage_only)
1292       mask &= RADV_PREFETCH_VS | RADV_PREFETCH_VBO_DESCRIPTORS | RADV_PREFETCH_MS;
1293 
1294    if (mask & RADV_PREFETCH_VS)
1295       radv_emit_shader_prefetch(cmd_buffer, pipeline->base.shaders[MESA_SHADER_VERTEX]);
1296 
1297    if (mask & RADV_PREFETCH_MS)
1298       radv_emit_shader_prefetch(cmd_buffer, pipeline->base.shaders[MESA_SHADER_MESH]);
1299 
1300    if (mask & RADV_PREFETCH_VBO_DESCRIPTORS)
1301       si_cp_dma_prefetch(cmd_buffer, state->vb_va, pipeline->vb_desc_alloc_size);
1302 
1303    if (mask & RADV_PREFETCH_TCS)
1304       radv_emit_shader_prefetch(cmd_buffer, pipeline->base.shaders[MESA_SHADER_TESS_CTRL]);
1305 
1306    if (mask & RADV_PREFETCH_TES)
1307       radv_emit_shader_prefetch(cmd_buffer, pipeline->base.shaders[MESA_SHADER_TESS_EVAL]);
1308 
1309    if (mask & RADV_PREFETCH_GS) {
1310       radv_emit_shader_prefetch(cmd_buffer, pipeline->base.shaders[MESA_SHADER_GEOMETRY]);
1311       if (radv_pipeline_has_gs_copy_shader(&pipeline->base))
1312          radv_emit_shader_prefetch(cmd_buffer, pipeline->base.gs_copy_shader);
1313    }
1314 
1315    if (mask & RADV_PREFETCH_PS)
1316       radv_emit_shader_prefetch(cmd_buffer, pipeline->base.shaders[MESA_SHADER_FRAGMENT]);
1317 
1318    state->prefetch_L2_mask &= ~mask;
1319 }
1320 
1321 static void
radv_emit_rbplus_state(struct radv_cmd_buffer * cmd_buffer)1322 radv_emit_rbplus_state(struct radv_cmd_buffer *cmd_buffer)
1323 {
1324    if (!cmd_buffer->device->physical_device->rad_info.rbplus_allowed)
1325       return;
1326 
1327    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
1328    const struct radv_subpass *subpass = cmd_buffer->state.subpass;
1329 
1330    unsigned sx_ps_downconvert = 0;
1331    unsigned sx_blend_opt_epsilon = 0;
1332    unsigned sx_blend_opt_control = 0;
1333 
1334    for (unsigned i = 0; i < subpass->color_count; ++i) {
1335       unsigned format, swap;
1336       bool has_alpha, has_rgb;
1337       if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) {
1338          /* We don't set the DISABLE bits, because the HW can't have holes,
1339           * so the SPI color format is set to 32-bit 1-component. */
1340          sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
1341          continue;
1342       }
1343 
1344       int idx = subpass->color_attachments[i].attachment;
1345       if (cmd_buffer->state.attachments) {
1346          struct radv_color_buffer_info *cb = &cmd_buffer->state.attachments[idx].cb;
1347 
1348          format = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11
1349                      ? G_028C70_FORMAT_GFX11(cb->cb_color_info)
1350                      : G_028C70_FORMAT_GFX6(cb->cb_color_info);
1351          swap = G_028C70_COMP_SWAP(cb->cb_color_info);
1352          has_alpha = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11
1353                         ? !G_028C74_FORCE_DST_ALPHA_1_GFX11(cb->cb_color_attrib)
1354                         : !G_028C74_FORCE_DST_ALPHA_1_GFX6(cb->cb_color_attrib);
1355       } else {
1356          VkFormat fmt = cmd_buffer->state.pass->attachments[idx].format;
1357          format = radv_translate_colorformat(fmt);
1358          swap = radv_translate_colorswap(fmt, false);
1359          has_alpha = vk_format_description(fmt)->swizzle[3] != PIPE_SWIZZLE_1;
1360       }
1361 
1362       uint32_t spi_format = (pipeline->col_format >> (i * 4)) & 0xf;
1363       uint32_t colormask = (pipeline->cb_target_mask >> (i * 4)) & 0xf;
1364 
1365       if (format == V_028C70_COLOR_8 || format == V_028C70_COLOR_16 || format == V_028C70_COLOR_32)
1366          has_rgb = !has_alpha;
1367       else
1368          has_rgb = true;
1369 
1370       /* Check the colormask and export format. */
1371       if (!(colormask & 0x7))
1372          has_rgb = false;
1373       if (!(colormask & 0x8))
1374          has_alpha = false;
1375 
1376       if (spi_format == V_028714_SPI_SHADER_ZERO) {
1377          has_rgb = false;
1378          has_alpha = false;
1379       }
1380 
1381       /* The HW doesn't quite blend correctly with rgb9e5 if we disable the alpha
1382        * optimization, even though it has no alpha. */
1383       if (has_rgb && format == V_028C70_COLOR_5_9_9_9)
1384          has_alpha = true;
1385 
1386       /* Disable value checking for disabled channels. */
1387       if (!has_rgb)
1388          sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4);
1389       if (!has_alpha)
1390          sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4);
1391 
1392       /* Enable down-conversion for 32bpp and smaller formats. */
1393       switch (format) {
1394       case V_028C70_COLOR_8:
1395       case V_028C70_COLOR_8_8:
1396       case V_028C70_COLOR_8_8_8_8:
1397          /* For 1 and 2-channel formats, use the superset thereof. */
1398          if (spi_format == V_028714_SPI_SHADER_FP16_ABGR ||
1399              spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
1400              spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
1401             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_8_8_8_8 << (i * 4);
1402             sx_blend_opt_epsilon |= V_028758_8BIT_FORMAT << (i * 4);
1403          }
1404          break;
1405 
1406       case V_028C70_COLOR_5_6_5:
1407          if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1408             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_5_6_5 << (i * 4);
1409             sx_blend_opt_epsilon |= V_028758_6BIT_FORMAT << (i * 4);
1410          }
1411          break;
1412 
1413       case V_028C70_COLOR_1_5_5_5:
1414          if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1415             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_1_5_5_5 << (i * 4);
1416             sx_blend_opt_epsilon |= V_028758_5BIT_FORMAT << (i * 4);
1417          }
1418          break;
1419 
1420       case V_028C70_COLOR_4_4_4_4:
1421          if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1422             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_4_4_4_4 << (i * 4);
1423             sx_blend_opt_epsilon |= V_028758_4BIT_FORMAT << (i * 4);
1424          }
1425          break;
1426 
1427       case V_028C70_COLOR_32:
1428          if (swap == V_028C70_SWAP_STD && spi_format == V_028714_SPI_SHADER_32_R)
1429             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
1430          else if (swap == V_028C70_SWAP_ALT_REV && spi_format == V_028714_SPI_SHADER_32_AR)
1431             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_A << (i * 4);
1432          break;
1433 
1434       case V_028C70_COLOR_16:
1435       case V_028C70_COLOR_16_16:
1436          /* For 1-channel formats, use the superset thereof. */
1437          if (spi_format == V_028714_SPI_SHADER_UNORM16_ABGR ||
1438              spi_format == V_028714_SPI_SHADER_SNORM16_ABGR ||
1439              spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
1440              spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
1441             if (swap == V_028C70_SWAP_STD || swap == V_028C70_SWAP_STD_REV)
1442                sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_GR << (i * 4);
1443             else
1444                sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_AR << (i * 4);
1445          }
1446          break;
1447 
1448       case V_028C70_COLOR_10_11_11:
1449          if (spi_format == V_028714_SPI_SHADER_FP16_ABGR)
1450             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_10_11_11 << (i * 4);
1451          break;
1452 
1453       case V_028C70_COLOR_2_10_10_10:
1454          if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1455             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_2_10_10_10 << (i * 4);
1456             sx_blend_opt_epsilon |= V_028758_10BIT_FORMAT << (i * 4);
1457          }
1458          break;
1459       case V_028C70_COLOR_5_9_9_9:
1460          if (spi_format == V_028714_SPI_SHADER_FP16_ABGR)
1461             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_9_9_9_E5 << (i * 4);
1462          break;
1463       }
1464    }
1465 
1466    /* Do not set the DISABLE bits for the unused attachments, as that
1467     * breaks dual source blending in SkQP and does not seem to improve
1468     * performance. */
1469 
1470    if (sx_ps_downconvert == cmd_buffer->state.last_sx_ps_downconvert &&
1471        sx_blend_opt_epsilon == cmd_buffer->state.last_sx_blend_opt_epsilon &&
1472        sx_blend_opt_control == cmd_buffer->state.last_sx_blend_opt_control)
1473       return;
1474 
1475    radeon_set_context_reg_seq(cmd_buffer->cs, R_028754_SX_PS_DOWNCONVERT, 3);
1476    radeon_emit(cmd_buffer->cs, sx_ps_downconvert);
1477    radeon_emit(cmd_buffer->cs, sx_blend_opt_epsilon);
1478    radeon_emit(cmd_buffer->cs, sx_blend_opt_control);
1479 
1480    cmd_buffer->state.context_roll_without_scissor_emitted = true;
1481 
1482    cmd_buffer->state.last_sx_ps_downconvert = sx_ps_downconvert;
1483    cmd_buffer->state.last_sx_blend_opt_epsilon = sx_blend_opt_epsilon;
1484    cmd_buffer->state.last_sx_blend_opt_control = sx_blend_opt_control;
1485 }
1486 
1487 static void
radv_emit_batch_break_on_new_ps(struct radv_cmd_buffer * cmd_buffer)1488 radv_emit_batch_break_on_new_ps(struct radv_cmd_buffer *cmd_buffer)
1489 {
1490    if (!cmd_buffer->device->pbb_allowed)
1491       return;
1492 
1493    struct radv_binning_settings settings =
1494       radv_get_binning_settings(cmd_buffer->device->physical_device);
1495    bool break_for_new_ps =
1496       (!cmd_buffer->state.emitted_graphics_pipeline ||
1497        cmd_buffer->state.emitted_graphics_pipeline->base.shaders[MESA_SHADER_FRAGMENT] !=
1498           cmd_buffer->state.graphics_pipeline->base.shaders[MESA_SHADER_FRAGMENT]) &&
1499       (settings.context_states_per_bin > 1 || settings.persistent_states_per_bin > 1);
1500    bool break_for_new_cb_target_mask =
1501       (cmd_buffer->state.dirty & RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE) &&
1502       settings.context_states_per_bin > 1;
1503 
1504    if (!break_for_new_ps && !break_for_new_cb_target_mask)
1505       return;
1506 
1507    radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
1508    radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
1509 }
1510 
1511 static void
radv_emit_graphics_pipeline(struct radv_cmd_buffer * cmd_buffer)1512 radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
1513 {
1514    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
1515 
1516    if (cmd_buffer->state.emitted_graphics_pipeline == pipeline)
1517       return;
1518 
1519    radv_update_multisample_state(cmd_buffer, pipeline);
1520    radv_update_binning_state(cmd_buffer, pipeline);
1521 
1522    cmd_buffer->scratch_size_per_wave_needed =
1523       MAX2(cmd_buffer->scratch_size_per_wave_needed, pipeline->base.scratch_bytes_per_wave);
1524    cmd_buffer->scratch_waves_wanted = MAX2(cmd_buffer->scratch_waves_wanted, pipeline->base.max_waves);
1525 
1526    if (!cmd_buffer->state.emitted_graphics_pipeline ||
1527        cmd_buffer->state.emitted_graphics_pipeline->negative_one_to_one != pipeline->negative_one_to_one ||
1528        cmd_buffer->state.emitted_graphics_pipeline->depth_clamp_mode != pipeline->depth_clamp_mode)
1529       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_VIEWPORT;
1530 
1531    if (!cmd_buffer->state.emitted_graphics_pipeline ||
1532        radv_rast_prim_is_points_or_lines(cmd_buffer->state.emitted_graphics_pipeline->rast_prim) != radv_rast_prim_is_points_or_lines(pipeline->rast_prim) ||
1533        cmd_buffer->state.emitted_graphics_pipeline->line_width != pipeline->line_width)
1534       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR;
1535 
1536    if (!cmd_buffer->state.emitted_graphics_pipeline ||
1537        cmd_buffer->state.emitted_graphics_pipeline->pa_su_sc_mode_cntl != pipeline->pa_su_sc_mode_cntl)
1538       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_CULL_MODE |
1539                                  RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE |
1540                                  RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
1541 
1542    if (!cmd_buffer->state.emitted_graphics_pipeline ||
1543        cmd_buffer->state.emitted_graphics_pipeline->pa_cl_clip_cntl != pipeline->pa_cl_clip_cntl)
1544       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
1545 
1546    if (!cmd_buffer->state.emitted_graphics_pipeline ||
1547        cmd_buffer->state.emitted_graphics_pipeline->cb_color_control != pipeline->cb_color_control)
1548       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP;
1549 
1550    if (!cmd_buffer->state.emitted_graphics_pipeline)
1551       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY |
1552                                  RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS |
1553                                  RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS |
1554                                  RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE;
1555 
1556    if (!cmd_buffer->state.emitted_graphics_pipeline ||
1557        cmd_buffer->state.emitted_graphics_pipeline->db_depth_control != pipeline->db_depth_control)
1558       cmd_buffer->state.dirty |=
1559          RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE |
1560          RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP | RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE |
1561          RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP;
1562 
1563    if (!cmd_buffer->state.emitted_graphics_pipeline)
1564       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP;
1565 
1566    if (!cmd_buffer->state.emitted_graphics_pipeline ||
1567        cmd_buffer->state.emitted_graphics_pipeline->cb_target_mask != pipeline->cb_target_mask) {
1568       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE;
1569    }
1570 
1571    radeon_emit_array(cmd_buffer->cs, pipeline->base.cs.buf, pipeline->base.cs.cdw);
1572 
1573    if (pipeline->has_ngg_culling &&
1574        pipeline->last_vgt_api_stage != MESA_SHADER_GEOMETRY &&
1575        !cmd_buffer->state.last_nggc_settings) {
1576       /* The already emitted RSRC2 contains the LDS required for NGG culling.
1577        * Culling is currently disabled, so re-emit RSRC2 to reduce LDS usage.
1578        * API GS always needs LDS, so this isn't useful there.
1579        */
1580       struct radv_shader *v = pipeline->base.shaders[pipeline->last_vgt_api_stage];
1581       radeon_set_sh_reg(cmd_buffer->cs, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
1582                         (v->config.rsrc2 & C_00B22C_LDS_SIZE) |
1583                         S_00B22C_LDS_SIZE(v->info.num_lds_blocks_when_not_culling));
1584    }
1585 
1586    if (!cmd_buffer->state.emitted_graphics_pipeline ||
1587        cmd_buffer->state.emitted_graphics_pipeline->base.ctx_cs.cdw != pipeline->base.ctx_cs.cdw ||
1588        cmd_buffer->state.emitted_graphics_pipeline->base.ctx_cs_hash != pipeline->base.ctx_cs_hash ||
1589        memcmp(cmd_buffer->state.emitted_graphics_pipeline->base.ctx_cs.buf, pipeline->base.ctx_cs.buf,
1590               pipeline->base.ctx_cs.cdw * 4)) {
1591       radeon_emit_array(cmd_buffer->cs, pipeline->base.ctx_cs.buf, pipeline->base.ctx_cs.cdw);
1592       cmd_buffer->state.context_roll_without_scissor_emitted = true;
1593    }
1594 
1595    radv_emit_batch_break_on_new_ps(cmd_buffer);
1596 
1597    radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->base.slab_bo);
1598 
1599    if (unlikely(cmd_buffer->device->trace_bo))
1600       radv_save_pipeline(cmd_buffer, &pipeline->base);
1601 
1602    cmd_buffer->state.emitted_graphics_pipeline = pipeline;
1603 
1604    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_PIPELINE;
1605 }
1606 
1607 static void
radv_emit_viewport(struct radv_cmd_buffer * cmd_buffer)1608 radv_emit_viewport(struct radv_cmd_buffer *cmd_buffer)
1609 {
1610    const struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
1611    const struct radv_viewport_state *viewport = &cmd_buffer->state.dynamic.viewport;
1612    int i;
1613    const unsigned count = viewport->count;
1614 
1615    assert(count);
1616    radeon_set_context_reg_seq(cmd_buffer->cs, R_02843C_PA_CL_VPORT_XSCALE, count * 6);
1617 
1618    for (i = 0; i < count; i++) {
1619       radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].scale[0]));
1620       radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].translate[0]));
1621       radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].scale[1]));
1622       radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].translate[1]));
1623 
1624       double scale_z, translate_z;
1625       if (pipeline->negative_one_to_one) {
1626          scale_z = viewport->xform[i].scale[2] * 0.5f;
1627          translate_z = (viewport->xform[i].translate[2] + viewport->viewports[i].maxDepth) * 0.5f;
1628       } else {
1629          scale_z = viewport->xform[i].scale[2];
1630          translate_z = viewport->xform[i].translate[2];
1631 
1632       }
1633       radeon_emit(cmd_buffer->cs, fui(scale_z));
1634       radeon_emit(cmd_buffer->cs, fui(translate_z));
1635    }
1636 
1637    radeon_set_context_reg_seq(cmd_buffer->cs, R_0282D0_PA_SC_VPORT_ZMIN_0, count * 2);
1638    for (i = 0; i < count; i++) {
1639       float zmin, zmax;
1640 
1641       if (pipeline->depth_clamp_mode == RADV_DEPTH_CLAMP_MODE_ZERO_TO_ONE) {
1642          zmin = 0.0f;
1643          zmax = 1.0f;
1644       } else {
1645          zmin = MIN2(viewport->viewports[i].minDepth, viewport->viewports[i].maxDepth);
1646          zmax = MAX2(viewport->viewports[i].minDepth, viewport->viewports[i].maxDepth);
1647       }
1648 
1649       radeon_emit(cmd_buffer->cs, fui(zmin));
1650       radeon_emit(cmd_buffer->cs, fui(zmax));
1651    }
1652 }
1653 
1654 void
radv_write_scissors(struct radv_cmd_buffer * cmd_buffer,struct radeon_cmdbuf * cs)1655 radv_write_scissors(struct radv_cmd_buffer *cmd_buffer, struct radeon_cmdbuf *cs)
1656 {
1657    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
1658    uint32_t count = cmd_buffer->state.dynamic.scissor.count;
1659    unsigned rast_prim;
1660 
1661    if (!(pipeline->dynamic_states & RADV_DYNAMIC_PRIMITIVE_TOPOLOGY) ||
1662        (pipeline->active_stages & (VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT |
1663                                    VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT |
1664                                    VK_SHADER_STAGE_GEOMETRY_BIT |
1665                                    VK_SHADER_STAGE_MESH_BIT_NV))) {
1666       /* Ignore dynamic primitive topology for TES/GS/MS stages. */
1667       rast_prim = pipeline->rast_prim;
1668    } else {
1669       rast_prim = si_conv_prim_to_gs_out(cmd_buffer->state.dynamic.primitive_topology);
1670    }
1671 
1672    si_write_scissors(cs, 0, count, cmd_buffer->state.dynamic.scissor.scissors,
1673                      cmd_buffer->state.dynamic.viewport.viewports, rast_prim,
1674                      cmd_buffer->state.dynamic.line_width);
1675 }
1676 
1677 static void
radv_emit_scissor(struct radv_cmd_buffer * cmd_buffer)1678 radv_emit_scissor(struct radv_cmd_buffer *cmd_buffer)
1679 {
1680    radv_write_scissors(cmd_buffer, cmd_buffer->cs);
1681 
1682    cmd_buffer->state.context_roll_without_scissor_emitted = false;
1683 }
1684 
1685 static void
radv_emit_discard_rectangle(struct radv_cmd_buffer * cmd_buffer)1686 radv_emit_discard_rectangle(struct radv_cmd_buffer *cmd_buffer)
1687 {
1688    if (!cmd_buffer->state.dynamic.discard_rectangle.count)
1689       return;
1690 
1691    radeon_set_context_reg_seq(cmd_buffer->cs, R_028210_PA_SC_CLIPRECT_0_TL,
1692                               cmd_buffer->state.dynamic.discard_rectangle.count * 2);
1693    for (unsigned i = 0; i < cmd_buffer->state.dynamic.discard_rectangle.count; ++i) {
1694       VkRect2D rect = cmd_buffer->state.dynamic.discard_rectangle.rectangles[i];
1695       radeon_emit(cmd_buffer->cs, S_028210_TL_X(rect.offset.x) | S_028210_TL_Y(rect.offset.y));
1696       radeon_emit(cmd_buffer->cs, S_028214_BR_X(rect.offset.x + rect.extent.width) |
1697                                      S_028214_BR_Y(rect.offset.y + rect.extent.height));
1698    }
1699 }
1700 
1701 static void
radv_emit_line_width(struct radv_cmd_buffer * cmd_buffer)1702 radv_emit_line_width(struct radv_cmd_buffer *cmd_buffer)
1703 {
1704    unsigned width = cmd_buffer->state.dynamic.line_width * 8;
1705 
1706    radeon_set_context_reg(cmd_buffer->cs, R_028A08_PA_SU_LINE_CNTL,
1707                           S_028A08_WIDTH(CLAMP(width, 0, 0xFFFF)));
1708 }
1709 
1710 static void
radv_emit_blend_constants(struct radv_cmd_buffer * cmd_buffer)1711 radv_emit_blend_constants(struct radv_cmd_buffer *cmd_buffer)
1712 {
1713    struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1714 
1715    radeon_set_context_reg_seq(cmd_buffer->cs, R_028414_CB_BLEND_RED, 4);
1716    radeon_emit_array(cmd_buffer->cs, (uint32_t *)d->blend_constants, 4);
1717 }
1718 
1719 static void
radv_emit_stencil(struct radv_cmd_buffer * cmd_buffer)1720 radv_emit_stencil(struct radv_cmd_buffer *cmd_buffer)
1721 {
1722    struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1723 
1724    radeon_set_context_reg_seq(cmd_buffer->cs, R_028430_DB_STENCILREFMASK, 2);
1725    radeon_emit(cmd_buffer->cs, S_028430_STENCILTESTVAL(d->stencil_reference.front) |
1726                                   S_028430_STENCILMASK(d->stencil_compare_mask.front) |
1727                                   S_028430_STENCILWRITEMASK(d->stencil_write_mask.front) |
1728                                   S_028430_STENCILOPVAL(1));
1729    radeon_emit(cmd_buffer->cs, S_028434_STENCILTESTVAL_BF(d->stencil_reference.back) |
1730                                   S_028434_STENCILMASK_BF(d->stencil_compare_mask.back) |
1731                                   S_028434_STENCILWRITEMASK_BF(d->stencil_write_mask.back) |
1732                                   S_028434_STENCILOPVAL_BF(1));
1733 }
1734 
1735 static void
radv_emit_depth_bounds(struct radv_cmd_buffer * cmd_buffer)1736 radv_emit_depth_bounds(struct radv_cmd_buffer *cmd_buffer)
1737 {
1738    struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1739 
1740    radeon_set_context_reg_seq(cmd_buffer->cs, R_028020_DB_DEPTH_BOUNDS_MIN, 2);
1741    radeon_emit(cmd_buffer->cs, fui(d->depth_bounds.min));
1742    radeon_emit(cmd_buffer->cs, fui(d->depth_bounds.max));
1743 }
1744 
1745 static void
radv_emit_depth_bias(struct radv_cmd_buffer * cmd_buffer)1746 radv_emit_depth_bias(struct radv_cmd_buffer *cmd_buffer)
1747 {
1748    struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1749    unsigned slope = fui(d->depth_bias.slope * 16.0f);
1750 
1751    radeon_set_context_reg_seq(cmd_buffer->cs, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, 5);
1752    radeon_emit(cmd_buffer->cs, fui(d->depth_bias.clamp)); /* CLAMP */
1753    radeon_emit(cmd_buffer->cs, slope);                    /* FRONT SCALE */
1754    radeon_emit(cmd_buffer->cs, fui(d->depth_bias.bias));  /* FRONT OFFSET */
1755    radeon_emit(cmd_buffer->cs, slope);                    /* BACK SCALE */
1756    radeon_emit(cmd_buffer->cs, fui(d->depth_bias.bias));  /* BACK OFFSET */
1757 }
1758 
1759 static void
radv_emit_line_stipple(struct radv_cmd_buffer * cmd_buffer)1760 radv_emit_line_stipple(struct radv_cmd_buffer *cmd_buffer)
1761 {
1762    struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1763    uint32_t auto_reset_cntl = 1;
1764 
1765    if (d->primitive_topology == V_008958_DI_PT_LINESTRIP)
1766       auto_reset_cntl = 2;
1767 
1768    radeon_set_context_reg(cmd_buffer->cs, R_028A0C_PA_SC_LINE_STIPPLE,
1769                           S_028A0C_LINE_PATTERN(d->line_stipple.pattern) |
1770                              S_028A0C_REPEAT_COUNT(d->line_stipple.factor - 1) |
1771                              S_028A0C_AUTO_RESET_CNTL(auto_reset_cntl));
1772 }
1773 
1774 uint32_t
radv_get_pa_su_sc_mode_cntl(const struct radv_cmd_buffer * cmd_buffer)1775 radv_get_pa_su_sc_mode_cntl(const struct radv_cmd_buffer *cmd_buffer)
1776 {
1777    unsigned pa_su_sc_mode_cntl = cmd_buffer->state.graphics_pipeline->pa_su_sc_mode_cntl;
1778    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1779 
1780    pa_su_sc_mode_cntl &= C_028814_CULL_FRONT &
1781                          C_028814_CULL_BACK &
1782                          C_028814_FACE &
1783                          C_028814_POLY_OFFSET_FRONT_ENABLE &
1784                          C_028814_POLY_OFFSET_BACK_ENABLE &
1785                          C_028814_POLY_OFFSET_PARA_ENABLE;
1786 
1787    pa_su_sc_mode_cntl |= S_028814_CULL_FRONT(!!(d->cull_mode & VK_CULL_MODE_FRONT_BIT)) |
1788                          S_028814_CULL_BACK(!!(d->cull_mode & VK_CULL_MODE_BACK_BIT)) |
1789                          S_028814_FACE(d->front_face) |
1790                          S_028814_POLY_OFFSET_FRONT_ENABLE(d->depth_bias_enable) |
1791                          S_028814_POLY_OFFSET_BACK_ENABLE(d->depth_bias_enable) |
1792                          S_028814_POLY_OFFSET_PARA_ENABLE(d->depth_bias_enable);
1793    return pa_su_sc_mode_cntl;
1794 }
1795 
1796 static void
radv_emit_culling(struct radv_cmd_buffer * cmd_buffer,uint64_t states)1797 radv_emit_culling(struct radv_cmd_buffer *cmd_buffer, uint64_t states)
1798 {
1799    unsigned pa_su_sc_mode_cntl = radv_get_pa_su_sc_mode_cntl(cmd_buffer);
1800 
1801    radeon_set_context_reg(cmd_buffer->cs, R_028814_PA_SU_SC_MODE_CNTL, pa_su_sc_mode_cntl);
1802 }
1803 
1804 static void
radv_emit_primitive_topology(struct radv_cmd_buffer * cmd_buffer)1805 radv_emit_primitive_topology(struct radv_cmd_buffer *cmd_buffer)
1806 {
1807    struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1808 
1809    assert(!cmd_buffer->state.mesh_shading);
1810 
1811    if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) {
1812       radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device, cmd_buffer->cs,
1813                                  R_030908_VGT_PRIMITIVE_TYPE, 1, d->primitive_topology);
1814    } else {
1815       radeon_set_config_reg(cmd_buffer->cs, R_008958_VGT_PRIMITIVE_TYPE, d->primitive_topology);
1816    }
1817 }
1818 
1819 static void
radv_emit_depth_control(struct radv_cmd_buffer * cmd_buffer,uint64_t states)1820 radv_emit_depth_control(struct radv_cmd_buffer *cmd_buffer, uint64_t states)
1821 {
1822    unsigned db_depth_control = cmd_buffer->state.graphics_pipeline->db_depth_control;
1823    struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1824 
1825    db_depth_control &= C_028800_Z_ENABLE &
1826                        C_028800_Z_WRITE_ENABLE &
1827                        C_028800_ZFUNC &
1828                        C_028800_DEPTH_BOUNDS_ENABLE &
1829                        C_028800_STENCIL_ENABLE &
1830                        C_028800_BACKFACE_ENABLE &
1831                        C_028800_STENCILFUNC &
1832                        C_028800_STENCILFUNC_BF;
1833 
1834    db_depth_control |= S_028800_Z_ENABLE(d->depth_test_enable ? 1 : 0) |
1835                        S_028800_Z_WRITE_ENABLE(d->depth_write_enable ? 1 : 0) |
1836                        S_028800_ZFUNC(d->depth_compare_op) |
1837                        S_028800_DEPTH_BOUNDS_ENABLE(d->depth_bounds_test_enable ? 1 : 0) |
1838                        S_028800_STENCIL_ENABLE(d->stencil_test_enable ? 1 : 0) |
1839                        S_028800_BACKFACE_ENABLE(d->stencil_test_enable ? 1 : 0) |
1840                        S_028800_STENCILFUNC(d->stencil_op.front.compare_op) |
1841                        S_028800_STENCILFUNC_BF(d->stencil_op.back.compare_op);
1842 
1843    radeon_set_context_reg(cmd_buffer->cs, R_028800_DB_DEPTH_CONTROL, db_depth_control);
1844 }
1845 
1846 static void
radv_emit_stencil_control(struct radv_cmd_buffer * cmd_buffer)1847 radv_emit_stencil_control(struct radv_cmd_buffer *cmd_buffer)
1848 {
1849    struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1850 
1851    radeon_set_context_reg(
1852       cmd_buffer->cs, R_02842C_DB_STENCIL_CONTROL,
1853       S_02842C_STENCILFAIL(si_translate_stencil_op(d->stencil_op.front.fail_op)) |
1854          S_02842C_STENCILZPASS(si_translate_stencil_op(d->stencil_op.front.pass_op)) |
1855          S_02842C_STENCILZFAIL(si_translate_stencil_op(d->stencil_op.front.depth_fail_op)) |
1856          S_02842C_STENCILFAIL_BF(si_translate_stencil_op(d->stencil_op.back.fail_op)) |
1857          S_02842C_STENCILZPASS_BF(si_translate_stencil_op(d->stencil_op.back.pass_op)) |
1858          S_02842C_STENCILZFAIL_BF(si_translate_stencil_op(d->stencil_op.back.depth_fail_op)));
1859 }
1860 
1861 static void
radv_emit_fragment_shading_rate(struct radv_cmd_buffer * cmd_buffer)1862 radv_emit_fragment_shading_rate(struct radv_cmd_buffer *cmd_buffer)
1863 {
1864    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
1865    const struct radv_subpass *subpass = cmd_buffer->state.subpass;
1866    struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1867    uint32_t rate_x = MIN2(2, d->fragment_shading_rate.size.width) - 1;
1868    uint32_t rate_y = MIN2(2, d->fragment_shading_rate.size.height) - 1;
1869    uint32_t pa_cl_vrs_cntl = pipeline->vrs.pa_cl_vrs_cntl;
1870    uint32_t pipeline_comb_mode = d->fragment_shading_rate.combiner_ops[0];
1871    uint32_t htile_comb_mode = d->fragment_shading_rate.combiner_ops[1];
1872 
1873    assert(cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10_3);
1874 
1875    if (subpass && !subpass->vrs_attachment) {
1876       /* When the current subpass has no VRS attachment, the VRS rates are expected to be 1x1, so we
1877        * can cheat by tweaking the different combiner modes.
1878        */
1879       switch (htile_comb_mode) {
1880       case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MIN_KHR:
1881          /* The result of min(A, 1x1) is always 1x1. */
1882          FALLTHROUGH;
1883       case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR:
1884          /* Force the per-draw VRS rate to 1x1. */
1885          rate_x = rate_y = 0;
1886 
1887          /* As the result of min(A, 1x1) or replace(A, 1x1) are always 1x1, set the vertex rate
1888           * combiner mode as passthrough.
1889           */
1890          pipeline_comb_mode = V_028848_VRS_COMB_MODE_PASSTHRU;
1891          break;
1892       case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MAX_KHR:
1893          /* The result of max(A, 1x1) is always A. */
1894          FALLTHROUGH;
1895       case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR:
1896          /* Nothing to do here because the SAMPLE_ITER combiner mode should already be passthrough. */
1897          break;
1898       default:
1899          break;
1900       }
1901    }
1902 
1903    /* Emit per-draw VRS rate which is the first combiner. */
1904    radeon_set_uconfig_reg(cmd_buffer->cs, R_03098C_GE_VRS_RATE,
1905                           S_03098C_RATE_X(rate_x) | S_03098C_RATE_Y(rate_y));
1906 
1907    /* VERTEX_RATE_COMBINER_MODE controls the combiner mode between the
1908     * draw rate and the vertex rate.
1909     */
1910    if (cmd_buffer->state.mesh_shading) {
1911       pa_cl_vrs_cntl |= S_028848_VERTEX_RATE_COMBINER_MODE(V_028848_VRS_COMB_MODE_PASSTHRU) |
1912                         S_028848_PRIMITIVE_RATE_COMBINER_MODE(pipeline_comb_mode);
1913    } else {
1914       pa_cl_vrs_cntl |= S_028848_VERTEX_RATE_COMBINER_MODE(pipeline_comb_mode) |
1915                         S_028848_PRIMITIVE_RATE_COMBINER_MODE(V_028848_VRS_COMB_MODE_PASSTHRU);
1916    }
1917 
1918    /* HTILE_RATE_COMBINER_MODE controls the combiner mode between the primitive rate and the HTILE
1919     * rate.
1920     */
1921    pa_cl_vrs_cntl |= S_028848_HTILE_RATE_COMBINER_MODE(htile_comb_mode);
1922 
1923    radeon_set_context_reg(cmd_buffer->cs, R_028848_PA_CL_VRS_CNTL, pa_cl_vrs_cntl);
1924 }
1925 
1926 static void
radv_emit_primitive_restart_enable(struct radv_cmd_buffer * cmd_buffer)1927 radv_emit_primitive_restart_enable(struct radv_cmd_buffer *cmd_buffer)
1928 {
1929    struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1930 
1931    if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
1932       radeon_set_uconfig_reg(cmd_buffer->cs, R_03092C_GE_MULTI_PRIM_IB_RESET_EN,
1933                              d->primitive_restart_enable);
1934    } else if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9) {
1935       radeon_set_uconfig_reg(cmd_buffer->cs, R_03092C_VGT_MULTI_PRIM_IB_RESET_EN,
1936                              d->primitive_restart_enable);
1937    } else {
1938       radeon_set_context_reg(cmd_buffer->cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN,
1939                              d->primitive_restart_enable);
1940    }
1941 }
1942 
1943 static void
radv_emit_rasterizer_discard_enable(struct radv_cmd_buffer * cmd_buffer)1944 radv_emit_rasterizer_discard_enable(struct radv_cmd_buffer *cmd_buffer)
1945 {
1946    unsigned pa_cl_clip_cntl = cmd_buffer->state.graphics_pipeline->pa_cl_clip_cntl;
1947    struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1948 
1949    pa_cl_clip_cntl &= C_028810_DX_RASTERIZATION_KILL;
1950    pa_cl_clip_cntl |= S_028810_DX_RASTERIZATION_KILL(d->rasterizer_discard_enable);
1951 
1952    radeon_set_context_reg(cmd_buffer->cs, R_028810_PA_CL_CLIP_CNTL, pa_cl_clip_cntl);
1953 }
1954 
1955 static void
radv_emit_logic_op(struct radv_cmd_buffer * cmd_buffer)1956 radv_emit_logic_op(struct radv_cmd_buffer *cmd_buffer)
1957 {
1958    unsigned cb_color_control = cmd_buffer->state.graphics_pipeline->cb_color_control;
1959    struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1960 
1961    cb_color_control &= C_028808_ROP3;
1962    cb_color_control |= S_028808_ROP3(d->logic_op);
1963 
1964    radeon_set_context_reg(cmd_buffer->cs, R_028808_CB_COLOR_CONTROL, cb_color_control);
1965 }
1966 
1967 static void
radv_emit_color_write_enable(struct radv_cmd_buffer * cmd_buffer)1968 radv_emit_color_write_enable(struct radv_cmd_buffer *cmd_buffer)
1969 {
1970    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
1971    struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1972 
1973    radeon_set_context_reg(cmd_buffer->cs, R_028238_CB_TARGET_MASK,
1974                           pipeline->cb_target_mask & d->color_write_enable);
1975 }
1976 
1977 static void
radv_emit_fb_color_state(struct radv_cmd_buffer * cmd_buffer,int index,struct radv_color_buffer_info * cb,struct radv_image_view * iview,VkImageLayout layout,bool in_render_loop)1978 radv_emit_fb_color_state(struct radv_cmd_buffer *cmd_buffer, int index,
1979                          struct radv_color_buffer_info *cb, struct radv_image_view *iview,
1980                          VkImageLayout layout, bool in_render_loop)
1981 {
1982    bool is_vi = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX8;
1983    uint32_t cb_fdcc_control = cb->cb_dcc_control;
1984    uint32_t cb_color_info = cb->cb_color_info;
1985    struct radv_image *image = iview->image;
1986 
1987    if (!radv_layout_dcc_compressed(
1988           cmd_buffer->device, image, iview->vk.base_mip_level, layout, in_render_loop,
1989           radv_image_queue_family_mask(image, cmd_buffer->qf,
1990                                        cmd_buffer->qf))) {
1991       if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
1992          cb_fdcc_control &= C_028C78_FDCC_ENABLE;
1993       } else {
1994          cb_color_info &= C_028C70_DCC_ENABLE;
1995       }
1996    }
1997 
1998    if (!radv_layout_fmask_compressed(
1999           cmd_buffer->device, image, layout,
2000           radv_image_queue_family_mask(image, cmd_buffer->qf,
2001                                        cmd_buffer->qf))) {
2002       cb_color_info &= C_028C70_COMPRESSION;
2003    }
2004 
2005    if (radv_image_is_tc_compat_cmask(image) && (radv_is_fmask_decompress_pipeline(cmd_buffer) ||
2006                                                 radv_is_dcc_decompress_pipeline(cmd_buffer))) {
2007       /* If this bit is set, the FMASK decompression operation
2008        * doesn't occur (DCC_COMPRESS also implies FMASK_DECOMPRESS).
2009        */
2010       cb_color_info &= C_028C70_FMASK_COMPRESS_1FRAG_ONLY;
2011    }
2012 
2013    if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
2014       radeon_set_context_reg_seq(cmd_buffer->cs, R_028C6C_CB_COLOR0_VIEW + index * 0x3c, 4);
2015       radeon_emit(cmd_buffer->cs, cb->cb_color_view);                      /* CB_COLOR0_VIEW */
2016       radeon_emit(cmd_buffer->cs, cb->cb_color_info);                      /* CB_COLOR0_INFO */
2017       radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);                    /* CB_COLOR0_ATTRIB */
2018       radeon_emit(cmd_buffer->cs, cb_fdcc_control);                        /* CB_COLOR0_FDCC_CONTROL */
2019 
2020       radeon_set_context_reg(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, cb->cb_color_base);
2021       radeon_set_context_reg(cmd_buffer->cs, R_028E40_CB_COLOR0_BASE_EXT + index * 4, cb->cb_color_base >> 32);
2022       radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->cb_dcc_base);
2023       radeon_set_context_reg(cmd_buffer->cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + index * 4, cb->cb_dcc_base >> 32);
2024       radeon_set_context_reg(cmd_buffer->cs, R_028EC0_CB_COLOR0_ATTRIB2 + index * 4, cb->cb_color_attrib2);
2025       radeon_set_context_reg(cmd_buffer->cs, R_028EE0_CB_COLOR0_ATTRIB3 + index * 4, cb->cb_color_attrib3);
2026    } else if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10) {
2027       radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
2028       radeon_emit(cmd_buffer->cs, cb->cb_color_base);
2029       radeon_emit(cmd_buffer->cs, 0);
2030       radeon_emit(cmd_buffer->cs, 0);
2031       radeon_emit(cmd_buffer->cs, cb->cb_color_view);
2032       radeon_emit(cmd_buffer->cs, cb_color_info);
2033       radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
2034       radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
2035       radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
2036       radeon_emit(cmd_buffer->cs, 0);
2037       radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
2038       radeon_emit(cmd_buffer->cs, 0);
2039 
2040       radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->cb_dcc_base);
2041 
2042       radeon_set_context_reg(cmd_buffer->cs, R_028E40_CB_COLOR0_BASE_EXT + index * 4,
2043                              cb->cb_color_base >> 32);
2044       radeon_set_context_reg(cmd_buffer->cs, R_028E60_CB_COLOR0_CMASK_BASE_EXT + index * 4,
2045                              cb->cb_color_cmask >> 32);
2046       radeon_set_context_reg(cmd_buffer->cs, R_028E80_CB_COLOR0_FMASK_BASE_EXT + index * 4,
2047                              cb->cb_color_fmask >> 32);
2048       radeon_set_context_reg(cmd_buffer->cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + index * 4,
2049                              cb->cb_dcc_base >> 32);
2050       radeon_set_context_reg(cmd_buffer->cs, R_028EC0_CB_COLOR0_ATTRIB2 + index * 4,
2051                              cb->cb_color_attrib2);
2052       radeon_set_context_reg(cmd_buffer->cs, R_028EE0_CB_COLOR0_ATTRIB3 + index * 4,
2053                              cb->cb_color_attrib3);
2054    } else if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX9) {
2055       radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
2056       radeon_emit(cmd_buffer->cs, cb->cb_color_base);
2057       radeon_emit(cmd_buffer->cs, S_028C64_BASE_256B(cb->cb_color_base >> 32));
2058       radeon_emit(cmd_buffer->cs, cb->cb_color_attrib2);
2059       radeon_emit(cmd_buffer->cs, cb->cb_color_view);
2060       radeon_emit(cmd_buffer->cs, cb_color_info);
2061       radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
2062       radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
2063       radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
2064       radeon_emit(cmd_buffer->cs, S_028C80_BASE_256B(cb->cb_color_cmask >> 32));
2065       radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
2066       radeon_emit(cmd_buffer->cs, S_028C88_BASE_256B(cb->cb_color_fmask >> 32));
2067 
2068       radeon_set_context_reg_seq(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, 2);
2069       radeon_emit(cmd_buffer->cs, cb->cb_dcc_base);
2070       radeon_emit(cmd_buffer->cs, S_028C98_BASE_256B(cb->cb_dcc_base >> 32));
2071 
2072       radeon_set_context_reg(cmd_buffer->cs, R_0287A0_CB_MRT0_EPITCH + index * 4,
2073                              cb->cb_mrt_epitch);
2074    } else {
2075       radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
2076       radeon_emit(cmd_buffer->cs, cb->cb_color_base);
2077       radeon_emit(cmd_buffer->cs, cb->cb_color_pitch);
2078       radeon_emit(cmd_buffer->cs, cb->cb_color_slice);
2079       radeon_emit(cmd_buffer->cs, cb->cb_color_view);
2080       radeon_emit(cmd_buffer->cs, cb_color_info);
2081       radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
2082       radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
2083       radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
2084       radeon_emit(cmd_buffer->cs, cb->cb_color_cmask_slice);
2085       radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
2086       radeon_emit(cmd_buffer->cs, cb->cb_color_fmask_slice);
2087 
2088       if (is_vi) { /* DCC BASE */
2089          radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c,
2090                                 cb->cb_dcc_base);
2091       }
2092    }
2093 
2094    if (G_028C70_DCC_ENABLE(cb_color_info)) {
2095       /* Drawing with DCC enabled also compresses colorbuffers. */
2096       VkImageSubresourceRange range = {
2097          .aspectMask = iview->vk.aspects,
2098          .baseMipLevel = iview->vk.base_mip_level,
2099          .levelCount = iview->vk.level_count,
2100          .baseArrayLayer = iview->vk.base_array_layer,
2101          .layerCount = iview->vk.layer_count,
2102       };
2103 
2104       radv_update_dcc_metadata(cmd_buffer, image, &range, true);
2105    }
2106 }
2107 
2108 static void
radv_update_zrange_precision(struct radv_cmd_buffer * cmd_buffer,struct radv_ds_buffer_info * ds,const struct radv_image_view * iview,VkImageLayout layout,bool in_render_loop,bool requires_cond_exec)2109 radv_update_zrange_precision(struct radv_cmd_buffer *cmd_buffer, struct radv_ds_buffer_info *ds,
2110                              const struct radv_image_view *iview, VkImageLayout layout,
2111                              bool in_render_loop, bool requires_cond_exec)
2112 {
2113    const struct radv_image *image = iview->image;
2114    uint32_t db_z_info = ds->db_z_info;
2115    uint32_t db_z_info_reg;
2116 
2117    if (!cmd_buffer->device->physical_device->rad_info.has_tc_compat_zrange_bug ||
2118        !radv_image_is_tc_compat_htile(image))
2119       return;
2120 
2121    if (!radv_layout_is_htile_compressed(
2122           cmd_buffer->device, image, layout, in_render_loop,
2123           radv_image_queue_family_mask(image, cmd_buffer->qf,
2124                                        cmd_buffer->qf))) {
2125       db_z_info &= C_028040_TILE_SURFACE_ENABLE;
2126    }
2127 
2128    db_z_info &= C_028040_ZRANGE_PRECISION;
2129 
2130    if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX9) {
2131       db_z_info_reg = R_028038_DB_Z_INFO;
2132    } else {
2133       db_z_info_reg = R_028040_DB_Z_INFO;
2134    }
2135 
2136    /* When we don't know the last fast clear value we need to emit a
2137     * conditional packet that will eventually skip the following
2138     * SET_CONTEXT_REG packet.
2139     */
2140    if (requires_cond_exec) {
2141       uint64_t va = radv_get_tc_compat_zrange_va(image, iview->vk.base_mip_level);
2142 
2143       radeon_emit(cmd_buffer->cs, PKT3(PKT3_COND_EXEC, 3, 0));
2144       radeon_emit(cmd_buffer->cs, va);
2145       radeon_emit(cmd_buffer->cs, va >> 32);
2146       radeon_emit(cmd_buffer->cs, 0);
2147       radeon_emit(cmd_buffer->cs, 3); /* SET_CONTEXT_REG size */
2148    }
2149 
2150    radeon_set_context_reg(cmd_buffer->cs, db_z_info_reg, db_z_info);
2151 }
2152 
2153 static void
radv_emit_fb_ds_state(struct radv_cmd_buffer * cmd_buffer,struct radv_ds_buffer_info * ds,struct radv_image_view * iview,VkImageLayout layout,bool in_render_loop)2154 radv_emit_fb_ds_state(struct radv_cmd_buffer *cmd_buffer, struct radv_ds_buffer_info *ds,
2155                       struct radv_image_view *iview, VkImageLayout layout, bool in_render_loop)
2156 {
2157    const struct radv_image *image = iview->image;
2158    uint32_t db_z_info = ds->db_z_info;
2159    uint32_t db_stencil_info = ds->db_stencil_info;
2160    uint32_t db_htile_surface = ds->db_htile_surface;
2161 
2162    if (!radv_layout_is_htile_compressed(
2163           cmd_buffer->device, image, layout, in_render_loop,
2164           radv_image_queue_family_mask(image, cmd_buffer->qf,
2165                                        cmd_buffer->qf))) {
2166       db_z_info &= C_028040_TILE_SURFACE_ENABLE;
2167       db_stencil_info |= S_028044_TILE_STENCIL_DISABLE(1);
2168    }
2169 
2170    if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10_3 &&
2171        !cmd_buffer->state.subpass->vrs_attachment) {
2172       db_htile_surface &= C_028ABC_VRS_HTILE_ENCODING;
2173    }
2174 
2175    radeon_set_context_reg(cmd_buffer->cs, R_028008_DB_DEPTH_VIEW, ds->db_depth_view);
2176    radeon_set_context_reg(cmd_buffer->cs, R_028ABC_DB_HTILE_SURFACE, db_htile_surface);
2177 
2178    if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10) {
2179       radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, ds->db_htile_data_base);
2180       radeon_set_context_reg(cmd_buffer->cs, R_02801C_DB_DEPTH_SIZE_XY, ds->db_depth_size);
2181 
2182       if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
2183          radeon_set_context_reg_seq(cmd_buffer->cs, R_028040_DB_Z_INFO, 6);
2184       } else {
2185          radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 7);
2186          radeon_emit(cmd_buffer->cs, S_02803C_RESOURCE_LEVEL(1));
2187       }
2188       radeon_emit(cmd_buffer->cs, db_z_info);
2189       radeon_emit(cmd_buffer->cs, db_stencil_info);
2190       radeon_emit(cmd_buffer->cs, ds->db_z_read_base);
2191       radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);
2192       radeon_emit(cmd_buffer->cs, ds->db_z_read_base);
2193       radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);
2194 
2195       radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_READ_BASE_HI, 5);
2196       radeon_emit(cmd_buffer->cs, ds->db_z_read_base >> 32);
2197       radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base >> 32);
2198       radeon_emit(cmd_buffer->cs, ds->db_z_read_base >> 32);
2199       radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base >> 32);
2200       radeon_emit(cmd_buffer->cs, ds->db_htile_data_base >> 32);
2201    } else if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX9) {
2202       radeon_set_context_reg_seq(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, 3);
2203       radeon_emit(cmd_buffer->cs, ds->db_htile_data_base);
2204       radeon_emit(cmd_buffer->cs, S_028018_BASE_HI(ds->db_htile_data_base >> 32));
2205       radeon_emit(cmd_buffer->cs, ds->db_depth_size);
2206 
2207       radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 10);
2208       radeon_emit(cmd_buffer->cs, db_z_info);          /* DB_Z_INFO */
2209       radeon_emit(cmd_buffer->cs, db_stencil_info);    /* DB_STENCIL_INFO */
2210       radeon_emit(cmd_buffer->cs, ds->db_z_read_base); /* DB_Z_READ_BASE */
2211       radeon_emit(cmd_buffer->cs,
2212                   S_028044_BASE_HI(ds->db_z_read_base >> 32)); /* DB_Z_READ_BASE_HI */
2213       radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);   /* DB_STENCIL_READ_BASE */
2214       radeon_emit(cmd_buffer->cs,
2215                   S_02804C_BASE_HI(ds->db_stencil_read_base >> 32)); /* DB_STENCIL_READ_BASE_HI */
2216       radeon_emit(cmd_buffer->cs, ds->db_z_write_base);              /* DB_Z_WRITE_BASE */
2217       radeon_emit(cmd_buffer->cs,
2218                   S_028054_BASE_HI(ds->db_z_write_base >> 32)); /* DB_Z_WRITE_BASE_HI */
2219       radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base);   /* DB_STENCIL_WRITE_BASE */
2220       radeon_emit(cmd_buffer->cs,
2221                   S_02805C_BASE_HI(ds->db_stencil_write_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */
2222 
2223       radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_INFO2, 2);
2224       radeon_emit(cmd_buffer->cs, ds->db_z_info2);
2225       radeon_emit(cmd_buffer->cs, ds->db_stencil_info2);
2226    } else {
2227       radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, ds->db_htile_data_base);
2228 
2229       radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 9);
2230       radeon_emit(cmd_buffer->cs, ds->db_depth_info);         /* R_02803C_DB_DEPTH_INFO */
2231       radeon_emit(cmd_buffer->cs, db_z_info);                 /* R_028040_DB_Z_INFO */
2232       radeon_emit(cmd_buffer->cs, db_stencil_info);           /* R_028044_DB_STENCIL_INFO */
2233       radeon_emit(cmd_buffer->cs, ds->db_z_read_base);        /* R_028048_DB_Z_READ_BASE */
2234       radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);  /* R_02804C_DB_STENCIL_READ_BASE */
2235       radeon_emit(cmd_buffer->cs, ds->db_z_write_base);       /* R_028050_DB_Z_WRITE_BASE */
2236       radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base); /* R_028054_DB_STENCIL_WRITE_BASE */
2237       radeon_emit(cmd_buffer->cs, ds->db_depth_size);         /* R_028058_DB_DEPTH_SIZE */
2238       radeon_emit(cmd_buffer->cs, ds->db_depth_slice);        /* R_02805C_DB_DEPTH_SLICE */
2239    }
2240 
2241    /* Update the ZRANGE_PRECISION value for the TC-compat bug. */
2242    radv_update_zrange_precision(cmd_buffer, ds, iview, layout, in_render_loop, true);
2243 
2244    radeon_set_context_reg(cmd_buffer->cs, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL,
2245                           ds->pa_su_poly_offset_db_fmt_cntl);
2246 }
2247 
2248 /**
2249  * Update the fast clear depth/stencil values if the image is bound as a
2250  * depth/stencil buffer.
2251  */
2252 static void
radv_update_bound_fast_clear_ds(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview,VkClearDepthStencilValue ds_clear_value,VkImageAspectFlags aspects)2253 radv_update_bound_fast_clear_ds(struct radv_cmd_buffer *cmd_buffer,
2254                                 const struct radv_image_view *iview,
2255                                 VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects)
2256 {
2257    const struct radv_subpass *subpass = cmd_buffer->state.subpass;
2258    const struct radv_image *image = iview->image;
2259    struct radeon_cmdbuf *cs = cmd_buffer->cs;
2260    uint32_t att_idx;
2261 
2262    if (!cmd_buffer->state.attachments || !subpass)
2263       return;
2264 
2265    if (!subpass->depth_stencil_attachment)
2266       return;
2267 
2268    att_idx = subpass->depth_stencil_attachment->attachment;
2269    if (cmd_buffer->state.attachments[att_idx].iview->image != image)
2270       return;
2271 
2272    if (aspects == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
2273       radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2);
2274       radeon_emit(cs, ds_clear_value.stencil);
2275       radeon_emit(cs, fui(ds_clear_value.depth));
2276    } else if (aspects == VK_IMAGE_ASPECT_DEPTH_BIT) {
2277       radeon_set_context_reg(cs, R_02802C_DB_DEPTH_CLEAR, fui(ds_clear_value.depth));
2278    } else {
2279       assert(aspects == VK_IMAGE_ASPECT_STENCIL_BIT);
2280       radeon_set_context_reg(cs, R_028028_DB_STENCIL_CLEAR, ds_clear_value.stencil);
2281    }
2282 
2283    /* Update the ZRANGE_PRECISION value for the TC-compat bug. This is
2284     * only needed when clearing Z to 0.0.
2285     */
2286    if ((aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && ds_clear_value.depth == 0.0) {
2287       VkImageLayout layout = subpass->depth_stencil_attachment->layout;
2288       bool in_render_loop = subpass->depth_stencil_attachment->in_render_loop;
2289 
2290       radv_update_zrange_precision(cmd_buffer, &cmd_buffer->state.attachments[att_idx].ds, iview,
2291                                    layout, in_render_loop, false);
2292    }
2293 
2294    cmd_buffer->state.context_roll_without_scissor_emitted = true;
2295 }
2296 
2297 /**
2298  * Set the clear depth/stencil values to the image's metadata.
2299  */
2300 static void
radv_set_ds_clear_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,VkClearDepthStencilValue ds_clear_value,VkImageAspectFlags aspects)2301 radv_set_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2302                            const VkImageSubresourceRange *range,
2303                            VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects)
2304 {
2305    struct radeon_cmdbuf *cs = cmd_buffer->cs;
2306    uint32_t level_count = radv_get_levelCount(image, range);
2307 
2308    if (aspects == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
2309       uint64_t va = radv_get_ds_clear_value_va(image, range->baseMipLevel);
2310 
2311       /* Use the fastest way when both aspects are used. */
2312       radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + 2 * level_count, cmd_buffer->state.predicating));
2313       radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2314       radeon_emit(cs, va);
2315       radeon_emit(cs, va >> 32);
2316 
2317       for (uint32_t l = 0; l < level_count; l++) {
2318          radeon_emit(cs, ds_clear_value.stencil);
2319          radeon_emit(cs, fui(ds_clear_value.depth));
2320       }
2321    } else {
2322       /* Otherwise we need one WRITE_DATA packet per level. */
2323       for (uint32_t l = 0; l < level_count; l++) {
2324          uint64_t va = radv_get_ds_clear_value_va(image, range->baseMipLevel + l);
2325          unsigned value;
2326 
2327          if (aspects == VK_IMAGE_ASPECT_DEPTH_BIT) {
2328             value = fui(ds_clear_value.depth);
2329             va += 4;
2330          } else {
2331             assert(aspects == VK_IMAGE_ASPECT_STENCIL_BIT);
2332             value = ds_clear_value.stencil;
2333          }
2334 
2335          radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, cmd_buffer->state.predicating));
2336          radeon_emit(cs,
2337                      S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2338          radeon_emit(cs, va);
2339          radeon_emit(cs, va >> 32);
2340          radeon_emit(cs, value);
2341       }
2342    }
2343 }
2344 
2345 /**
2346  * Update the TC-compat metadata value for this image.
2347  */
2348 static void
radv_set_tc_compat_zrange_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,uint32_t value)2349 radv_set_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2350                                    const VkImageSubresourceRange *range, uint32_t value)
2351 {
2352    struct radeon_cmdbuf *cs = cmd_buffer->cs;
2353 
2354    if (!cmd_buffer->device->physical_device->rad_info.has_tc_compat_zrange_bug)
2355       return;
2356 
2357    uint64_t va = radv_get_tc_compat_zrange_va(image, range->baseMipLevel);
2358    uint32_t level_count = radv_get_levelCount(image, range);
2359 
2360    radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + level_count, cmd_buffer->state.predicating));
2361    radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2362    radeon_emit(cs, va);
2363    radeon_emit(cs, va >> 32);
2364 
2365    for (uint32_t l = 0; l < level_count; l++)
2366       radeon_emit(cs, value);
2367 }
2368 
2369 static void
radv_update_tc_compat_zrange_metadata(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview,VkClearDepthStencilValue ds_clear_value)2370 radv_update_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer,
2371                                       const struct radv_image_view *iview,
2372                                       VkClearDepthStencilValue ds_clear_value)
2373 {
2374    VkImageSubresourceRange range = {
2375       .aspectMask = iview->vk.aspects,
2376       .baseMipLevel = iview->vk.base_mip_level,
2377       .levelCount = iview->vk.level_count,
2378       .baseArrayLayer = iview->vk.base_array_layer,
2379       .layerCount = iview->vk.layer_count,
2380    };
2381    uint32_t cond_val;
2382 
2383    /* Conditionally set DB_Z_INFO.ZRANGE_PRECISION to 0 when the last
2384     * depth clear value is 0.0f.
2385     */
2386    cond_val = ds_clear_value.depth == 0.0f ? UINT_MAX : 0;
2387 
2388    radv_set_tc_compat_zrange_metadata(cmd_buffer, iview->image, &range, cond_val);
2389 }
2390 
2391 /**
2392  * Update the clear depth/stencil values for this image.
2393  */
2394 void
radv_update_ds_clear_metadata(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview,VkClearDepthStencilValue ds_clear_value,VkImageAspectFlags aspects)2395 radv_update_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
2396                               const struct radv_image_view *iview,
2397                               VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects)
2398 {
2399    VkImageSubresourceRange range = {
2400       .aspectMask = iview->vk.aspects,
2401       .baseMipLevel = iview->vk.base_mip_level,
2402       .levelCount = iview->vk.level_count,
2403       .baseArrayLayer = iview->vk.base_array_layer,
2404       .layerCount = iview->vk.layer_count,
2405    };
2406    struct radv_image *image = iview->image;
2407 
2408    assert(radv_htile_enabled(image, range.baseMipLevel));
2409 
2410    radv_set_ds_clear_metadata(cmd_buffer, iview->image, &range, ds_clear_value, aspects);
2411 
2412    if (radv_image_is_tc_compat_htile(image) && (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) {
2413       radv_update_tc_compat_zrange_metadata(cmd_buffer, iview, ds_clear_value);
2414    }
2415 
2416    radv_update_bound_fast_clear_ds(cmd_buffer, iview, ds_clear_value, aspects);
2417 }
2418 
2419 /**
2420  * Load the clear depth/stencil values from the image's metadata.
2421  */
2422 static void
radv_load_ds_clear_metadata(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview)2423 radv_load_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, const struct radv_image_view *iview)
2424 {
2425    struct radeon_cmdbuf *cs = cmd_buffer->cs;
2426    const struct radv_image *image = iview->image;
2427    VkImageAspectFlags aspects = vk_format_aspects(image->vk.format);
2428    uint64_t va = radv_get_ds_clear_value_va(image, iview->vk.base_mip_level);
2429    unsigned reg_offset = 0, reg_count = 0;
2430 
2431    assert(radv_image_has_htile(image));
2432 
2433    if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
2434       ++reg_count;
2435    } else {
2436       ++reg_offset;
2437       va += 4;
2438    }
2439    if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
2440       ++reg_count;
2441 
2442    uint32_t reg = R_028028_DB_STENCIL_CLEAR + 4 * reg_offset;
2443 
2444    if (cmd_buffer->device->physical_device->rad_info.has_load_ctx_reg_pkt) {
2445       radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, 0));
2446       radeon_emit(cs, va);
2447       radeon_emit(cs, va >> 32);
2448       radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
2449       radeon_emit(cs, reg_count);
2450    } else {
2451       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
2452       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) |
2453                          (reg_count == 2 ? COPY_DATA_COUNT_SEL : 0));
2454       radeon_emit(cs, va);
2455       radeon_emit(cs, va >> 32);
2456       radeon_emit(cs, reg >> 2);
2457       radeon_emit(cs, 0);
2458 
2459       radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
2460       radeon_emit(cs, 0);
2461    }
2462 }
2463 
2464 /*
2465  * With DCC some colors don't require CMASK elimination before being
2466  * used as a texture. This sets a predicate value to determine if the
2467  * cmask eliminate is required.
2468  */
2469 void
radv_update_fce_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,bool value)2470 radv_update_fce_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2471                          const VkImageSubresourceRange *range, bool value)
2472 {
2473    if (!image->fce_pred_offset)
2474       return;
2475 
2476    uint64_t pred_val = value;
2477    uint64_t va = radv_image_get_fce_pred_va(image, range->baseMipLevel);
2478    uint32_t level_count = radv_get_levelCount(image, range);
2479    uint32_t count = 2 * level_count;
2480 
2481    radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0));
2482    radeon_emit(cmd_buffer->cs,
2483                S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2484    radeon_emit(cmd_buffer->cs, va);
2485    radeon_emit(cmd_buffer->cs, va >> 32);
2486 
2487    for (uint32_t l = 0; l < level_count; l++) {
2488       radeon_emit(cmd_buffer->cs, pred_val);
2489       radeon_emit(cmd_buffer->cs, pred_val >> 32);
2490    }
2491 }
2492 
2493 /**
2494  * Update the DCC predicate to reflect the compression state.
2495  */
2496 void
radv_update_dcc_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,bool value)2497 radv_update_dcc_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2498                          const VkImageSubresourceRange *range, bool value)
2499 {
2500    if (image->dcc_pred_offset == 0)
2501       return;
2502 
2503    uint64_t pred_val = value;
2504    uint64_t va = radv_image_get_dcc_pred_va(image, range->baseMipLevel);
2505    uint32_t level_count = radv_get_levelCount(image, range);
2506    uint32_t count = 2 * level_count;
2507 
2508    assert(radv_dcc_enabled(image, range->baseMipLevel));
2509 
2510    radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0));
2511    radeon_emit(cmd_buffer->cs,
2512                S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2513    radeon_emit(cmd_buffer->cs, va);
2514    radeon_emit(cmd_buffer->cs, va >> 32);
2515 
2516    for (uint32_t l = 0; l < level_count; l++) {
2517       radeon_emit(cmd_buffer->cs, pred_val);
2518       radeon_emit(cmd_buffer->cs, pred_val >> 32);
2519    }
2520 }
2521 
2522 /**
2523  * Update the fast clear color values if the image is bound as a color buffer.
2524  */
2525 static void
radv_update_bound_fast_clear_color(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,int cb_idx,uint32_t color_values[2])2526 radv_update_bound_fast_clear_color(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2527                                    int cb_idx, uint32_t color_values[2])
2528 {
2529    const struct radv_subpass *subpass = cmd_buffer->state.subpass;
2530    struct radeon_cmdbuf *cs = cmd_buffer->cs;
2531    uint32_t att_idx;
2532 
2533    if (!cmd_buffer->state.attachments || !subpass)
2534       return;
2535 
2536    att_idx = subpass->color_attachments[cb_idx].attachment;
2537    if (att_idx == VK_ATTACHMENT_UNUSED)
2538       return;
2539 
2540    if (cmd_buffer->state.attachments[att_idx].iview->image != image)
2541       return;
2542 
2543    radeon_set_context_reg_seq(cs, R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c, 2);
2544    radeon_emit(cs, color_values[0]);
2545    radeon_emit(cs, color_values[1]);
2546 
2547    cmd_buffer->state.context_roll_without_scissor_emitted = true;
2548 }
2549 
2550 /**
2551  * Set the clear color values to the image's metadata.
2552  */
2553 static void
radv_set_color_clear_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,uint32_t color_values[2])2554 radv_set_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2555                               const VkImageSubresourceRange *range, uint32_t color_values[2])
2556 {
2557    struct radeon_cmdbuf *cs = cmd_buffer->cs;
2558    uint32_t level_count = radv_get_levelCount(image, range);
2559    uint32_t count = 2 * level_count;
2560 
2561    assert(radv_image_has_cmask(image) || radv_dcc_enabled(image, range->baseMipLevel));
2562 
2563    if (radv_image_has_clear_value(image)) {
2564       uint64_t va = radv_image_get_fast_clear_va(image, range->baseMipLevel);
2565 
2566       radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + count, cmd_buffer->state.predicating));
2567       radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2568       radeon_emit(cs, va);
2569       radeon_emit(cs, va >> 32);
2570 
2571       for (uint32_t l = 0; l < level_count; l++) {
2572          radeon_emit(cs, color_values[0]);
2573          radeon_emit(cs, color_values[1]);
2574       }
2575    } else {
2576       /* Some default value we can set in the update. */
2577       assert(color_values[0] == 0 && color_values[1] == 0);
2578    }
2579 }
2580 
2581 /**
2582  * Update the clear color values for this image.
2583  */
2584 void
radv_update_color_clear_metadata(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview,int cb_idx,uint32_t color_values[2])2585 radv_update_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
2586                                  const struct radv_image_view *iview, int cb_idx,
2587                                  uint32_t color_values[2])
2588 {
2589    struct radv_image *image = iview->image;
2590    VkImageSubresourceRange range = {
2591       .aspectMask = iview->vk.aspects,
2592       .baseMipLevel = iview->vk.base_mip_level,
2593       .levelCount = iview->vk.level_count,
2594       .baseArrayLayer = iview->vk.base_array_layer,
2595       .layerCount = iview->vk.layer_count,
2596    };
2597 
2598    assert(radv_image_has_cmask(image) || radv_dcc_enabled(image, iview->vk.base_mip_level));
2599 
2600    /* Do not need to update the clear value for images that are fast cleared with the comp-to-single
2601     * mode because the hardware gets the value from the image directly.
2602     */
2603    if (iview->image->support_comp_to_single)
2604       return;
2605 
2606    radv_set_color_clear_metadata(cmd_buffer, image, &range, color_values);
2607 
2608    radv_update_bound_fast_clear_color(cmd_buffer, image, cb_idx, color_values);
2609 }
2610 
2611 /**
2612  * Load the clear color values from the image's metadata.
2613  */
2614 static void
radv_load_color_clear_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image_view * iview,int cb_idx)2615 radv_load_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image_view *iview,
2616                                int cb_idx)
2617 {
2618    struct radeon_cmdbuf *cs = cmd_buffer->cs;
2619    struct radv_image *image = iview->image;
2620 
2621    if (!radv_image_has_cmask(image) && !radv_dcc_enabled(image, iview->vk.base_mip_level))
2622       return;
2623 
2624    if (iview->image->support_comp_to_single)
2625       return;
2626 
2627    if (!radv_image_has_clear_value(image)) {
2628       uint32_t color_values[2] = {0, 0};
2629       radv_update_bound_fast_clear_color(cmd_buffer, image, cb_idx, color_values);
2630       return;
2631    }
2632 
2633    uint64_t va = radv_image_get_fast_clear_va(image, iview->vk.base_mip_level);
2634    uint32_t reg = R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c;
2635 
2636    if (cmd_buffer->device->physical_device->rad_info.has_load_ctx_reg_pkt) {
2637       radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, cmd_buffer->state.predicating));
2638       radeon_emit(cs, va);
2639       radeon_emit(cs, va >> 32);
2640       radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
2641       radeon_emit(cs, 2);
2642    } else {
2643       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating));
2644       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) |
2645                          COPY_DATA_COUNT_SEL);
2646       radeon_emit(cs, va);
2647       radeon_emit(cs, va >> 32);
2648       radeon_emit(cs, reg >> 2);
2649       radeon_emit(cs, 0);
2650 
2651       radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
2652       radeon_emit(cs, 0);
2653    }
2654 }
2655 
2656 /* GFX9+ metadata cache flushing workaround. metadata cache coherency is
2657  * broken if the CB caches data of multiple mips of the same image at the
2658  * same time.
2659  *
2660  * Insert some flushes to avoid this.
2661  */
2662 static void
radv_emit_fb_mip_change_flush(struct radv_cmd_buffer * cmd_buffer)2663 radv_emit_fb_mip_change_flush(struct radv_cmd_buffer *cmd_buffer)
2664 {
2665    struct vk_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
2666    const struct radv_subpass *subpass = cmd_buffer->state.subpass;
2667    bool color_mip_changed = false;
2668 
2669    /* Entire workaround is not applicable before GFX9 */
2670    if (cmd_buffer->device->physical_device->rad_info.gfx_level < GFX9)
2671       return;
2672 
2673    if (!framebuffer)
2674       return;
2675 
2676    for (int i = 0; i < subpass->color_count; ++i) {
2677       int idx = subpass->color_attachments[i].attachment;
2678       if (idx == VK_ATTACHMENT_UNUSED)
2679          continue;
2680 
2681       struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview;
2682 
2683       if ((radv_image_has_CB_metadata(iview->image) ||
2684            radv_dcc_enabled(iview->image, iview->vk.base_mip_level) ||
2685            radv_dcc_enabled(iview->image, cmd_buffer->state.cb_mip[i])) &&
2686           cmd_buffer->state.cb_mip[i] != iview->vk.base_mip_level)
2687          color_mip_changed = true;
2688 
2689       cmd_buffer->state.cb_mip[i] = iview->vk.base_mip_level;
2690    }
2691 
2692    if (color_mip_changed) {
2693       cmd_buffer->state.flush_bits |=
2694          RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
2695    }
2696 }
2697 
2698 /* This function does the flushes for mip changes if the levels are not zero for
2699  * all render targets. This way we can assume at the start of the next cmd_buffer
2700  * that rendering to mip 0 doesn't need any flushes. As that is the most common
2701  * case that saves some flushes. */
2702 static void
radv_emit_mip_change_flush_default(struct radv_cmd_buffer * cmd_buffer)2703 radv_emit_mip_change_flush_default(struct radv_cmd_buffer *cmd_buffer)
2704 {
2705    /* Entire workaround is not applicable before GFX9 */
2706    if (cmd_buffer->device->physical_device->rad_info.gfx_level < GFX9)
2707       return;
2708 
2709    bool need_color_mip_flush = false;
2710    for (unsigned i = 0; i < 8; ++i) {
2711       if (cmd_buffer->state.cb_mip[i]) {
2712          need_color_mip_flush = true;
2713          break;
2714       }
2715    }
2716 
2717    if (need_color_mip_flush) {
2718       cmd_buffer->state.flush_bits |=
2719          RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
2720    }
2721 
2722    memset(cmd_buffer->state.cb_mip, 0, sizeof(cmd_buffer->state.cb_mip));
2723 }
2724 
2725 static struct radv_image *
radv_cmd_buffer_get_vrs_image(struct radv_cmd_buffer * cmd_buffer)2726 radv_cmd_buffer_get_vrs_image(struct radv_cmd_buffer *cmd_buffer)
2727 {
2728    struct radv_device *device = cmd_buffer->device;
2729 
2730    if (!device->vrs.image) {
2731       VkResult result;
2732 
2733       /* The global VRS state is initialized on-demand to avoid wasting VRAM. */
2734       result = radv_device_init_vrs_state(device);
2735       if (result != VK_SUCCESS) {
2736          cmd_buffer->record_result = result;
2737          return NULL;
2738       }
2739    }
2740 
2741    return device->vrs.image;
2742 }
2743 
2744 static void
radv_emit_framebuffer_state(struct radv_cmd_buffer * cmd_buffer)2745 radv_emit_framebuffer_state(struct radv_cmd_buffer *cmd_buffer)
2746 {
2747    int i;
2748    struct vk_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
2749    const struct radv_subpass *subpass = cmd_buffer->state.subpass;
2750    bool disable_constant_encode_ac01 = false;
2751    unsigned color_invalid = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11
2752                             ? G_028C70_FORMAT_GFX11(V_028C70_COLOR_INVALID)
2753                             : G_028C70_FORMAT_GFX6(V_028C70_COLOR_INVALID);
2754 
2755    for (i = 0; i < subpass->color_count; ++i) {
2756       if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) {
2757          radeon_set_context_reg(cmd_buffer->cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, color_invalid);
2758          continue;
2759       }
2760 
2761       int idx = subpass->color_attachments[i].attachment;
2762       struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview;
2763       VkImageLayout layout = subpass->color_attachments[i].layout;
2764       bool in_render_loop = subpass->color_attachments[i].in_render_loop;
2765 
2766       radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, iview->image->bindings[0].bo);
2767 
2768       assert(iview->vk.aspects & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_PLANE_0_BIT |
2769                                    VK_IMAGE_ASPECT_PLANE_1_BIT | VK_IMAGE_ASPECT_PLANE_2_BIT));
2770 
2771       if (iview->image->disjoint && iview->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
2772          for (uint32_t plane_id = 0; plane_id < iview->image->plane_count; plane_id++) {
2773             radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
2774                   iview->image->bindings[plane_id].bo);
2775          }
2776       } else {
2777          uint32_t plane_id = iview->image->disjoint ? iview->plane_id : 0;
2778          radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
2779                iview->image->bindings[plane_id].bo);
2780       }
2781 
2782       radv_emit_fb_color_state(cmd_buffer, i, &cmd_buffer->state.attachments[idx].cb, iview, layout,
2783                                in_render_loop);
2784 
2785       radv_load_color_clear_metadata(cmd_buffer, iview, i);
2786 
2787       if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9 &&
2788           iview->image->dcc_sign_reinterpret) {
2789          /* Disable constant encoding with the clear value of "1" with different DCC signedness
2790           * because the hardware will fill "1" instead of the clear value.
2791           */
2792          disable_constant_encode_ac01 = true;
2793       }
2794    }
2795    for (; i < cmd_buffer->state.last_subpass_color_count; i++) {
2796       radeon_set_context_reg(cmd_buffer->cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, color_invalid);
2797    }
2798    cmd_buffer->state.last_subpass_color_count = subpass->color_count;
2799 
2800    if (subpass->depth_stencil_attachment) {
2801       int idx = subpass->depth_stencil_attachment->attachment;
2802       VkImageLayout layout = subpass->depth_stencil_attachment->layout;
2803       bool in_render_loop = subpass->depth_stencil_attachment->in_render_loop;
2804       struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview;
2805       radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
2806                          cmd_buffer->state.attachments[idx].iview->image->bindings[0].bo);
2807 
2808       radv_emit_fb_ds_state(cmd_buffer, &cmd_buffer->state.attachments[idx].ds, iview, layout,
2809                             in_render_loop);
2810 
2811       if (radv_layout_is_htile_compressed(
2812              cmd_buffer->device, iview->image, layout, in_render_loop,
2813              radv_image_queue_family_mask(iview->image, cmd_buffer->qf,
2814                                           cmd_buffer->qf))) {
2815          /* Only load the depth/stencil fast clear values when
2816           * compressed rendering is enabled.
2817           */
2818          radv_load_ds_clear_metadata(cmd_buffer, iview);
2819       }
2820    } else if (subpass->vrs_attachment && radv_cmd_buffer_get_vrs_image(cmd_buffer)) {
2821       /* When a subpass uses a VRS attachment without binding a depth/stencil attachment, we have to
2822        * bind our internal depth buffer that contains the VRS data as part of HTILE.
2823        */
2824       VkImageLayout layout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
2825       struct radv_buffer *htile_buffer = cmd_buffer->device->vrs.buffer;
2826       struct radv_image *image = cmd_buffer->device->vrs.image;
2827       struct radv_ds_buffer_info ds;
2828       struct radv_image_view iview;
2829 
2830       radv_image_view_init(&iview, cmd_buffer->device,
2831                            &(VkImageViewCreateInfo){
2832                               .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
2833                               .image = radv_image_to_handle(image),
2834                               .viewType = radv_meta_get_view_type(image),
2835                               .format = image->vk.format,
2836                               .subresourceRange =
2837                                  {
2838                                     .aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT,
2839                                     .baseMipLevel = 0,
2840                                     .levelCount = 1,
2841                                     .baseArrayLayer = 0,
2842                                     .layerCount = 1,
2843                                  },
2844                            },
2845                            0, NULL);
2846 
2847       radv_initialise_vrs_surface(image, htile_buffer, &ds);
2848 
2849       radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, htile_buffer->bo);
2850 
2851       radv_emit_fb_ds_state(cmd_buffer, &ds, &iview, layout, false);
2852 
2853       radv_image_view_finish(&iview);
2854    } else {
2855       unsigned num_samples = 0;
2856 
2857       /* On GFX11, DB_Z_INFO.NUM_SAMPLES should always match the framebuffer samples. It affects
2858        * VRS and occlusion queries if depth and stencil are not bound.
2859        */
2860       if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX11)
2861          num_samples = util_logbase2(subpass->max_sample_count);
2862 
2863       if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX9)
2864          radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 2);
2865       else
2866          radeon_set_context_reg_seq(cmd_buffer->cs, R_028040_DB_Z_INFO, 2);
2867 
2868       radeon_emit(cmd_buffer->cs, S_028040_FORMAT(V_028040_Z_INVALID) |       /* DB_Z_INFO */
2869                                   S_028040_NUM_SAMPLES(num_samples));
2870       radeon_emit(cmd_buffer->cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */
2871    }
2872    radeon_set_context_reg(cmd_buffer->cs, R_028208_PA_SC_WINDOW_SCISSOR_BR,
2873                           S_028208_BR_X(framebuffer->width) | S_028208_BR_Y(framebuffer->height));
2874 
2875    if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX8) {
2876       bool disable_constant_encode =
2877          cmd_buffer->device->physical_device->rad_info.has_dcc_constant_encode;
2878       enum amd_gfx_level gfx_level = cmd_buffer->device->physical_device->rad_info.gfx_level;
2879       uint8_t watermark = gfx_level >= GFX10 ? 6 : 4;
2880 
2881       if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
2882          radeon_set_context_reg(cmd_buffer->cs, R_028424_CB_FDCC_CONTROL,
2883                                 S_028424_SAMPLE_MASK_TRACKER_WATERMARK(watermark));
2884       } else {
2885          radeon_set_context_reg(cmd_buffer->cs, R_028424_CB_DCC_CONTROL,
2886                                 S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(gfx_level <= GFX9) |
2887                                 S_028424_OVERWRITE_COMBINER_WATERMARK(watermark) |
2888                                 S_028424_DISABLE_CONSTANT_ENCODE_AC01(disable_constant_encode_ac01) |
2889                                 S_028424_DISABLE_CONSTANT_ENCODE_REG(disable_constant_encode));
2890       }
2891    }
2892 
2893    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_FRAMEBUFFER;
2894 }
2895 
2896 static void
radv_emit_index_buffer(struct radv_cmd_buffer * cmd_buffer,bool indirect)2897 radv_emit_index_buffer(struct radv_cmd_buffer *cmd_buffer, bool indirect)
2898 {
2899    struct radeon_cmdbuf *cs = cmd_buffer->cs;
2900    struct radv_cmd_state *state = &cmd_buffer->state;
2901 
2902    /* With indirect generated commands the index buffer bind may be part of the
2903     * indirect command buffer, in which case the app may not have bound any yet. */
2904    if (state->index_type < 0)
2905       return;
2906 
2907    /* For the direct indexed draws we use DRAW_INDEX_2, which includes
2908     * the index_va and max_index_count already. */
2909    if (!indirect)
2910       return;
2911 
2912    if (state->max_index_count ||
2913        !cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug) {
2914       radeon_emit(cs, PKT3(PKT3_INDEX_BASE, 1, 0));
2915       radeon_emit(cs, state->index_va);
2916       radeon_emit(cs, state->index_va >> 32);
2917 
2918       radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
2919       radeon_emit(cs, state->max_index_count);
2920    }
2921 
2922    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_INDEX_BUFFER;
2923 }
2924 
2925 void
radv_set_db_count_control(struct radv_cmd_buffer * cmd_buffer,bool enable_occlusion_queries)2926 radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer, bool enable_occlusion_queries)
2927 {
2928    bool has_perfect_queries = cmd_buffer->state.perfect_occlusion_queries_enabled;
2929    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
2930    uint32_t pa_sc_mode_cntl_1 = pipeline ? pipeline->ms.pa_sc_mode_cntl_1 : 0;
2931    uint32_t db_count_control;
2932 
2933    if (!enable_occlusion_queries) {
2934       if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) {
2935          if (G_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(pa_sc_mode_cntl_1) &&
2936              pipeline->disable_out_of_order_rast_for_occlusion && has_perfect_queries) {
2937             /* Re-enable out-of-order rasterization if the
2938              * bound pipeline supports it and if it's has
2939              * been disabled before starting any perfect
2940              * occlusion queries.
2941              */
2942             radeon_set_context_reg(cmd_buffer->cs, R_028A4C_PA_SC_MODE_CNTL_1, pa_sc_mode_cntl_1);
2943          }
2944       }
2945       db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(1);
2946    } else {
2947       const struct radv_subpass *subpass = cmd_buffer->state.subpass;
2948       uint32_t sample_rate = subpass ? util_logbase2(subpass->max_sample_count) : 0;
2949       bool gfx10_perfect =
2950          cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10 && has_perfect_queries;
2951 
2952       if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) {
2953          /* Always enable PERFECT_ZPASS_COUNTS due to issues with partially
2954           * covered tiles, discards, and early depth testing. For more details,
2955           * see https://gitlab.freedesktop.org/mesa/mesa/-/issues/3218 */
2956          db_count_control = S_028004_PERFECT_ZPASS_COUNTS(1) |
2957                             S_028004_DISABLE_CONSERVATIVE_ZPASS_COUNTS(gfx10_perfect) |
2958                             S_028004_SAMPLE_RATE(sample_rate) | S_028004_ZPASS_ENABLE(1) |
2959                             S_028004_SLICE_EVEN_ENABLE(1) | S_028004_SLICE_ODD_ENABLE(1);
2960 
2961          if (G_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(pa_sc_mode_cntl_1) &&
2962              pipeline->disable_out_of_order_rast_for_occlusion && has_perfect_queries) {
2963             /* If the bound pipeline has enabled
2964              * out-of-order rasterization, we should
2965              * disable it before starting any perfect
2966              * occlusion queries.
2967              */
2968             pa_sc_mode_cntl_1 &= C_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE;
2969 
2970             radeon_set_context_reg(cmd_buffer->cs, R_028A4C_PA_SC_MODE_CNTL_1, pa_sc_mode_cntl_1);
2971          }
2972       } else {
2973          db_count_control = S_028004_PERFECT_ZPASS_COUNTS(1) | S_028004_SAMPLE_RATE(sample_rate);
2974       }
2975    }
2976 
2977    radeon_set_context_reg(cmd_buffer->cs, R_028004_DB_COUNT_CONTROL, db_count_control);
2978 
2979    cmd_buffer->state.context_roll_without_scissor_emitted = true;
2980 }
2981 
2982 unsigned
radv_instance_rate_prolog_index(unsigned num_attributes,uint32_t instance_rate_inputs)2983 radv_instance_rate_prolog_index(unsigned num_attributes, uint32_t instance_rate_inputs)
2984 {
2985    /* instance_rate_vs_prologs is a flattened array of array of arrays of different sizes, or a
2986     * single array sorted in ascending order using:
2987     * - total number of attributes
2988     * - number of instanced attributes
2989     * - index of first instanced attribute
2990     */
2991 
2992    /* From total number of attributes to offset. */
2993    static const uint16_t total_to_offset[16] = {0,   1,   4,   10,  20,  35,  56,  84,
2994                                                 120, 165, 220, 286, 364, 455, 560, 680};
2995    unsigned start_index = total_to_offset[num_attributes - 1];
2996 
2997    /* From number of instanced attributes to offset. This would require a different LUT depending on
2998     * the total number of attributes, but we can exploit a pattern to use just the LUT for 16 total
2999     * attributes.
3000     */
3001    static const uint8_t count_to_offset_total16[16] = {0,   16,  31,  45,  58,  70,  81,  91,
3002                                                        100, 108, 115, 121, 126, 130, 133, 135};
3003    unsigned count = util_bitcount(instance_rate_inputs);
3004    unsigned offset_from_start_index =
3005       count_to_offset_total16[count - 1] - ((16 - num_attributes) * (count - 1));
3006 
3007    unsigned first = ffs(instance_rate_inputs) - 1;
3008    return start_index + offset_from_start_index + first;
3009 }
3010 
3011 union vs_prolog_key_header {
3012    struct {
3013       uint32_t key_size : 8;
3014       uint32_t num_attributes : 6;
3015       uint32_t as_ls : 1;
3016       uint32_t is_ngg : 1;
3017       uint32_t wave32 : 1;
3018       uint32_t next_stage : 3;
3019       uint32_t instance_rate_inputs : 1;
3020       uint32_t alpha_adjust_lo : 1;
3021       uint32_t alpha_adjust_hi : 1;
3022       uint32_t misaligned_mask : 1;
3023       uint32_t post_shuffle : 1;
3024       uint32_t nontrivial_divisors : 1;
3025       uint32_t zero_divisors : 1;
3026       /* We need this to ensure the padding is zero. It's useful even if it's unused. */
3027       uint32_t padding0 : 5;
3028    };
3029    uint32_t v;
3030 };
3031 
3032 uint32_t
radv_hash_vs_prolog(const void * key_)3033 radv_hash_vs_prolog(const void *key_)
3034 {
3035    const uint32_t *key = key_;
3036    union vs_prolog_key_header header;
3037    header.v = key[0];
3038    return _mesa_hash_data(key, header.key_size);
3039 }
3040 
3041 bool
radv_cmp_vs_prolog(const void * a_,const void * b_)3042 radv_cmp_vs_prolog(const void *a_, const void *b_)
3043 {
3044    const uint32_t *a = a_;
3045    const uint32_t *b = b_;
3046    if (a[0] != b[0])
3047       return false;
3048 
3049    union vs_prolog_key_header header;
3050    header.v = a[0];
3051    return memcmp(a, b, header.key_size) == 0;
3052 }
3053 
3054 static struct radv_shader_part *
lookup_vs_prolog(struct radv_cmd_buffer * cmd_buffer,struct radv_shader * vs_shader,uint32_t * nontrivial_divisors)3055 lookup_vs_prolog(struct radv_cmd_buffer *cmd_buffer, struct radv_shader *vs_shader,
3056                  uint32_t *nontrivial_divisors)
3057 {
3058    STATIC_ASSERT(sizeof(union vs_prolog_key_header) == 4);
3059    assert(vs_shader->info.vs.dynamic_inputs);
3060 
3061    const struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
3062    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
3063    struct radv_device *device = cmd_buffer->device;
3064 
3065    unsigned num_attributes = pipeline->last_vertex_attrib_bit;
3066    uint32_t attribute_mask = BITFIELD_MASK(num_attributes);
3067 
3068    uint32_t instance_rate_inputs = state->instance_rate_inputs & attribute_mask;
3069    uint32_t zero_divisors = state->zero_divisors & attribute_mask;
3070    *nontrivial_divisors = state->nontrivial_divisors & attribute_mask;
3071    uint32_t misaligned_mask = cmd_buffer->state.vbo_misaligned_mask;
3072    if (cmd_buffer->state.vbo_misaligned_mask_invalid) {
3073       assert(device->physical_device->rad_info.gfx_level == GFX6 ||
3074              device->physical_device->rad_info.gfx_level >= GFX10);
3075 
3076       u_foreach_bit (index, cmd_buffer->state.vbo_misaligned_mask_invalid & attribute_mask) {
3077          uint8_t binding = state->bindings[index];
3078          if (!(cmd_buffer->state.vbo_bound_mask & BITFIELD_BIT(binding)))
3079             continue;
3080          uint8_t req = state->format_align_req_minus_1[index];
3081          struct radv_vertex_binding *vb = &cmd_buffer->vertex_bindings[binding];
3082          VkDeviceSize offset = vb->offset + state->offsets[index];
3083          if ((offset & req) || (vb->stride & req))
3084             misaligned_mask |= BITFIELD_BIT(index);
3085       }
3086       cmd_buffer->state.vbo_misaligned_mask = misaligned_mask;
3087       cmd_buffer->state.vbo_misaligned_mask_invalid &= ~attribute_mask;
3088    }
3089 
3090    /* try to use a pre-compiled prolog first */
3091    struct radv_shader_part *prolog = NULL;
3092    if (pipeline->can_use_simple_input &&
3093        (!vs_shader->info.vs.as_ls || !instance_rate_inputs) &&
3094        !misaligned_mask && !state->alpha_adjust_lo && !state->alpha_adjust_hi) {
3095       if (!instance_rate_inputs) {
3096          prolog = device->simple_vs_prologs[num_attributes - 1];
3097       } else if (num_attributes <= 16 && !*nontrivial_divisors && !zero_divisors &&
3098                  util_bitcount(instance_rate_inputs) ==
3099                     (util_last_bit(instance_rate_inputs) - ffs(instance_rate_inputs) + 1)) {
3100          unsigned index = radv_instance_rate_prolog_index(num_attributes, instance_rate_inputs);
3101          prolog = device->instance_rate_vs_prologs[index];
3102       }
3103    }
3104    if (prolog)
3105       return prolog;
3106 
3107    /* if we couldn't use a pre-compiled prolog, find one in the cache or create one */
3108    uint32_t key_words[17];
3109    unsigned key_size = 1;
3110 
3111    struct radv_vs_prolog_key key;
3112    key.state = state;
3113    key.num_attributes = num_attributes;
3114    key.misaligned_mask = misaligned_mask;
3115    /* The instance ID input VGPR is placed differently when as_ls=true. */
3116    key.as_ls = vs_shader->info.vs.as_ls && instance_rate_inputs;
3117    key.is_ngg = vs_shader->info.is_ngg;
3118    key.wave32 = vs_shader->info.wave_size == 32;
3119    key.next_stage = pipeline->next_vertex_stage;
3120 
3121    union vs_prolog_key_header header;
3122    header.v = 0;
3123    header.num_attributes = num_attributes;
3124    header.as_ls = key.as_ls;
3125    header.is_ngg = key.is_ngg;
3126    header.wave32 = key.wave32;
3127    header.next_stage = key.next_stage;
3128 
3129    if (instance_rate_inputs & ~*nontrivial_divisors) {
3130       header.instance_rate_inputs = true;
3131       key_words[key_size++] = instance_rate_inputs;
3132    }
3133    if (*nontrivial_divisors) {
3134       header.nontrivial_divisors = true;
3135       key_words[key_size++] = *nontrivial_divisors;
3136    }
3137    if (zero_divisors) {
3138       header.zero_divisors = true;
3139       key_words[key_size++] = zero_divisors;
3140    }
3141    if (misaligned_mask) {
3142       header.misaligned_mask = true;
3143       key_words[key_size++] = misaligned_mask;
3144 
3145       uint8_t *formats = (uint8_t *)&key_words[key_size];
3146       unsigned num_formats = 0;
3147       u_foreach_bit(index, misaligned_mask) formats[num_formats++] = state->formats[index];
3148       while (num_formats & 0x3)
3149          formats[num_formats++] = 0;
3150       key_size += num_formats / 4u;
3151 
3152       if (state->post_shuffle & attribute_mask) {
3153          header.post_shuffle = true;
3154          key_words[key_size++] = state->post_shuffle & attribute_mask;
3155       }
3156    }
3157    if (state->alpha_adjust_lo & attribute_mask) {
3158       header.alpha_adjust_lo = true;
3159       key_words[key_size++] = state->alpha_adjust_lo & attribute_mask;
3160    }
3161    if (state->alpha_adjust_hi & attribute_mask) {
3162       header.alpha_adjust_hi = true;
3163       key_words[key_size++] = state->alpha_adjust_hi & attribute_mask;
3164    }
3165 
3166    header.key_size = key_size * sizeof(key_words[0]);
3167    key_words[0] = header.v;
3168 
3169    uint32_t hash = radv_hash_vs_prolog(key_words);
3170 
3171    if (cmd_buffer->state.emitted_vs_prolog &&
3172        cmd_buffer->state.emitted_vs_prolog_key_hash == hash &&
3173        radv_cmp_vs_prolog(key_words, cmd_buffer->state.emitted_vs_prolog_key))
3174       return cmd_buffer->state.emitted_vs_prolog;
3175 
3176    u_rwlock_rdlock(&device->vs_prologs_lock);
3177    struct hash_entry *prolog_entry =
3178       _mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, key_words);
3179    u_rwlock_rdunlock(&device->vs_prologs_lock);
3180 
3181    if (!prolog_entry) {
3182       u_rwlock_wrlock(&device->vs_prologs_lock);
3183       prolog_entry = _mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, key_words);
3184       if (prolog_entry) {
3185          u_rwlock_wrunlock(&device->vs_prologs_lock);
3186          return prolog_entry->data;
3187       }
3188 
3189       prolog = radv_create_vs_prolog(device, &key);
3190       uint32_t *key2 = malloc(key_size * 4);
3191       if (!prolog || !key2) {
3192          radv_shader_part_destroy(device, prolog);
3193          free(key2);
3194          u_rwlock_wrunlock(&device->vs_prologs_lock);
3195          return NULL;
3196       }
3197       memcpy(key2, key_words, key_size * 4);
3198       _mesa_hash_table_insert_pre_hashed(device->vs_prologs, hash, key2, prolog);
3199 
3200       u_rwlock_wrunlock(&device->vs_prologs_lock);
3201       return prolog;
3202    }
3203 
3204    return prolog_entry->data;
3205 }
3206 
3207 static void
emit_prolog_regs(struct radv_cmd_buffer * cmd_buffer,struct radv_shader * vs_shader,struct radv_shader_part * prolog,bool pipeline_is_dirty)3208 emit_prolog_regs(struct radv_cmd_buffer *cmd_buffer, struct radv_shader *vs_shader,
3209                  struct radv_shader_part *prolog, bool pipeline_is_dirty)
3210 {
3211    /* no need to re-emit anything in this case */
3212    if (cmd_buffer->state.emitted_vs_prolog == prolog && !pipeline_is_dirty)
3213       return;
3214 
3215    enum amd_gfx_level chip = cmd_buffer->device->physical_device->rad_info.gfx_level;
3216    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
3217    uint64_t prolog_va = radv_buffer_get_va(prolog->bo) + prolog->alloc->offset;
3218 
3219    assert(cmd_buffer->state.emitted_graphics_pipeline == cmd_buffer->state.graphics_pipeline);
3220 
3221    uint32_t rsrc1 = vs_shader->config.rsrc1;
3222    if (chip < GFX10 && G_00B228_SGPRS(prolog->rsrc1) > G_00B228_SGPRS(vs_shader->config.rsrc1))
3223       rsrc1 = (rsrc1 & C_00B228_SGPRS) | (prolog->rsrc1 & ~C_00B228_SGPRS);
3224 
3225    /* The main shader must not use less VGPRs than the prolog, otherwise shared vgprs might not
3226     * work.
3227     */
3228    assert(G_00B848_VGPRS(vs_shader->config.rsrc1) >= G_00B848_VGPRS(prolog->rsrc1));
3229 
3230    unsigned pgm_lo_reg = R_00B120_SPI_SHADER_PGM_LO_VS;
3231    unsigned rsrc1_reg = R_00B128_SPI_SHADER_PGM_RSRC1_VS;
3232    if (vs_shader->info.is_ngg || pipeline->base.shaders[MESA_SHADER_GEOMETRY] == vs_shader) {
3233       pgm_lo_reg = chip >= GFX10 ? R_00B320_SPI_SHADER_PGM_LO_ES : R_00B210_SPI_SHADER_PGM_LO_ES;
3234       rsrc1_reg = R_00B228_SPI_SHADER_PGM_RSRC1_GS;
3235    } else if (pipeline->base.shaders[MESA_SHADER_TESS_CTRL] == vs_shader) {
3236       pgm_lo_reg = chip >= GFX10 ? R_00B520_SPI_SHADER_PGM_LO_LS : R_00B410_SPI_SHADER_PGM_LO_LS;
3237       rsrc1_reg = R_00B428_SPI_SHADER_PGM_RSRC1_HS;
3238    } else if (vs_shader->info.vs.as_ls) {
3239       pgm_lo_reg = R_00B520_SPI_SHADER_PGM_LO_LS;
3240       rsrc1_reg = R_00B528_SPI_SHADER_PGM_RSRC1_LS;
3241    } else if (vs_shader->info.vs.as_es) {
3242       pgm_lo_reg = R_00B320_SPI_SHADER_PGM_LO_ES;
3243       rsrc1_reg = R_00B328_SPI_SHADER_PGM_RSRC1_ES;
3244    }
3245 
3246    radeon_set_sh_reg(cmd_buffer->cs, pgm_lo_reg, prolog_va >> 8);
3247 
3248    if (chip < GFX10)
3249       radeon_set_sh_reg(cmd_buffer->cs, rsrc1_reg, rsrc1);
3250    else
3251       assert(rsrc1 == vs_shader->config.rsrc1);
3252 
3253    radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, prolog->bo);
3254 }
3255 
3256 static void
emit_prolog_inputs(struct radv_cmd_buffer * cmd_buffer,struct radv_shader * vs_shader,uint32_t nontrivial_divisors,bool pipeline_is_dirty)3257 emit_prolog_inputs(struct radv_cmd_buffer *cmd_buffer, struct radv_shader *vs_shader,
3258                    uint32_t nontrivial_divisors, bool pipeline_is_dirty)
3259 {
3260    /* no need to re-emit anything in this case */
3261    if (!nontrivial_divisors && !pipeline_is_dirty && cmd_buffer->state.emitted_vs_prolog &&
3262        !cmd_buffer->state.emitted_vs_prolog->nontrivial_divisors)
3263       return;
3264 
3265    const struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
3266    uint64_t input_va = radv_shader_get_va(vs_shader);
3267 
3268    if (nontrivial_divisors) {
3269       unsigned inputs_offset;
3270       uint32_t *inputs;
3271       unsigned size = 8 + util_bitcount(nontrivial_divisors) * 8;
3272       if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, &inputs_offset, (void **)&inputs))
3273          return;
3274 
3275       *(inputs++) = input_va;
3276       *(inputs++) = input_va >> 32;
3277 
3278       u_foreach_bit(index, nontrivial_divisors)
3279       {
3280          uint32_t div = state->divisors[index];
3281          if (div == 0) {
3282             *(inputs++) = 0;
3283             *(inputs++) = 1;
3284          } else if (util_is_power_of_two_or_zero(div)) {
3285             *(inputs++) = util_logbase2(div) | (1 << 8);
3286             *(inputs++) = 0xffffffffu;
3287          } else {
3288             struct util_fast_udiv_info info = util_compute_fast_udiv_info(div, 32, 32);
3289             *(inputs++) = info.pre_shift | (info.increment << 8) | (info.post_shift << 16);
3290             *(inputs++) = info.multiplier;
3291          }
3292       }
3293 
3294       input_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + inputs_offset;
3295    }
3296 
3297    struct radv_userdata_info *loc =
3298       &vs_shader->info.user_sgprs_locs.shader_data[AC_UD_VS_PROLOG_INPUTS];
3299    uint32_t base_reg = cmd_buffer->state.graphics_pipeline->base.user_data_0[MESA_SHADER_VERTEX];
3300    assert(loc->sgpr_idx != -1);
3301    assert(loc->num_sgprs == 2);
3302    radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4,
3303                             input_va, true);
3304 }
3305 
3306 static void
radv_emit_vertex_input(struct radv_cmd_buffer * cmd_buffer,bool pipeline_is_dirty)3307 radv_emit_vertex_input(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
3308 {
3309    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
3310    struct radv_shader *vs_shader = radv_get_shader(&pipeline->base, MESA_SHADER_VERTEX);
3311 
3312    assert(!cmd_buffer->state.mesh_shading);
3313 
3314    if (!vs_shader->info.vs.has_prolog)
3315       return;
3316 
3317    uint32_t nontrivial_divisors;
3318    struct radv_shader_part *prolog =
3319       lookup_vs_prolog(cmd_buffer, vs_shader, &nontrivial_divisors);
3320    if (!prolog) {
3321       cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
3322       return;
3323    }
3324    emit_prolog_regs(cmd_buffer, vs_shader, prolog, pipeline_is_dirty);
3325    emit_prolog_inputs(cmd_buffer, vs_shader, nontrivial_divisors, pipeline_is_dirty);
3326 
3327    cmd_buffer->state.emitted_vs_prolog = prolog;
3328 
3329    if (unlikely(cmd_buffer->device->trace_bo))
3330       radv_save_vs_prolog(cmd_buffer, prolog);
3331 }
3332 
3333 static void
radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer * cmd_buffer,bool pipeline_is_dirty)3334 radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
3335 {
3336    uint64_t states =
3337       cmd_buffer->state.dirty & cmd_buffer->state.emitted_graphics_pipeline->needed_dynamic_state;
3338 
3339    if (states & (RADV_CMD_DIRTY_DYNAMIC_VIEWPORT))
3340       radv_emit_viewport(cmd_buffer);
3341 
3342    if (states & (RADV_CMD_DIRTY_DYNAMIC_SCISSOR | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT) &&
3343        !cmd_buffer->device->physical_device->rad_info.has_gfx9_scissor_bug)
3344       radv_emit_scissor(cmd_buffer);
3345 
3346    if (states & RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH)
3347       radv_emit_line_width(cmd_buffer);
3348 
3349    if (states & RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS)
3350       radv_emit_blend_constants(cmd_buffer);
3351 
3352    if (states &
3353        (RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE | RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK |
3354         RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK))
3355       radv_emit_stencil(cmd_buffer);
3356 
3357    if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS)
3358       radv_emit_depth_bounds(cmd_buffer);
3359 
3360    if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS)
3361       radv_emit_depth_bias(cmd_buffer);
3362 
3363    if (states & RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE)
3364       radv_emit_discard_rectangle(cmd_buffer);
3365 
3366    if (states & RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS)
3367       radv_emit_sample_locations(cmd_buffer);
3368 
3369    if (states & (RADV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE))
3370       radv_emit_line_stipple(cmd_buffer);
3371 
3372    if (states & (RADV_CMD_DIRTY_DYNAMIC_CULL_MODE | RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE |
3373                  RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE))
3374       radv_emit_culling(cmd_buffer, states);
3375 
3376    if (states & RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY)
3377       radv_emit_primitive_topology(cmd_buffer);
3378 
3379    if (states &
3380        (RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE |
3381         RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP | RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE |
3382         RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP))
3383       radv_emit_depth_control(cmd_buffer, states);
3384 
3385    if (states & RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP)
3386       radv_emit_stencil_control(cmd_buffer);
3387 
3388    if (states & RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE)
3389       radv_emit_fragment_shading_rate(cmd_buffer);
3390 
3391    if (states & RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE)
3392       radv_emit_primitive_restart_enable(cmd_buffer);
3393 
3394    if (states & RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE)
3395       radv_emit_rasterizer_discard_enable(cmd_buffer);
3396 
3397    if (states & RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP)
3398       radv_emit_logic_op(cmd_buffer);
3399 
3400    if (states & RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE)
3401       radv_emit_color_write_enable(cmd_buffer);
3402 
3403    if (states & RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT)
3404       radv_emit_vertex_input(cmd_buffer, pipeline_is_dirty);
3405 
3406    cmd_buffer->state.dirty &= ~states;
3407 }
3408 
3409 static void
radv_flush_push_descriptors(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)3410 radv_flush_push_descriptors(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point)
3411 {
3412    struct radv_descriptor_state *descriptors_state =
3413       radv_get_descriptors_state(cmd_buffer, bind_point);
3414    struct radv_descriptor_set *set = (struct radv_descriptor_set *)&descriptors_state->push_set.set;
3415    unsigned bo_offset;
3416 
3417    if (!radv_cmd_buffer_upload_data(cmd_buffer, set->header.size, set->header.mapped_ptr,
3418                                     &bo_offset))
3419       return;
3420 
3421    set->header.va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
3422    set->header.va += bo_offset;
3423 }
3424 
3425 static void
radv_flush_indirect_descriptor_sets(struct radv_cmd_buffer * cmd_buffer,struct radv_pipeline * pipeline,VkPipelineBindPoint bind_point)3426 radv_flush_indirect_descriptor_sets(struct radv_cmd_buffer *cmd_buffer,
3427                                     struct radv_pipeline *pipeline, VkPipelineBindPoint bind_point)
3428 {
3429    struct radv_descriptor_state *descriptors_state =
3430       radv_get_descriptors_state(cmd_buffer, bind_point);
3431    uint32_t size = MAX_SETS * 4;
3432    uint32_t offset;
3433    void *ptr;
3434 
3435    if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, &offset, &ptr))
3436       return;
3437 
3438    for (unsigned i = 0; i < MAX_SETS; i++) {
3439       uint32_t *uptr = ((uint32_t *)ptr) + i;
3440       uint64_t set_va = 0;
3441       struct radv_descriptor_set *set = descriptors_state->sets[i];
3442       if (descriptors_state->valid & (1u << i))
3443          set_va = set->header.va;
3444       uptr[0] = set_va & 0xffffffff;
3445    }
3446 
3447    struct radeon_cmdbuf *cs = cmd_buffer->cs;
3448    struct radv_device *device = cmd_buffer->device;
3449    uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
3450    va += offset;
3451 
3452    if (bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) {
3453       struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline);
3454 
3455       if (pipeline->shaders[MESA_SHADER_VERTEX])
3456          radv_emit_userdata_address(device, cs, pipeline, MESA_SHADER_VERTEX,
3457                                     AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3458 
3459       if (pipeline->shaders[MESA_SHADER_FRAGMENT])
3460          radv_emit_userdata_address(device, cs, pipeline, MESA_SHADER_FRAGMENT,
3461                                     AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3462 
3463       if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_MESH))
3464          radv_emit_userdata_address(device, cs, pipeline, MESA_SHADER_MESH,
3465                                     AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3466 
3467       if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_TASK))
3468          radv_emit_userdata_address(device, cmd_buffer->ace_internal.cs, pipeline, MESA_SHADER_TASK,
3469                                     AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3470 
3471       if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_GEOMETRY))
3472          radv_emit_userdata_address(device, cs, pipeline, MESA_SHADER_GEOMETRY,
3473                                     AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3474 
3475       if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_TESS_CTRL))
3476          radv_emit_userdata_address(device, cs, pipeline, MESA_SHADER_TESS_CTRL,
3477                                     AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3478 
3479       if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_TESS_CTRL))
3480          radv_emit_userdata_address(device, cs, pipeline, MESA_SHADER_TESS_EVAL,
3481                                     AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3482    } else {
3483       radv_emit_userdata_address(device, cs, pipeline, MESA_SHADER_COMPUTE,
3484                                  AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
3485    }
3486 }
3487 
3488 static void
radv_flush_descriptors(struct radv_cmd_buffer * cmd_buffer,VkShaderStageFlags stages,struct radv_pipeline * pipeline,VkPipelineBindPoint bind_point)3489 radv_flush_descriptors(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stages,
3490                        struct radv_pipeline *pipeline, VkPipelineBindPoint bind_point)
3491 {
3492    struct radv_descriptor_state *descriptors_state =
3493       radv_get_descriptors_state(cmd_buffer, bind_point);
3494    struct radv_device *device = cmd_buffer->device;
3495    struct radeon_cmdbuf *cs = cmd_buffer->cs;
3496    bool flush_indirect_descriptors;
3497 
3498    if (!descriptors_state->dirty)
3499       return;
3500 
3501    if (descriptors_state->push_dirty)
3502       radv_flush_push_descriptors(cmd_buffer, bind_point);
3503 
3504    flush_indirect_descriptors = pipeline->need_indirect_descriptor_sets;
3505 
3506    if (flush_indirect_descriptors)
3507       radv_flush_indirect_descriptor_sets(cmd_buffer, pipeline, bind_point);
3508 
3509    ASSERTED unsigned cdw_max =
3510       radeon_check_space(device->ws, cs, MAX_SETS * MESA_VULKAN_SHADER_STAGES * 4);
3511 
3512    if (stages & VK_SHADER_STAGE_COMPUTE_BIT) {
3513       radv_emit_descriptor_pointers(device, cs, pipeline, descriptors_state, MESA_SHADER_COMPUTE);
3514    } else {
3515       radv_foreach_stage(stage, stages & ~VK_SHADER_STAGE_TASK_BIT_NV)
3516       {
3517          if (!cmd_buffer->state.graphics_pipeline->base.shaders[stage])
3518             continue;
3519 
3520          radv_emit_descriptor_pointers(device, cs, pipeline, descriptors_state, stage);
3521       }
3522 
3523       if (stages & VK_SHADER_STAGE_TASK_BIT_NV) {
3524          radv_emit_descriptor_pointers(device, cmd_buffer->ace_internal.cs, pipeline,
3525                                        descriptors_state, MESA_SHADER_TASK);
3526       }
3527    }
3528 
3529    descriptors_state->dirty = 0;
3530    descriptors_state->push_dirty = false;
3531 
3532    assert(cmd_buffer->cs->cdw <= cdw_max);
3533 
3534    if (unlikely(cmd_buffer->device->trace_bo))
3535       radv_save_descriptors(cmd_buffer, bind_point);
3536 }
3537 
3538 static bool
radv_shader_loads_push_constants(struct radv_pipeline * pipeline,gl_shader_stage stage)3539 radv_shader_loads_push_constants(struct radv_pipeline *pipeline, gl_shader_stage stage)
3540 {
3541    struct radv_userdata_info *loc =
3542       radv_lookup_user_sgpr(pipeline, stage, AC_UD_PUSH_CONSTANTS);
3543    return loc->sgpr_idx != -1;
3544 }
3545 
3546 static void
radv_emit_all_inline_push_consts(struct radv_device * device,struct radeon_cmdbuf * cs,struct radv_pipeline * pipeline,gl_shader_stage stage,uint32_t * values,bool * need_push_constants)3547 radv_emit_all_inline_push_consts(struct radv_device *device, struct radeon_cmdbuf *cs,
3548                                  struct radv_pipeline *pipeline, gl_shader_stage stage,
3549                                  uint32_t *values, bool *need_push_constants)
3550 {
3551    const struct radv_shader *shader = radv_get_shader(pipeline, stage);
3552    if (!shader)
3553       return;
3554 
3555    *need_push_constants |= radv_shader_loads_push_constants(pipeline, stage);
3556 
3557    const uint64_t mask = shader->info.inline_push_constant_mask;
3558    if (!mask)
3559       return;
3560 
3561    const uint8_t base = ffs(mask) - 1;
3562    if (mask == u_bit_consecutive64(base, util_last_bit64(mask) - base)) {
3563       /* consecutive inline push constants */
3564       radv_emit_inline_push_consts(device, cs, pipeline, stage, AC_UD_INLINE_PUSH_CONSTANTS,
3565                                    values + base);
3566    } else {
3567       /* sparse inline push constants */
3568       uint32_t consts[AC_MAX_INLINE_PUSH_CONSTS];
3569       unsigned num_consts = 0;
3570       u_foreach_bit64 (idx, mask)
3571          consts[num_consts++] = values[idx];
3572       radv_emit_inline_push_consts(device, cs, pipeline, stage, AC_UD_INLINE_PUSH_CONSTANTS,
3573                                    consts);
3574    }
3575 }
3576 
3577 static void
radv_flush_constants(struct radv_cmd_buffer * cmd_buffer,VkShaderStageFlags stages,struct radv_pipeline * pipeline,VkPipelineBindPoint bind_point)3578 radv_flush_constants(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stages,
3579                      struct radv_pipeline *pipeline, VkPipelineBindPoint bind_point)
3580 {
3581    struct radv_device *device = cmd_buffer->device;
3582    struct radeon_cmdbuf *cs = cmd_buffer->cs;
3583    struct radv_descriptor_state *descriptors_state =
3584       radv_get_descriptors_state(cmd_buffer, bind_point);
3585    struct radv_shader *shader, *prev_shader;
3586    bool need_push_constants = false;
3587    unsigned offset;
3588    void *ptr;
3589    uint64_t va;
3590    uint32_t internal_stages;
3591    uint32_t dirty_stages = 0;
3592 
3593    stages &= cmd_buffer->push_constant_stages;
3594    if (!stages || (!pipeline->push_constant_size && !pipeline->dynamic_offset_count))
3595       return;
3596 
3597    internal_stages = stages;
3598    switch (bind_point) {
3599    case VK_PIPELINE_BIND_POINT_GRAPHICS:
3600       break;
3601    case VK_PIPELINE_BIND_POINT_COMPUTE:
3602       dirty_stages = RADV_RT_STAGE_BITS;
3603       break;
3604    case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR:
3605       internal_stages = VK_SHADER_STAGE_COMPUTE_BIT;
3606       dirty_stages = VK_SHADER_STAGE_COMPUTE_BIT;
3607       break;
3608    default:
3609       unreachable("Unhandled bind point");
3610    }
3611 
3612    radv_foreach_stage(stage, internal_stages & ~VK_SHADER_STAGE_TASK_BIT_NV)
3613    {
3614       radv_emit_all_inline_push_consts(
3615          device, cs, pipeline, stage, (uint32_t *)cmd_buffer->push_constants, &need_push_constants);
3616    }
3617 
3618    if (internal_stages & VK_SHADER_STAGE_TASK_BIT_NV) {
3619       radv_emit_all_inline_push_consts(device, cmd_buffer->ace_internal.cs, pipeline,
3620                                        MESA_SHADER_TASK, (uint32_t *)cmd_buffer->push_constants,
3621                                        &need_push_constants);
3622    }
3623 
3624    if (need_push_constants) {
3625       if (!radv_cmd_buffer_upload_alloc(
3626              cmd_buffer, pipeline->push_constant_size + 16 * pipeline->dynamic_offset_count, &offset,
3627              &ptr))
3628          return;
3629 
3630       memcpy(ptr, cmd_buffer->push_constants, pipeline->push_constant_size);
3631       memcpy((char *)ptr + pipeline->push_constant_size, descriptors_state->dynamic_buffers,
3632              16 * pipeline->dynamic_offset_count);
3633 
3634       va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
3635       va += offset;
3636 
3637       ASSERTED unsigned cdw_max =
3638          radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, MESA_VULKAN_SHADER_STAGES * 4);
3639 
3640       prev_shader = NULL;
3641       radv_foreach_stage(stage, internal_stages & ~VK_SHADER_STAGE_TASK_BIT_NV)
3642       {
3643          shader = radv_get_shader(pipeline, stage);
3644 
3645          /* Avoid redundantly emitting the address for merged stages. */
3646          if (shader && shader != prev_shader) {
3647             radv_emit_userdata_address(device, cs, pipeline, stage, AC_UD_PUSH_CONSTANTS, va);
3648 
3649             prev_shader = shader;
3650          }
3651       }
3652 
3653       if (internal_stages & VK_SHADER_STAGE_TASK_BIT_NV) {
3654          radv_emit_userdata_address(device, cmd_buffer->ace_internal.cs, pipeline, MESA_SHADER_TASK,
3655                                     AC_UD_PUSH_CONSTANTS, va);
3656       }
3657 
3658       assert(cmd_buffer->cs->cdw <= cdw_max);
3659    }
3660 
3661    cmd_buffer->push_constant_stages &= ~stages;
3662    cmd_buffer->push_constant_stages |= dirty_stages;
3663 }
3664 
3665 enum radv_dst_sel {
3666    DST_SEL_0001 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_0) |
3667                   S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1),
3668    DST_SEL_X001 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_0) |
3669                   S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1),
3670    DST_SEL_XY01 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3671                   S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1),
3672    DST_SEL_XYZ1 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3673                   S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1),
3674    DST_SEL_XYZW = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3675                   S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W),
3676    DST_SEL_ZYXW = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3677                   S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W),
3678 };
3679 
3680 static const uint32_t data_format_dst_sel[] = {
3681    [V_008F0C_BUF_DATA_FORMAT_INVALID] = DST_SEL_0001,
3682    [V_008F0C_BUF_DATA_FORMAT_8] = DST_SEL_X001,
3683    [V_008F0C_BUF_DATA_FORMAT_16] = DST_SEL_X001,
3684    [V_008F0C_BUF_DATA_FORMAT_8_8] = DST_SEL_XY01,
3685    [V_008F0C_BUF_DATA_FORMAT_32] = DST_SEL_X001,
3686    [V_008F0C_BUF_DATA_FORMAT_16_16] = DST_SEL_XY01,
3687    [V_008F0C_BUF_DATA_FORMAT_10_11_11] = DST_SEL_XYZ1,
3688    [V_008F0C_BUF_DATA_FORMAT_11_11_10] = DST_SEL_XYZ1,
3689    [V_008F0C_BUF_DATA_FORMAT_10_10_10_2] = DST_SEL_XYZW,
3690    [V_008F0C_BUF_DATA_FORMAT_2_10_10_10] = DST_SEL_XYZW,
3691    [V_008F0C_BUF_DATA_FORMAT_8_8_8_8] = DST_SEL_XYZW,
3692    [V_008F0C_BUF_DATA_FORMAT_32_32] = DST_SEL_XY01,
3693    [V_008F0C_BUF_DATA_FORMAT_16_16_16_16] = DST_SEL_XYZW,
3694    [V_008F0C_BUF_DATA_FORMAT_32_32_32] = DST_SEL_XYZ1,
3695    [V_008F0C_BUF_DATA_FORMAT_32_32_32_32] = DST_SEL_XYZW,
3696 };
3697 
3698 void
radv_write_vertex_descriptors(const struct radv_cmd_buffer * cmd_buffer,const struct radv_graphics_pipeline * pipeline,bool full_null_descriptors,void * vb_ptr)3699 radv_write_vertex_descriptors(const struct radv_cmd_buffer *cmd_buffer,
3700                               const struct radv_graphics_pipeline *pipeline,
3701                               bool full_null_descriptors, void *vb_ptr)
3702 {
3703    struct radv_shader *vs_shader = radv_get_shader(&pipeline->base, MESA_SHADER_VERTEX);
3704    enum amd_gfx_level chip = cmd_buffer->device->physical_device->rad_info.gfx_level;
3705    unsigned desc_index = 0;
3706    uint32_t mask = pipeline->vb_desc_usage_mask;
3707    uint64_t va;
3708    const struct radv_vs_input_state *vs_state =
3709       vs_shader->info.vs.dynamic_inputs ? &cmd_buffer->state.dynamic_vs_input : NULL;
3710    assert(!vs_state || pipeline->use_per_attribute_vb_descs);
3711 
3712    while (mask) {
3713       unsigned i = u_bit_scan(&mask);
3714       uint32_t *desc = &((uint32_t *)vb_ptr)[desc_index++ * 4];
3715       uint32_t offset, rsrc_word3;
3716       unsigned binding =
3717          vs_state ? cmd_buffer->state.dynamic_vs_input.bindings[i]
3718                   : (pipeline->use_per_attribute_vb_descs ? pipeline->attrib_bindings[i] : i);
3719       struct radv_buffer *buffer = cmd_buffer->vertex_binding_buffers[binding];
3720       unsigned num_records;
3721       unsigned stride;
3722 
3723       if (vs_state) {
3724          unsigned format = vs_state->formats[i];
3725          unsigned dfmt = format & 0xf;
3726          unsigned nfmt = (format >> 4) & 0x7;
3727 
3728          rsrc_word3 = vs_state->post_shuffle & (1u << i) ? DST_SEL_ZYXW : data_format_dst_sel[dfmt];
3729 
3730          if (chip >= GFX10)
3731             rsrc_word3 |= S_008F0C_FORMAT(ac_get_tbuffer_format(chip, dfmt, nfmt));
3732          else
3733             rsrc_word3 |= S_008F0C_NUM_FORMAT(nfmt) | S_008F0C_DATA_FORMAT(dfmt);
3734       } else {
3735          if (chip >= GFX10)
3736             rsrc_word3 = DST_SEL_XYZW | S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_UINT);
3737          else
3738             rsrc_word3 = DST_SEL_XYZW | S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
3739                          S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
3740       }
3741 
3742       if (pipeline->uses_dynamic_stride) {
3743          stride = cmd_buffer->vertex_bindings[binding].stride;
3744       } else {
3745          stride = pipeline->binding_stride[binding];
3746       }
3747 
3748       if (!buffer) {
3749          if (full_null_descriptors) {
3750             /* Put all the info in for the DGC generation shader in case the VBO gets overridden. */
3751             desc[0] = 0;
3752             desc[1] = S_008F04_STRIDE(stride);
3753             desc[2] = 0;
3754             desc[3] = rsrc_word3;
3755          } else if (vs_state) {
3756             /* Stride needs to be non-zero on GFX9, or else bounds checking is disabled. We need
3757              * to include the format/word3 so that the alpha channel is 1 for formats without an
3758              * alpha channel.
3759              */
3760             desc[0] = 0;
3761             desc[1] = S_008F04_STRIDE(16);
3762             desc[2] = 0;
3763             desc[3] = rsrc_word3;
3764          } else {
3765             memset(desc, 0, 4 * 4);
3766          }
3767 
3768          continue;
3769       }
3770 
3771       va = radv_buffer_get_va(buffer->bo);
3772 
3773       offset = cmd_buffer->vertex_bindings[binding].offset;
3774       va += offset + buffer->offset;
3775       if (vs_state)
3776          va += vs_state->offsets[i];
3777 
3778       if (cmd_buffer->vertex_bindings[binding].size) {
3779          num_records = cmd_buffer->vertex_bindings[binding].size;
3780       } else {
3781          num_records = vk_buffer_range(&buffer->vk, offset, VK_WHOLE_SIZE);
3782       }
3783 
3784       if (pipeline->use_per_attribute_vb_descs) {
3785          uint32_t attrib_end =
3786             vs_state ? vs_state->offsets[i] + vs_state->format_sizes[i] : pipeline->attrib_ends[i];
3787 
3788          if (num_records < attrib_end) {
3789             num_records = 0; /* not enough space for one vertex */
3790          } else if (stride == 0) {
3791             num_records = 1; /* only one vertex */
3792          } else {
3793             num_records = (num_records - attrib_end) / stride + 1;
3794             /* If attrib_offset>stride, then the compiler will increase the vertex index by
3795              * attrib_offset/stride and decrease the offset by attrib_offset%stride. This is
3796              * only allowed with static strides.
3797              */
3798             num_records += pipeline->attrib_index_offset[i];
3799          }
3800 
3801          /* GFX10 uses OOB_SELECT_RAW if stride==0, so convert num_records from elements into
3802           * into bytes in that case. GFX8 always uses bytes.
3803           */
3804          if (num_records && (chip == GFX8 || (chip != GFX9 && !stride))) {
3805             num_records = (num_records - 1) * stride + attrib_end;
3806          } else if (!num_records) {
3807             /* On GFX9, it seems bounds checking is disabled if both
3808              * num_records and stride are zero. This doesn't seem necessary on GFX8, GFX10 and
3809              * GFX10.3 but it doesn't hurt.
3810              */
3811             if (full_null_descriptors) {
3812                /* Put all the info in for the DGC generation shader in case the VBO gets overridden.
3813                 */
3814                desc[0] = 0;
3815                desc[1] = S_008F04_STRIDE(stride);
3816                desc[2] = 0;
3817                desc[3] = rsrc_word3;
3818             } else if (vs_state) {
3819                desc[0] = 0;
3820                desc[1] = S_008F04_STRIDE(16);
3821                desc[2] = 0;
3822                desc[3] = rsrc_word3;
3823             } else {
3824                memset(desc, 0, 16);
3825             }
3826 
3827             continue;
3828          }
3829       } else {
3830          if (chip != GFX8 && stride)
3831             num_records = DIV_ROUND_UP(num_records, stride);
3832       }
3833 
3834       if (chip >= GFX10) {
3835          /* OOB_SELECT chooses the out-of-bounds check:
3836           * - 1: index >= NUM_RECORDS (Structured)
3837           * - 3: offset >= NUM_RECORDS (Raw)
3838           */
3839          int oob_select = stride ? V_008F0C_OOB_SELECT_STRUCTURED : V_008F0C_OOB_SELECT_RAW;
3840          rsrc_word3 |= S_008F0C_OOB_SELECT(oob_select) | S_008F0C_RESOURCE_LEVEL(chip < GFX11);
3841       }
3842 
3843       desc[0] = va;
3844       desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride);
3845       desc[2] = num_records;
3846       desc[3] = rsrc_word3;
3847    }
3848 }
3849 
3850 static void
radv_flush_vertex_descriptors(struct radv_cmd_buffer * cmd_buffer,bool pipeline_is_dirty)3851 radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
3852 {
3853    if ((pipeline_is_dirty || (cmd_buffer->state.dirty & RADV_CMD_DIRTY_VERTEX_BUFFER)) &&
3854        cmd_buffer->state.graphics_pipeline->vb_desc_usage_mask) {
3855       /* Mesh shaders don't have vertex descriptors. */
3856       assert(!cmd_buffer->state.mesh_shading);
3857 
3858       struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
3859       unsigned vb_offset;
3860       void *vb_ptr;
3861       uint64_t va;
3862 
3863       /* allocate some descriptor state for vertex buffers */
3864       if (!radv_cmd_buffer_upload_alloc(cmd_buffer, pipeline->vb_desc_alloc_size, &vb_offset,
3865                                         &vb_ptr))
3866          return;
3867 
3868       radv_write_vertex_descriptors(cmd_buffer, pipeline, false, vb_ptr);
3869 
3870       va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
3871       va += vb_offset;
3872 
3873       radv_emit_userdata_address(cmd_buffer->device, cmd_buffer->cs, &pipeline->base,
3874                                  MESA_SHADER_VERTEX, AC_UD_VS_VERTEX_BUFFERS, va);
3875 
3876       cmd_buffer->state.vb_va = va;
3877       cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_VBO_DESCRIPTORS;
3878 
3879       if (unlikely(cmd_buffer->device->trace_bo))
3880          radv_save_vertex_descriptors(cmd_buffer, (uintptr_t)vb_ptr);
3881    }
3882    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_VERTEX_BUFFER;
3883 }
3884 
3885 static void
radv_emit_streamout_buffers(struct radv_cmd_buffer * cmd_buffer,uint64_t va)3886 radv_emit_streamout_buffers(struct radv_cmd_buffer *cmd_buffer, uint64_t va)
3887 {
3888    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
3889    struct radv_userdata_info *loc;
3890    uint32_t base_reg;
3891 
3892    for (unsigned stage = 0; stage < MESA_VULKAN_SHADER_STAGES; ++stage) {
3893       if (!radv_get_shader(&pipeline->base, stage))
3894          continue;
3895 
3896       loc = radv_lookup_user_sgpr(&pipeline->base, stage, AC_UD_STREAMOUT_BUFFERS);
3897       if (loc->sgpr_idx == -1)
3898          continue;
3899 
3900       base_reg = pipeline->base.user_data_0[stage];
3901 
3902       radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, va,
3903                                false);
3904    }
3905 
3906    if (radv_pipeline_has_gs_copy_shader(&pipeline->base)) {
3907       loc = &pipeline->base.gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_STREAMOUT_BUFFERS];
3908       if (loc->sgpr_idx != -1) {
3909          base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0;
3910 
3911          radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4,
3912                                   va, false);
3913       }
3914    }
3915 }
3916 
3917 static void
radv_flush_streamout_descriptors(struct radv_cmd_buffer * cmd_buffer)3918 radv_flush_streamout_descriptors(struct radv_cmd_buffer *cmd_buffer)
3919 {
3920    if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_STREAMOUT_BUFFER) {
3921       struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
3922       struct radv_streamout_state *so = &cmd_buffer->state.streamout;
3923       unsigned so_offset;
3924       void *so_ptr;
3925       uint64_t va;
3926 
3927       /* Allocate some descriptor state for streamout buffers. */
3928       if (!radv_cmd_buffer_upload_alloc(cmd_buffer, MAX_SO_BUFFERS * 16, &so_offset, &so_ptr))
3929          return;
3930 
3931       for (uint32_t i = 0; i < MAX_SO_BUFFERS; i++) {
3932          struct radv_buffer *buffer = sb[i].buffer;
3933          uint32_t *desc = &((uint32_t *)so_ptr)[i * 4];
3934 
3935          if (!(so->enabled_mask & (1 << i)))
3936             continue;
3937 
3938          va = radv_buffer_get_va(buffer->bo) + buffer->offset;
3939 
3940          va += sb[i].offset;
3941 
3942          /* Set the descriptor.
3943           *
3944           * On GFX8, the format must be non-INVALID, otherwise
3945           * the buffer will be considered not bound and store
3946           * instructions will be no-ops.
3947           */
3948          uint32_t size = 0xffffffff;
3949 
3950          /* Compute the correct buffer size for NGG streamout
3951           * because it's used to determine the max emit per
3952           * buffer.
3953           */
3954          if (cmd_buffer->device->physical_device->use_ngg_streamout)
3955             size = buffer->vk.size - sb[i].offset;
3956 
3957          uint32_t rsrc_word3 =
3958             S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3959             S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
3960 
3961          if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
3962             rsrc_word3 |= S_008F0C_FORMAT(V_008F0C_GFX11_FORMAT_32_FLOAT) |
3963                           S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW);
3964          } else if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10) {
3965             rsrc_word3 |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
3966                           S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
3967          } else {
3968             rsrc_word3 |= S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
3969          }
3970 
3971          desc[0] = va;
3972          desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
3973          desc[2] = size;
3974          desc[3] = rsrc_word3;
3975       }
3976 
3977       va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
3978       va += so_offset;
3979 
3980       radv_emit_streamout_buffers(cmd_buffer, va);
3981    }
3982 
3983    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_STREAMOUT_BUFFER;
3984 }
3985 
3986 static void
radv_flush_ngg_query_state(struct radv_cmd_buffer * cmd_buffer)3987 radv_flush_ngg_query_state(struct radv_cmd_buffer *cmd_buffer)
3988 {
3989    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
3990    const unsigned stage = pipeline->last_vgt_api_stage;
3991    struct radv_userdata_info *loc;
3992    uint32_t ngg_query_state = 0;
3993    uint32_t base_reg;
3994 
3995    loc = radv_lookup_user_sgpr(&pipeline->base, stage, AC_UD_NGG_QUERY_STATE);
3996    if (loc->sgpr_idx == -1)
3997       return;
3998 
3999    assert(pipeline->is_ngg);
4000 
4001    /* By default NGG queries are disabled but they are enabled if the command buffer has active GDS
4002     * queries or if it's a secondary command buffer that inherits the number of generated
4003     * primitives.
4004     */
4005    if (cmd_buffer->state.active_pipeline_gds_queries ||
4006        (cmd_buffer->state.inherited_pipeline_statistics &
4007         VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT))
4008       ngg_query_state = 1;
4009 
4010    base_reg = pipeline->base.user_data_0[stage];
4011    assert(loc->sgpr_idx != -1);
4012 
4013    radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, ngg_query_state);
4014 }
4015 
4016 static void
radv_flush_force_vrs_state(struct radv_cmd_buffer * cmd_buffer)4017 radv_flush_force_vrs_state(struct radv_cmd_buffer *cmd_buffer)
4018 {
4019    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
4020    enum amd_gfx_level gfx_level = pipeline->base.device->physical_device->rad_info.gfx_level;
4021    const unsigned stage = pipeline->last_vgt_api_stage;
4022    struct radv_userdata_info *loc;
4023    uint32_t vrs_rates = 0;
4024    uint32_t base_reg;
4025 
4026    if (!pipeline->force_vrs_per_vertex) {
4027       /* Un-set the SGPR index so we know to re-emit it later. */
4028       cmd_buffer->state.last_vrs_rates_sgpr_idx = -1;
4029       return;
4030    }
4031 
4032    loc = radv_lookup_user_sgpr(&pipeline->base, stage, AC_UD_FORCE_VRS_RATES);
4033    assert(loc->sgpr_idx != -1);
4034 
4035    base_reg = pipeline->base.user_data_0[stage];
4036 
4037    switch (cmd_buffer->device->force_vrs) {
4038    case RADV_FORCE_VRS_2x2:
4039       vrs_rates = gfx_level >= GFX11 ? V_0283D0_VRS_SHADING_RATE_2X2 : (1u << 2) | (1u << 4);
4040       break;
4041    case RADV_FORCE_VRS_2x1:
4042       vrs_rates = gfx_level >= GFX11 ? V_0283D0_VRS_SHADING_RATE_2X1 : (1u << 2) | (0u << 4);
4043       break;
4044    case RADV_FORCE_VRS_1x2:
4045       vrs_rates = gfx_level >= GFX11 ? V_0283D0_VRS_SHADING_RATE_1X2 : (0u << 2) | (1u << 4);
4046       break;
4047    default:
4048       break;
4049    }
4050 
4051    if (cmd_buffer->state.last_vrs_rates != vrs_rates ||
4052        cmd_buffer->state.last_vrs_rates_sgpr_idx != loc->sgpr_idx) {
4053       radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, vrs_rates);
4054    }
4055 
4056    cmd_buffer->state.last_vrs_rates = vrs_rates;
4057    cmd_buffer->state.last_vrs_rates_sgpr_idx = loc->sgpr_idx;
4058 }
4059 
4060 static void
radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer * cmd_buffer,bool pipeline_is_dirty)4061 radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
4062 {
4063    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
4064 
4065    radv_flush_vertex_descriptors(cmd_buffer, pipeline_is_dirty);
4066    radv_flush_streamout_descriptors(cmd_buffer);
4067 
4068    VkShaderStageFlags stages = VK_SHADER_STAGE_ALL_GRAPHICS | VK_SHADER_STAGE_MESH_BIT_NV;
4069    radv_flush_descriptors(cmd_buffer, stages, &pipeline->base, VK_PIPELINE_BIND_POINT_GRAPHICS);
4070    radv_flush_constants(cmd_buffer, stages, &pipeline->base, VK_PIPELINE_BIND_POINT_GRAPHICS);
4071    radv_flush_ngg_query_state(cmd_buffer);
4072    radv_flush_force_vrs_state(cmd_buffer);
4073 }
4074 
4075 struct radv_draw_info {
4076    /**
4077     * Number of vertices.
4078     */
4079    uint32_t count;
4080 
4081    /**
4082     * First instance id.
4083     */
4084    uint32_t first_instance;
4085 
4086    /**
4087     * Number of instances.
4088     */
4089    uint32_t instance_count;
4090 
4091    /**
4092     * Whether it's an indexed draw.
4093     */
4094    bool indexed;
4095 
4096    /**
4097     * Indirect draw parameters resource.
4098     */
4099    struct radv_buffer *indirect;
4100    uint64_t indirect_offset;
4101    uint32_t stride;
4102 
4103    /**
4104     * Draw count parameters resource.
4105     */
4106    struct radv_buffer *count_buffer;
4107    uint64_t count_buffer_offset;
4108 
4109    /**
4110     * Stream output parameters resource.
4111     */
4112    struct radv_buffer *strmout_buffer;
4113    uint64_t strmout_buffer_offset;
4114 };
4115 
4116 static uint32_t
radv_get_primitive_reset_index(struct radv_cmd_buffer * cmd_buffer)4117 radv_get_primitive_reset_index(struct radv_cmd_buffer *cmd_buffer)
4118 {
4119    uint32_t index_type = G_028A7C_INDEX_TYPE(cmd_buffer->state.index_type);
4120    switch (index_type) {
4121    case V_028A7C_VGT_INDEX_8:
4122       return 0xffu;
4123    case V_028A7C_VGT_INDEX_16:
4124       return 0xffffu;
4125    case V_028A7C_VGT_INDEX_32:
4126       return 0xffffffffu;
4127    default:
4128       unreachable("invalid index type");
4129    }
4130 }
4131 
4132 static void
si_emit_ia_multi_vgt_param(struct radv_cmd_buffer * cmd_buffer,bool instanced_draw,bool indirect_draw,bool count_from_stream_output,uint32_t draw_vertex_count)4133 si_emit_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer, bool instanced_draw,
4134                            bool indirect_draw, bool count_from_stream_output,
4135                            uint32_t draw_vertex_count)
4136 {
4137    struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info;
4138    struct radv_cmd_state *state = &cmd_buffer->state;
4139    unsigned topology = state->dynamic.primitive_topology;
4140    bool prim_restart_enable = state->dynamic.primitive_restart_enable;
4141    struct radeon_cmdbuf *cs = cmd_buffer->cs;
4142    unsigned ia_multi_vgt_param;
4143 
4144    ia_multi_vgt_param =
4145       si_get_ia_multi_vgt_param(cmd_buffer, instanced_draw, indirect_draw, count_from_stream_output,
4146                                 draw_vertex_count, topology, prim_restart_enable);
4147 
4148    if (state->last_ia_multi_vgt_param != ia_multi_vgt_param) {
4149       if (info->gfx_level == GFX9) {
4150          radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device, cs,
4151                                     R_030960_IA_MULTI_VGT_PARAM, 4, ia_multi_vgt_param);
4152       } else if (info->gfx_level >= GFX7) {
4153          radeon_set_context_reg_idx(cs, R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param);
4154       } else {
4155          radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param);
4156       }
4157       state->last_ia_multi_vgt_param = ia_multi_vgt_param;
4158    }
4159 }
4160 
4161 static void
radv_emit_draw_registers(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * draw_info)4162 radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *draw_info)
4163 {
4164    struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info;
4165    struct radv_cmd_state *state = &cmd_buffer->state;
4166    struct radeon_cmdbuf *cs = cmd_buffer->cs;
4167    uint32_t topology = state->dynamic.primitive_topology;
4168    bool disable_instance_packing = false;
4169 
4170    /* Draw state. */
4171    if (info->gfx_level < GFX10) {
4172       si_emit_ia_multi_vgt_param(cmd_buffer, draw_info->instance_count > 1, draw_info->indirect,
4173                                  !!draw_info->strmout_buffer,
4174                                  draw_info->indirect ? 0 : draw_info->count);
4175    }
4176 
4177    if (state->dynamic.primitive_restart_enable) {
4178       uint32_t primitive_reset_index = radv_get_primitive_reset_index(cmd_buffer);
4179 
4180       if (primitive_reset_index != state->last_primitive_reset_index) {
4181          radeon_set_context_reg(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, primitive_reset_index);
4182          state->last_primitive_reset_index = primitive_reset_index;
4183       }
4184    }
4185 
4186    if (draw_info->strmout_buffer) {
4187       uint64_t va = radv_buffer_get_va(draw_info->strmout_buffer->bo);
4188 
4189       va += draw_info->strmout_buffer->offset + draw_info->strmout_buffer_offset;
4190 
4191       radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE, draw_info->stride);
4192 
4193       if (info->gfx_level >= GFX10) {
4194          /* Emitting a COPY_DATA packet should be enough because RADV doesn't support preemption
4195           * (shadow memory) but for unknown reasons, it can lead to GPU hangs on GFX10+.
4196           */
4197          radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
4198          radeon_emit(cs, 0);
4199 
4200          radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, 0));
4201          radeon_emit(cs, va);
4202          radeon_emit(cs, va >> 32);
4203          radeon_emit(cs, (R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE - SI_CONTEXT_REG_OFFSET) >> 2);
4204          radeon_emit(cs, 1); /* 1 DWORD */
4205       } else {
4206          radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
4207          radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) |
4208                          COPY_DATA_WR_CONFIRM);
4209          radeon_emit(cs, va);
4210          radeon_emit(cs, va >> 32);
4211          radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2);
4212          radeon_emit(cs, 0); /* unused */
4213       }
4214 
4215       radv_cs_add_buffer(cmd_buffer->device->ws, cs, draw_info->strmout_buffer->bo);
4216    }
4217 
4218    /* RDNA2 is affected by a hardware bug when instance packing is enabled for adjacent primitive
4219     * topologies and instance_count > 1, pipeline stats generated by GE are incorrect. It needs to
4220     * be applied for indexed and non-indexed draws.
4221     */
4222    if (info->gfx_level == GFX10_3 && state->active_pipeline_queries > 0 &&
4223        (draw_info->instance_count > 1 || draw_info->indirect) &&
4224        (topology == V_008958_DI_PT_LINELIST_ADJ || topology == V_008958_DI_PT_LINESTRIP_ADJ ||
4225         topology == V_008958_DI_PT_TRILIST_ADJ || topology == V_008958_DI_PT_TRISTRIP_ADJ)) {
4226       disable_instance_packing = true;
4227    }
4228 
4229    if ((draw_info->indexed && state->index_type != state->last_index_type) ||
4230        (info->gfx_level == GFX10_3 &&
4231         (state->last_index_type == -1 ||
4232          disable_instance_packing != G_028A7C_DISABLE_INSTANCE_PACKING(state->last_index_type)))) {
4233       uint32_t index_type = state->index_type | S_028A7C_DISABLE_INSTANCE_PACKING(disable_instance_packing);
4234 
4235       if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9) {
4236          radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device, cs,
4237                                     R_03090C_VGT_INDEX_TYPE, 2, index_type);
4238       } else {
4239          radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
4240          radeon_emit(cs, index_type);
4241       }
4242 
4243       state->last_index_type = index_type;
4244    }
4245 }
4246 
4247 static void
radv_stage_flush(struct radv_cmd_buffer * cmd_buffer,VkPipelineStageFlags2 src_stage_mask)4248 radv_stage_flush(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 src_stage_mask)
4249 {
4250    /* For simplicity, if the barrier wants to wait for the task shader,
4251     * just make it wait for the mesh shader too.
4252     */
4253    if (src_stage_mask & VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_NV)
4254       src_stage_mask |= VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_NV;
4255 
4256    if (src_stage_mask & (VK_PIPELINE_STAGE_2_COPY_BIT |
4257                          VK_PIPELINE_STAGE_2_RESOLVE_BIT |
4258                          VK_PIPELINE_STAGE_2_BLIT_BIT |
4259                          VK_PIPELINE_STAGE_2_CLEAR_BIT)) {
4260       /* Be conservative for now. */
4261       src_stage_mask |= VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT;
4262    }
4263 
4264    if (src_stage_mask &
4265        (VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT |
4266         VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_KHR |
4267         VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_COPY_BIT_KHR |
4268         VK_PIPELINE_STAGE_2_RAY_TRACING_SHADER_BIT_KHR | VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT |
4269         VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) {
4270       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
4271    }
4272 
4273    if (src_stage_mask &
4274        (VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT |
4275         VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT |
4276         VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT | VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT |
4277         VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) {
4278       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
4279    } else if (src_stage_mask &
4280               (VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT |
4281                VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT |
4282                VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT |
4283                VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT |
4284                VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT |
4285                VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_NV |
4286                VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT |
4287                VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT)) {
4288       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH;
4289    }
4290 }
4291 
4292 static bool
can_skip_buffer_l2_flushes(struct radv_device * device)4293 can_skip_buffer_l2_flushes(struct radv_device *device)
4294 {
4295    return device->physical_device->rad_info.gfx_level == GFX9 ||
4296           (device->physical_device->rad_info.gfx_level >= GFX10 &&
4297            !device->physical_device->rad_info.tcc_rb_non_coherent);
4298 }
4299 
4300 /*
4301  * In vulkan barriers have two kinds of operations:
4302  *
4303  * - visibility (implemented with radv_src_access_flush)
4304  * - availability (implemented with radv_dst_access_flush)
4305  *
4306  * for a memory operation to observe the result of a previous memory operation
4307  * one needs to do a visibility operation from the source memory and then an
4308  * availability operation to the target memory.
4309  *
4310  * The complication is the availability and visibility operations do not need to
4311  * be in the same barrier.
4312  *
4313  * The cleanest way to implement this is to define the visibility operation to
4314  * bring the caches to a "state of rest", which none of the caches below that
4315  * level dirty.
4316  *
4317  * For GFX8 and earlier this would be VRAM/GTT with none of the caches dirty.
4318  *
4319  * For GFX9+ we can define the state at rest to be L2 instead of VRAM for all
4320  * buffers and for images marked as coherent, and VRAM/GTT for non-coherent
4321  * images. However, given the existence of memory barriers which do not specify
4322  * the image/buffer it often devolves to just VRAM/GTT anyway.
4323  *
4324  * To help reducing the invalidations for GPUs that have L2 coherency between the
4325  * RB and the shader caches, we always invalidate L2 on the src side, as we can
4326  * use our knowledge of past usage to optimize flushes away.
4327  */
4328 
4329 enum radv_cmd_flush_bits
radv_src_access_flush(struct radv_cmd_buffer * cmd_buffer,VkAccessFlags2 src_flags,const struct radv_image * image)4330 radv_src_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags2 src_flags,
4331                       const struct radv_image *image)
4332 {
4333    bool has_CB_meta = true, has_DB_meta = true;
4334    bool image_is_coherent = image ? image->l2_coherent : false;
4335    enum radv_cmd_flush_bits flush_bits = 0;
4336 
4337    if (image) {
4338       if (!radv_image_has_CB_metadata(image))
4339          has_CB_meta = false;
4340       if (!radv_image_has_htile(image))
4341          has_DB_meta = false;
4342    }
4343 
4344    u_foreach_bit64(b, src_flags)
4345    {
4346       switch ((VkAccessFlags2)(1 << b)) {
4347       case VK_ACCESS_2_SHADER_WRITE_BIT:
4348       case VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT:
4349          /* since the STORAGE bit isn't set we know that this is a meta operation.
4350           * on the dst flush side we skip CB/DB flushes without the STORAGE bit, so
4351           * set it here. */
4352          if (image && !(image->vk.usage & VK_IMAGE_USAGE_STORAGE_BIT)) {
4353             if (vk_format_is_depth_or_stencil(image->vk.format)) {
4354                flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
4355             } else {
4356                flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
4357             }
4358          }
4359 
4360          if (!image_is_coherent)
4361             flush_bits |= RADV_CMD_FLAG_INV_L2;
4362          break;
4363       case VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_KHR:
4364       case VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT:
4365       case VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
4366          if (!image_is_coherent)
4367             flush_bits |= RADV_CMD_FLAG_WB_L2;
4368          break;
4369       case VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT:
4370          flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
4371          if (has_CB_meta)
4372             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
4373          break;
4374       case VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
4375          flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
4376          if (has_DB_meta)
4377             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
4378          break;
4379       case VK_ACCESS_2_TRANSFER_WRITE_BIT:
4380          flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB;
4381 
4382          if (!image_is_coherent)
4383             flush_bits |= RADV_CMD_FLAG_INV_L2;
4384          if (has_CB_meta)
4385             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
4386          if (has_DB_meta)
4387             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
4388          break;
4389       case VK_ACCESS_2_MEMORY_WRITE_BIT:
4390          flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB;
4391 
4392          if (!image_is_coherent)
4393             flush_bits |= RADV_CMD_FLAG_INV_L2;
4394          if (has_CB_meta)
4395             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
4396          if (has_DB_meta)
4397             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
4398          break;
4399       default:
4400          break;
4401       }
4402    }
4403    return flush_bits;
4404 }
4405 
4406 enum radv_cmd_flush_bits
radv_dst_access_flush(struct radv_cmd_buffer * cmd_buffer,VkAccessFlags2 dst_flags,const struct radv_image * image)4407 radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags2 dst_flags,
4408                       const struct radv_image *image)
4409 {
4410    bool has_CB_meta = true, has_DB_meta = true;
4411    enum radv_cmd_flush_bits flush_bits = 0;
4412    bool flush_CB = true, flush_DB = true;
4413    bool image_is_coherent = image ? image->l2_coherent : false;
4414 
4415    if (image) {
4416       if (!(image->vk.usage & VK_IMAGE_USAGE_STORAGE_BIT)) {
4417          flush_CB = false;
4418          flush_DB = false;
4419       }
4420 
4421       if (!radv_image_has_CB_metadata(image))
4422          has_CB_meta = false;
4423       if (!radv_image_has_htile(image))
4424          has_DB_meta = false;
4425    }
4426 
4427    /* All the L2 invalidations below are not the CB/DB. So if there are no incoherent images
4428     * in the L2 cache in CB/DB mode then they are already usable from all the other L2 clients. */
4429    image_is_coherent |=
4430       can_skip_buffer_l2_flushes(cmd_buffer->device) && !cmd_buffer->state.rb_noncoherent_dirty;
4431 
4432    u_foreach_bit64(b, dst_flags)
4433    {
4434       switch ((VkAccessFlags2)(1 << b)) {
4435       case VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT:
4436          /* SMEM loads are used to read compute dispatch size in shaders */
4437          if (!cmd_buffer->device->load_grid_size_from_user_sgpr)
4438             flush_bits |= RADV_CMD_FLAG_INV_SCACHE;
4439 
4440          /* Ensure the DGC meta shader can read the commands. */
4441          if (cmd_buffer->device->uses_device_generated_commands) {
4442             flush_bits |= RADV_CMD_FLAG_INV_SCACHE | RADV_CMD_FLAG_INV_VCACHE;
4443 
4444             if (cmd_buffer->device->physical_device->rad_info.gfx_level < GFX9)
4445                flush_bits |= RADV_CMD_FLAG_INV_L2;
4446          }
4447 
4448          break;
4449       case VK_ACCESS_2_INDEX_READ_BIT:
4450       case VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
4451          break;
4452       case VK_ACCESS_2_UNIFORM_READ_BIT:
4453          flush_bits |= RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_SCACHE;
4454          break;
4455       case VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT:
4456       case VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT:
4457       case VK_ACCESS_2_TRANSFER_READ_BIT:
4458       case VK_ACCESS_2_TRANSFER_WRITE_BIT:
4459          flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
4460 
4461          if (has_CB_meta || has_DB_meta)
4462             flush_bits |= RADV_CMD_FLAG_INV_L2_METADATA;
4463          if (!image_is_coherent)
4464             flush_bits |= RADV_CMD_FLAG_INV_L2;
4465          break;
4466       case VK_ACCESS_2_SHADER_BINDING_TABLE_READ_BIT_KHR:
4467       case VK_ACCESS_2_SHADER_READ_BIT:
4468       case VK_ACCESS_2_SHADER_STORAGE_READ_BIT:
4469          flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
4470          /* Unlike LLVM, ACO uses SMEM for SSBOs and we have to
4471           * invalidate the scalar cache. */
4472          if (!cmd_buffer->device->physical_device->use_llvm && !image)
4473             flush_bits |= RADV_CMD_FLAG_INV_SCACHE;
4474 
4475          if (has_CB_meta || has_DB_meta)
4476             flush_bits |= RADV_CMD_FLAG_INV_L2_METADATA;
4477          if (!image_is_coherent)
4478             flush_bits |= RADV_CMD_FLAG_INV_L2;
4479          break;
4480       case VK_ACCESS_2_ACCELERATION_STRUCTURE_READ_BIT_KHR:
4481          flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
4482          if (cmd_buffer->device->physical_device->rad_info.gfx_level < GFX9)
4483             flush_bits |= RADV_CMD_FLAG_INV_L2;
4484          break;
4485       case VK_ACCESS_2_SHADER_WRITE_BIT:
4486       case VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT:
4487       case VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_KHR:
4488          break;
4489       case VK_ACCESS_2_COLOR_ATTACHMENT_READ_BIT:
4490       case VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT:
4491          if (flush_CB)
4492             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
4493          if (has_CB_meta)
4494             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
4495          break;
4496       case VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_READ_BIT:
4497       case VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
4498          if (flush_DB)
4499             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
4500          if (has_DB_meta)
4501             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
4502          break;
4503       case VK_ACCESS_2_MEMORY_READ_BIT:
4504       case VK_ACCESS_2_MEMORY_WRITE_BIT:
4505          flush_bits |= RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_SCACHE;
4506          if (!image_is_coherent)
4507             flush_bits |= RADV_CMD_FLAG_INV_L2;
4508          if (flush_CB)
4509             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
4510          if (has_CB_meta)
4511             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
4512          if (flush_DB)
4513             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
4514          if (has_DB_meta)
4515             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
4516          break;
4517       default:
4518          break;
4519       }
4520    }
4521    return flush_bits;
4522 }
4523 
4524 void
radv_emit_subpass_barrier(struct radv_cmd_buffer * cmd_buffer,const struct radv_subpass_barrier * barrier)4525 radv_emit_subpass_barrier(struct radv_cmd_buffer *cmd_buffer,
4526                           const struct radv_subpass_barrier *barrier)
4527 {
4528    struct radv_render_pass *pass = cmd_buffer->state.pass;
4529 
4530    for (uint32_t i = 0; i < pass->attachment_count; i++) {
4531       struct radv_image_view *iview = cmd_buffer->state.attachments[i].iview;
4532 
4533       cmd_buffer->state.flush_bits |=
4534          radv_src_access_flush(cmd_buffer, barrier->src_access_mask, iview->image);
4535    }
4536 
4537    radv_stage_flush(cmd_buffer, barrier->src_stage_mask);
4538 
4539    for (uint32_t i = 0; i < pass->attachment_count; i++) {
4540       struct radv_image_view *iview = cmd_buffer->state.attachments[i].iview;
4541 
4542       cmd_buffer->state.flush_bits |=
4543          radv_dst_access_flush(cmd_buffer, barrier->dst_access_mask, iview->image);
4544    }
4545 
4546    radv_ace_internal_barrier(cmd_buffer, barrier->src_stage_mask, barrier->dst_stage_mask);
4547 }
4548 
4549 uint32_t
radv_get_subpass_id(struct radv_cmd_buffer * cmd_buffer)4550 radv_get_subpass_id(struct radv_cmd_buffer *cmd_buffer)
4551 {
4552    struct radv_cmd_state *state = &cmd_buffer->state;
4553    uint32_t subpass_id = state->subpass - state->pass->subpasses;
4554 
4555    /* The id of this subpass shouldn't exceed the number of subpasses in
4556     * this render pass minus 1.
4557     */
4558    assert(subpass_id < state->pass->subpass_count);
4559    return subpass_id;
4560 }
4561 
4562 static struct radv_sample_locations_state *
radv_get_attachment_sample_locations(struct radv_cmd_buffer * cmd_buffer,uint32_t att_idx,bool begin_subpass)4563 radv_get_attachment_sample_locations(struct radv_cmd_buffer *cmd_buffer, uint32_t att_idx,
4564                                      bool begin_subpass)
4565 {
4566    struct radv_cmd_state *state = &cmd_buffer->state;
4567    uint32_t subpass_id = radv_get_subpass_id(cmd_buffer);
4568    struct radv_image_view *view = state->attachments[att_idx].iview;
4569 
4570    if (view->image->info.samples == 1)
4571       return NULL;
4572 
4573    if (state->pass->attachments[att_idx].first_subpass_idx == subpass_id) {
4574       /* Return the initial sample locations if this is the initial
4575        * layout transition of the given subpass attachemnt.
4576        */
4577       if (state->attachments[att_idx].sample_location.count > 0)
4578          return &state->attachments[att_idx].sample_location;
4579    } else {
4580       /* Otherwise return the subpass sample locations if defined. */
4581       if (state->subpass_sample_locs) {
4582          /* Because the driver sets the current subpass before
4583           * initial layout transitions, we should use the sample
4584           * locations from the previous subpass to avoid an
4585           * off-by-one problem. Otherwise, use the sample
4586           * locations for the current subpass for final layout
4587           * transitions.
4588           */
4589          if (begin_subpass)
4590             subpass_id--;
4591 
4592          for (uint32_t i = 0; i < state->num_subpass_sample_locs; i++) {
4593             if (state->subpass_sample_locs[i].subpass_idx == subpass_id)
4594                return &state->subpass_sample_locs[i].sample_location;
4595          }
4596       }
4597    }
4598 
4599    return NULL;
4600 }
4601 
4602 static void
radv_handle_subpass_image_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_subpass_attachment att,bool begin_subpass)4603 radv_handle_subpass_image_transition(struct radv_cmd_buffer *cmd_buffer,
4604                                      struct radv_subpass_attachment att, bool begin_subpass)
4605 {
4606    unsigned idx = att.attachment;
4607    struct radv_image_view *view = cmd_buffer->state.attachments[idx].iview;
4608    struct radv_sample_locations_state *sample_locs;
4609    VkImageSubresourceRange range;
4610    range.aspectMask = view->vk.aspects;
4611    range.baseMipLevel = view->vk.base_mip_level;
4612    range.levelCount = 1;
4613    range.baseArrayLayer = view->vk.base_array_layer;
4614    range.layerCount = cmd_buffer->state.framebuffer->layers;
4615 
4616    if (cmd_buffer->state.subpass->view_mask) {
4617       /* If the current subpass uses multiview, the driver might have
4618        * performed a fast color/depth clear to the whole image
4619        * (including all layers). To make sure the driver will
4620        * decompress the image correctly (if needed), we have to
4621        * account for the "real" number of layers. If the view mask is
4622        * sparse, this will decompress more layers than needed.
4623        */
4624       range.layerCount = util_last_bit(cmd_buffer->state.subpass->view_mask);
4625    }
4626 
4627    /* Get the subpass sample locations for the given attachment, if NULL
4628     * is returned the driver will use the default HW locations.
4629     */
4630    sample_locs = radv_get_attachment_sample_locations(cmd_buffer, idx, begin_subpass);
4631 
4632    /* Determine if the subpass uses separate depth/stencil layouts. */
4633    bool uses_separate_depth_stencil_layouts = false;
4634    if ((cmd_buffer->state.attachments[idx].current_layout !=
4635         cmd_buffer->state.attachments[idx].current_stencil_layout) ||
4636        (att.layout != att.stencil_layout)) {
4637       uses_separate_depth_stencil_layouts = true;
4638    }
4639 
4640    /* For separate layouts, perform depth and stencil transitions
4641     * separately.
4642     */
4643    if (uses_separate_depth_stencil_layouts &&
4644        (range.aspectMask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT))) {
4645       /* Depth-only transitions. */
4646       range.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT;
4647       radv_handle_image_transition(cmd_buffer, view->image,
4648                                    cmd_buffer->state.attachments[idx].current_layout,
4649                                    cmd_buffer->state.attachments[idx].current_in_render_loop,
4650                                    att.layout, att.in_render_loop, 0, 0, &range, sample_locs);
4651 
4652       /* Stencil-only transitions. */
4653       range.aspectMask = VK_IMAGE_ASPECT_STENCIL_BIT;
4654       radv_handle_image_transition(
4655          cmd_buffer, view->image, cmd_buffer->state.attachments[idx].current_stencil_layout,
4656          cmd_buffer->state.attachments[idx].current_in_render_loop, att.stencil_layout,
4657          att.in_render_loop, 0, 0, &range, sample_locs);
4658    } else {
4659       radv_handle_image_transition(cmd_buffer, view->image,
4660                                    cmd_buffer->state.attachments[idx].current_layout,
4661                                    cmd_buffer->state.attachments[idx].current_in_render_loop,
4662                                    att.layout, att.in_render_loop, 0, 0, &range, sample_locs);
4663    }
4664 
4665    cmd_buffer->state.attachments[idx].current_layout = att.layout;
4666    cmd_buffer->state.attachments[idx].current_stencil_layout = att.stencil_layout;
4667    cmd_buffer->state.attachments[idx].current_in_render_loop = att.in_render_loop;
4668 }
4669 
4670 void
radv_cmd_buffer_set_subpass(struct radv_cmd_buffer * cmd_buffer,const struct radv_subpass * subpass)4671 radv_cmd_buffer_set_subpass(struct radv_cmd_buffer *cmd_buffer, const struct radv_subpass *subpass)
4672 {
4673    cmd_buffer->state.subpass = subpass;
4674 
4675    cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FRAMEBUFFER;
4676 }
4677 
4678 static VkResult
radv_cmd_state_setup_sample_locations(struct radv_cmd_buffer * cmd_buffer,struct radv_render_pass * pass,const VkRenderPassBeginInfo * info)4679 radv_cmd_state_setup_sample_locations(struct radv_cmd_buffer *cmd_buffer,
4680                                       struct radv_render_pass *pass,
4681                                       const VkRenderPassBeginInfo *info)
4682 {
4683    const struct VkRenderPassSampleLocationsBeginInfoEXT *sample_locs =
4684       vk_find_struct_const(info->pNext, RENDER_PASS_SAMPLE_LOCATIONS_BEGIN_INFO_EXT);
4685    struct radv_cmd_state *state = &cmd_buffer->state;
4686 
4687    if (!sample_locs) {
4688       state->subpass_sample_locs = NULL;
4689       return VK_SUCCESS;
4690    }
4691 
4692    for (uint32_t i = 0; i < sample_locs->attachmentInitialSampleLocationsCount; i++) {
4693       const VkAttachmentSampleLocationsEXT *att_sample_locs =
4694          &sample_locs->pAttachmentInitialSampleLocations[i];
4695       uint32_t att_idx = att_sample_locs->attachmentIndex;
4696       struct radv_image *image = cmd_buffer->state.attachments[att_idx].iview->image;
4697 
4698       assert(vk_format_is_depth_or_stencil(image->vk.format));
4699 
4700       /* From the Vulkan spec 1.1.108:
4701        *
4702        * "If the image referenced by the framebuffer attachment at
4703        *  index attachmentIndex was not created with
4704        *  VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT
4705        *  then the values specified in sampleLocationsInfo are
4706        *  ignored."
4707        */
4708       if (!(image->vk.create_flags & VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT))
4709          continue;
4710 
4711       const VkSampleLocationsInfoEXT *sample_locs_info = &att_sample_locs->sampleLocationsInfo;
4712 
4713       state->attachments[att_idx].sample_location.per_pixel =
4714          sample_locs_info->sampleLocationsPerPixel;
4715       state->attachments[att_idx].sample_location.grid_size =
4716          sample_locs_info->sampleLocationGridSize;
4717       state->attachments[att_idx].sample_location.count = sample_locs_info->sampleLocationsCount;
4718       typed_memcpy(&state->attachments[att_idx].sample_location.locations[0],
4719                    sample_locs_info->pSampleLocations, sample_locs_info->sampleLocationsCount);
4720    }
4721 
4722    state->subpass_sample_locs =
4723       vk_alloc(&cmd_buffer->pool->vk.alloc,
4724                sample_locs->postSubpassSampleLocationsCount * sizeof(state->subpass_sample_locs[0]),
4725                8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
4726    if (state->subpass_sample_locs == NULL) {
4727       cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
4728       return cmd_buffer->record_result;
4729    }
4730 
4731    state->num_subpass_sample_locs = sample_locs->postSubpassSampleLocationsCount;
4732 
4733    for (uint32_t i = 0; i < sample_locs->postSubpassSampleLocationsCount; i++) {
4734       const VkSubpassSampleLocationsEXT *subpass_sample_locs_info =
4735          &sample_locs->pPostSubpassSampleLocations[i];
4736       const VkSampleLocationsInfoEXT *sample_locs_info =
4737          &subpass_sample_locs_info->sampleLocationsInfo;
4738 
4739       state->subpass_sample_locs[i].subpass_idx = subpass_sample_locs_info->subpassIndex;
4740       state->subpass_sample_locs[i].sample_location.per_pixel =
4741          sample_locs_info->sampleLocationsPerPixel;
4742       state->subpass_sample_locs[i].sample_location.grid_size =
4743          sample_locs_info->sampleLocationGridSize;
4744       state->subpass_sample_locs[i].sample_location.count = sample_locs_info->sampleLocationsCount;
4745       typed_memcpy(&state->subpass_sample_locs[i].sample_location.locations[0],
4746                    sample_locs_info->pSampleLocations, sample_locs_info->sampleLocationsCount);
4747    }
4748 
4749    return VK_SUCCESS;
4750 }
4751 
4752 static VkResult
radv_cmd_state_setup_attachments(struct radv_cmd_buffer * cmd_buffer,struct radv_render_pass * pass,const VkRenderPassBeginInfo * info)4753 radv_cmd_state_setup_attachments(struct radv_cmd_buffer *cmd_buffer, struct radv_render_pass *pass,
4754                                  const VkRenderPassBeginInfo *info)
4755 {
4756    struct radv_cmd_state *state = &cmd_buffer->state;
4757    const struct VkRenderPassAttachmentBeginInfo *attachment_info = NULL;
4758 
4759    if (info) {
4760       attachment_info = vk_find_struct_const(info->pNext, RENDER_PASS_ATTACHMENT_BEGIN_INFO);
4761    }
4762 
4763    if (pass->attachment_count == 0) {
4764       state->attachments = NULL;
4765       return VK_SUCCESS;
4766    }
4767 
4768    state->attachments =
4769       vk_alloc(&cmd_buffer->pool->vk.alloc, pass->attachment_count * sizeof(state->attachments[0]),
4770                8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
4771    if (state->attachments == NULL) {
4772       cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
4773       return cmd_buffer->record_result;
4774    }
4775 
4776    for (uint32_t i = 0; i < pass->attachment_count; ++i) {
4777       struct radv_render_pass_attachment *att = &pass->attachments[i];
4778       VkImageAspectFlags att_aspects = vk_format_aspects(att->format);
4779       VkImageAspectFlags clear_aspects = 0;
4780 
4781       if (att_aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
4782          /* color attachment */
4783          if (att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
4784             clear_aspects |= VK_IMAGE_ASPECT_COLOR_BIT;
4785          }
4786       } else {
4787          /* depthstencil attachment */
4788          if ((att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
4789              att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
4790             clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
4791             if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
4792                 att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_DONT_CARE)
4793                clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
4794          }
4795          if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
4796              att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
4797             clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
4798          }
4799       }
4800 
4801       state->attachments[i].pending_clear_aspects = clear_aspects;
4802       state->attachments[i].cleared_views = 0;
4803       if (clear_aspects && info) {
4804          assert(info->clearValueCount > i);
4805          state->attachments[i].clear_value = info->pClearValues[i];
4806       }
4807 
4808       state->attachments[i].current_layout = att->initial_layout;
4809       state->attachments[i].current_in_render_loop = false;
4810       state->attachments[i].current_stencil_layout = att->stencil_initial_layout;
4811       state->attachments[i].sample_location.count = 0;
4812 
4813       struct radv_image_view *iview;
4814       if (attachment_info && attachment_info->attachmentCount > i) {
4815          iview = radv_image_view_from_handle(attachment_info->pAttachments[i]);
4816       } else {
4817          iview = radv_image_view_from_handle(state->framebuffer->attachments[i]);
4818       }
4819 
4820       state->attachments[i].iview = iview;
4821       if (iview->vk.aspects & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
4822          radv_initialise_ds_surface(cmd_buffer->device, &state->attachments[i].ds, iview);
4823       } else {
4824          radv_initialise_color_surface(cmd_buffer->device, &state->attachments[i].cb, iview);
4825       }
4826    }
4827 
4828    return VK_SUCCESS;
4829 }
4830 
4831 VKAPI_ATTR VkResult VKAPI_CALL
radv_AllocateCommandBuffers(VkDevice _device,const VkCommandBufferAllocateInfo * pAllocateInfo,VkCommandBuffer * pCommandBuffers)4832 radv_AllocateCommandBuffers(VkDevice _device, const VkCommandBufferAllocateInfo *pAllocateInfo,
4833                             VkCommandBuffer *pCommandBuffers)
4834 {
4835    RADV_FROM_HANDLE(radv_device, device, _device);
4836    RADV_FROM_HANDLE(radv_cmd_pool, pool, pAllocateInfo->commandPool);
4837 
4838    VkResult result = VK_SUCCESS;
4839    uint32_t i;
4840 
4841    for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
4842 
4843       if (!list_is_empty(&pool->free_cmd_buffers)) {
4844          struct radv_cmd_buffer *cmd_buffer =
4845             list_first_entry(&pool->free_cmd_buffers, struct radv_cmd_buffer, pool_link);
4846 
4847          list_del(&cmd_buffer->pool_link);
4848          list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
4849 
4850          result = radv_reset_cmd_buffer(cmd_buffer);
4851          vk_command_buffer_finish(&cmd_buffer->vk);
4852          VkResult init_result =
4853             vk_command_buffer_init(&cmd_buffer->vk, &pool->vk, pAllocateInfo->level);
4854          if (init_result != VK_SUCCESS)
4855             result = init_result;
4856 
4857          pCommandBuffers[i] = radv_cmd_buffer_to_handle(cmd_buffer);
4858       } else {
4859          result = radv_create_cmd_buffer(device, pool, pAllocateInfo->level, &pCommandBuffers[i]);
4860       }
4861       if (result != VK_SUCCESS)
4862          break;
4863    }
4864 
4865    if (result != VK_SUCCESS) {
4866       radv_FreeCommandBuffers(_device, pAllocateInfo->commandPool, i, pCommandBuffers);
4867 
4868       /* From the Vulkan 1.0.66 spec:
4869        *
4870        * "vkAllocateCommandBuffers can be used to create multiple
4871        *  command buffers. If the creation of any of those command
4872        *  buffers fails, the implementation must destroy all
4873        *  successfully created command buffer objects from this
4874        *  command, set all entries of the pCommandBuffers array to
4875        *  NULL and return the error."
4876        */
4877       memset(pCommandBuffers, 0, sizeof(*pCommandBuffers) * pAllocateInfo->commandBufferCount);
4878    }
4879 
4880    return result;
4881 }
4882 
4883 VKAPI_ATTR void VKAPI_CALL
radv_FreeCommandBuffers(VkDevice device,VkCommandPool commandPool,uint32_t commandBufferCount,const VkCommandBuffer * pCommandBuffers)4884 radv_FreeCommandBuffers(VkDevice device, VkCommandPool commandPool, uint32_t commandBufferCount,
4885                         const VkCommandBuffer *pCommandBuffers)
4886 {
4887    RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
4888 
4889    for (uint32_t i = 0; i < commandBufferCount; i++) {
4890       RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, pCommandBuffers[i]);
4891 
4892       if (!cmd_buffer)
4893          continue;
4894       assert(cmd_buffer->pool == pool);
4895 
4896       list_del(&cmd_buffer->pool_link);
4897       list_addtail(&cmd_buffer->pool_link, &pool->free_cmd_buffers);
4898    }
4899 }
4900 
4901 VKAPI_ATTR VkResult VKAPI_CALL
radv_ResetCommandBuffer(VkCommandBuffer commandBuffer,VkCommandBufferResetFlags flags)4902 radv_ResetCommandBuffer(VkCommandBuffer commandBuffer, VkCommandBufferResetFlags flags)
4903 {
4904    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4905    return radv_reset_cmd_buffer(cmd_buffer);
4906 }
4907 
4908 static void
radv_inherit_dynamic_rendering(struct radv_cmd_buffer * cmd_buffer,const VkCommandBufferInheritanceInfo * inherit_info,const VkCommandBufferInheritanceRenderingInfo * dyn_info)4909 radv_inherit_dynamic_rendering(struct radv_cmd_buffer *cmd_buffer,
4910                                const VkCommandBufferInheritanceInfo *inherit_info,
4911                                const VkCommandBufferInheritanceRenderingInfo *dyn_info)
4912 {
4913    const VkAttachmentSampleCountInfoAMD *sample_info =
4914       vk_find_struct_const(inherit_info->pNext, ATTACHMENT_SAMPLE_COUNT_INFO_AMD);
4915    VkResult result;
4916    /* (normal + resolve) for color attachments and ds and a VRS attachment */
4917    VkAttachmentDescription2 att_desc[MAX_RTS * 2 + 3];
4918    VkAttachmentReference2 color_refs[MAX_RTS], ds_ref;
4919    unsigned att_count = 0;
4920 
4921    VkSubpassDescription2 subpass = {
4922       .sType = VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_2,
4923       .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
4924       .viewMask = dyn_info->viewMask,
4925       .colorAttachmentCount = dyn_info->colorAttachmentCount,
4926       .pColorAttachments = color_refs,
4927    };
4928 
4929    for (unsigned i = 0; i < dyn_info->colorAttachmentCount; ++i) {
4930       if (dyn_info->pColorAttachmentFormats[i] == VK_FORMAT_UNDEFINED) {
4931          color_refs[i] = (VkAttachmentReference2){
4932             .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
4933             .attachment = VK_ATTACHMENT_UNUSED,
4934          };
4935          continue;
4936       }
4937 
4938       color_refs[i] = (VkAttachmentReference2){
4939          .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
4940          .attachment = att_count,
4941          .layout = VK_IMAGE_LAYOUT_GENERAL, /* Shouldn't be used */
4942          .aspectMask = 0,                   /* Shouldn't be used */
4943       };
4944 
4945       VkAttachmentDescription2 *att = att_desc + att_count++;
4946       memset(att, 0, sizeof(*att));
4947       att->sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2;
4948       att->format = dyn_info->pColorAttachmentFormats[i];
4949       att->samples =
4950          sample_info ? sample_info->pColorAttachmentSamples[i] : dyn_info->rasterizationSamples;
4951       att->loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
4952       att->storeOp = VK_ATTACHMENT_STORE_OP_STORE;
4953       att->initialLayout = VK_IMAGE_LAYOUT_GENERAL;
4954       att->finalLayout = VK_IMAGE_LAYOUT_GENERAL;
4955    }
4956 
4957    if (dyn_info->depthAttachmentFormat != VK_FORMAT_UNDEFINED ||
4958        dyn_info->stencilAttachmentFormat != VK_FORMAT_UNDEFINED) {
4959       VkFormat fmt = dyn_info->depthAttachmentFormat != VK_FORMAT_UNDEFINED
4960                         ? dyn_info->depthAttachmentFormat
4961                         : dyn_info->stencilAttachmentFormat;
4962 
4963       ds_ref = (VkAttachmentReference2){
4964          .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
4965          .attachment = att_count,
4966          .layout = VK_IMAGE_LAYOUT_GENERAL, /* Shouldn't be used */
4967          .aspectMask = 0,                   /* Shouldn't be used */
4968       };
4969       subpass.pDepthStencilAttachment = &ds_ref;
4970 
4971       VkAttachmentDescription2 *att = att_desc + att_count++;
4972 
4973       memset(att, 0, sizeof(*att));
4974       att->sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2;
4975       att->format = fmt;
4976       att->samples =
4977          sample_info ? sample_info->depthStencilAttachmentSamples : dyn_info->rasterizationSamples;
4978       att->loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
4979       att->storeOp = VK_ATTACHMENT_STORE_OP_STORE;
4980       att->stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
4981       att->stencilStoreOp = VK_ATTACHMENT_STORE_OP_STORE;
4982    }
4983 
4984    VkRenderPassCreateInfo2 rp_create_info = {
4985       .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2,
4986       .attachmentCount = att_count,
4987       .pAttachments = att_desc,
4988       .subpassCount = 1,
4989       .pSubpasses = &subpass,
4990    };
4991 
4992    VkRenderPass rp;
4993    result =
4994       radv_CreateRenderPass2(radv_device_to_handle(cmd_buffer->device), &rp_create_info, NULL, &rp);
4995    if (result != VK_SUCCESS) {
4996       cmd_buffer->record_result = result;
4997       return;
4998    }
4999 
5000    cmd_buffer->state.pass = radv_render_pass_from_handle(rp);
5001    cmd_buffer->state.own_render_pass = true;
5002 }
5003 
5004 VKAPI_ATTR VkResult VKAPI_CALL
radv_BeginCommandBuffer(VkCommandBuffer commandBuffer,const VkCommandBufferBeginInfo * pBeginInfo)5005 radv_BeginCommandBuffer(VkCommandBuffer commandBuffer, const VkCommandBufferBeginInfo *pBeginInfo)
5006 {
5007    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5008    VkResult result = VK_SUCCESS;
5009 
5010    if (cmd_buffer->status != RADV_CMD_BUFFER_STATUS_INITIAL) {
5011       /* If the command buffer has already been resetted with
5012        * vkResetCommandBuffer, no need to do it again.
5013        */
5014       result = radv_reset_cmd_buffer(cmd_buffer);
5015       if (result != VK_SUCCESS)
5016          return result;
5017    }
5018 
5019    memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
5020    cmd_buffer->state.last_primitive_reset_en = -1;
5021    cmd_buffer->state.last_index_type = -1;
5022    cmd_buffer->state.last_num_instances = -1;
5023    cmd_buffer->state.last_vertex_offset = -1;
5024    cmd_buffer->state.last_first_instance = -1;
5025    cmd_buffer->state.last_drawid = -1;
5026    cmd_buffer->state.last_subpass_color_count = MAX_RTS;
5027    cmd_buffer->state.predication_type = -1;
5028    cmd_buffer->state.last_sx_ps_downconvert = -1;
5029    cmd_buffer->state.last_sx_blend_opt_epsilon = -1;
5030    cmd_buffer->state.last_sx_blend_opt_control = -1;
5031    cmd_buffer->state.last_nggc_settings = -1;
5032    cmd_buffer->state.last_nggc_settings_sgpr_idx = -1;
5033    cmd_buffer->state.mesh_shading = false;
5034    cmd_buffer->state.last_vrs_rates = -1;
5035    cmd_buffer->state.last_vrs_rates_sgpr_idx = -1;
5036    cmd_buffer->usage_flags = pBeginInfo->flags;
5037 
5038    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
5039        (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) {
5040       struct radv_subpass *subpass = NULL;
5041 
5042       assert(pBeginInfo->pInheritanceInfo);
5043 
5044       cmd_buffer->state.framebuffer =
5045          vk_framebuffer_from_handle(pBeginInfo->pInheritanceInfo->framebuffer);
5046 
5047       if (pBeginInfo->pInheritanceInfo->renderPass) {
5048          cmd_buffer->state.pass =
5049             radv_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
5050          assert(pBeginInfo->pInheritanceInfo->subpass < cmd_buffer->state.pass->subpass_count);
5051          subpass = &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
5052       } else {
5053          const VkCommandBufferInheritanceRenderingInfo *dyn_info =
5054             vk_find_struct_const(pBeginInfo->pInheritanceInfo->pNext,
5055                                  COMMAND_BUFFER_INHERITANCE_RENDERING_INFO);
5056          if (dyn_info) {
5057             radv_inherit_dynamic_rendering(cmd_buffer, pBeginInfo->pInheritanceInfo, dyn_info);
5058             subpass = &cmd_buffer->state.pass->subpasses[0];
5059          }
5060       }
5061 
5062       if (cmd_buffer->state.framebuffer) {
5063          result = radv_cmd_state_setup_attachments(cmd_buffer, cmd_buffer->state.pass, NULL);
5064          if (result != VK_SUCCESS)
5065             return result;
5066       }
5067 
5068       cmd_buffer->state.inherited_pipeline_statistics =
5069          pBeginInfo->pInheritanceInfo->pipelineStatistics;
5070 
5071       if (cmd_buffer->state.pass) {
5072          cmd_buffer->state.subpass = subpass;
5073          if (cmd_buffer->state.framebuffer)
5074             cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FRAMEBUFFER;
5075       }
5076    }
5077 
5078    if (unlikely(cmd_buffer->device->trace_bo))
5079       radv_cmd_buffer_trace_emit(cmd_buffer);
5080 
5081    radv_describe_begin_cmd_buffer(cmd_buffer);
5082 
5083    cmd_buffer->status = RADV_CMD_BUFFER_STATUS_RECORDING;
5084 
5085    return result;
5086 }
5087 
5088 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes,const VkDeviceSize * pStrides)5089 radv_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer, uint32_t firstBinding,
5090                            uint32_t bindingCount, const VkBuffer *pBuffers,
5091                            const VkDeviceSize *pOffsets, const VkDeviceSize *pSizes,
5092                            const VkDeviceSize *pStrides)
5093 {
5094    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5095    struct radv_vertex_binding *vb = cmd_buffer->vertex_bindings;
5096    const struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
5097 
5098    /* We have to defer setting up vertex buffer since we need the buffer
5099     * stride from the pipeline. */
5100 
5101    assert(firstBinding + bindingCount <= MAX_VBS);
5102    enum amd_gfx_level chip = cmd_buffer->device->physical_device->rad_info.gfx_level;
5103 
5104    if (firstBinding + bindingCount > cmd_buffer->used_vertex_bindings)
5105       cmd_buffer->used_vertex_bindings = firstBinding + bindingCount;
5106 
5107    uint32_t misaligned_mask_invalid = 0;
5108 
5109    for (uint32_t i = 0; i < bindingCount; i++) {
5110       RADV_FROM_HANDLE(radv_buffer, buffer, pBuffers[i]);
5111       uint32_t idx = firstBinding + i;
5112       VkDeviceSize size = pSizes ? pSizes[i] : 0;
5113       /* if pStrides=NULL, it shouldn't overwrite the strides specified by CmdSetVertexInputEXT */
5114       VkDeviceSize stride = pStrides ? pStrides[i] : vb[idx].stride;
5115 
5116       if (!!cmd_buffer->vertex_binding_buffers[idx] != !!buffer ||
5117           (buffer && ((vb[idx].offset & 0x3) != (pOffsets[i] & 0x3) ||
5118                       (vb[idx].stride & 0x3) != (stride & 0x3)))) {
5119          misaligned_mask_invalid |= state->bindings_match_attrib ? BITFIELD_BIT(idx) : 0xffffffff;
5120       }
5121 
5122       cmd_buffer->vertex_binding_buffers[idx] = buffer;
5123       vb[idx].offset = pOffsets[i];
5124       vb[idx].size = size;
5125       vb[idx].stride = stride;
5126 
5127       uint32_t bit = BITFIELD_BIT(idx);
5128       if (buffer) {
5129          radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->vertex_binding_buffers[idx]->bo);
5130          cmd_buffer->state.vbo_bound_mask |= bit;
5131       } else {
5132          cmd_buffer->state.vbo_bound_mask &= ~bit;
5133       }
5134    }
5135 
5136    if ((chip == GFX6 || chip >= GFX10) && misaligned_mask_invalid) {
5137       cmd_buffer->state.vbo_misaligned_mask_invalid = misaligned_mask_invalid;
5138       cmd_buffer->state.vbo_misaligned_mask &= ~misaligned_mask_invalid;
5139    }
5140 
5141    cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER |
5142                               RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT;
5143 }
5144 
5145 static uint32_t
vk_to_index_type(VkIndexType type)5146 vk_to_index_type(VkIndexType type)
5147 {
5148    switch (type) {
5149    case VK_INDEX_TYPE_UINT8_EXT:
5150       return V_028A7C_VGT_INDEX_8;
5151    case VK_INDEX_TYPE_UINT16:
5152       return V_028A7C_VGT_INDEX_16;
5153    case VK_INDEX_TYPE_UINT32:
5154       return V_028A7C_VGT_INDEX_32;
5155    default:
5156       unreachable("invalid index type");
5157    }
5158 }
5159 
5160 uint32_t
radv_get_vgt_index_size(uint32_t type)5161 radv_get_vgt_index_size(uint32_t type)
5162 {
5163    uint32_t index_type = G_028A7C_INDEX_TYPE(type);
5164    switch (index_type) {
5165    case V_028A7C_VGT_INDEX_8:
5166       return 1;
5167    case V_028A7C_VGT_INDEX_16:
5168       return 2;
5169    case V_028A7C_VGT_INDEX_32:
5170       return 4;
5171    default:
5172       unreachable("invalid index type");
5173    }
5174 }
5175 
5176 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,VkBuffer buffer,VkDeviceSize offset,VkIndexType indexType)5177 radv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset,
5178                         VkIndexType indexType)
5179 {
5180    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5181    RADV_FROM_HANDLE(radv_buffer, index_buffer, buffer);
5182 
5183    cmd_buffer->state.index_buffer = index_buffer;
5184    cmd_buffer->state.index_offset = offset;
5185    cmd_buffer->state.index_type = vk_to_index_type(indexType);
5186    cmd_buffer->state.index_va = radv_buffer_get_va(index_buffer->bo);
5187    cmd_buffer->state.index_va += index_buffer->offset + offset;
5188 
5189    int index_size = radv_get_vgt_index_size(vk_to_index_type(indexType));
5190    cmd_buffer->state.max_index_count =
5191       (vk_buffer_range(&index_buffer->vk, offset, VK_WHOLE_SIZE)) / index_size;
5192    cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
5193    radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, index_buffer->bo);
5194 }
5195 
5196 static void
radv_bind_descriptor_set(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point,struct radv_descriptor_set * set,unsigned idx)5197 radv_bind_descriptor_set(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point,
5198                          struct radv_descriptor_set *set, unsigned idx)
5199 {
5200    struct radeon_winsys *ws = cmd_buffer->device->ws;
5201 
5202    radv_set_descriptor_set(cmd_buffer, bind_point, set, idx);
5203 
5204    assert(set);
5205    assert(!(set->header.layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR));
5206 
5207    if (!cmd_buffer->device->use_global_bo_list) {
5208       for (unsigned j = 0; j < set->header.buffer_count; ++j)
5209          if (set->descriptors[j])
5210             radv_cs_add_buffer(ws, cmd_buffer->cs, set->descriptors[j]);
5211    }
5212 
5213    if (set->header.bo)
5214       radv_cs_add_buffer(ws, cmd_buffer->cs, set->header.bo);
5215 }
5216 
5217 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipelineLayout _layout,uint32_t firstSet,uint32_t descriptorSetCount,const VkDescriptorSet * pDescriptorSets,uint32_t dynamicOffsetCount,const uint32_t * pDynamicOffsets)5218 radv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint,
5219                            VkPipelineLayout _layout, uint32_t firstSet, uint32_t descriptorSetCount,
5220                            const VkDescriptorSet *pDescriptorSets, uint32_t dynamicOffsetCount,
5221                            const uint32_t *pDynamicOffsets)
5222 {
5223    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5224    RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
5225    unsigned dyn_idx = 0;
5226 
5227    const bool no_dynamic_bounds =
5228       cmd_buffer->device->instance->debug_flags & RADV_DEBUG_NO_DYNAMIC_BOUNDS;
5229    struct radv_descriptor_state *descriptors_state =
5230       radv_get_descriptors_state(cmd_buffer, pipelineBindPoint);
5231 
5232    for (unsigned i = 0; i < descriptorSetCount; ++i) {
5233       unsigned set_idx = i + firstSet;
5234       RADV_FROM_HANDLE(radv_descriptor_set, set, pDescriptorSets[i]);
5235 
5236       if (!set) {
5237          /* From the Vulkan spec 1.3.211:
5238           *
5239           * "VUID-vkCmdBindDescriptorSets-layout-06564
5240           *  If layout was not created with VK_PIPELINE_LAYOUT_CREATE_INDEPENDENT_SETS_BIT_EXT, each
5241           *  element of pDescriptorSets must be a valid VkDescriptorSet"
5242           */
5243          assert(layout->independent_sets);
5244          continue;
5245       }
5246 
5247       /* If the set is already bound we only need to update the
5248        * (potentially changed) dynamic offsets. */
5249       if (descriptors_state->sets[set_idx] != set ||
5250           !(descriptors_state->valid & (1u << set_idx))) {
5251          radv_bind_descriptor_set(cmd_buffer, pipelineBindPoint, set, set_idx);
5252       }
5253 
5254       for (unsigned j = 0; j < set->header.layout->dynamic_offset_count; ++j, ++dyn_idx) {
5255          unsigned idx = j + layout->set[i + firstSet].dynamic_offset_start;
5256          uint32_t *dst = descriptors_state->dynamic_buffers + idx * 4;
5257          assert(dyn_idx < dynamicOffsetCount);
5258 
5259          struct radv_descriptor_range *range = set->header.dynamic_descriptors + j;
5260 
5261          if (!range->va) {
5262             memset(dst, 0, 4 * 4);
5263          } else {
5264             uint64_t va = range->va + pDynamicOffsets[dyn_idx];
5265             dst[0] = va;
5266             dst[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
5267             dst[2] = no_dynamic_bounds ? 0xffffffffu : range->size;
5268             dst[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
5269                      S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
5270 
5271             if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
5272                dst[3] |= S_008F0C_FORMAT(V_008F0C_GFX11_FORMAT_32_FLOAT) |
5273                          S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW);
5274             } else if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10) {
5275                dst[3] |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
5276                          S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
5277             } else {
5278                dst[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
5279                          S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
5280             }
5281          }
5282 
5283          cmd_buffer->push_constant_stages |= set->header.layout->dynamic_shader_stages;
5284       }
5285    }
5286 }
5287 
5288 static bool
radv_init_push_descriptor_set(struct radv_cmd_buffer * cmd_buffer,struct radv_descriptor_set * set,struct radv_descriptor_set_layout * layout,VkPipelineBindPoint bind_point)5289 radv_init_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer, struct radv_descriptor_set *set,
5290                               struct radv_descriptor_set_layout *layout,
5291                               VkPipelineBindPoint bind_point)
5292 {
5293    struct radv_descriptor_state *descriptors_state =
5294       radv_get_descriptors_state(cmd_buffer, bind_point);
5295    set->header.size = layout->size;
5296 
5297    if (set->header.layout != layout) {
5298       if (set->header.layout)
5299          vk_descriptor_set_layout_unref(&cmd_buffer->device->vk, &set->header.layout->vk);
5300       vk_descriptor_set_layout_ref(&layout->vk);
5301       set->header.layout = layout;
5302    }
5303 
5304    if (descriptors_state->push_set.capacity < set->header.size) {
5305       size_t new_size = MAX2(set->header.size, 1024);
5306       new_size = MAX2(new_size, 2 * descriptors_state->push_set.capacity);
5307       new_size = MIN2(new_size, 96 * MAX_PUSH_DESCRIPTORS);
5308 
5309       free(set->header.mapped_ptr);
5310       set->header.mapped_ptr = malloc(new_size);
5311 
5312       if (!set->header.mapped_ptr) {
5313          descriptors_state->push_set.capacity = 0;
5314          cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
5315          return false;
5316       }
5317 
5318       descriptors_state->push_set.capacity = new_size;
5319    }
5320 
5321    return true;
5322 }
5323 
5324 void
radv_meta_push_descriptor_set(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint pipelineBindPoint,VkPipelineLayout _layout,uint32_t set,uint32_t descriptorWriteCount,const VkWriteDescriptorSet * pDescriptorWrites)5325 radv_meta_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer,
5326                               VkPipelineBindPoint pipelineBindPoint, VkPipelineLayout _layout,
5327                               uint32_t set, uint32_t descriptorWriteCount,
5328                               const VkWriteDescriptorSet *pDescriptorWrites)
5329 {
5330    RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
5331    struct radv_descriptor_set *push_set =
5332       (struct radv_descriptor_set *)&cmd_buffer->meta_push_descriptors;
5333    unsigned bo_offset;
5334 
5335    assert(set == 0);
5336    assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
5337 
5338    push_set->header.size = layout->set[set].layout->size;
5339    push_set->header.layout = layout->set[set].layout;
5340 
5341    if (!radv_cmd_buffer_upload_alloc(cmd_buffer, push_set->header.size, &bo_offset,
5342                                      (void **)&push_set->header.mapped_ptr))
5343       return;
5344 
5345    push_set->header.va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
5346    push_set->header.va += bo_offset;
5347 
5348    radv_cmd_update_descriptor_sets(cmd_buffer->device, cmd_buffer,
5349                                    radv_descriptor_set_to_handle(push_set), descriptorWriteCount,
5350                                    pDescriptorWrites, 0, NULL);
5351 
5352    radv_set_descriptor_set(cmd_buffer, pipelineBindPoint, push_set, set);
5353 }
5354 
5355 VKAPI_ATTR void VKAPI_CALL
radv_CmdPushDescriptorSetKHR(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipelineLayout _layout,uint32_t set,uint32_t descriptorWriteCount,const VkWriteDescriptorSet * pDescriptorWrites)5356 radv_CmdPushDescriptorSetKHR(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint,
5357                              VkPipelineLayout _layout, uint32_t set, uint32_t descriptorWriteCount,
5358                              const VkWriteDescriptorSet *pDescriptorWrites)
5359 {
5360    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5361    RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
5362    struct radv_descriptor_state *descriptors_state =
5363       radv_get_descriptors_state(cmd_buffer, pipelineBindPoint);
5364    struct radv_descriptor_set *push_set =
5365       (struct radv_descriptor_set *)&descriptors_state->push_set.set;
5366 
5367    assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
5368 
5369    if (!radv_init_push_descriptor_set(cmd_buffer, push_set, layout->set[set].layout,
5370                                       pipelineBindPoint))
5371       return;
5372 
5373    /* Check that there are no inline uniform block updates when calling vkCmdPushDescriptorSetKHR()
5374     * because it is invalid, according to Vulkan spec.
5375     */
5376    for (int i = 0; i < descriptorWriteCount; i++) {
5377       ASSERTED const VkWriteDescriptorSet *writeset = &pDescriptorWrites[i];
5378       assert(writeset->descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK);
5379    }
5380 
5381    radv_cmd_update_descriptor_sets(cmd_buffer->device, cmd_buffer,
5382                                    radv_descriptor_set_to_handle(push_set), descriptorWriteCount,
5383                                    pDescriptorWrites, 0, NULL);
5384 
5385    radv_set_descriptor_set(cmd_buffer, pipelineBindPoint, push_set, set);
5386    descriptors_state->push_dirty = true;
5387 }
5388 
5389 VKAPI_ATTR void VKAPI_CALL
radv_CmdPushDescriptorSetWithTemplateKHR(VkCommandBuffer commandBuffer,VkDescriptorUpdateTemplate descriptorUpdateTemplate,VkPipelineLayout _layout,uint32_t set,const void * pData)5390 radv_CmdPushDescriptorSetWithTemplateKHR(VkCommandBuffer commandBuffer,
5391                                          VkDescriptorUpdateTemplate descriptorUpdateTemplate,
5392                                          VkPipelineLayout _layout, uint32_t set, const void *pData)
5393 {
5394    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5395    RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
5396    RADV_FROM_HANDLE(radv_descriptor_update_template, templ, descriptorUpdateTemplate);
5397    struct radv_descriptor_state *descriptors_state =
5398       radv_get_descriptors_state(cmd_buffer, templ->bind_point);
5399    struct radv_descriptor_set *push_set =
5400       (struct radv_descriptor_set *)&descriptors_state->push_set.set;
5401 
5402    assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
5403 
5404    if (!radv_init_push_descriptor_set(cmd_buffer, push_set, layout->set[set].layout,
5405                                       templ->bind_point))
5406       return;
5407 
5408    radv_cmd_update_descriptor_set_with_template(cmd_buffer->device, cmd_buffer, push_set,
5409                                                 descriptorUpdateTemplate, pData);
5410 
5411    radv_set_descriptor_set(cmd_buffer, templ->bind_point, push_set, set);
5412    descriptors_state->push_dirty = true;
5413 }
5414 
5415 VKAPI_ATTR void VKAPI_CALL
radv_CmdPushConstants(VkCommandBuffer commandBuffer,VkPipelineLayout layout,VkShaderStageFlags stageFlags,uint32_t offset,uint32_t size,const void * pValues)5416 radv_CmdPushConstants(VkCommandBuffer commandBuffer, VkPipelineLayout layout,
5417                       VkShaderStageFlags stageFlags, uint32_t offset, uint32_t size,
5418                       const void *pValues)
5419 {
5420    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5421    memcpy(cmd_buffer->push_constants + offset, pValues, size);
5422    cmd_buffer->push_constant_stages |= stageFlags;
5423 }
5424 
5425 VKAPI_ATTR VkResult VKAPI_CALL
radv_EndCommandBuffer(VkCommandBuffer commandBuffer)5426 radv_EndCommandBuffer(VkCommandBuffer commandBuffer)
5427 {
5428    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5429 
5430    radv_emit_mip_change_flush_default(cmd_buffer);
5431 
5432    if (cmd_buffer->qf != RADV_QUEUE_TRANSFER) {
5433       if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX6)
5434          cmd_buffer->state.flush_bits |=
5435             RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_WB_L2;
5436 
5437       /* Make sure to sync all pending active queries at the end of
5438        * command buffer.
5439        */
5440       cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
5441 
5442       /* Flush noncoherent images on GFX9+ so we can assume they're clean on the start of a
5443        * command buffer.
5444        */
5445       if (cmd_buffer->state.rb_noncoherent_dirty && can_skip_buffer_l2_flushes(cmd_buffer->device))
5446          cmd_buffer->state.flush_bits |= radv_src_access_flush(
5447             cmd_buffer,
5448             VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT |
5449             VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,
5450             NULL);
5451 
5452       /* Since NGG streamout uses GDS, we need to make GDS idle when
5453        * we leave the IB, otherwise another process might overwrite
5454        * it while our shaders are busy.
5455        */
5456       if (cmd_buffer->gds_needed)
5457          cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
5458 
5459       /* Finalize the internal compute command stream, if it exists. */
5460       if (cmd_buffer->ace_internal.cs) {
5461          VkResult result = radv_ace_internal_finalize(cmd_buffer);
5462          if (result != VK_SUCCESS)
5463             return vk_error(cmd_buffer, result);
5464       }
5465 
5466       si_emit_cache_flush(cmd_buffer);
5467    }
5468 
5469    /* Make sure CP DMA is idle at the end of IBs because the kernel
5470     * doesn't wait for it.
5471     */
5472    si_cp_dma_wait_for_idle(cmd_buffer);
5473 
5474    radv_describe_end_cmd_buffer(cmd_buffer);
5475 
5476    vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer->state.attachments);
5477    vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer->state.subpass_sample_locs);
5478 
5479    VkResult result = cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs);
5480    if (result != VK_SUCCESS)
5481       return vk_error(cmd_buffer, result);
5482 
5483    cmd_buffer->status = RADV_CMD_BUFFER_STATUS_EXECUTABLE;
5484 
5485    return cmd_buffer->record_result;
5486 }
5487 
5488 static void
radv_emit_compute_pipeline(struct radv_cmd_buffer * cmd_buffer,struct radv_compute_pipeline * pipeline)5489 radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer,
5490                            struct radv_compute_pipeline *pipeline)
5491 {
5492    if (pipeline == cmd_buffer->state.emitted_compute_pipeline)
5493       return;
5494 
5495    assert(!pipeline->base.ctx_cs.cdw);
5496 
5497    cmd_buffer->state.emitted_compute_pipeline = pipeline;
5498 
5499    radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->base.cs.cdw);
5500    radeon_emit_array(cmd_buffer->cs, pipeline->base.cs.buf, pipeline->base.cs.cdw);
5501 
5502    cmd_buffer->compute_scratch_size_per_wave_needed =
5503       MAX2(cmd_buffer->compute_scratch_size_per_wave_needed, pipeline->base.scratch_bytes_per_wave);
5504    cmd_buffer->compute_scratch_waves_wanted =
5505       MAX2(cmd_buffer->compute_scratch_waves_wanted, pipeline->base.max_waves);
5506 
5507    radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->base.slab_bo);
5508 
5509    if (unlikely(cmd_buffer->device->trace_bo))
5510       radv_save_pipeline(cmd_buffer, &pipeline->base);
5511 }
5512 
5513 static void
radv_mark_descriptor_sets_dirty(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)5514 radv_mark_descriptor_sets_dirty(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point)
5515 {
5516    struct radv_descriptor_state *descriptors_state =
5517       radv_get_descriptors_state(cmd_buffer, bind_point);
5518 
5519    descriptors_state->dirty |= descriptors_state->valid;
5520 }
5521 
5522 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindPipeline(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipeline _pipeline)5523 radv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint,
5524                      VkPipeline _pipeline)
5525 {
5526    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5527    RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);
5528 
5529    switch (pipelineBindPoint) {
5530    case VK_PIPELINE_BIND_POINT_COMPUTE: {
5531       struct radv_compute_pipeline *compute_pipeline = radv_pipeline_to_compute(pipeline);
5532 
5533       if (cmd_buffer->state.compute_pipeline == compute_pipeline)
5534          return;
5535       radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
5536 
5537       cmd_buffer->state.compute_pipeline = compute_pipeline;
5538       cmd_buffer->push_constant_stages |= VK_SHADER_STAGE_COMPUTE_BIT;
5539       cmd_buffer->task_rings_needed |=
5540          pipeline->shaders[MESA_SHADER_COMPUTE]->info.cs.uses_task_rings;
5541       break;
5542    }
5543    case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR: {
5544       struct radv_compute_pipeline *compute_pipeline = radv_pipeline_to_compute(pipeline);
5545 
5546       if (cmd_buffer->state.rt_pipeline == compute_pipeline)
5547          return;
5548       radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
5549 
5550       cmd_buffer->state.rt_pipeline = compute_pipeline;
5551       cmd_buffer->push_constant_stages |= RADV_RT_STAGE_BITS;
5552       if (compute_pipeline->dynamic_stack_size)
5553          radv_set_rt_stack_size(cmd_buffer, cmd_buffer->state.rt_stack_size);
5554       break;
5555    }
5556    case VK_PIPELINE_BIND_POINT_GRAPHICS: {
5557       struct radv_graphics_pipeline *graphics_pipeline =
5558          pipeline ? radv_pipeline_to_graphics(pipeline) : NULL;
5559 
5560       if (cmd_buffer->state.graphics_pipeline == graphics_pipeline)
5561          return;
5562       radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
5563 
5564       bool vtx_emit_count_changed =
5565          !pipeline || !cmd_buffer->state.graphics_pipeline ||
5566          cmd_buffer->state.graphics_pipeline->vtx_emit_num != graphics_pipeline->vtx_emit_num ||
5567          cmd_buffer->state.graphics_pipeline->vtx_base_sgpr != graphics_pipeline->vtx_base_sgpr;
5568       cmd_buffer->state.graphics_pipeline = graphics_pipeline;
5569       if (!pipeline)
5570          break;
5571 
5572       bool mesh_shading = radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_MESH);
5573       if (mesh_shading != cmd_buffer->state.mesh_shading) {
5574          /* Re-emit VRS state because the combiner is different (vertex vs primitive).
5575           * Re-emit primitive topology because the mesh shading pipeline clobbered it.
5576           */
5577          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE |
5578                                     RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY;
5579       }
5580 
5581       cmd_buffer->state.mesh_shading = mesh_shading;
5582       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_PIPELINE | RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT;
5583       cmd_buffer->push_constant_stages |= graphics_pipeline->active_stages;
5584 
5585       /* the new vertex shader might not have the same user regs */
5586       if (vtx_emit_count_changed) {
5587          cmd_buffer->state.last_first_instance = -1;
5588          cmd_buffer->state.last_vertex_offset = -1;
5589          cmd_buffer->state.last_drawid = -1;
5590       }
5591 
5592       /* Prefetch all pipeline shaders at first draw time. */
5593       cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_SHADERS;
5594 
5595       if (cmd_buffer->device->physical_device->rad_info.has_vgt_flush_ngg_legacy_bug &&
5596           cmd_buffer->state.emitted_graphics_pipeline &&
5597           cmd_buffer->state.emitted_graphics_pipeline->is_ngg &&
5598           !cmd_buffer->state.graphics_pipeline->is_ngg) {
5599          /* Transitioning from NGG to legacy GS requires
5600           * VGT_FLUSH on GFX10 and Navi21. VGT_FLUSH
5601           * is also emitted at the beginning of IBs when legacy
5602           * GS ring pointers are set.
5603           */
5604          cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_FLUSH;
5605       }
5606 
5607       radv_bind_dynamic_state(cmd_buffer, &graphics_pipeline->dynamic_state);
5608 
5609       if (graphics_pipeline->esgs_ring_size > cmd_buffer->esgs_ring_size_needed)
5610          cmd_buffer->esgs_ring_size_needed = graphics_pipeline->esgs_ring_size;
5611       if (graphics_pipeline->gsvs_ring_size > cmd_buffer->gsvs_ring_size_needed)
5612          cmd_buffer->gsvs_ring_size_needed = graphics_pipeline->gsvs_ring_size;
5613 
5614       if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_TESS_CTRL))
5615          cmd_buffer->tess_rings_needed = true;
5616       if (mesh_shading)
5617          cmd_buffer->mesh_scratch_ring_needed |=
5618             pipeline->shaders[MESA_SHADER_MESH]->info.ms.needs_ms_scratch_ring;
5619 
5620       if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_TASK)) {
5621          if (!cmd_buffer->ace_internal.cs) {
5622             cmd_buffer->ace_internal.cs = radv_ace_internal_create(cmd_buffer);
5623             if (!cmd_buffer->ace_internal.cs)
5624                return;
5625          }
5626 
5627          cmd_buffer->task_rings_needed = true;
5628       }
5629       break;
5630    }
5631    default:
5632       assert(!"invalid bind point");
5633       break;
5634    }
5635 }
5636 
5637 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetViewport(VkCommandBuffer commandBuffer,uint32_t firstViewport,uint32_t viewportCount,const VkViewport * pViewports)5638 radv_CmdSetViewport(VkCommandBuffer commandBuffer, uint32_t firstViewport, uint32_t viewportCount,
5639                     const VkViewport *pViewports)
5640 {
5641    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5642    struct radv_cmd_state *state = &cmd_buffer->state;
5643    ASSERTED const uint32_t total_count = firstViewport + viewportCount;
5644 
5645    assert(firstViewport < MAX_VIEWPORTS);
5646    assert(total_count >= 1 && total_count <= MAX_VIEWPORTS);
5647 
5648    if (state->dynamic.viewport.count < total_count)
5649       state->dynamic.viewport.count = total_count;
5650 
5651    memcpy(state->dynamic.viewport.viewports + firstViewport, pViewports,
5652           viewportCount * sizeof(*pViewports));
5653    for (unsigned i = 0; i < viewportCount; i++) {
5654       radv_get_viewport_xform(&pViewports[i],
5655                               state->dynamic.viewport.xform[i + firstViewport].scale,
5656                               state->dynamic.viewport.xform[i + firstViewport].translate);
5657    }
5658 
5659    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_VIEWPORT;
5660 }
5661 
5662 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetScissor(VkCommandBuffer commandBuffer,uint32_t firstScissor,uint32_t scissorCount,const VkRect2D * pScissors)5663 radv_CmdSetScissor(VkCommandBuffer commandBuffer, uint32_t firstScissor, uint32_t scissorCount,
5664                    const VkRect2D *pScissors)
5665 {
5666    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5667    struct radv_cmd_state *state = &cmd_buffer->state;
5668    ASSERTED const uint32_t total_count = firstScissor + scissorCount;
5669 
5670    assert(firstScissor < MAX_SCISSORS);
5671    assert(total_count >= 1 && total_count <= MAX_SCISSORS);
5672 
5673    if (state->dynamic.scissor.count < total_count)
5674       state->dynamic.scissor.count = total_count;
5675 
5676    memcpy(state->dynamic.scissor.scissors + firstScissor, pScissors,
5677           scissorCount * sizeof(*pScissors));
5678 
5679    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR;
5680 }
5681 
5682 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetLineWidth(VkCommandBuffer commandBuffer,float lineWidth)5683 radv_CmdSetLineWidth(VkCommandBuffer commandBuffer, float lineWidth)
5684 {
5685    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5686 
5687    if (cmd_buffer->state.dynamic.line_width != lineWidth)
5688       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR;
5689 
5690    cmd_buffer->state.dynamic.line_width = lineWidth;
5691    cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH;
5692 }
5693 
5694 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthBias(VkCommandBuffer commandBuffer,float depthBiasConstantFactor,float depthBiasClamp,float depthBiasSlopeFactor)5695 radv_CmdSetDepthBias(VkCommandBuffer commandBuffer, float depthBiasConstantFactor,
5696                      float depthBiasClamp, float depthBiasSlopeFactor)
5697 {
5698    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5699    struct radv_cmd_state *state = &cmd_buffer->state;
5700 
5701    state->dynamic.depth_bias.bias = depthBiasConstantFactor;
5702    state->dynamic.depth_bias.clamp = depthBiasClamp;
5703    state->dynamic.depth_bias.slope = depthBiasSlopeFactor;
5704 
5705    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
5706 }
5707 
5708 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetBlendConstants(VkCommandBuffer commandBuffer,const float blendConstants[4])5709 radv_CmdSetBlendConstants(VkCommandBuffer commandBuffer, const float blendConstants[4])
5710 {
5711    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5712    struct radv_cmd_state *state = &cmd_buffer->state;
5713 
5714    memcpy(state->dynamic.blend_constants, blendConstants, sizeof(float) * 4);
5715 
5716    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS;
5717 }
5718 
5719 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthBounds(VkCommandBuffer commandBuffer,float minDepthBounds,float maxDepthBounds)5720 radv_CmdSetDepthBounds(VkCommandBuffer commandBuffer, float minDepthBounds, float maxDepthBounds)
5721 {
5722    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5723    struct radv_cmd_state *state = &cmd_buffer->state;
5724 
5725    state->dynamic.depth_bounds.min = minDepthBounds;
5726    state->dynamic.depth_bounds.max = maxDepthBounds;
5727 
5728    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS;
5729 }
5730 
5731 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t compareMask)5732 radv_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask,
5733                               uint32_t compareMask)
5734 {
5735    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5736    struct radv_cmd_state *state = &cmd_buffer->state;
5737 
5738    if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
5739       state->dynamic.stencil_compare_mask.front = compareMask;
5740    if (faceMask & VK_STENCIL_FACE_BACK_BIT)
5741       state->dynamic.stencil_compare_mask.back = compareMask;
5742 
5743    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK;
5744 }
5745 
5746 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t writeMask)5747 radv_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask,
5748                             uint32_t writeMask)
5749 {
5750    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5751    struct radv_cmd_state *state = &cmd_buffer->state;
5752 
5753    if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
5754       state->dynamic.stencil_write_mask.front = writeMask;
5755    if (faceMask & VK_STENCIL_FACE_BACK_BIT)
5756       state->dynamic.stencil_write_mask.back = writeMask;
5757 
5758    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK;
5759 }
5760 
5761 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetStencilReference(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t reference)5762 radv_CmdSetStencilReference(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask,
5763                             uint32_t reference)
5764 {
5765    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5766 
5767    if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
5768       cmd_buffer->state.dynamic.stencil_reference.front = reference;
5769    if (faceMask & VK_STENCIL_FACE_BACK_BIT)
5770       cmd_buffer->state.dynamic.stencil_reference.back = reference;
5771 
5772    cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE;
5773 }
5774 
5775 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDiscardRectangleEXT(VkCommandBuffer commandBuffer,uint32_t firstDiscardRectangle,uint32_t discardRectangleCount,const VkRect2D * pDiscardRectangles)5776 radv_CmdSetDiscardRectangleEXT(VkCommandBuffer commandBuffer, uint32_t firstDiscardRectangle,
5777                                uint32_t discardRectangleCount, const VkRect2D *pDiscardRectangles)
5778 {
5779    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5780    struct radv_cmd_state *state = &cmd_buffer->state;
5781    ASSERTED const uint32_t total_count = firstDiscardRectangle + discardRectangleCount;
5782 
5783    assert(firstDiscardRectangle < MAX_DISCARD_RECTANGLES);
5784    assert(total_count >= 1 && total_count <= MAX_DISCARD_RECTANGLES);
5785 
5786    typed_memcpy(&state->dynamic.discard_rectangle.rectangles[firstDiscardRectangle],
5787                 pDiscardRectangles, discardRectangleCount);
5788 
5789    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE;
5790 }
5791 
5792 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetSampleLocationsEXT(VkCommandBuffer commandBuffer,const VkSampleLocationsInfoEXT * pSampleLocationsInfo)5793 radv_CmdSetSampleLocationsEXT(VkCommandBuffer commandBuffer,
5794                               const VkSampleLocationsInfoEXT *pSampleLocationsInfo)
5795 {
5796    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5797    struct radv_cmd_state *state = &cmd_buffer->state;
5798 
5799    assert(pSampleLocationsInfo->sampleLocationsCount <= MAX_SAMPLE_LOCATIONS);
5800 
5801    state->dynamic.sample_location.per_pixel = pSampleLocationsInfo->sampleLocationsPerPixel;
5802    state->dynamic.sample_location.grid_size = pSampleLocationsInfo->sampleLocationGridSize;
5803    state->dynamic.sample_location.count = pSampleLocationsInfo->sampleLocationsCount;
5804    typed_memcpy(&state->dynamic.sample_location.locations[0],
5805                 pSampleLocationsInfo->pSampleLocations, pSampleLocationsInfo->sampleLocationsCount);
5806 
5807    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS;
5808 }
5809 
5810 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetLineStippleEXT(VkCommandBuffer commandBuffer,uint32_t lineStippleFactor,uint16_t lineStipplePattern)5811 radv_CmdSetLineStippleEXT(VkCommandBuffer commandBuffer, uint32_t lineStippleFactor,
5812                           uint16_t lineStipplePattern)
5813 {
5814    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5815    struct radv_cmd_state *state = &cmd_buffer->state;
5816 
5817    state->dynamic.line_stipple.factor = lineStippleFactor;
5818    state->dynamic.line_stipple.pattern = lineStipplePattern;
5819 
5820    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE;
5821 }
5822 
5823 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetCullMode(VkCommandBuffer commandBuffer,VkCullModeFlags cullMode)5824 radv_CmdSetCullMode(VkCommandBuffer commandBuffer, VkCullModeFlags cullMode)
5825 {
5826    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5827    struct radv_cmd_state *state = &cmd_buffer->state;
5828 
5829    state->dynamic.cull_mode = cullMode;
5830 
5831    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_CULL_MODE;
5832 }
5833 
5834 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetFrontFace(VkCommandBuffer commandBuffer,VkFrontFace frontFace)5835 radv_CmdSetFrontFace(VkCommandBuffer commandBuffer, VkFrontFace frontFace)
5836 {
5837    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5838    struct radv_cmd_state *state = &cmd_buffer->state;
5839 
5840    state->dynamic.front_face = frontFace;
5841 
5842    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE;
5843 }
5844 
5845 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetPrimitiveTopology(VkCommandBuffer commandBuffer,VkPrimitiveTopology primitiveTopology)5846 radv_CmdSetPrimitiveTopology(VkCommandBuffer commandBuffer, VkPrimitiveTopology primitiveTopology)
5847 {
5848    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5849    struct radv_cmd_state *state = &cmd_buffer->state;
5850    unsigned primitive_topology = si_translate_prim(primitiveTopology);
5851 
5852    if ((state->dynamic.primitive_topology == V_008958_DI_PT_LINESTRIP) !=
5853        (primitive_topology == V_008958_DI_PT_LINESTRIP))
5854       state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE;
5855 
5856    if (radv_prim_is_points_or_lines(state->dynamic.primitive_topology) !=
5857        radv_prim_is_points_or_lines(primitive_topology))
5858       state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR;
5859 
5860    state->dynamic.primitive_topology = primitive_topology;
5861 
5862    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY;
5863 }
5864 
5865 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetViewportWithCount(VkCommandBuffer commandBuffer,uint32_t viewportCount,const VkViewport * pViewports)5866 radv_CmdSetViewportWithCount(VkCommandBuffer commandBuffer, uint32_t viewportCount,
5867                              const VkViewport *pViewports)
5868 {
5869    radv_CmdSetViewport(commandBuffer, 0, viewportCount, pViewports);
5870 }
5871 
5872 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetScissorWithCount(VkCommandBuffer commandBuffer,uint32_t scissorCount,const VkRect2D * pScissors)5873 radv_CmdSetScissorWithCount(VkCommandBuffer commandBuffer, uint32_t scissorCount,
5874                             const VkRect2D *pScissors)
5875 {
5876    radv_CmdSetScissor(commandBuffer, 0, scissorCount, pScissors);
5877 }
5878 
5879 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthTestEnable(VkCommandBuffer commandBuffer,VkBool32 depthTestEnable)5880 radv_CmdSetDepthTestEnable(VkCommandBuffer commandBuffer, VkBool32 depthTestEnable)
5881 
5882 {
5883    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5884    struct radv_cmd_state *state = &cmd_buffer->state;
5885 
5886    state->dynamic.depth_test_enable = depthTestEnable;
5887 
5888    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE;
5889 }
5890 
5891 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthWriteEnable(VkCommandBuffer commandBuffer,VkBool32 depthWriteEnable)5892 radv_CmdSetDepthWriteEnable(VkCommandBuffer commandBuffer, VkBool32 depthWriteEnable)
5893 {
5894    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5895    struct radv_cmd_state *state = &cmd_buffer->state;
5896 
5897    state->dynamic.depth_write_enable = depthWriteEnable;
5898 
5899    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE;
5900 }
5901 
5902 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthCompareOp(VkCommandBuffer commandBuffer,VkCompareOp depthCompareOp)5903 radv_CmdSetDepthCompareOp(VkCommandBuffer commandBuffer, VkCompareOp depthCompareOp)
5904 {
5905    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5906    struct radv_cmd_state *state = &cmd_buffer->state;
5907 
5908    state->dynamic.depth_compare_op = depthCompareOp;
5909 
5910    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP;
5911 }
5912 
5913 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthBoundsTestEnable(VkCommandBuffer commandBuffer,VkBool32 depthBoundsTestEnable)5914 radv_CmdSetDepthBoundsTestEnable(VkCommandBuffer commandBuffer, VkBool32 depthBoundsTestEnable)
5915 {
5916    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5917    struct radv_cmd_state *state = &cmd_buffer->state;
5918 
5919    state->dynamic.depth_bounds_test_enable = depthBoundsTestEnable;
5920 
5921    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE;
5922 }
5923 
5924 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetStencilTestEnable(VkCommandBuffer commandBuffer,VkBool32 stencilTestEnable)5925 radv_CmdSetStencilTestEnable(VkCommandBuffer commandBuffer, VkBool32 stencilTestEnable)
5926 {
5927    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5928    struct radv_cmd_state *state = &cmd_buffer->state;
5929 
5930    state->dynamic.stencil_test_enable = stencilTestEnable;
5931 
5932    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE;
5933 }
5934 
5935 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetStencilOp(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,VkStencilOp failOp,VkStencilOp passOp,VkStencilOp depthFailOp,VkCompareOp compareOp)5936 radv_CmdSetStencilOp(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask,
5937                      VkStencilOp failOp, VkStencilOp passOp, VkStencilOp depthFailOp,
5938                      VkCompareOp compareOp)
5939 {
5940    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5941    struct radv_cmd_state *state = &cmd_buffer->state;
5942 
5943    if (faceMask & VK_STENCIL_FACE_FRONT_BIT) {
5944       state->dynamic.stencil_op.front.fail_op = failOp;
5945       state->dynamic.stencil_op.front.pass_op = passOp;
5946       state->dynamic.stencil_op.front.depth_fail_op = depthFailOp;
5947       state->dynamic.stencil_op.front.compare_op = compareOp;
5948    }
5949 
5950    if (faceMask & VK_STENCIL_FACE_BACK_BIT) {
5951       state->dynamic.stencil_op.back.fail_op = failOp;
5952       state->dynamic.stencil_op.back.pass_op = passOp;
5953       state->dynamic.stencil_op.back.depth_fail_op = depthFailOp;
5954       state->dynamic.stencil_op.back.compare_op = compareOp;
5955    }
5956 
5957    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP;
5958 }
5959 
5960 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetFragmentShadingRateKHR(VkCommandBuffer commandBuffer,const VkExtent2D * pFragmentSize,const VkFragmentShadingRateCombinerOpKHR combinerOps[2])5961 radv_CmdSetFragmentShadingRateKHR(VkCommandBuffer commandBuffer, const VkExtent2D *pFragmentSize,
5962                                   const VkFragmentShadingRateCombinerOpKHR combinerOps[2])
5963 {
5964    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5965    struct radv_cmd_state *state = &cmd_buffer->state;
5966 
5967    state->dynamic.fragment_shading_rate.size = *pFragmentSize;
5968    for (unsigned i = 0; i < 2; i++)
5969       state->dynamic.fragment_shading_rate.combiner_ops[i] = combinerOps[i];
5970 
5971    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE;
5972 }
5973 
5974 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthBiasEnable(VkCommandBuffer commandBuffer,VkBool32 depthBiasEnable)5975 radv_CmdSetDepthBiasEnable(VkCommandBuffer commandBuffer, VkBool32 depthBiasEnable)
5976 {
5977    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5978    struct radv_cmd_state *state = &cmd_buffer->state;
5979 
5980    state->dynamic.depth_bias_enable = depthBiasEnable;
5981 
5982    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE;
5983 }
5984 
5985 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetPrimitiveRestartEnable(VkCommandBuffer commandBuffer,VkBool32 primitiveRestartEnable)5986 radv_CmdSetPrimitiveRestartEnable(VkCommandBuffer commandBuffer, VkBool32 primitiveRestartEnable)
5987 {
5988    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5989    struct radv_cmd_state *state = &cmd_buffer->state;
5990 
5991    state->dynamic.primitive_restart_enable = primitiveRestartEnable;
5992 
5993    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE;
5994 }
5995 
5996 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetRasterizerDiscardEnable(VkCommandBuffer commandBuffer,VkBool32 rasterizerDiscardEnable)5997 radv_CmdSetRasterizerDiscardEnable(VkCommandBuffer commandBuffer, VkBool32 rasterizerDiscardEnable)
5998 {
5999    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6000    struct radv_cmd_state *state = &cmd_buffer->state;
6001 
6002    state->dynamic.rasterizer_discard_enable = rasterizerDiscardEnable;
6003 
6004    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
6005 }
6006 
6007 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetPatchControlPointsEXT(VkCommandBuffer commandBuffer,uint32_t patchControlPoints)6008 radv_CmdSetPatchControlPointsEXT(VkCommandBuffer commandBuffer, uint32_t patchControlPoints)
6009 {
6010    /* not implemented */
6011 }
6012 
6013 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetLogicOpEXT(VkCommandBuffer commandBuffer,VkLogicOp logicOp)6014 radv_CmdSetLogicOpEXT(VkCommandBuffer commandBuffer, VkLogicOp logicOp)
6015 {
6016    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6017    struct radv_cmd_state *state = &cmd_buffer->state;
6018    unsigned logic_op = si_translate_blend_logic_op(logicOp);
6019 
6020    state->dynamic.logic_op = logic_op;
6021 
6022    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP;
6023 }
6024 
6025 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer,uint32_t attachmentCount,const VkBool32 * pColorWriteEnables)6026 radv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer, uint32_t attachmentCount,
6027                                const VkBool32 *pColorWriteEnables)
6028 {
6029    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6030    struct radv_cmd_state *state = &cmd_buffer->state;
6031    uint32_t color_write_enable = 0;
6032 
6033    assert(attachmentCount <= MAX_RTS);
6034 
6035    for (uint32_t i = 0; i < attachmentCount; i++) {
6036       color_write_enable |= pColorWriteEnables[i] ? (0xfu << (i * 4)) : 0;
6037    }
6038 
6039    state->dynamic.color_write_enable = color_write_enable;
6040 
6041    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE;
6042 }
6043 
6044 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetVertexInputEXT(VkCommandBuffer commandBuffer,uint32_t vertexBindingDescriptionCount,const VkVertexInputBindingDescription2EXT * pVertexBindingDescriptions,uint32_t vertexAttributeDescriptionCount,const VkVertexInputAttributeDescription2EXT * pVertexAttributeDescriptions)6045 radv_CmdSetVertexInputEXT(VkCommandBuffer commandBuffer, uint32_t vertexBindingDescriptionCount,
6046                           const VkVertexInputBindingDescription2EXT *pVertexBindingDescriptions,
6047                           uint32_t vertexAttributeDescriptionCount,
6048                           const VkVertexInputAttributeDescription2EXT *pVertexAttributeDescriptions)
6049 {
6050    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6051    struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
6052 
6053    const VkVertexInputBindingDescription2EXT *bindings[MAX_VBS];
6054    for (unsigned i = 0; i < vertexBindingDescriptionCount; i++)
6055       bindings[pVertexBindingDescriptions[i].binding] = &pVertexBindingDescriptions[i];
6056 
6057    cmd_buffer->state.vbo_misaligned_mask = 0;
6058    cmd_buffer->state.vbo_misaligned_mask_invalid = 0;
6059 
6060    memset(state, 0, sizeof(*state));
6061    state->bindings_match_attrib = true;
6062 
6063    enum amd_gfx_level chip = cmd_buffer->device->physical_device->rad_info.gfx_level;
6064    for (unsigned i = 0; i < vertexAttributeDescriptionCount; i++) {
6065       const VkVertexInputAttributeDescription2EXT *attrib = &pVertexAttributeDescriptions[i];
6066       const VkVertexInputBindingDescription2EXT *binding = bindings[attrib->binding];
6067       unsigned loc = attrib->location;
6068 
6069       state->attribute_mask |= 1u << loc;
6070       state->bindings[loc] = attrib->binding;
6071       if (attrib->binding != loc)
6072          state->bindings_match_attrib = false;
6073       if (binding->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE) {
6074          state->instance_rate_inputs |= 1u << loc;
6075          state->divisors[loc] = binding->divisor;
6076          if (binding->divisor == 0) {
6077             state->zero_divisors |= 1u << loc;
6078          } else if (binding->divisor > 1) {
6079             state->nontrivial_divisors |= 1u << loc;
6080          }
6081       }
6082       cmd_buffer->vertex_bindings[attrib->binding].stride = binding->stride;
6083       state->offsets[loc] = attrib->offset;
6084 
6085       struct dynamic_vertex_format_cache *found = NULL;
6086       util_dynarray_foreach(&cmd_buffer->cached_vertex_formats,
6087                             struct dynamic_vertex_format_cache,
6088                             vf) {
6089          if (vf->format == attrib->format) {
6090             found = vf;
6091             break;
6092          }
6093       }
6094       if (!found) {
6095          unsigned nfmt, dfmt;
6096          bool post_shuffle;
6097          enum radv_vs_input_alpha_adjust alpha_adjust;
6098          const struct util_format_description *format_desc = vk_format_description(attrib->format);
6099 
6100          found = util_dynarray_grow(&cmd_buffer->cached_vertex_formats,
6101                                     struct dynamic_vertex_format_cache, 1);
6102          radv_translate_vertex_format(cmd_buffer->device->physical_device, attrib->format, format_desc,
6103                                       &dfmt, &nfmt, &post_shuffle, &alpha_adjust);
6104          found->format = attrib->format;
6105          found->hw_fmt = dfmt | (nfmt << 4);
6106          const uint8_t format_align_req_minus_1 = format_desc->channel[0].size >= 32 ? 3 :
6107             (format_desc->block.bits / 8u - 1);
6108          found->fmt_align_req_minus_1 = format_align_req_minus_1;
6109          found->fmt_size = format_desc->block.bits / 8u;
6110          found->post_shuffle = post_shuffle;
6111          found->alpha_adjust_lo = alpha_adjust & 0x1;
6112          found->alpha_adjust_hi = (alpha_adjust >> 1) & 0x1;
6113       }
6114 
6115       state->formats[loc] = found->hw_fmt;
6116       state->format_align_req_minus_1[loc] = found->fmt_align_req_minus_1;
6117       state->format_sizes[loc] = found->fmt_size;
6118       state->alpha_adjust_lo |= found->alpha_adjust_lo << loc;
6119       state->alpha_adjust_hi |= found->alpha_adjust_hi << loc;
6120       if (found->post_shuffle)
6121          state->post_shuffle |= 1u << loc;
6122 
6123       if ((chip == GFX6 || chip >= GFX10) &&
6124           cmd_buffer->state.vbo_bound_mask & BITFIELD_BIT(attrib->binding)) {
6125          if (binding->stride & found->fmt_align_req_minus_1) {
6126             cmd_buffer->state.vbo_misaligned_mask |= BITFIELD_BIT(loc);
6127          } else if ((cmd_buffer->vertex_bindings[attrib->binding].offset + state->offsets[loc]) &
6128                     found->fmt_align_req_minus_1) {
6129             cmd_buffer->state.vbo_misaligned_mask |= BITFIELD_BIT(loc);
6130          }
6131       }
6132    }
6133 
6134    cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER |
6135                               RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT;
6136 }
6137 
6138 VKAPI_ATTR void VKAPI_CALL
radv_CmdExecuteCommands(VkCommandBuffer commandBuffer,uint32_t commandBufferCount,const VkCommandBuffer * pCmdBuffers)6139 radv_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCount,
6140                         const VkCommandBuffer *pCmdBuffers)
6141 {
6142    RADV_FROM_HANDLE(radv_cmd_buffer, primary, commandBuffer);
6143 
6144    assert(commandBufferCount > 0);
6145 
6146    radv_emit_mip_change_flush_default(primary);
6147 
6148    /* Emit pending flushes on primary prior to executing secondary */
6149    si_emit_cache_flush(primary);
6150 
6151    /* Make sure CP DMA is idle on primary prior to executing secondary. */
6152    si_cp_dma_wait_for_idle(primary);
6153 
6154    for (uint32_t i = 0; i < commandBufferCount; i++) {
6155       RADV_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]);
6156       bool allow_ib2 = true;
6157 
6158       if (secondary->device->physical_device->rad_info.gfx_level == GFX7 &&
6159           secondary->state.uses_draw_indirect_multi) {
6160          /* Do not launch an IB2 for secondary command buffers that contain
6161           * DRAW_{INDEX}_INDIRECT_MULTI on GFX7 because it's illegal and hang the GPU.
6162           */
6163          allow_ib2 = false;
6164       }
6165 
6166       if (secondary->qf == RADV_QUEUE_COMPUTE) {
6167          /* IB2 packets are not supported on compute queues according to PAL. */
6168          allow_ib2 = false;
6169       }
6170 
6171       primary->scratch_size_per_wave_needed =
6172          MAX2(primary->scratch_size_per_wave_needed, secondary->scratch_size_per_wave_needed);
6173       primary->scratch_waves_wanted =
6174          MAX2(primary->scratch_waves_wanted, secondary->scratch_waves_wanted);
6175       primary->compute_scratch_size_per_wave_needed =
6176          MAX2(primary->compute_scratch_size_per_wave_needed,
6177               secondary->compute_scratch_size_per_wave_needed);
6178       primary->compute_scratch_waves_wanted =
6179          MAX2(primary->compute_scratch_waves_wanted, secondary->compute_scratch_waves_wanted);
6180 
6181       if (secondary->esgs_ring_size_needed > primary->esgs_ring_size_needed)
6182          primary->esgs_ring_size_needed = secondary->esgs_ring_size_needed;
6183       if (secondary->gsvs_ring_size_needed > primary->gsvs_ring_size_needed)
6184          primary->gsvs_ring_size_needed = secondary->gsvs_ring_size_needed;
6185       if (secondary->tess_rings_needed)
6186          primary->tess_rings_needed = true;
6187       if (secondary->task_rings_needed)
6188          primary->task_rings_needed = true;
6189       if (secondary->mesh_scratch_ring_needed)
6190          primary->mesh_scratch_ring_needed = true;
6191       if (secondary->sample_positions_needed)
6192          primary->sample_positions_needed = true;
6193       if (secondary->gds_needed)
6194          primary->gds_needed = true;
6195 
6196       if (!secondary->state.framebuffer && primary->state.pass && (primary->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)) {
6197          /* Emit the framebuffer state from primary if secondary
6198           * has been recorded without a framebuffer, otherwise
6199           * fast color/depth clears can't work.
6200           */
6201          radv_emit_fb_mip_change_flush(primary);
6202          radv_emit_framebuffer_state(primary);
6203       }
6204 
6205       if (secondary->ace_internal.cs) {
6206          if (!primary->ace_internal.cs) {
6207             primary->ace_internal.cs = radv_ace_internal_create(primary);
6208             if (!primary->ace_internal.cs)
6209                return;
6210          }
6211 
6212          struct radeon_cmdbuf *ace_primary = primary->ace_internal.cs;
6213          struct radeon_cmdbuf *ace_secondary = secondary->ace_internal.cs;
6214 
6215          /* Emit pending flushes on primary prior to executing secondary. */
6216          radv_ace_internal_cache_flush(primary);
6217 
6218          /* Wait for primary GFX->ACE semaphore, if necessary. */
6219          if (radv_flush_gfx2ace_semaphore(primary))
6220             radv_wait_gfx2ace_semaphore(primary);
6221 
6222          /* Execute the secondary compute cmdbuf.
6223           * Don't use IB2 packets because they are not supported on compute queues.
6224           */
6225          primary->device->ws->cs_execute_secondary(ace_primary, ace_secondary, false);
6226       }
6227 
6228       /* Update pending ACE internal flush bits from the secondary cmdbuf */
6229       primary->ace_internal.flush_bits |= secondary->ace_internal.flush_bits;
6230 
6231       /* Increment primary semaphore if secondary was dirty.
6232        * This happens when the secondary cmdbuf has a barrier which
6233        * isn't consumed by a draw call.
6234        */
6235       if (radv_ace_internal_sem_dirty(secondary))
6236          primary->ace_internal.sem.gfx2ace_value++;
6237 
6238       primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs, allow_ib2);
6239 
6240       /* When the secondary command buffer is compute only we don't
6241        * need to re-emit the current graphics pipeline.
6242        */
6243       if (secondary->state.emitted_graphics_pipeline) {
6244          primary->state.emitted_graphics_pipeline = secondary->state.emitted_graphics_pipeline;
6245       }
6246 
6247       /* When the secondary command buffer is graphics only we don't
6248        * need to re-emit the current compute pipeline.
6249        */
6250       if (secondary->state.emitted_compute_pipeline) {
6251          primary->state.emitted_compute_pipeline = secondary->state.emitted_compute_pipeline;
6252       }
6253 
6254       /* Only re-emit the draw packets when needed. */
6255       if (secondary->state.last_primitive_reset_en != -1) {
6256          primary->state.last_primitive_reset_en = secondary->state.last_primitive_reset_en;
6257       }
6258 
6259       if (secondary->state.last_primitive_reset_index) {
6260          primary->state.last_primitive_reset_index = secondary->state.last_primitive_reset_index;
6261       }
6262 
6263       if (secondary->state.last_ia_multi_vgt_param) {
6264          primary->state.last_ia_multi_vgt_param = secondary->state.last_ia_multi_vgt_param;
6265       }
6266 
6267       primary->state.last_first_instance = secondary->state.last_first_instance;
6268       primary->state.last_num_instances = secondary->state.last_num_instances;
6269       primary->state.last_drawid = secondary->state.last_drawid;
6270       primary->state.last_subpass_color_count = secondary->state.last_subpass_color_count;
6271       primary->state.last_vertex_offset = secondary->state.last_vertex_offset;
6272       primary->state.last_sx_ps_downconvert = secondary->state.last_sx_ps_downconvert;
6273       primary->state.last_sx_blend_opt_epsilon = secondary->state.last_sx_blend_opt_epsilon;
6274       primary->state.last_sx_blend_opt_control = secondary->state.last_sx_blend_opt_control;
6275 
6276       if (secondary->state.last_index_type != -1) {
6277          primary->state.last_index_type = secondary->state.last_index_type;
6278       }
6279 
6280       primary->state.last_nggc_settings = secondary->state.last_nggc_settings;
6281       primary->state.last_nggc_settings_sgpr_idx = secondary->state.last_nggc_settings_sgpr_idx;
6282       primary->state.last_nggc_skip = secondary->state.last_nggc_skip;
6283 
6284       primary->state.last_vrs_rates = secondary->state.last_vrs_rates;
6285       primary->state.last_vrs_rates_sgpr_idx = secondary->state.last_vrs_rates_sgpr_idx;
6286    }
6287 
6288    /* After executing commands from secondary buffers we have to dirty
6289     * some states.
6290     */
6291    primary->state.dirty |=
6292       RADV_CMD_DIRTY_PIPELINE | RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_DYNAMIC_ALL;
6293    radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_GRAPHICS);
6294    radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_COMPUTE);
6295 }
6296 
6297 VKAPI_ATTR VkResult VKAPI_CALL
radv_CreateCommandPool(VkDevice _device,const VkCommandPoolCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkCommandPool * pCmdPool)6298 radv_CreateCommandPool(VkDevice _device, const VkCommandPoolCreateInfo *pCreateInfo,
6299                        const VkAllocationCallbacks *pAllocator, VkCommandPool *pCmdPool)
6300 {
6301    RADV_FROM_HANDLE(radv_device, device, _device);
6302    struct radv_cmd_pool *pool;
6303 
6304    pool =
6305       vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*pool), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
6306    if (pool == NULL)
6307       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
6308 
6309    VkResult result = vk_command_pool_init(&pool->vk, &device->vk, pCreateInfo, pAllocator);
6310    if (result != VK_SUCCESS) {
6311       vk_free2(&device->vk.alloc, pAllocator, pool);
6312       return result;
6313    }
6314 
6315    list_inithead(&pool->cmd_buffers);
6316    list_inithead(&pool->free_cmd_buffers);
6317 
6318    *pCmdPool = radv_cmd_pool_to_handle(pool);
6319 
6320    return VK_SUCCESS;
6321 }
6322 
6323 VKAPI_ATTR void VKAPI_CALL
radv_DestroyCommandPool(VkDevice _device,VkCommandPool commandPool,const VkAllocationCallbacks * pAllocator)6324 radv_DestroyCommandPool(VkDevice _device, VkCommandPool commandPool,
6325                         const VkAllocationCallbacks *pAllocator)
6326 {
6327    RADV_FROM_HANDLE(radv_device, device, _device);
6328    RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
6329 
6330    if (!pool)
6331       return;
6332 
6333    list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer, &pool->cmd_buffers, pool_link)
6334    {
6335       radv_destroy_cmd_buffer(cmd_buffer);
6336    }
6337 
6338    list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer, &pool->free_cmd_buffers, pool_link)
6339    {
6340       radv_destroy_cmd_buffer(cmd_buffer);
6341    }
6342 
6343    vk_command_pool_finish(&pool->vk);
6344    vk_free2(&device->vk.alloc, pAllocator, pool);
6345 }
6346 
6347 VKAPI_ATTR VkResult VKAPI_CALL
radv_ResetCommandPool(VkDevice device,VkCommandPool commandPool,VkCommandPoolResetFlags flags)6348 radv_ResetCommandPool(VkDevice device, VkCommandPool commandPool, VkCommandPoolResetFlags flags)
6349 {
6350    RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
6351    VkResult result;
6352 
6353    list_for_each_entry(struct radv_cmd_buffer, cmd_buffer, &pool->cmd_buffers, pool_link)
6354    {
6355       result = radv_reset_cmd_buffer(cmd_buffer);
6356       if (result != VK_SUCCESS)
6357          return result;
6358    }
6359 
6360    return VK_SUCCESS;
6361 }
6362 
6363 VKAPI_ATTR void VKAPI_CALL
radv_TrimCommandPool(VkDevice device,VkCommandPool commandPool,VkCommandPoolTrimFlags flags)6364 radv_TrimCommandPool(VkDevice device, VkCommandPool commandPool, VkCommandPoolTrimFlags flags)
6365 {
6366    RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
6367 
6368    list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer, &pool->free_cmd_buffers, pool_link)
6369    {
6370       radv_destroy_cmd_buffer(cmd_buffer);
6371    }
6372 }
6373 
6374 static void
radv_cmd_buffer_begin_subpass(struct radv_cmd_buffer * cmd_buffer,uint32_t subpass_id)6375 radv_cmd_buffer_begin_subpass(struct radv_cmd_buffer *cmd_buffer, uint32_t subpass_id)
6376 {
6377    struct radv_cmd_state *state = &cmd_buffer->state;
6378    struct radv_subpass *subpass = &state->pass->subpasses[subpass_id];
6379 
6380    ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4096);
6381 
6382    radv_emit_subpass_barrier(cmd_buffer, &subpass->start_barrier);
6383 
6384    radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
6385 
6386    radv_describe_barrier_start(cmd_buffer, RGP_BARRIER_EXTERNAL_RENDER_PASS_SYNC);
6387 
6388    for (uint32_t i = 0; i < subpass->attachment_count; ++i) {
6389       const uint32_t a = subpass->attachments[i].attachment;
6390       if (a == VK_ATTACHMENT_UNUSED)
6391          continue;
6392 
6393       radv_handle_subpass_image_transition(cmd_buffer, subpass->attachments[i], true);
6394    }
6395 
6396    radv_ace_internal_barrier(cmd_buffer, 0, 0);
6397    radv_describe_barrier_end(cmd_buffer);
6398 
6399    radv_cmd_buffer_clear_subpass(cmd_buffer);
6400 
6401    if (subpass->vrs_attachment) {
6402       int idx = subpass->vrs_attachment->attachment;
6403       struct radv_image_view *vrs_iview = cmd_buffer->state.attachments[idx].iview;
6404 
6405       if (subpass->depth_stencil_attachment) {
6406          /* When a subpass uses a VRS attachment and a depth/stencil attachment, we just need to
6407           * copy the VRS rates to the HTILE buffer of the attachment.
6408           */
6409          int ds_idx = subpass->depth_stencil_attachment->attachment;
6410          struct radv_image_view *ds_iview = cmd_buffer->state.attachments[ds_idx].iview;
6411          struct radv_image *ds_image = ds_iview->image;
6412          uint32_t level = ds_iview->vk.base_mip_level;
6413 
6414          VkExtent2D extent = {
6415             .width = radv_minify(ds_image->info.width, level),
6416             .height = radv_minify(ds_image->info.height, level),
6417          };
6418 
6419          /* HTILE buffer */
6420          uint64_t htile_offset = ds_image->bindings[0].offset + ds_image->planes[0].surface.meta_offset +
6421                                  ds_image->planes[0].surface.u.gfx9.meta_levels[level].offset;
6422          uint64_t htile_size = ds_image->planes[0].surface.u.gfx9.meta_levels[level].size;
6423          struct radv_buffer htile_buffer;
6424 
6425          radv_buffer_init(&htile_buffer, cmd_buffer->device, ds_image->bindings[0].bo, htile_size, htile_offset);
6426 
6427          /* Copy the VRS rates to the HTILE buffer. */
6428          radv_copy_vrs_htile(cmd_buffer, vrs_iview->image, &extent, ds_image, &htile_buffer, true);
6429 
6430          radv_buffer_finish(&htile_buffer);
6431       } else {
6432          /* When a subpass uses a VRS attachment without binding a depth/stencil attachment, we have
6433           * to copy the VRS rates to our internal HTILE buffer.
6434           */
6435          struct vk_framebuffer *fb = cmd_buffer->state.framebuffer;
6436          struct radv_image *ds_image = radv_cmd_buffer_get_vrs_image(cmd_buffer);
6437 
6438          if (ds_image) {
6439             /* HTILE buffer */
6440             struct radv_buffer *htile_buffer = cmd_buffer->device->vrs.buffer;
6441 
6442             VkExtent2D extent = {
6443                .width = MIN2(fb->width, ds_image->info.width),
6444                .height = MIN2(fb->height, ds_image->info.height),
6445             };
6446 
6447             /* Copy the VRS rates to the HTILE buffer. */
6448             radv_copy_vrs_htile(cmd_buffer, vrs_iview->image, &extent, ds_image, htile_buffer, false);
6449          }
6450       }
6451    }
6452 
6453    assert(cmd_buffer->cs->cdw <= cdw_max);
6454 }
6455 
6456 static void
radv_mark_noncoherent_rb(struct radv_cmd_buffer * cmd_buffer)6457 radv_mark_noncoherent_rb(struct radv_cmd_buffer *cmd_buffer)
6458 {
6459    const struct radv_subpass *subpass = cmd_buffer->state.subpass;
6460 
6461    /* Have to be conservative in cmdbuffers with inherited attachments. */
6462    if (!cmd_buffer->state.attachments) {
6463       cmd_buffer->state.rb_noncoherent_dirty = true;
6464       return;
6465    }
6466 
6467    for (uint32_t i = 0; i < subpass->color_count; ++i) {
6468       const uint32_t a = subpass->color_attachments[i].attachment;
6469       if (a == VK_ATTACHMENT_UNUSED)
6470          continue;
6471       if (!cmd_buffer->state.attachments[a].iview->image->l2_coherent) {
6472          cmd_buffer->state.rb_noncoherent_dirty = true;
6473          return;
6474       }
6475    }
6476    if (subpass->depth_stencil_attachment &&
6477        !cmd_buffer->state.attachments[subpass->depth_stencil_attachment->attachment]
6478            .iview->image->l2_coherent)
6479       cmd_buffer->state.rb_noncoherent_dirty = true;
6480 }
6481 
6482 void
radv_cmd_buffer_restore_subpass(struct radv_cmd_buffer * cmd_buffer,const struct radv_subpass * subpass)6483 radv_cmd_buffer_restore_subpass(struct radv_cmd_buffer *cmd_buffer,
6484                                 const struct radv_subpass *subpass)
6485 {
6486    radv_mark_noncoherent_rb(cmd_buffer);
6487    radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
6488 }
6489 
6490 static void
radv_cmd_buffer_end_subpass(struct radv_cmd_buffer * cmd_buffer)6491 radv_cmd_buffer_end_subpass(struct radv_cmd_buffer *cmd_buffer)
6492 {
6493    struct radv_cmd_state *state = &cmd_buffer->state;
6494    const struct radv_subpass *subpass = state->subpass;
6495    uint32_t subpass_id = radv_get_subpass_id(cmd_buffer);
6496 
6497    radv_cmd_buffer_resolve_subpass(cmd_buffer);
6498 
6499    radv_describe_barrier_start(cmd_buffer, RGP_BARRIER_EXTERNAL_RENDER_PASS_SYNC);
6500 
6501    for (uint32_t i = 0; i < subpass->attachment_count; ++i) {
6502       const uint32_t a = subpass->attachments[i].attachment;
6503       if (a == VK_ATTACHMENT_UNUSED)
6504          continue;
6505 
6506       if (state->pass->attachments[a].last_subpass_idx != subpass_id)
6507          continue;
6508 
6509       VkImageLayout layout = state->pass->attachments[a].final_layout;
6510       VkImageLayout stencil_layout = state->pass->attachments[a].stencil_final_layout;
6511       struct radv_subpass_attachment att = {a, layout, stencil_layout};
6512       radv_handle_subpass_image_transition(cmd_buffer, att, false);
6513    }
6514 
6515    radv_ace_internal_barrier(cmd_buffer, 0, 0);
6516    radv_describe_barrier_end(cmd_buffer);
6517 }
6518 
6519 VKAPI_ATTR void VKAPI_CALL
radv_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,const VkRenderPassBeginInfo * pRenderPassBeginInfo,const VkSubpassBeginInfo * pSubpassBeginInfo)6520 radv_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
6521                          const VkRenderPassBeginInfo *pRenderPassBeginInfo,
6522                          const VkSubpassBeginInfo *pSubpassBeginInfo)
6523 {
6524    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6525    RADV_FROM_HANDLE(radv_render_pass, pass, pRenderPassBeginInfo->renderPass);
6526    RADV_FROM_HANDLE(vk_framebuffer, framebuffer, pRenderPassBeginInfo->framebuffer);
6527    VkResult result;
6528 
6529    cmd_buffer->state.framebuffer = framebuffer;
6530    cmd_buffer->state.pass = pass;
6531    cmd_buffer->state.render_area = pRenderPassBeginInfo->renderArea;
6532 
6533    result = radv_cmd_state_setup_attachments(cmd_buffer, pass, pRenderPassBeginInfo);
6534    if (result != VK_SUCCESS)
6535       return;
6536 
6537    result = radv_cmd_state_setup_sample_locations(cmd_buffer, pass, pRenderPassBeginInfo);
6538    if (result != VK_SUCCESS)
6539       return;
6540 
6541    radv_cmd_buffer_begin_subpass(cmd_buffer, 0);
6542 }
6543 
6544 VKAPI_ATTR void VKAPI_CALL
radv_CmdNextSubpass2(VkCommandBuffer commandBuffer,const VkSubpassBeginInfo * pSubpassBeginInfo,const VkSubpassEndInfo * pSubpassEndInfo)6545 radv_CmdNextSubpass2(VkCommandBuffer commandBuffer, const VkSubpassBeginInfo *pSubpassBeginInfo,
6546                      const VkSubpassEndInfo *pSubpassEndInfo)
6547 {
6548    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6549 
6550    radv_mark_noncoherent_rb(cmd_buffer);
6551 
6552    uint32_t prev_subpass = radv_get_subpass_id(cmd_buffer);
6553    radv_cmd_buffer_end_subpass(cmd_buffer);
6554    radv_cmd_buffer_begin_subpass(cmd_buffer, prev_subpass + 1);
6555 }
6556 
6557 static void
radv_emit_view_index_per_stage(struct radeon_cmdbuf * cs,struct radv_graphics_pipeline * pipeline,unsigned stage,unsigned index)6558 radv_emit_view_index_per_stage(struct radeon_cmdbuf *cs, struct radv_graphics_pipeline *pipeline,
6559                                unsigned stage, unsigned index)
6560 {
6561    struct radv_userdata_info *loc = radv_lookup_user_sgpr(&pipeline->base, stage, AC_UD_VIEW_INDEX);
6562    if (loc->sgpr_idx == -1)
6563       return;
6564    uint32_t base_reg = pipeline->base.user_data_0[stage];
6565    radeon_set_sh_reg(cs, base_reg + loc->sgpr_idx * 4, index);
6566 }
6567 
6568 static void
radv_emit_view_index(struct radv_cmd_buffer * cmd_buffer,unsigned index)6569 radv_emit_view_index(struct radv_cmd_buffer *cmd_buffer, unsigned index)
6570 {
6571    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
6572 
6573    radv_foreach_stage(stage, pipeline->active_stages & ~VK_SHADER_STAGE_TASK_BIT_NV) {
6574       radv_emit_view_index_per_stage(cmd_buffer->cs, pipeline, stage, index);
6575    }
6576    if (radv_pipeline_has_gs_copy_shader(&pipeline->base)) {
6577       struct radv_userdata_info *loc =
6578          &pipeline->base.gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_VIEW_INDEX];
6579       if (loc->sgpr_idx != -1) {
6580          uint32_t base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0;
6581          radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, index);
6582       }
6583    }
6584    if (pipeline->active_stages & VK_SHADER_STAGE_TASK_BIT_NV) {
6585       radv_emit_view_index_per_stage(cmd_buffer->ace_internal.cs, pipeline, MESA_SHADER_TASK,
6586                                      index);
6587    }
6588 }
6589 
6590 /**
6591  * Emulates predication for MEC using COND_EXEC.
6592  * When the current command buffer is predicating, emit a COND_EXEC packet
6593  * so that the MEC skips the next few dwords worth of packets.
6594  *
6595  * To make it work with inverted conditional rendering, we allocate
6596  * space in the upload BO and emit some packets to invert the condition.
6597  */
6598 static void
radv_cs_emit_compute_predication(struct radv_cmd_state * state,struct radeon_cmdbuf * cs,uint64_t inv_va,bool * inv_emitted,unsigned dwords)6599 radv_cs_emit_compute_predication(struct radv_cmd_state *state, struct radeon_cmdbuf *cs,
6600                                  uint64_t inv_va, bool *inv_emitted, unsigned dwords)
6601 {
6602    if (!state->predicating)
6603       return;
6604 
6605    uint64_t va = state->predication_va;
6606 
6607    if (!state->predication_type) {
6608       /* Invert the condition the first time it is needed. */
6609       if (!*inv_emitted) {
6610          *inv_emitted = true;
6611 
6612          /* Write 1 to the inverted predication VA. */
6613          radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
6614          radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
6615                             COPY_DATA_WR_CONFIRM);
6616          radeon_emit(cs, 1);
6617          radeon_emit(cs, 0);
6618          radeon_emit(cs, inv_va);
6619          radeon_emit(cs, inv_va >> 32);
6620 
6621          /* If the API predication VA == 0, skip next command. */
6622          radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0));
6623          radeon_emit(cs, va);
6624          radeon_emit(cs, va >> 32);
6625          radeon_emit(cs, 0);
6626          radeon_emit(cs, 6); /* 1x COPY_DATA size */
6627 
6628          /* Write 0 to the new predication VA (when the API condition != 0) */
6629          radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
6630          radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
6631                             COPY_DATA_WR_CONFIRM);
6632          radeon_emit(cs, 0);
6633          radeon_emit(cs, 0);
6634          radeon_emit(cs, inv_va);
6635          radeon_emit(cs, inv_va >> 32);
6636       }
6637 
6638       va = inv_va;
6639    }
6640 
6641    radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0));
6642    radeon_emit(cs, va);
6643    radeon_emit(cs, va >> 32);
6644    radeon_emit(cs, 0); /* Cache policy */
6645    radeon_emit(cs, dwords); /* Size of the predicated packet(s) in DWORDs. */
6646 }
6647 
6648 static void
radv_cs_emit_draw_packet(struct radv_cmd_buffer * cmd_buffer,uint32_t vertex_count,uint32_t use_opaque)6649 radv_cs_emit_draw_packet(struct radv_cmd_buffer *cmd_buffer, uint32_t vertex_count,
6650                          uint32_t use_opaque)
6651 {
6652    radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, cmd_buffer->state.predicating));
6653    radeon_emit(cmd_buffer->cs, vertex_count);
6654    radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX | use_opaque);
6655 }
6656 
6657 /**
6658  * Emit a PKT3_DRAW_INDEX_2 packet to render "index_count` vertices.
6659  *
6660  * The starting address "index_va" may point anywhere within the index buffer. The number of
6661  * indexes allocated in the index buffer *past that point* is specified by "max_index_count".
6662  * Hardware uses this information to return 0 for out-of-bounds reads.
6663  */
6664 static void
radv_cs_emit_draw_indexed_packet(struct radv_cmd_buffer * cmd_buffer,uint64_t index_va,uint32_t max_index_count,uint32_t index_count,bool not_eop)6665 radv_cs_emit_draw_indexed_packet(struct radv_cmd_buffer *cmd_buffer, uint64_t index_va,
6666                                  uint32_t max_index_count, uint32_t index_count, bool not_eop)
6667 {
6668    radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_2, 4, cmd_buffer->state.predicating));
6669    radeon_emit(cmd_buffer->cs, max_index_count);
6670    radeon_emit(cmd_buffer->cs, index_va);
6671    radeon_emit(cmd_buffer->cs, index_va >> 32);
6672    radeon_emit(cmd_buffer->cs, index_count);
6673    /* NOT_EOP allows merging multiple draws into 1 wave, but only user VGPRs
6674     * can be changed between draws and GS fast launch must be disabled.
6675     * NOT_EOP doesn't work on gfx9 and older.
6676     */
6677    radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_DMA | S_0287F0_NOT_EOP(not_eop));
6678 }
6679 
6680 /* MUST inline this function to avoid massive perf loss in drawoverhead */
6681 ALWAYS_INLINE static void
radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer * cmd_buffer,bool indexed,uint32_t draw_count,uint64_t count_va,uint32_t stride)6682 radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer *cmd_buffer, bool indexed,
6683                                   uint32_t draw_count, uint64_t count_va, uint32_t stride)
6684 {
6685    struct radeon_cmdbuf *cs = cmd_buffer->cs;
6686    const unsigned di_src_sel = indexed ? V_0287F0_DI_SRC_SEL_DMA : V_0287F0_DI_SRC_SEL_AUTO_INDEX;
6687    bool draw_id_enable = cmd_buffer->state.graphics_pipeline->uses_drawid;
6688    uint32_t base_reg = cmd_buffer->state.graphics_pipeline->vtx_base_sgpr;
6689    uint32_t vertex_offset_reg, start_instance_reg = 0, draw_id_reg = 0;
6690    bool predicating = cmd_buffer->state.predicating;
6691    bool mesh = cmd_buffer->state.mesh_shading;
6692    assert(base_reg);
6693 
6694    /* just reset draw state for vertex data */
6695    cmd_buffer->state.last_first_instance = -1;
6696    cmd_buffer->state.last_num_instances = -1;
6697    cmd_buffer->state.last_drawid = -1;
6698    cmd_buffer->state.last_vertex_offset = -1;
6699 
6700    vertex_offset_reg = (base_reg - SI_SH_REG_OFFSET) >> 2;
6701    if (cmd_buffer->state.graphics_pipeline->uses_baseinstance)
6702       start_instance_reg = ((base_reg + (draw_id_enable ? 8 : 4)) - SI_SH_REG_OFFSET) >> 2;
6703    if (draw_id_enable)
6704       draw_id_reg = ((base_reg + mesh * 12 + 4) - SI_SH_REG_OFFSET) >> 2;
6705 
6706    if (draw_count == 1 && !count_va && !draw_id_enable) {
6707       radeon_emit(cs,
6708                   PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT : PKT3_DRAW_INDIRECT, 3, predicating));
6709       radeon_emit(cs, 0);
6710       radeon_emit(cs, vertex_offset_reg);
6711       radeon_emit(cs, start_instance_reg);
6712       radeon_emit(cs, di_src_sel);
6713    } else {
6714       radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI : PKT3_DRAW_INDIRECT_MULTI, 8,
6715                            predicating));
6716       radeon_emit(cs, 0);
6717       radeon_emit(cs, vertex_offset_reg);
6718       radeon_emit(cs, start_instance_reg);
6719       radeon_emit(cs, draw_id_reg | S_2C3_DRAW_INDEX_ENABLE(draw_id_enable) |
6720                          S_2C3_COUNT_INDIRECT_ENABLE(!!count_va));
6721       radeon_emit(cs, draw_count); /* count */
6722       radeon_emit(cs, count_va);   /* count_addr */
6723       radeon_emit(cs, count_va >> 32);
6724       radeon_emit(cs, stride); /* stride */
6725       radeon_emit(cs, di_src_sel);
6726 
6727       cmd_buffer->state.uses_draw_indirect_multi = true;
6728    }
6729 }
6730 
6731 ALWAYS_INLINE static void
radv_cs_emit_dispatch_taskmesh_direct_ace_packet(struct radv_cmd_buffer * cmd_buffer,const uint32_t x,const uint32_t y,const uint32_t z)6732 radv_cs_emit_dispatch_taskmesh_direct_ace_packet(struct radv_cmd_buffer *cmd_buffer,
6733                                                  const uint32_t x, const uint32_t y,
6734                                                  const uint32_t z)
6735 {
6736    struct radv_pipeline *pipeline = &cmd_buffer->state.graphics_pipeline->base;
6737    struct radv_shader *compute_shader = radv_get_shader(pipeline, MESA_SHADER_TASK);
6738    struct radeon_cmdbuf *cs = cmd_buffer->ace_internal.cs;
6739    const bool predicating = cmd_buffer->state.predicating;
6740    const uint32_t dispatch_initiator = cmd_buffer->device->dispatch_initiator_task |
6741                                        S_00B800_CS_W32_EN(compute_shader->info.wave_size == 32);
6742 
6743    struct radv_userdata_info *ring_entry_loc =
6744       radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_TASK_RING_ENTRY);
6745    assert(ring_entry_loc && ring_entry_loc->sgpr_idx != -1 && ring_entry_loc->num_sgprs == 1);
6746 
6747    uint32_t ring_entry_reg =
6748       (R_00B900_COMPUTE_USER_DATA_0 + ring_entry_loc->sgpr_idx * 4 - SI_SH_REG_OFFSET) >> 2;
6749 
6750    radeon_emit(cs, PKT3(PKT3_DISPATCH_TASKMESH_DIRECT_ACE, 4, predicating) | PKT3_SHADER_TYPE_S(1));
6751    radeon_emit(cs, x);
6752    radeon_emit(cs, y);
6753    radeon_emit(cs, z);
6754    radeon_emit(cs, dispatch_initiator);
6755    radeon_emit(cs, ring_entry_reg & 0xFFFF);
6756 }
6757 
6758 ALWAYS_INLINE static void
radv_cs_emit_dispatch_taskmesh_indirect_multi_ace_packet(struct radv_cmd_buffer * cmd_buffer,uint64_t data_va,uint32_t draw_count,uint64_t count_va,uint32_t stride)6759 radv_cs_emit_dispatch_taskmesh_indirect_multi_ace_packet(struct radv_cmd_buffer *cmd_buffer,
6760                                                          uint64_t data_va, uint32_t draw_count,
6761                                                          uint64_t count_va, uint32_t stride)
6762 {
6763    assert((data_va & 0x03) == 0);
6764    assert((count_va & 0x03) == 0);
6765 
6766    struct radv_pipeline *pipeline = &cmd_buffer->state.graphics_pipeline->base;
6767    struct radv_shader *compute_shader = radv_get_shader(pipeline, MESA_SHADER_TASK);
6768    struct radeon_cmdbuf *cs = cmd_buffer->ace_internal.cs;
6769 
6770    const uint32_t count_indirect_enable = !!count_va;
6771    const uint32_t xyz_dim_enable = compute_shader->info.cs.uses_grid_size;
6772    const uint32_t draw_id_enable = compute_shader->info.vs.needs_draw_id;
6773    const uint32_t dispatch_initiator = cmd_buffer->device->dispatch_initiator_task |
6774                                        S_00B800_CS_W32_EN(compute_shader->info.wave_size == 32);
6775 
6776    const struct radv_userdata_info *ring_entry_loc =
6777       radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_TASK_RING_ENTRY);
6778    const struct radv_userdata_info *xyz_dim_loc =
6779       radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_CS_GRID_SIZE);
6780    const struct radv_userdata_info *draw_id_loc =
6781       radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_CS_TASK_DRAW_ID);
6782 
6783    assert(ring_entry_loc->sgpr_idx != -1 && ring_entry_loc->num_sgprs == 1);
6784    assert(!xyz_dim_enable || (xyz_dim_loc->sgpr_idx != -1 && xyz_dim_loc->num_sgprs == 3));
6785    assert(!draw_id_enable || (draw_id_loc->sgpr_idx != -1 && draw_id_loc->num_sgprs == 1));
6786 
6787    const uint32_t ring_entry_reg =
6788       (R_00B900_COMPUTE_USER_DATA_0 + ring_entry_loc->sgpr_idx * 4 - SI_SH_REG_OFFSET) >> 2;
6789    const uint32_t xyz_dim_reg =
6790       !xyz_dim_enable
6791          ? 0
6792          : (R_00B900_COMPUTE_USER_DATA_0 + xyz_dim_loc->sgpr_idx * 4 - SI_SH_REG_OFFSET) >> 2;
6793    const uint32_t draw_id_reg =
6794       !draw_id_enable
6795          ? 0
6796          : (R_00B900_COMPUTE_USER_DATA_0 + draw_id_loc->sgpr_idx * 4 - SI_SH_REG_OFFSET) >> 2;
6797 
6798    radeon_emit(cs, PKT3(PKT3_DISPATCH_TASKMESH_INDIRECT_MULTI_ACE, 9, 0) | PKT3_SHADER_TYPE_S(1));
6799    radeon_emit(cs, data_va);
6800    radeon_emit(cs, data_va >> 32);
6801    radeon_emit(cs, ring_entry_reg & 0xFFFF);
6802    radeon_emit(cs, (count_indirect_enable << 1) | (draw_id_enable << 2) | (xyz_dim_enable << 3) |
6803                       (draw_id_reg << 16));
6804    radeon_emit(cs, xyz_dim_reg & 0xFFFF);
6805    radeon_emit(cs, draw_count);
6806    radeon_emit(cs, count_va);
6807    radeon_emit(cs, count_va >> 32);
6808    radeon_emit(cs, stride);
6809    radeon_emit(cs, dispatch_initiator);
6810 }
6811 
6812 ALWAYS_INLINE static void
radv_cs_emit_dispatch_taskmesh_gfx_packet(struct radv_cmd_buffer * cmd_buffer)6813 radv_cs_emit_dispatch_taskmesh_gfx_packet(struct radv_cmd_buffer *cmd_buffer)
6814 {
6815    struct radv_pipeline *pipeline = &cmd_buffer->state.graphics_pipeline->base;
6816    struct radeon_cmdbuf *cs = cmd_buffer->cs;
6817    bool predicating = cmd_buffer->state.predicating;
6818 
6819    struct radv_userdata_info *ring_entry_loc =
6820       radv_lookup_user_sgpr(pipeline, MESA_SHADER_MESH, AC_UD_TASK_RING_ENTRY);
6821 
6822    assert(ring_entry_loc && ring_entry_loc->sgpr_idx != -1);
6823 
6824    uint32_t base_reg = cmd_buffer->state.graphics_pipeline->vtx_base_sgpr;
6825    uint32_t xyz_dim_reg = ((base_reg + 4) - SI_SH_REG_OFFSET) >> 2;
6826    uint32_t ring_entry_reg = ((base_reg + ring_entry_loc->sgpr_idx * 4) - SI_SH_REG_OFFSET) >> 2;
6827 
6828    radeon_emit(cs, PKT3(PKT3_DISPATCH_TASKMESH_GFX, 2, predicating));
6829    radeon_emit(cs, (ring_entry_reg << 16) | (xyz_dim_reg & 0xFFFF));
6830    radeon_emit(cs, 0);
6831    radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX);
6832 }
6833 
6834 static inline void
radv_emit_userdata_vertex_internal(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,const uint32_t vertex_offset)6835 radv_emit_userdata_vertex_internal(struct radv_cmd_buffer *cmd_buffer,
6836                                    const struct radv_draw_info *info, const uint32_t vertex_offset)
6837 {
6838    struct radv_cmd_state *state = &cmd_buffer->state;
6839    struct radeon_cmdbuf *cs = cmd_buffer->cs;
6840    const bool uses_baseinstance = state->graphics_pipeline->uses_baseinstance;
6841    const bool uses_drawid = state->graphics_pipeline->uses_drawid;
6842 
6843    radeon_set_sh_reg_seq(cs, state->graphics_pipeline->vtx_base_sgpr, state->graphics_pipeline->vtx_emit_num);
6844 
6845    radeon_emit(cs, vertex_offset);
6846    state->last_vertex_offset = vertex_offset;
6847    if (uses_drawid) {
6848       radeon_emit(cs, 0);
6849       state->last_drawid = 0;
6850    }
6851    if (uses_baseinstance) {
6852       radeon_emit(cs, info->first_instance);
6853       state->last_first_instance = info->first_instance;
6854    }
6855 }
6856 
6857 ALWAYS_INLINE static void
radv_emit_userdata_vertex(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,const uint32_t vertex_offset)6858 radv_emit_userdata_vertex(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
6859                           const uint32_t vertex_offset)
6860 {
6861    const struct radv_cmd_state *state = &cmd_buffer->state;
6862    const bool uses_baseinstance = state->graphics_pipeline->uses_baseinstance;
6863    const bool uses_drawid = state->graphics_pipeline->uses_drawid;
6864 
6865    /* this looks very dumb, but it allows the compiler to optimize better and yields
6866     * ~3-4% perf increase in drawoverhead
6867     */
6868    if (vertex_offset != state->last_vertex_offset) {
6869       radv_emit_userdata_vertex_internal(cmd_buffer, info, vertex_offset);
6870    } else if (uses_drawid && 0 != state->last_drawid) {
6871       radv_emit_userdata_vertex_internal(cmd_buffer, info, vertex_offset);
6872    } else if (uses_baseinstance && info->first_instance != state->last_first_instance) {
6873       radv_emit_userdata_vertex_internal(cmd_buffer, info, vertex_offset);
6874    }
6875 }
6876 
6877 ALWAYS_INLINE static void
radv_emit_userdata_vertex_drawid(struct radv_cmd_buffer * cmd_buffer,uint32_t vertex_offset,uint32_t drawid)6878 radv_emit_userdata_vertex_drawid(struct radv_cmd_buffer *cmd_buffer, uint32_t vertex_offset, uint32_t drawid)
6879 {
6880    struct radv_cmd_state *state = &cmd_buffer->state;
6881    struct radeon_cmdbuf *cs = cmd_buffer->cs;
6882    radeon_set_sh_reg_seq(cs, state->graphics_pipeline->vtx_base_sgpr, 1 + !!drawid);
6883    radeon_emit(cs, vertex_offset);
6884    state->last_vertex_offset = vertex_offset;
6885    if (drawid)
6886       radeon_emit(cs, drawid);
6887 
6888 }
6889 
6890 ALWAYS_INLINE static void
radv_emit_userdata_mesh(struct radv_cmd_buffer * cmd_buffer,const uint32_t x,const uint32_t y,const uint32_t z,const uint32_t first_task)6891 radv_emit_userdata_mesh(struct radv_cmd_buffer *cmd_buffer,
6892                         const uint32_t x, const uint32_t y, const uint32_t z,
6893                         const uint32_t first_task)
6894 {
6895    struct radv_cmd_state *state = &cmd_buffer->state;
6896    struct radeon_cmdbuf *cs = cmd_buffer->cs;
6897    const bool uses_drawid = state->graphics_pipeline->uses_drawid;
6898 
6899    radeon_set_sh_reg_seq(cs, state->graphics_pipeline->vtx_base_sgpr, state->graphics_pipeline->vtx_emit_num);
6900    radeon_emit(cs, first_task);
6901    radeon_emit(cs, x);
6902    radeon_emit(cs, y);
6903    radeon_emit(cs, z);
6904 
6905    if (uses_drawid) {
6906       radeon_emit(cs, 0);
6907       state->last_drawid = 0;
6908    }
6909 }
6910 
6911 ALWAYS_INLINE static void
radv_emit_userdata_mesh_first_task_0_draw_id_0(struct radv_cmd_buffer * cmd_buffer)6912 radv_emit_userdata_mesh_first_task_0_draw_id_0(struct radv_cmd_buffer *cmd_buffer)
6913 {
6914    struct radv_cmd_state *state = &cmd_buffer->state;
6915    struct radeon_cmdbuf *cs = cmd_buffer->cs;
6916    struct radv_graphics_pipeline *pipeline = state->graphics_pipeline;
6917    const bool uses_drawid = pipeline->uses_drawid;
6918 
6919    radeon_set_sh_reg_seq(cs, pipeline->vtx_base_sgpr, 1);
6920    radeon_emit(cs, 0);
6921 
6922    if (uses_drawid) {
6923       radeon_set_sh_reg_seq(cs, pipeline->vtx_base_sgpr + (pipeline->vtx_emit_num - 1) * 4, 1);
6924       radeon_emit(cs, 0);
6925    }
6926 }
6927 
6928 ALWAYS_INLINE static void
radv_emit_userdata_task_ib_only(struct radv_cmd_buffer * cmd_buffer,uint64_t ib_va,uint32_t ib_stride)6929 radv_emit_userdata_task_ib_only(struct radv_cmd_buffer *cmd_buffer, uint64_t ib_va,
6930                                 uint32_t ib_stride)
6931 {
6932    struct radv_pipeline *pipeline = &cmd_buffer->state.graphics_pipeline->base;
6933    struct radeon_cmdbuf *cs = cmd_buffer->ace_internal.cs;
6934 
6935    struct radv_userdata_info *task_ib_loc =
6936       radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_CS_TASK_IB);
6937 
6938    if (task_ib_loc->sgpr_idx != -1) {
6939       assert(task_ib_loc->num_sgprs == 3);
6940       unsigned task_ib_reg = R_00B900_COMPUTE_USER_DATA_0 + task_ib_loc->sgpr_idx * 4;
6941 
6942       radeon_set_sh_reg_seq(cs, task_ib_reg, 3);
6943       radeon_emit(cs, ib_va);
6944       radeon_emit(cs, ib_va >> 32);
6945       radeon_emit(cs, ib_stride);
6946    }
6947 }
6948 
6949 ALWAYS_INLINE static void
radv_emit_userdata_task(struct radv_cmd_buffer * cmd_buffer,uint32_t x,uint32_t y,uint32_t z,uint32_t draw_id,uint32_t first_task,uint64_t ib_va)6950 radv_emit_userdata_task(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t y, uint32_t z,
6951                         uint32_t draw_id, uint32_t first_task, uint64_t ib_va)
6952 {
6953    struct radv_pipeline *pipeline = &cmd_buffer->state.graphics_pipeline->base;
6954    struct radeon_cmdbuf *cs = cmd_buffer->ace_internal.cs;
6955 
6956    struct radv_userdata_info *xyz_loc =
6957       radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_CS_GRID_SIZE);
6958    struct radv_userdata_info *draw_id_loc =
6959       radv_lookup_user_sgpr(pipeline, MESA_SHADER_TASK, AC_UD_CS_TASK_DRAW_ID);
6960 
6961    if (xyz_loc->sgpr_idx != -1) {
6962       assert(xyz_loc->num_sgprs == 3);
6963       unsigned xyz_reg = R_00B900_COMPUTE_USER_DATA_0 + xyz_loc->sgpr_idx * 4;
6964 
6965       radeon_set_sh_reg_seq(cs, xyz_reg, 3);
6966       radeon_emit(cs, x);
6967       radeon_emit(cs, y);
6968       radeon_emit(cs, z);
6969    }
6970 
6971    if (draw_id_loc->sgpr_idx != -1) {
6972       assert(draw_id_loc->num_sgprs == 1);
6973       unsigned draw_id_reg = R_00B900_COMPUTE_USER_DATA_0 + draw_id_loc->sgpr_idx * 4;
6974 
6975       radeon_set_sh_reg_seq(cs, draw_id_reg, 1);
6976       radeon_emit(cs, draw_id);
6977    }
6978 
6979    radv_emit_userdata_task_ib_only(cmd_buffer, ib_va, first_task ? 8 : 0);
6980 }
6981 
6982 ALWAYS_INLINE static void
radv_emit_draw_packets_indexed(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,uint32_t drawCount,const VkMultiDrawIndexedInfoEXT * minfo,uint32_t stride,const int32_t * vertexOffset)6983 radv_emit_draw_packets_indexed(struct radv_cmd_buffer *cmd_buffer,
6984                                const struct radv_draw_info *info,
6985                                uint32_t drawCount, const VkMultiDrawIndexedInfoEXT *minfo,
6986                                uint32_t stride,
6987                                const int32_t *vertexOffset)
6988 
6989 {
6990    struct radv_cmd_state *state = &cmd_buffer->state;
6991    struct radeon_cmdbuf *cs = cmd_buffer->cs;
6992    const int index_size = radv_get_vgt_index_size(state->index_type);
6993    unsigned i = 0;
6994    const bool uses_drawid = state->graphics_pipeline->uses_drawid;
6995    const bool can_eop =
6996       !uses_drawid && cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10;
6997 
6998    if (uses_drawid) {
6999       if (vertexOffset) {
7000          radv_emit_userdata_vertex(cmd_buffer, info, *vertexOffset);
7001          vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) {
7002             const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
7003 
7004             /* Skip draw calls with 0-sized index buffers if the GPU can't handle them */
7005             if (!remaining_indexes &&
7006                 cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
7007                continue;
7008 
7009             if (i > 0)
7010                radeon_set_sh_reg(cs, state->graphics_pipeline->vtx_base_sgpr + sizeof(uint32_t), i);
7011 
7012             const uint64_t index_va = state->index_va + draw->firstIndex * index_size;
7013 
7014             if (!state->subpass->view_mask) {
7015                radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
7016             } else {
7017                u_foreach_bit(view, state->subpass->view_mask) {
7018                   radv_emit_view_index(cmd_buffer, view);
7019 
7020                   radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
7021                }
7022             }
7023          }
7024       } else {
7025          vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) {
7026             const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
7027 
7028             /* Skip draw calls with 0-sized index buffers if the GPU can't handle them */
7029             if (!remaining_indexes &&
7030                 cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
7031                continue;
7032 
7033             if (i > 0) {
7034                if (state->last_vertex_offset != draw->vertexOffset)
7035                   radv_emit_userdata_vertex_drawid(cmd_buffer, draw->vertexOffset, i);
7036                else
7037                   radeon_set_sh_reg(cs, state->graphics_pipeline->vtx_base_sgpr + sizeof(uint32_t), i);
7038             } else
7039                radv_emit_userdata_vertex(cmd_buffer, info, draw->vertexOffset);
7040 
7041             const uint64_t index_va = state->index_va + draw->firstIndex * index_size;
7042 
7043             if (!state->subpass->view_mask) {
7044                radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
7045             } else {
7046                u_foreach_bit(view, state->subpass->view_mask) {
7047                   radv_emit_view_index(cmd_buffer, view);
7048 
7049                   radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
7050                }
7051             }
7052          }
7053       }
7054       if (drawCount > 1) {
7055          state->last_drawid = drawCount - 1;
7056       }
7057    } else {
7058       if (vertexOffset) {
7059          if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX10) {
7060             /* GFX10 has a bug that consecutive draw packets with NOT_EOP must not have
7061              * count == 0 for the last draw that doesn't have NOT_EOP.
7062              */
7063             while (drawCount > 1) {
7064                const VkMultiDrawIndexedInfoEXT *last = (const VkMultiDrawIndexedInfoEXT*)(((const uint8_t*)minfo) + (drawCount - 1) * stride);
7065                if (last->indexCount)
7066                   break;
7067                drawCount--;
7068             }
7069          }
7070 
7071          radv_emit_userdata_vertex(cmd_buffer, info, *vertexOffset);
7072          vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) {
7073             const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
7074 
7075             /* Skip draw calls with 0-sized index buffers if the GPU can't handle them */
7076             if (!remaining_indexes &&
7077                 cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
7078                continue;
7079 
7080             const uint64_t index_va = state->index_va + draw->firstIndex * index_size;
7081 
7082             if (!state->subpass->view_mask) {
7083                radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, can_eop && i < drawCount - 1);
7084             } else {
7085                u_foreach_bit(view, state->subpass->view_mask) {
7086                   radv_emit_view_index(cmd_buffer, view);
7087 
7088                   radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
7089                }
7090             }
7091          }
7092       } else {
7093          vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) {
7094             const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
7095 
7096             /* Skip draw calls with 0-sized index buffers if the GPU can't handle them */
7097             if (!remaining_indexes &&
7098                 cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
7099                continue;
7100 
7101             const VkMultiDrawIndexedInfoEXT *next = (const VkMultiDrawIndexedInfoEXT*)(i < drawCount - 1 ? ((uint8_t*)draw + stride) : NULL);
7102             const bool offset_changes = next && next->vertexOffset != draw->vertexOffset;
7103             radv_emit_userdata_vertex(cmd_buffer, info, draw->vertexOffset);
7104 
7105             const uint64_t index_va = state->index_va + draw->firstIndex * index_size;
7106 
7107             if (!state->subpass->view_mask) {
7108                radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, can_eop && !offset_changes && i < drawCount - 1);
7109             } else {
7110                u_foreach_bit(view, state->subpass->view_mask) {
7111                   radv_emit_view_index(cmd_buffer, view);
7112 
7113                   radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
7114                }
7115             }
7116          }
7117       }
7118       if (drawCount > 1) {
7119          state->last_drawid = drawCount - 1;
7120       }
7121    }
7122 }
7123 
7124 ALWAYS_INLINE static void
radv_emit_direct_draw_packets(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,uint32_t drawCount,const VkMultiDrawInfoEXT * minfo,uint32_t use_opaque,uint32_t stride)7125 radv_emit_direct_draw_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
7126                               uint32_t drawCount, const VkMultiDrawInfoEXT *minfo,
7127                               uint32_t use_opaque, uint32_t stride)
7128 {
7129    unsigned i = 0;
7130    const uint32_t view_mask = cmd_buffer->state.subpass->view_mask;
7131    const bool uses_drawid = cmd_buffer->state.graphics_pipeline->uses_drawid;
7132    uint32_t last_start = 0;
7133 
7134    vk_foreach_multi_draw(draw, i, minfo, drawCount, stride) {
7135       if (!i)
7136          radv_emit_userdata_vertex(cmd_buffer, info, draw->firstVertex);
7137       else
7138          radv_emit_userdata_vertex_drawid(cmd_buffer, draw->firstVertex, uses_drawid ? i : 0);
7139 
7140       if (!view_mask) {
7141          radv_cs_emit_draw_packet(cmd_buffer, draw->vertexCount, use_opaque);
7142       } else {
7143          u_foreach_bit(view, view_mask) {
7144             radv_emit_view_index(cmd_buffer, view);
7145             radv_cs_emit_draw_packet(cmd_buffer, draw->vertexCount, use_opaque);
7146          }
7147       }
7148       last_start = draw->firstVertex;
7149    }
7150    if (drawCount > 1) {
7151        struct radv_cmd_state *state = &cmd_buffer->state;
7152        state->last_vertex_offset = last_start;
7153        if (uses_drawid)
7154            state->last_drawid = drawCount - 1;
7155    }
7156 }
7157 
7158 ALWAYS_INLINE static void
radv_emit_direct_mesh_draw_packet(struct radv_cmd_buffer * cmd_buffer,uint32_t x,uint32_t y,uint32_t z,uint32_t first_task)7159 radv_emit_direct_mesh_draw_packet(struct radv_cmd_buffer *cmd_buffer,
7160                                   uint32_t x, uint32_t y, uint32_t z,
7161                                   uint32_t first_task)
7162 {
7163    const uint32_t view_mask = cmd_buffer->state.subpass->view_mask;
7164    const uint32_t count = x * y * z;
7165 
7166    radv_emit_userdata_mesh(cmd_buffer, x, y, z, first_task);
7167 
7168    if (!view_mask) {
7169       radv_cs_emit_draw_packet(cmd_buffer, count, 0);
7170    } else {
7171       u_foreach_bit(view, view_mask) {
7172          radv_emit_view_index(cmd_buffer, view);
7173          radv_cs_emit_draw_packet(cmd_buffer, count, 0);
7174       }
7175    }
7176 }
7177 
7178 ALWAYS_INLINE static void
radv_emit_direct_taskmesh_draw_packets(struct radv_cmd_buffer * cmd_buffer,uint32_t x,uint32_t y,uint32_t z,uint32_t first_task)7179 radv_emit_direct_taskmesh_draw_packets(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t y,
7180                                        uint32_t z, uint32_t first_task)
7181 {
7182    uint64_t fake_ib_va = 0;
7183    const uint32_t view_mask = cmd_buffer->state.subpass->view_mask;
7184    const unsigned num_views = MAX2(1, util_bitcount(view_mask));
7185    unsigned ace_predication_size = num_views * 6; /* DISPATCH_TASKMESH_DIRECT_ACE size */
7186 
7187    if (first_task) {
7188       /* Pass this as the IB to the shader for emulating firstTask in task shaders. */
7189       uint32_t fake_ib_dwords[2] = {x, first_task};
7190       unsigned fake_ib_offset;
7191       radv_cmd_buffer_upload_data(cmd_buffer, 8, fake_ib_dwords, &fake_ib_offset);
7192       fake_ib_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + fake_ib_offset;
7193    }
7194 
7195    radv_emit_userdata_task(cmd_buffer, x, y, z, 0, first_task, fake_ib_va);
7196    radv_emit_userdata_mesh_first_task_0_draw_id_0(cmd_buffer);
7197    radv_cs_emit_compute_predication(&cmd_buffer->state, cmd_buffer->ace_internal.cs,
7198                                     cmd_buffer->mec_inv_pred_va, &cmd_buffer->mec_inv_pred_emitted,
7199                                     ace_predication_size);
7200 
7201    if (!view_mask) {
7202       radv_cs_emit_dispatch_taskmesh_direct_ace_packet(cmd_buffer, x, y, z);
7203       radv_cs_emit_dispatch_taskmesh_gfx_packet(cmd_buffer);
7204    } else {
7205       u_foreach_bit (view, view_mask) {
7206          radv_emit_view_index(cmd_buffer, view);
7207          radv_cs_emit_dispatch_taskmesh_direct_ace_packet(cmd_buffer, x, y, z);
7208          radv_cs_emit_dispatch_taskmesh_gfx_packet(cmd_buffer);
7209       }
7210    }
7211 }
7212 
7213 static void
radv_emit_indirect_taskmesh_draw_packets(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,uint64_t nv_ib_va,uint32_t nv_ib_stride)7214 radv_emit_indirect_taskmesh_draw_packets(struct radv_cmd_buffer *cmd_buffer,
7215                                          const struct radv_draw_info *info, uint64_t nv_ib_va,
7216                                          uint32_t nv_ib_stride)
7217 {
7218    const uint32_t view_mask = cmd_buffer->state.subpass->view_mask;
7219    struct radeon_winsys *ws = cmd_buffer->device->ws;
7220    const unsigned num_views = MAX2(1, util_bitcount(view_mask));
7221    unsigned ace_predication_size = num_views * 11; /* DISPATCH_TASKMESH_INDIRECT_MULTI_ACE size */
7222    struct radeon_cmdbuf *ace_cs = cmd_buffer->ace_internal.cs;
7223 
7224    const uint64_t va =
7225       radv_buffer_get_va(info->indirect->bo) + info->indirect->offset + info->indirect_offset;
7226    const uint64_t count_va = !info->count_buffer
7227                                 ? 0
7228                                 : radv_buffer_get_va(info->count_buffer->bo) +
7229                                      info->count_buffer->offset + info->count_buffer_offset;
7230    uint64_t workaround_cond_va = 0;
7231 
7232    if (count_va) {
7233       radv_cs_add_buffer(ws, cmd_buffer->ace_internal.cs, info->count_buffer->bo);
7234 
7235       /* MEC firmware bug workaround.
7236        * When the count buffer contains zero, DISPATCH_TASKMESH_INDIRECT_MULTI_ACE hangs.
7237        * - We must ensure that DISPATCH_TASKMESH_INDIRECT_MULTI_ACE
7238        *   is only executed when the count buffer contains non-zero.
7239        * - Furthermore, we must also ensure that each DISPATCH_TASKMESH_GFX packet
7240        *   has a matching ACE packet.
7241        *
7242        * As a workaround:
7243        * - Reserve a dword in the upload buffer and initialize it to 1 for the workaround
7244        * - When count != 0, write 0 to the workaround BO and execute the indirect dispatch
7245        * - When workaround BO != 0 (count was 0), execute an empty direct dispatch
7246        */
7247 
7248       uint32_t workaround_cond_init = 0;
7249       uint32_t workaround_cond_off;
7250       if (!radv_cmd_buffer_upload_data(cmd_buffer, 4, &workaround_cond_init, &workaround_cond_off))
7251          cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
7252 
7253       workaround_cond_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + workaround_cond_off;
7254 
7255       radeon_emit(ace_cs, PKT3(PKT3_COPY_DATA, 4, 0));
7256       radeon_emit(ace_cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
7257                              COPY_DATA_WR_CONFIRM);
7258       radeon_emit(ace_cs, 1);
7259       radeon_emit(ace_cs, 0);
7260       radeon_emit(ace_cs, workaround_cond_va);
7261       radeon_emit(ace_cs, workaround_cond_va >> 32);
7262 
7263       /* 2x COND_EXEC + 1x COPY_DATA + Nx DISPATCH_TASKMESH_DIRECT_ACE */
7264       ace_predication_size += 2 * 5 + 6 + 6 * num_views;
7265    }
7266 
7267    radv_cs_add_buffer(ws, cmd_buffer->ace_internal.cs, info->indirect->bo);
7268    radv_emit_userdata_task_ib_only(cmd_buffer, nv_ib_va, nv_ib_stride);
7269    radv_emit_userdata_mesh_first_task_0_draw_id_0(cmd_buffer);
7270    radv_cs_emit_compute_predication(&cmd_buffer->state, cmd_buffer->ace_internal.cs,
7271                                     cmd_buffer->mec_inv_pred_va, &cmd_buffer->mec_inv_pred_emitted,
7272                                     ace_predication_size);
7273 
7274    if (workaround_cond_va) {
7275       radeon_emit(ace_cs, PKT3(PKT3_COND_EXEC, 3, 0));
7276       radeon_emit(ace_cs, count_va);
7277       radeon_emit(ace_cs, count_va >> 32);
7278       radeon_emit(ace_cs, 0);
7279       radeon_emit(ace_cs,
7280                   6 + 11 * num_views); /* 1x COPY_DATA + Nx DISPATCH_TASKMESH_INDIRECT_MULTI_ACE */
7281 
7282       radeon_emit(ace_cs, PKT3(PKT3_COPY_DATA, 4, 0));
7283       radeon_emit(ace_cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
7284                              COPY_DATA_WR_CONFIRM);
7285       radeon_emit(ace_cs, 0);
7286       radeon_emit(ace_cs, 0);
7287       radeon_emit(ace_cs, workaround_cond_va);
7288       radeon_emit(ace_cs, workaround_cond_va >> 32);
7289    }
7290 
7291    if (!view_mask) {
7292       radv_cs_emit_dispatch_taskmesh_indirect_multi_ace_packet(cmd_buffer, va, info->count,
7293                                                                count_va, info->stride);
7294       radv_cs_emit_dispatch_taskmesh_gfx_packet(cmd_buffer);
7295    } else {
7296       u_foreach_bit (view, view_mask) {
7297          radv_emit_view_index(cmd_buffer, view);
7298          radv_cs_emit_dispatch_taskmesh_indirect_multi_ace_packet(cmd_buffer, va, info->count,
7299                                                                   count_va, info->stride);
7300          radv_cs_emit_dispatch_taskmesh_gfx_packet(cmd_buffer);
7301       }
7302    }
7303 
7304    if (workaround_cond_va) {
7305       radeon_emit(ace_cs, PKT3(PKT3_COND_EXEC, 3, 0));
7306       radeon_emit(ace_cs, workaround_cond_va);
7307       radeon_emit(ace_cs, workaround_cond_va >> 32);
7308       radeon_emit(ace_cs, 0);
7309       radeon_emit(ace_cs, 6 * num_views); /* Nx DISPATCH_TASKMESH_DIRECT_ACE */
7310 
7311       for (unsigned v = 0; v < num_views; ++v) {
7312          radv_cs_emit_dispatch_taskmesh_direct_ace_packet(cmd_buffer, 0, 0, 0);
7313       }
7314    }
7315 }
7316 
7317 static void
radv_emit_indirect_draw_packets(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info)7318 radv_emit_indirect_draw_packets(struct radv_cmd_buffer *cmd_buffer,
7319                                 const struct radv_draw_info *info)
7320 {
7321    const struct radv_cmd_state *state = &cmd_buffer->state;
7322    struct radeon_winsys *ws = cmd_buffer->device->ws;
7323    struct radeon_cmdbuf *cs = cmd_buffer->cs;
7324    const uint64_t va =
7325       radv_buffer_get_va(info->indirect->bo) + info->indirect->offset + info->indirect_offset;
7326    const uint64_t count_va = info->count_buffer
7327                                 ? radv_buffer_get_va(info->count_buffer->bo) +
7328                                      info->count_buffer->offset + info->count_buffer_offset
7329                                 : 0;
7330 
7331    radv_cs_add_buffer(ws, cs, info->indirect->bo);
7332 
7333    radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0));
7334    radeon_emit(cs, 1);
7335    radeon_emit(cs, va);
7336    radeon_emit(cs, va >> 32);
7337 
7338    if (info->count_buffer) {
7339       radv_cs_add_buffer(ws, cs, info->count_buffer->bo);
7340    }
7341 
7342    if (!state->subpass->view_mask) {
7343       radv_cs_emit_indirect_draw_packet(cmd_buffer, info->indexed, info->count, count_va,
7344                                         info->stride);
7345    } else {
7346       u_foreach_bit(i, state->subpass->view_mask)
7347       {
7348          radv_emit_view_index(cmd_buffer, i);
7349 
7350          radv_cs_emit_indirect_draw_packet(cmd_buffer, info->indexed, info->count, count_va,
7351                                            info->stride);
7352       }
7353    }
7354 }
7355 
7356 /*
7357  * Vega and raven have a bug which triggers if there are multiple context
7358  * register contexts active at the same time with different scissor values.
7359  *
7360  * There are two possible workarounds:
7361  * 1) Wait for PS_PARTIAL_FLUSH every time the scissor is changed. That way
7362  *    there is only ever 1 active set of scissor values at the same time.
7363  *
7364  * 2) Whenever the hardware switches contexts we have to set the scissor
7365  *    registers again even if it is a noop. That way the new context gets
7366  *    the correct scissor values.
7367  *
7368  * This implements option 2. radv_need_late_scissor_emission needs to
7369  * return true on affected HW if radv_emit_all_graphics_states sets
7370  * any context registers.
7371  */
7372 static bool
radv_need_late_scissor_emission(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info)7373 radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer,
7374                                 const struct radv_draw_info *info)
7375 {
7376    struct radv_cmd_state *state = &cmd_buffer->state;
7377 
7378    if (!cmd_buffer->device->physical_device->rad_info.has_gfx9_scissor_bug)
7379       return false;
7380 
7381    if (cmd_buffer->state.context_roll_without_scissor_emitted || info->strmout_buffer)
7382       return true;
7383 
7384    uint64_t used_states =
7385       cmd_buffer->state.graphics_pipeline->needed_dynamic_state | ~RADV_CMD_DIRTY_DYNAMIC_ALL;
7386 
7387    /* Index, vertex and streamout buffers don't change context regs, and
7388     * pipeline is already handled.
7389     */
7390    used_states &= ~(RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_VERTEX_BUFFER |
7391                     RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT | RADV_CMD_DIRTY_STREAMOUT_BUFFER |
7392                     RADV_CMD_DIRTY_PIPELINE);
7393 
7394    if (cmd_buffer->state.dirty & used_states)
7395       return true;
7396 
7397    uint32_t primitive_reset_index = radv_get_primitive_reset_index(cmd_buffer);
7398 
7399    if (info->indexed && state->dynamic.primitive_restart_enable &&
7400        primitive_reset_index != state->last_primitive_reset_index)
7401       return true;
7402 
7403    return false;
7404 }
7405 
7406 ALWAYS_INLINE static bool
radv_skip_ngg_culling(bool has_tess,const unsigned vtx_cnt,bool indirect)7407 radv_skip_ngg_culling(bool has_tess, const unsigned vtx_cnt,
7408                       bool indirect)
7409 {
7410    /* If we have to draw only a few vertices, we get better latency if
7411     * we disable NGG culling.
7412     *
7413     * When tessellation is used, what matters is the number of tessellated
7414     * vertices, so let's always assume it's not a small draw.
7415     */
7416    return !has_tess && !indirect && vtx_cnt < 128;
7417 }
7418 
7419 ALWAYS_INLINE static uint32_t
radv_get_ngg_culling_settings(struct radv_cmd_buffer * cmd_buffer,bool vp_y_inverted)7420 radv_get_ngg_culling_settings(struct radv_cmd_buffer *cmd_buffer, bool vp_y_inverted)
7421 {
7422    const struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
7423    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
7424 
7425    /* Cull every triangle when rasterizer discard is enabled. */
7426    if (d->rasterizer_discard_enable ||
7427        G_028810_DX_RASTERIZATION_KILL(cmd_buffer->state.graphics_pipeline->pa_cl_clip_cntl))
7428       return radv_nggc_front_face | radv_nggc_back_face;
7429 
7430    uint32_t pa_su_sc_mode_cntl = cmd_buffer->state.graphics_pipeline->pa_su_sc_mode_cntl;
7431    uint32_t nggc_settings = radv_nggc_none;
7432 
7433    /* The culling code needs to know whether face is CW or CCW. */
7434    bool ccw = (pipeline->needed_dynamic_state & RADV_DYNAMIC_FRONT_FACE)
7435               ? d->front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE
7436               : G_028814_FACE(pa_su_sc_mode_cntl) == 0;
7437 
7438    /* Take inverted viewport into account. */
7439    ccw ^= vp_y_inverted;
7440 
7441    if (ccw)
7442       nggc_settings |= radv_nggc_face_is_ccw;
7443 
7444    /* Face culling settings. */
7445    if ((pipeline->needed_dynamic_state & RADV_DYNAMIC_CULL_MODE)
7446          ? (d->cull_mode & VK_CULL_MODE_FRONT_BIT)
7447          : G_028814_CULL_FRONT(pa_su_sc_mode_cntl))
7448       nggc_settings |= radv_nggc_front_face;
7449    if ((pipeline->needed_dynamic_state & RADV_DYNAMIC_CULL_MODE)
7450          ? (d->cull_mode & VK_CULL_MODE_BACK_BIT)
7451          : G_028814_CULL_BACK(pa_su_sc_mode_cntl))
7452       nggc_settings |= radv_nggc_back_face;
7453 
7454    /* Small primitive culling is only valid when conservative overestimation is not used. It's also
7455     * disabled for user sample locations because small primitive culling assumes a sample
7456     * position at (0.5, 0.5). */
7457    if (!pipeline->uses_conservative_overestimate && !pipeline->uses_user_sample_locations) {
7458       nggc_settings |= radv_nggc_small_primitives;
7459 
7460       /* small_prim_precision = num_samples / 2^subpixel_bits
7461        * num_samples is also always a power of two, so the small prim precision can only be
7462        * a power of two between 2^-2 and 2^-6, therefore it's enough to remember the exponent.
7463        */
7464       unsigned subpixel_bits = 256;
7465       int32_t small_prim_precision_log2 = util_logbase2(pipeline->ms.num_samples) - util_logbase2(subpixel_bits);
7466       nggc_settings |= ((uint32_t) small_prim_precision_log2 << 24u);
7467    }
7468 
7469    return nggc_settings;
7470 }
7471 
7472 static void
radv_emit_ngg_culling_state(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * draw_info)7473 radv_emit_ngg_culling_state(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *draw_info)
7474 {
7475    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
7476    const unsigned stage = pipeline->last_vgt_api_stage;
7477    const bool nggc_supported = pipeline->has_ngg_culling;
7478 
7479    if (!nggc_supported && !cmd_buffer->state.last_nggc_settings) {
7480       /* Current shader doesn't support culling and culling was already disabled:
7481        * No further steps needed, just remember the SGPR's location is not set.
7482        */
7483       cmd_buffer->state.last_nggc_settings_sgpr_idx = -1;
7484       return;
7485    }
7486 
7487    /* Check dirty flags:
7488     * - Dirty pipeline: SGPR index may have changed (we have to re-emit if changed).
7489     * - Dirty dynamic flags: culling settings may have changed.
7490     */
7491    const bool dirty =
7492       cmd_buffer->state.dirty &
7493       (RADV_CMD_DIRTY_PIPELINE |
7494        RADV_CMD_DIRTY_DYNAMIC_CULL_MODE | RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE |
7495        RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT);
7496 
7497    /* Check small draw status:
7498     * For small draw calls, we disable culling by setting the SGPR to 0.
7499     */
7500    const bool skip =
7501       radv_skip_ngg_culling(stage == MESA_SHADER_TESS_EVAL, draw_info->count, draw_info->indirect);
7502 
7503    /* See if anything changed. */
7504    if (!dirty && skip == cmd_buffer->state.last_nggc_skip)
7505       return;
7506 
7507    /* Remember small draw state. */
7508    cmd_buffer->state.last_nggc_skip = skip;
7509    const struct radv_shader *v = pipeline->base.shaders[stage];
7510    assert(v->info.has_ngg_culling == nggc_supported);
7511 
7512    /* Find the user SGPR. */
7513    const uint32_t base_reg = pipeline->base.user_data_0[stage];
7514    const int8_t nggc_sgpr_idx = v->info.user_sgprs_locs.shader_data[AC_UD_NGG_CULLING_SETTINGS].sgpr_idx;
7515    assert(!nggc_supported || nggc_sgpr_idx != -1);
7516 
7517    /* Get viewport transform. */
7518    float vp_scale[2], vp_translate[2];
7519    memcpy(vp_scale, cmd_buffer->state.dynamic.viewport.xform[0].scale, 2 * sizeof(float));
7520    memcpy(vp_translate, cmd_buffer->state.dynamic.viewport.xform[0].translate, 2 * sizeof(float));
7521    bool vp_y_inverted = (-vp_scale[1] + vp_translate[1]) > (vp_scale[1] + vp_translate[1]);
7522 
7523    /* Get current culling settings. */
7524    uint32_t nggc_settings = nggc_supported && !skip
7525                             ? radv_get_ngg_culling_settings(cmd_buffer, vp_y_inverted)
7526                             : radv_nggc_none;
7527 
7528    bool emit_viewport = nggc_settings &&
7529                         (cmd_buffer->state.dirty & RADV_CMD_DIRTY_DYNAMIC_VIEWPORT ||
7530                          cmd_buffer->state.last_nggc_settings_sgpr_idx != nggc_sgpr_idx ||
7531                          !cmd_buffer->state.last_nggc_settings);
7532 
7533    if (emit_viewport) {
7534       /* Correction for inverted Y */
7535       if (vp_y_inverted) {
7536          vp_scale[1] = -vp_scale[1];
7537          vp_translate[1] = -vp_translate[1];
7538       }
7539 
7540       /* Correction for number of samples per pixel. */
7541       for (unsigned i = 0; i < 2; ++i) {
7542          vp_scale[i] *= (float) pipeline->ms.num_samples;
7543          vp_translate[i] *= (float) pipeline->ms.num_samples;
7544       }
7545 
7546       uint32_t vp_reg_values[4] = {fui(vp_scale[0]), fui(vp_scale[1]), fui(vp_translate[0]), fui(vp_translate[1])};
7547       const int8_t vp_sgpr_idx = v->info.user_sgprs_locs.shader_data[AC_UD_NGG_VIEWPORT].sgpr_idx;
7548       assert(vp_sgpr_idx != -1);
7549       radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + vp_sgpr_idx * 4, 4);
7550       radeon_emit_array(cmd_buffer->cs, vp_reg_values, 4);
7551    }
7552 
7553    bool emit_settings = nggc_supported &&
7554                         (cmd_buffer->state.last_nggc_settings != nggc_settings ||
7555                          cmd_buffer->state.last_nggc_settings_sgpr_idx != nggc_sgpr_idx);
7556 
7557    /* This needs to be emitted when culling is turned on
7558     * and when it's already on but some settings change.
7559     */
7560    if (emit_settings) {
7561       assert(nggc_sgpr_idx >= 0);
7562       radeon_set_sh_reg(cmd_buffer->cs, base_reg + nggc_sgpr_idx * 4, nggc_settings);
7563    }
7564 
7565    /* These only need to be emitted when culling is turned on or off,
7566     * but not when it stays on and just some settings change.
7567     */
7568    if (!!cmd_buffer->state.last_nggc_settings != !!nggc_settings) {
7569       uint32_t rsrc2 = v->config.rsrc2;
7570 
7571       if (!nggc_settings) {
7572          /* Allocate less LDS when culling is disabled. (But GS always needs it.) */
7573          if (stage != MESA_SHADER_GEOMETRY)
7574             rsrc2 = (rsrc2 & C_00B22C_LDS_SIZE) | S_00B22C_LDS_SIZE(v->info.num_lds_blocks_when_not_culling);
7575       }
7576 
7577       /* When the pipeline is dirty and not yet emitted, don't write it here
7578        * because radv_emit_graphics_pipeline will overwrite this register.
7579        */
7580       if (!(cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) ||
7581           cmd_buffer->state.emitted_graphics_pipeline == pipeline) {
7582          radeon_set_sh_reg(cmd_buffer->cs, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2);
7583       }
7584    }
7585 
7586    cmd_buffer->state.last_nggc_settings = nggc_settings;
7587    cmd_buffer->state.last_nggc_settings_sgpr_idx = nggc_sgpr_idx;
7588 }
7589 
7590 static void
radv_emit_all_graphics_states(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,bool pipeline_is_dirty)7591 radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
7592                               bool pipeline_is_dirty)
7593 {
7594    bool late_scissor_emission;
7595 
7596    if ((cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER) ||
7597        cmd_buffer->state.emitted_graphics_pipeline != cmd_buffer->state.graphics_pipeline)
7598       radv_emit_rbplus_state(cmd_buffer);
7599 
7600    if (cmd_buffer->device->physical_device->use_ngg_culling &&
7601        cmd_buffer->state.graphics_pipeline->is_ngg)
7602       radv_emit_ngg_culling_state(cmd_buffer, info);
7603 
7604    if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE)
7605       radv_emit_graphics_pipeline(cmd_buffer);
7606 
7607    /* This should be before the cmd_buffer->state.dirty is cleared
7608     * (excluding RADV_CMD_DIRTY_PIPELINE) and after
7609     * cmd_buffer->state.context_roll_without_scissor_emitted is set. */
7610    late_scissor_emission = radv_need_late_scissor_emission(cmd_buffer, info);
7611 
7612    if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)
7613       radv_emit_framebuffer_state(cmd_buffer);
7614 
7615    if (info->indexed) {
7616       if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_INDEX_BUFFER)
7617          radv_emit_index_buffer(cmd_buffer, info->indirect);
7618    } else {
7619       /* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE,
7620        * so the state must be re-emitted before the next indexed
7621        * draw.
7622        */
7623       if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) {
7624          cmd_buffer->state.last_index_type = -1;
7625          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
7626       }
7627    }
7628 
7629    if (cmd_buffer->device->force_vrs != RADV_FORCE_VRS_1x1) {
7630       struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
7631       uint64_t dynamic_states =
7632          cmd_buffer->state.dirty & cmd_buffer->state.emitted_graphics_pipeline->needed_dynamic_state;
7633 
7634       if ((dynamic_states & RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE) &&
7635           d->fragment_shading_rate.size.width == 1 &&
7636           d->fragment_shading_rate.size.height == 1 &&
7637           d->fragment_shading_rate.combiner_ops[0] == VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR &&
7638           d->fragment_shading_rate.combiner_ops[1] == VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR) {
7639          /* When per-vertex VRS is forced and the dynamic fragment shading rate is a no-op, ignore
7640           * it. This is needed for vkd3d-proton because it always declares per-draw VRS as dynamic.
7641           */
7642          cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE;
7643       }
7644    }
7645 
7646    radv_cmd_buffer_flush_dynamic_state(cmd_buffer, pipeline_is_dirty);
7647 
7648    radv_emit_draw_registers(cmd_buffer, info);
7649 
7650    if (late_scissor_emission)
7651       radv_emit_scissor(cmd_buffer);
7652 }
7653 
7654 /* MUST inline this function to avoid massive perf loss in drawoverhead */
7655 ALWAYS_INLINE static bool
radv_before_draw(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,uint32_t drawCount)7656 radv_before_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info, uint32_t drawCount)
7657 {
7658    const bool has_prefetch = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7;
7659    const bool pipeline_is_dirty = (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) &&
7660                                   cmd_buffer->state.graphics_pipeline != cmd_buffer->state.emitted_graphics_pipeline;
7661 
7662    ASSERTED const unsigned cdw_max =
7663       radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4096 + 128 * (drawCount - 1));
7664 
7665    if (likely(!info->indirect)) {
7666       /* GFX6-GFX7 treat instance_count==0 as instance_count==1. There is
7667        * no workaround for indirect draws, but we can at least skip
7668        * direct draws.
7669        */
7670       if (unlikely(!info->instance_count))
7671          return false;
7672 
7673       /* Handle count == 0. */
7674       if (unlikely(!info->count && !info->strmout_buffer))
7675          return false;
7676    }
7677 
7678    /* Need to apply this workaround early as it can set flush flags. */
7679    if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)
7680       radv_emit_fb_mip_change_flush(cmd_buffer);
7681 
7682    /* Use optimal packet order based on whether we need to sync the
7683     * pipeline.
7684     */
7685    if (cmd_buffer->state.flush_bits &
7686        (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB |
7687         RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
7688       /* If we have to wait for idle, set all states first, so that
7689        * all SET packets are processed in parallel with previous draw
7690        * calls. Then upload descriptors, set shader pointers, and
7691        * draw, and prefetch at the end. This ensures that the time
7692        * the CUs are idle is very short. (there are only SET_SH
7693        * packets between the wait and the draw)
7694        */
7695       radv_emit_all_graphics_states(cmd_buffer, info, pipeline_is_dirty);
7696       si_emit_cache_flush(cmd_buffer);
7697       /* <-- CUs are idle here --> */
7698 
7699       radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty);
7700    } else {
7701       /* If we don't wait for idle, start prefetches first, then set
7702        * states, and draw at the end.
7703        */
7704       si_emit_cache_flush(cmd_buffer);
7705 
7706       if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) {
7707          /* Only prefetch the vertex shader and VBO descriptors
7708           * in order to start the draw as soon as possible.
7709           */
7710          radv_emit_prefetch_L2(cmd_buffer, cmd_buffer->state.graphics_pipeline, true);
7711       }
7712 
7713       radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty);
7714 
7715       radv_emit_all_graphics_states(cmd_buffer, info, pipeline_is_dirty);
7716    }
7717 
7718    radv_describe_draw(cmd_buffer);
7719    if (likely(!info->indirect)) {
7720       struct radv_cmd_state *state = &cmd_buffer->state;
7721       struct radeon_cmdbuf *cs = cmd_buffer->cs;
7722       assert(state->graphics_pipeline->vtx_base_sgpr);
7723       if (state->last_num_instances != info->instance_count) {
7724          radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, false));
7725          radeon_emit(cs, info->instance_count);
7726          state->last_num_instances = info->instance_count;
7727       }
7728    }
7729    assert(cmd_buffer->cs->cdw <= cdw_max);
7730 
7731    return true;
7732 }
7733 
7734 ALWAYS_INLINE static bool
radv_before_taskmesh_draw(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,uint32_t drawCount)7735 radv_before_taskmesh_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
7736                           uint32_t drawCount)
7737 {
7738    struct radv_descriptor_state *descriptors_state =
7739       radv_get_descriptors_state(cmd_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS);
7740    const bool pipeline_is_dirty =
7741       cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE &&
7742       cmd_buffer->state.graphics_pipeline != cmd_buffer->state.emitted_graphics_pipeline;
7743    const bool push_dirty = descriptors_state->push_dirty;
7744    const uint32_t desc_dirty = descriptors_state->dirty;
7745 
7746    const bool gfx_result = radv_before_draw(cmd_buffer, info, drawCount);
7747    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
7748    struct radv_shader *task_shader = radv_get_shader(&pipeline->base, MESA_SHADER_TASK);
7749 
7750    /* If there is no task shader, no need to do anything special. */
7751    if (!task_shader)
7752       return gfx_result;
7753 
7754    /* Need to check the count even for indirect draws to work around
7755     * an issue with DISPATCH_TASKMESH_INDIRECT_MULTI_ACE.
7756     */
7757    if (!info->count || !gfx_result)
7758       return false;
7759 
7760    const bool need_task_semaphore = radv_flush_gfx2ace_semaphore(cmd_buffer);
7761    struct radv_physical_device *pdevice = cmd_buffer->device->physical_device;
7762    struct radeon_cmdbuf *ace_cs = cmd_buffer->ace_internal.cs;
7763    struct radeon_winsys *ws = cmd_buffer->device->ws;
7764 
7765    assert(ace_cs);
7766    ASSERTED const unsigned ace_cdw_max =
7767       radeon_check_space(ws, ace_cs, 4096 + 128 * (drawCount - 1));
7768 
7769    if (need_task_semaphore)
7770       radv_wait_gfx2ace_semaphore(cmd_buffer);
7771 
7772    if (pipeline_is_dirty) {
7773       radv_pipeline_emit_hw_cs(pdevice, ace_cs, task_shader);
7774       radv_pipeline_emit_compute_state(pdevice, ace_cs, task_shader);
7775    }
7776 
7777    radv_ace_internal_cache_flush(cmd_buffer);
7778 
7779    /* Restore dirty state of descriptors
7780     * They were marked non-dirty in radv_before_draw,
7781     * but they need to be re-emitted now to the ACE cmdbuf.
7782     */
7783    descriptors_state->push_dirty = push_dirty;
7784    descriptors_state->dirty = desc_dirty;
7785 
7786    /* Flush descriptors and push constants for task shaders. */
7787    radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_TASK_BIT_NV, &pipeline->base,
7788                           VK_PIPELINE_BIND_POINT_GRAPHICS);
7789    radv_flush_constants(cmd_buffer, VK_SHADER_STAGE_TASK_BIT_NV, &pipeline->base,
7790                         VK_PIPELINE_BIND_POINT_GRAPHICS);
7791 
7792    assert(ace_cs->cdw <= ace_cdw_max);
7793    return true;
7794 }
7795 
7796 static void
radv_after_draw(struct radv_cmd_buffer * cmd_buffer)7797 radv_after_draw(struct radv_cmd_buffer *cmd_buffer)
7798 {
7799    const struct radeon_info *rad_info = &cmd_buffer->device->physical_device->rad_info;
7800    bool has_prefetch = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7;
7801    /* Start prefetches after the draw has been started. Both will
7802     * run in parallel, but starting the draw first is more
7803     * important.
7804     */
7805    if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) {
7806       radv_emit_prefetch_L2(cmd_buffer, cmd_buffer->state.graphics_pipeline, false);
7807    }
7808 
7809    /* Workaround for a VGT hang when streamout is enabled.
7810     * It must be done after drawing.
7811     */
7812    if (radv_is_streamout_enabled(cmd_buffer) &&
7813        (rad_info->family == CHIP_HAWAII || rad_info->family == CHIP_TONGA ||
7814         rad_info->family == CHIP_FIJI)) {
7815       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_STREAMOUT_SYNC;
7816    }
7817 
7818    radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_PS_PARTIAL_FLUSH);
7819 }
7820 
7821 static struct radv_buffer
radv_nv_mesh_indirect_bo(struct radv_cmd_buffer * cmd_buffer,struct radv_buffer * buffer,VkDeviceSize offset,uint32_t draw_count,uint32_t stride)7822 radv_nv_mesh_indirect_bo(struct radv_cmd_buffer *cmd_buffer,
7823                          struct radv_buffer *buffer, VkDeviceSize offset,
7824                          uint32_t draw_count, uint32_t stride)
7825 {
7826    /* Translates the indirect BO format used by NV_mesh_shader API
7827     * to the BO format used by DRAW_INDIRECT / DRAW_INDIRECT_MULTI.
7828     */
7829 
7830    struct radeon_cmdbuf *cs = cmd_buffer->cs;
7831    struct radeon_winsys *ws = cmd_buffer->device->ws;
7832 
7833    const size_t src_stride = MAX2(stride, sizeof(VkDrawMeshTasksIndirectCommandNV));
7834    const size_t dst_stride = sizeof(VkDrawIndirectCommand);
7835    const size_t src_off_task_count = offsetof(VkDrawMeshTasksIndirectCommandNV, taskCount);
7836    const size_t src_off_first_task = offsetof(VkDrawMeshTasksIndirectCommandNV, firstTask);
7837    const size_t dst_off_vertex_count = offsetof(VkDrawIndirectCommand, vertexCount);
7838    const size_t dst_off_first_vertex = offsetof(VkDrawIndirectCommand, firstVertex);
7839 
7840    /* Fill the buffer with all zeroes except instanceCount = 1.
7841     * This helps emit fewer copy packets below.
7842     */
7843    VkDrawIndirectCommand *fill_data = (VkDrawIndirectCommand *) alloca(dst_stride * draw_count);
7844    const VkDrawIndirectCommand filler = { .instanceCount = 1 };
7845    for (unsigned i = 0; i < draw_count; ++i)
7846       fill_data[i] = filler;
7847 
7848    /* We'll have to copy data from the API BO. */
7849    uint64_t va = radv_buffer_get_va(buffer->bo) + buffer->offset + offset;
7850    radv_cs_add_buffer(ws, cs, buffer->bo);
7851 
7852    /* Allocate some space in the upload BO. */
7853    unsigned out_offset;
7854    radv_cmd_buffer_upload_data(cmd_buffer, dst_stride * draw_count, fill_data, &out_offset);
7855    const uint64_t new_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + out_offset;
7856 
7857    ASSERTED unsigned cdw_max = radeon_check_space(ws, cs, 12 * draw_count + 2);
7858 
7859    /* Copy data from the API BO so that the format is suitable for the
7860     * indirect draw packet:
7861     * - vertexCount = taskCount (copied here)
7862     * - instanceCount = 1 (filled by CPU above)
7863     * - firstVertex = firstTask (copied here)
7864     * - firstInstance = 0 (filled by CPU above)
7865     */
7866    for (unsigned i = 0; i < draw_count; ++i) {
7867       const uint64_t src_task_count = va + i * src_stride + src_off_task_count;
7868       const uint64_t src_first_task = va + i * src_stride + src_off_first_task;
7869       const uint64_t dst_vertex_count = new_va + i * dst_stride + dst_off_vertex_count;
7870       const uint64_t dst_first_vertex = new_va + i * dst_stride + dst_off_first_vertex;
7871 
7872       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating));
7873       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
7874                       COPY_DATA_WR_CONFIRM);
7875       radeon_emit(cs, src_task_count);
7876       radeon_emit(cs, src_task_count >> 32);
7877       radeon_emit(cs, dst_vertex_count);
7878       radeon_emit(cs, dst_vertex_count >> 32);
7879 
7880       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating));
7881       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
7882                       COPY_DATA_WR_CONFIRM);
7883       radeon_emit(cs, src_first_task);
7884       radeon_emit(cs, src_first_task >> 32);
7885       radeon_emit(cs, dst_first_vertex);
7886       radeon_emit(cs, dst_first_vertex >> 32);
7887    }
7888 
7889    /* Wait for the copies to finish */
7890    radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
7891    radeon_emit(cs, 0);
7892 
7893    /* The draw packet can now use this buffer: */
7894    struct radv_buffer buf = *buffer;
7895    buf.bo = cmd_buffer->upload.upload_bo;
7896    buf.offset = out_offset;
7897 
7898    assert(cmd_buffer->cs->cdw <= cdw_max);
7899 
7900    return buf;
7901 }
7902 
7903 static struct radv_buffer
radv_nv_task_indirect_bo(struct radv_cmd_buffer * cmd_buffer,struct radv_buffer * buffer,VkDeviceSize offset,uint32_t draw_count,uint32_t stride)7904 radv_nv_task_indirect_bo(struct radv_cmd_buffer *cmd_buffer, struct radv_buffer *buffer,
7905                          VkDeviceSize offset, uint32_t draw_count, uint32_t stride)
7906 {
7907    /* Translates the indirect BO format used by NV_mesh_shader API
7908     * to the BO format used by DISPATCH_TASKMESH_INDIRECT_MULTI_ACE.
7909     */
7910 
7911    assert(draw_count);
7912    static_assert(sizeof(VkDispatchIndirectCommand) == 12, "Incorrect size of taskmesh command.");
7913 
7914    struct radeon_cmdbuf *cs = cmd_buffer->ace_internal.cs;
7915    struct radeon_winsys *ws = cmd_buffer->device->ws;
7916 
7917    const size_t src_stride = MAX2(stride, sizeof(VkDrawMeshTasksIndirectCommandNV));
7918    const size_t dst_stride = sizeof(VkDispatchIndirectCommand);
7919    const size_t src_off_task_count = offsetof(VkDrawMeshTasksIndirectCommandNV, taskCount);
7920    const size_t dst_off_x = offsetof(VkDispatchIndirectCommand, x);
7921 
7922    const unsigned new_disp_size = dst_stride * draw_count;
7923 
7924    const uint64_t va = radv_buffer_get_va(buffer->bo) + buffer->offset + offset;
7925    radv_cs_add_buffer(ws, cs, buffer->bo);
7926 
7927    /* Fill the buffer with X=0, Y=1, Z=1. */
7928    VkDispatchIndirectCommand *fill_data = (VkDispatchIndirectCommand *)alloca(new_disp_size);
7929    for (unsigned i = 0; i < draw_count; ++i) {
7930       fill_data[i].x = 0;
7931       fill_data[i].y = 1;
7932       fill_data[i].z = 1;
7933    }
7934 
7935    /* Allocate space in the upload BO. */
7936    unsigned out_offset;
7937    ASSERTED bool uploaded =
7938       radv_cmd_buffer_upload_data(cmd_buffer, new_disp_size, fill_data, &out_offset);
7939    const uint64_t new_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + out_offset;
7940    assert(uploaded);
7941 
7942    /* Clamp draw count to fit the actual size of the buffer.
7943     * This is to avoid potential out of bounds copies (eg. for draws with an indirect count buffer).
7944     * The remaining indirect draws will stay filled with X=0, Y=1, Z=1 which is harmless.
7945     */
7946    draw_count = MIN2(draw_count, (buffer->vk.size - buffer->offset - offset) / src_stride);
7947 
7948    ASSERTED unsigned cdw_max = radeon_check_space(ws, cs, 6 * draw_count + 2);
7949 
7950    /* Copy taskCount from the NV API BO to the X dispatch size of the compatible BO. */
7951    for (unsigned i = 0; i < draw_count; ++i) {
7952       const uint64_t src_task_count = va + i * src_stride + src_off_task_count;
7953       const uint64_t dst_x = new_va + i * dst_stride + dst_off_x;
7954 
7955       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating));
7956       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
7957                          COPY_DATA_WR_CONFIRM);
7958       radeon_emit(cs, src_task_count);
7959       radeon_emit(cs, src_task_count >> 32);
7960       radeon_emit(cs, dst_x);
7961       radeon_emit(cs, dst_x >> 32);
7962    }
7963 
7964    assert(cs->cdw <= cdw_max);
7965 
7966    /* The draw packet can now use this buffer: */
7967    struct radv_buffer buf = *buffer;
7968    buf.bo = cmd_buffer->upload.upload_bo;
7969    buf.offset = out_offset;
7970 
7971    return buf;
7972 }
7973 
7974 VKAPI_ATTR void VKAPI_CALL
radv_CmdDraw(VkCommandBuffer commandBuffer,uint32_t vertexCount,uint32_t instanceCount,uint32_t firstVertex,uint32_t firstInstance)7975 radv_CmdDraw(VkCommandBuffer commandBuffer, uint32_t vertexCount, uint32_t instanceCount,
7976              uint32_t firstVertex, uint32_t firstInstance)
7977 {
7978    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7979    struct radv_draw_info info;
7980 
7981    info.count = vertexCount;
7982    info.instance_count = instanceCount;
7983    info.first_instance = firstInstance;
7984    info.strmout_buffer = NULL;
7985    info.indirect = NULL;
7986    info.indexed = false;
7987 
7988    if (!radv_before_draw(cmd_buffer, &info, 1))
7989       return;
7990    const VkMultiDrawInfoEXT minfo = { firstVertex, vertexCount };
7991    radv_emit_direct_draw_packets(cmd_buffer, &info, 1, &minfo, 0, 0);
7992    radv_after_draw(cmd_buffer);
7993 }
7994 
7995 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawMultiEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawInfoEXT * pVertexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride)7996 radv_CmdDrawMultiEXT(VkCommandBuffer commandBuffer, uint32_t drawCount, const VkMultiDrawInfoEXT *pVertexInfo,
7997                           uint32_t instanceCount, uint32_t firstInstance, uint32_t stride)
7998 {
7999    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8000    struct radv_draw_info info;
8001 
8002    if (!drawCount)
8003       return;
8004 
8005    info.count = pVertexInfo->vertexCount;
8006    info.instance_count = instanceCount;
8007    info.first_instance = firstInstance;
8008    info.strmout_buffer = NULL;
8009    info.indirect = NULL;
8010    info.indexed = false;
8011 
8012    if (!radv_before_draw(cmd_buffer, &info, drawCount))
8013       return;
8014    radv_emit_direct_draw_packets(cmd_buffer, &info, drawCount, pVertexInfo, 0, stride);
8015    radv_after_draw(cmd_buffer);
8016 }
8017 
8018 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawIndexed(VkCommandBuffer commandBuffer,uint32_t indexCount,uint32_t instanceCount,uint32_t firstIndex,int32_t vertexOffset,uint32_t firstInstance)8019 radv_CmdDrawIndexed(VkCommandBuffer commandBuffer, uint32_t indexCount, uint32_t instanceCount,
8020                     uint32_t firstIndex, int32_t vertexOffset, uint32_t firstInstance)
8021 {
8022    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8023    struct radv_draw_info info;
8024 
8025    info.indexed = true;
8026    info.count = indexCount;
8027    info.instance_count = instanceCount;
8028    info.first_instance = firstInstance;
8029    info.strmout_buffer = NULL;
8030    info.indirect = NULL;
8031 
8032    if (!radv_before_draw(cmd_buffer, &info, 1))
8033       return;
8034    const VkMultiDrawIndexedInfoEXT minfo = { firstIndex, indexCount, vertexOffset };
8035    radv_emit_draw_packets_indexed(cmd_buffer, &info, 1, &minfo, 0, NULL);
8036    radv_after_draw(cmd_buffer);
8037 }
8038 
8039 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawIndexedInfoEXT * pIndexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride,const int32_t * pVertexOffset)8040 radv_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer, uint32_t drawCount, const VkMultiDrawIndexedInfoEXT *pIndexInfo,
8041                             uint32_t instanceCount, uint32_t firstInstance, uint32_t stride, const int32_t *pVertexOffset)
8042 {
8043    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8044    struct radv_draw_info info;
8045 
8046    if (!drawCount)
8047       return;
8048 
8049    const VkMultiDrawIndexedInfoEXT *minfo = pIndexInfo;
8050    info.indexed = true;
8051    info.count = minfo->indexCount;
8052    info.instance_count = instanceCount;
8053    info.first_instance = firstInstance;
8054    info.strmout_buffer = NULL;
8055    info.indirect = NULL;
8056 
8057    if (!radv_before_draw(cmd_buffer, &info, drawCount))
8058       return;
8059    radv_emit_draw_packets_indexed(cmd_buffer, &info, drawCount, pIndexInfo, stride, pVertexOffset);
8060    radv_after_draw(cmd_buffer);
8061 }
8062 
8063 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)8064 radv_CmdDrawIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset,
8065                      uint32_t drawCount, uint32_t stride)
8066 {
8067    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8068    RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
8069    struct radv_draw_info info;
8070 
8071    info.count = drawCount;
8072    info.indirect = buffer;
8073    info.indirect_offset = offset;
8074    info.stride = stride;
8075    info.strmout_buffer = NULL;
8076    info.count_buffer = NULL;
8077    info.indexed = false;
8078    info.instance_count = 0;
8079 
8080    if (!radv_before_draw(cmd_buffer, &info, 1))
8081       return;
8082    radv_emit_indirect_draw_packets(cmd_buffer, &info);
8083    radv_after_draw(cmd_buffer);
8084 }
8085 
8086 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)8087 radv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset,
8088                             uint32_t drawCount, uint32_t stride)
8089 {
8090    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8091    RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
8092    struct radv_draw_info info;
8093 
8094    info.indexed = true;
8095    info.count = drawCount;
8096    info.indirect = buffer;
8097    info.indirect_offset = offset;
8098    info.stride = stride;
8099    info.count_buffer = NULL;
8100    info.strmout_buffer = NULL;
8101    info.instance_count = 0;
8102 
8103    if (!radv_before_draw(cmd_buffer, &info, 1))
8104       return;
8105    radv_emit_indirect_draw_packets(cmd_buffer, &info);
8106    radv_after_draw(cmd_buffer);
8107 }
8108 
8109 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer _countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)8110 radv_CmdDrawIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset,
8111                           VkBuffer _countBuffer, VkDeviceSize countBufferOffset,
8112                           uint32_t maxDrawCount, uint32_t stride)
8113 {
8114    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8115    RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
8116    RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
8117    struct radv_draw_info info;
8118 
8119    info.count = maxDrawCount;
8120    info.indirect = buffer;
8121    info.indirect_offset = offset;
8122    info.count_buffer = count_buffer;
8123    info.count_buffer_offset = countBufferOffset;
8124    info.stride = stride;
8125    info.strmout_buffer = NULL;
8126    info.indexed = false;
8127    info.instance_count = 0;
8128 
8129    if (!radv_before_draw(cmd_buffer, &info, 1))
8130       return;
8131    radv_emit_indirect_draw_packets(cmd_buffer, &info);
8132    radv_after_draw(cmd_buffer);
8133 }
8134 
8135 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer _countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)8136 radv_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer,
8137                                  VkDeviceSize offset, VkBuffer _countBuffer,
8138                                  VkDeviceSize countBufferOffset, uint32_t maxDrawCount,
8139                                  uint32_t stride)
8140 {
8141    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8142    RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
8143    RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
8144    struct radv_draw_info info;
8145 
8146    info.indexed = true;
8147    info.count = maxDrawCount;
8148    info.indirect = buffer;
8149    info.indirect_offset = offset;
8150    info.count_buffer = count_buffer;
8151    info.count_buffer_offset = countBufferOffset;
8152    info.stride = stride;
8153    info.strmout_buffer = NULL;
8154    info.instance_count = 0;
8155 
8156    if (!radv_before_draw(cmd_buffer, &info, 1))
8157       return;
8158    radv_emit_indirect_draw_packets(cmd_buffer, &info);
8159    radv_after_draw(cmd_buffer);
8160 }
8161 
8162 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawMeshTasksNV(VkCommandBuffer commandBuffer,uint32_t taskCount,uint32_t firstTask)8163 radv_CmdDrawMeshTasksNV(VkCommandBuffer commandBuffer, uint32_t taskCount, uint32_t firstTask)
8164 {
8165    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8166    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
8167    struct radv_draw_info info;
8168 
8169    info.count = taskCount;
8170    info.instance_count = 1;
8171    info.first_instance = 0;
8172    info.stride = 0;
8173    info.indexed = false;
8174    info.strmout_buffer = NULL;
8175    info.count_buffer = NULL;
8176    info.indirect = NULL;
8177 
8178    if (!radv_before_taskmesh_draw(cmd_buffer, &info, 1))
8179       return;
8180 
8181    if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
8182       radv_emit_direct_taskmesh_draw_packets(cmd_buffer, taskCount, 1, 1, firstTask);
8183    } else {
8184       radv_emit_direct_mesh_draw_packet(cmd_buffer, taskCount, 1, 1, firstTask);
8185    }
8186 
8187    radv_after_draw(cmd_buffer);
8188 }
8189 
8190 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawMeshTasksIndirectNV(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)8191 radv_CmdDrawMeshTasksIndirectNV(VkCommandBuffer commandBuffer, VkBuffer _buffer,
8192                                 VkDeviceSize offset, uint32_t drawCount, uint32_t stride)
8193 {
8194    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8195    RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
8196 
8197    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
8198    struct radv_draw_info info;
8199 
8200    info.indirect = buffer;
8201    info.indirect_offset = offset;
8202    info.stride = stride;
8203    info.count = drawCount;
8204    info.strmout_buffer = NULL;
8205    info.count_buffer = NULL;
8206    info.indexed = false;
8207    info.instance_count = 0;
8208 
8209    if (!radv_before_taskmesh_draw(cmd_buffer, &info, drawCount))
8210       return;
8211 
8212    /* Indirect draw with mesh shader only:
8213     * Use DRAW_INDIRECT / DRAW_INDIRECT_MULTI like normal indirect draws.
8214     * Needed because DISPATCH_MESH_INDIRECT_MULTI doesn't support firstTask.
8215     *
8216     * Indirect draw with task + mesh shaders:
8217     * Use DISPATCH_TASKMESH_INDIRECT_MULTI_ACE + DISPATCH_TASKMESH_GFX.
8218     * These packets don't support firstTask so we implement that by
8219     * reading the NV command's indirect buffer in the shader.
8220     *
8221     * The indirect BO layout from the NV_mesh_shader API is incompatible
8222     * with AMD HW. To make it work, we allocate some space
8223     * in the upload buffer and copy the data to it.
8224     */
8225 
8226    if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
8227       uint64_t nv_ib_va = radv_buffer_get_va(buffer->bo) + buffer->offset + offset;
8228       uint32_t nv_ib_stride = MAX2(stride, sizeof(VkDrawMeshTasksIndirectCommandNV));
8229       struct radv_buffer buf =
8230          radv_nv_task_indirect_bo(cmd_buffer, buffer, offset, drawCount, stride);
8231       info.indirect = &buf;
8232       info.indirect_offset = 0;
8233       info.stride = sizeof(VkDispatchIndirectCommand);
8234 
8235       radv_emit_indirect_taskmesh_draw_packets(cmd_buffer, &info, nv_ib_va, nv_ib_stride);
8236    } else {
8237       struct radv_buffer buf =
8238          radv_nv_mesh_indirect_bo(cmd_buffer, buffer, offset, drawCount, stride);
8239       info.indirect = &buf;
8240       info.indirect_offset = 0;
8241       info.stride = sizeof(VkDrawIndirectCommand);
8242 
8243       radv_emit_indirect_draw_packets(cmd_buffer, &info);
8244    }
8245 
8246    radv_after_draw(cmd_buffer);
8247 }
8248 
8249 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawMeshTasksIndirectCountNV(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer _countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)8250 radv_CmdDrawMeshTasksIndirectCountNV(VkCommandBuffer commandBuffer, VkBuffer _buffer,
8251                                      VkDeviceSize offset, VkBuffer _countBuffer,
8252                                      VkDeviceSize countBufferOffset, uint32_t maxDrawCount,
8253                                      uint32_t stride)
8254 {
8255    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8256    RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
8257    RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
8258 
8259    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
8260    struct radv_draw_info info;
8261 
8262    info.indirect = buffer;
8263    info.indirect_offset = offset;
8264    info.stride = stride;
8265    info.count = maxDrawCount;
8266    info.strmout_buffer = NULL;
8267    info.count_buffer = count_buffer;
8268    info.count_buffer_offset = countBufferOffset;
8269    info.indexed = false;
8270    info.instance_count = 0;
8271 
8272    if (!radv_before_taskmesh_draw(cmd_buffer, &info, maxDrawCount))
8273       return;
8274 
8275    if (radv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
8276       uint64_t nv_ib_va = radv_buffer_get_va(buffer->bo) + buffer->offset + offset;
8277       uint32_t nv_ib_stride = MAX2(stride, sizeof(VkDrawMeshTasksIndirectCommandNV));
8278       struct radv_buffer buf =
8279          radv_nv_task_indirect_bo(cmd_buffer, buffer, offset, maxDrawCount, stride);
8280       info.indirect = &buf;
8281       info.indirect_offset = 0;
8282       info.stride = sizeof(VkDispatchIndirectCommand);
8283 
8284       radv_emit_indirect_taskmesh_draw_packets(cmd_buffer, &info, nv_ib_va, nv_ib_stride);
8285    } else {
8286       struct radv_buffer buf =
8287          radv_nv_mesh_indirect_bo(cmd_buffer, buffer, offset, maxDrawCount, stride);
8288       info.indirect = &buf;
8289       info.indirect_offset = 0;
8290       info.stride = sizeof(VkDrawIndirectCommand);
8291 
8292       radv_emit_indirect_draw_packets(cmd_buffer, &info);
8293    }
8294 
8295    radv_after_draw(cmd_buffer);
8296 }
8297 
8298 void
radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer,VkBool32 isPreprocessed,const VkGeneratedCommandsInfoNV * pGeneratedCommandsInfo)8299 radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer, VkBool32 isPreprocessed,
8300                                    const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo)
8301 {
8302    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8303    VK_FROM_HANDLE(radv_indirect_command_layout, layout,
8304                   pGeneratedCommandsInfo->indirectCommandsLayout);
8305    VK_FROM_HANDLE(radv_buffer, prep_buffer, pGeneratedCommandsInfo->preprocessBuffer);
8306 
8307    /* The only actions that can be done are draws, so skip on other queues. */
8308    if (cmd_buffer->qf != RADV_QUEUE_GENERAL)
8309       return;
8310 
8311    /* Secondary command buffers are needed for the full extension but can't use
8312     * PKT3_INDIRECT_BUFFER_CIK.
8313     */
8314    assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
8315 
8316    radv_prepare_dgc(cmd_buffer, pGeneratedCommandsInfo);
8317 
8318    struct radv_draw_info info;
8319 
8320    info.count = pGeneratedCommandsInfo->sequencesCount;
8321    info.indirect = prep_buffer; /* We're not really going use it this way, but a good signal
8322                                    that this is not direct. */
8323    info.indirect_offset = 0;
8324    info.stride = 0;
8325    info.strmout_buffer = NULL;
8326    info.count_buffer = NULL;
8327    info.indexed = layout->indexed;
8328    info.instance_count = 0;
8329 
8330    if (!radv_before_draw(cmd_buffer, &info, 1))
8331       return;
8332 
8333    uint32_t cmdbuf_size = radv_get_indirect_cmdbuf_size(pGeneratedCommandsInfo);
8334    uint64_t va = radv_buffer_get_va(prep_buffer->bo) + prep_buffer->offset +
8335                  pGeneratedCommandsInfo->preprocessOffset;
8336    const uint32_t view_mask = cmd_buffer->state.subpass->view_mask;
8337 
8338    radeon_emit(cmd_buffer->cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
8339    radeon_emit(cmd_buffer->cs, 0);
8340 
8341    if (!view_mask) {
8342       radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0));
8343       radeon_emit(cmd_buffer->cs, va);
8344       radeon_emit(cmd_buffer->cs, va >> 32);
8345       radeon_emit(cmd_buffer->cs, cmdbuf_size >> 2);
8346    } else {
8347       u_foreach_bit (view, view_mask) {
8348          radv_emit_view_index(cmd_buffer, view);
8349 
8350          radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0));
8351          radeon_emit(cmd_buffer->cs, va);
8352          radeon_emit(cmd_buffer->cs, va >> 32);
8353          radeon_emit(cmd_buffer->cs, cmdbuf_size >> 2);
8354       }
8355    }
8356 
8357    if (layout->binds_index_buffer) {
8358       cmd_buffer->state.last_index_type = -1;
8359       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
8360    }
8361 
8362    if (layout->bind_vbo_mask)
8363       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER;
8364 
8365    if (layout->binds_state)
8366       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE;
8367 
8368    cmd_buffer->push_constant_stages |= ~0;
8369 
8370    cmd_buffer->state.last_index_type = -1;
8371    cmd_buffer->state.last_num_instances = -1;
8372    cmd_buffer->state.last_vertex_offset = -1;
8373    cmd_buffer->state.last_first_instance = -1;
8374    cmd_buffer->state.last_drawid = -1;
8375 
8376    radv_after_draw(cmd_buffer);
8377 }
8378 
8379 struct radv_dispatch_info {
8380    /**
8381     * Determine the layout of the grid (in block units) to be used.
8382     */
8383    uint32_t blocks[3];
8384 
8385    /**
8386     * A starting offset for the grid. If unaligned is set, the offset
8387     * must still be aligned.
8388     */
8389    uint32_t offsets[3];
8390    /**
8391     * Whether it's an unaligned compute dispatch.
8392     */
8393    bool unaligned;
8394 
8395    /**
8396     * Indirect compute parameters resource.
8397     */
8398    struct radeon_winsys_bo *indirect;
8399    uint64_t va;
8400 };
8401 
8402 static void
radv_emit_dispatch_packets(struct radv_cmd_buffer * cmd_buffer,struct radv_compute_pipeline * pipeline,const struct radv_dispatch_info * info)8403 radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer,
8404                            struct radv_compute_pipeline *pipeline,
8405                            const struct radv_dispatch_info *info)
8406 {
8407    struct radv_shader *compute_shader = pipeline->base.shaders[MESA_SHADER_COMPUTE];
8408    unsigned dispatch_initiator = cmd_buffer->device->dispatch_initiator;
8409    struct radeon_winsys *ws = cmd_buffer->device->ws;
8410    bool predicating = cmd_buffer->state.predicating;
8411    struct radeon_cmdbuf *cs = cmd_buffer->cs;
8412    struct radv_userdata_info *loc;
8413 
8414    radv_describe_dispatch(cmd_buffer, info->blocks[0], info->blocks[1], info->blocks[2]);
8415 
8416    loc = radv_lookup_user_sgpr(&pipeline->base, MESA_SHADER_COMPUTE, AC_UD_CS_GRID_SIZE);
8417 
8418    ASSERTED unsigned cdw_max = radeon_check_space(ws, cs, 30);
8419 
8420    if (compute_shader->info.wave_size == 32) {
8421       assert(cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10);
8422       dispatch_initiator |= S_00B800_CS_W32_EN(1);
8423    }
8424 
8425    if (info->va) {
8426       if (info->indirect)
8427          radv_cs_add_buffer(ws, cs, info->indirect);
8428 
8429       if (info->unaligned) {
8430          radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
8431          radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[0]));
8432          radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[1]));
8433          radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(compute_shader->info.cs.block_size[2]));
8434 
8435          dispatch_initiator |= S_00B800_USE_THREAD_DIMENSIONS(1);
8436       }
8437 
8438       if (loc->sgpr_idx != -1) {
8439          unsigned reg = R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4;
8440 
8441          if (cmd_buffer->device->load_grid_size_from_user_sgpr) {
8442             assert(cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10_3);
8443             radeon_emit(cs, PKT3(PKT3_LOAD_SH_REG_INDEX, 3, 0));
8444             radeon_emit(cs, info->va);
8445             radeon_emit(cs, info->va >> 32);
8446             radeon_emit(cs, (reg - SI_SH_REG_OFFSET) >> 2);
8447             radeon_emit(cs, 3);
8448          } else {
8449             radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, reg, info->va, true);
8450          }
8451       }
8452 
8453       if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
8454          radv_cs_emit_compute_predication(&cmd_buffer->state, cs, cmd_buffer->mec_inv_pred_va,
8455                                           &cmd_buffer->mec_inv_pred_emitted,
8456                                           4 /* DISPATCH_INDIRECT size */);
8457          radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 2, 0) | PKT3_SHADER_TYPE_S(1));
8458          radeon_emit(cs, info->va);
8459          radeon_emit(cs, info->va >> 32);
8460          radeon_emit(cs, dispatch_initiator);
8461       } else {
8462          radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) | PKT3_SHADER_TYPE_S(1));
8463          radeon_emit(cs, 1);
8464          radeon_emit(cs, info->va);
8465          radeon_emit(cs, info->va >> 32);
8466 
8467          radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, predicating) | PKT3_SHADER_TYPE_S(1));
8468          radeon_emit(cs, 0);
8469          radeon_emit(cs, dispatch_initiator);
8470       }
8471    } else {
8472       unsigned blocks[3] = {info->blocks[0], info->blocks[1], info->blocks[2]};
8473       unsigned offsets[3] = {info->offsets[0], info->offsets[1], info->offsets[2]};
8474 
8475       if (info->unaligned) {
8476          unsigned *cs_block_size = compute_shader->info.cs.block_size;
8477          unsigned remainder[3];
8478 
8479          /* If aligned, these should be an entire block size,
8480           * not 0.
8481           */
8482          remainder[0] = blocks[0] + cs_block_size[0] - align_u32_npot(blocks[0], cs_block_size[0]);
8483          remainder[1] = blocks[1] + cs_block_size[1] - align_u32_npot(blocks[1], cs_block_size[1]);
8484          remainder[2] = blocks[2] + cs_block_size[2] - align_u32_npot(blocks[2], cs_block_size[2]);
8485 
8486          blocks[0] = round_up_u32(blocks[0], cs_block_size[0]);
8487          blocks[1] = round_up_u32(blocks[1], cs_block_size[1]);
8488          blocks[2] = round_up_u32(blocks[2], cs_block_size[2]);
8489 
8490          for (unsigned i = 0; i < 3; ++i) {
8491             assert(offsets[i] % cs_block_size[i] == 0);
8492             offsets[i] /= cs_block_size[i];
8493          }
8494 
8495          radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
8496          radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(cs_block_size[0]) |
8497                             S_00B81C_NUM_THREAD_PARTIAL(remainder[0]));
8498          radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(cs_block_size[1]) |
8499                             S_00B81C_NUM_THREAD_PARTIAL(remainder[1]));
8500          radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(cs_block_size[2]) |
8501                             S_00B81C_NUM_THREAD_PARTIAL(remainder[2]));
8502 
8503          dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1);
8504       }
8505 
8506       if (loc->sgpr_idx != -1) {
8507          if (cmd_buffer->device->load_grid_size_from_user_sgpr) {
8508             assert(loc->num_sgprs == 3);
8509 
8510             radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, 3);
8511             radeon_emit(cs, blocks[0]);
8512             radeon_emit(cs, blocks[1]);
8513             radeon_emit(cs, blocks[2]);
8514          } else {
8515             uint32_t offset;
8516             if (!radv_cmd_buffer_upload_data(cmd_buffer, 12, blocks, &offset))
8517                return;
8518 
8519             uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset;
8520             radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs,
8521                                      R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, va, true);
8522          }
8523       }
8524 
8525       if (offsets[0] || offsets[1] || offsets[2]) {
8526          radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3);
8527          radeon_emit(cs, offsets[0]);
8528          radeon_emit(cs, offsets[1]);
8529          radeon_emit(cs, offsets[2]);
8530 
8531          /* The blocks in the packet are not counts but end values. */
8532          for (unsigned i = 0; i < 3; ++i)
8533             blocks[i] += offsets[i];
8534       } else {
8535          dispatch_initiator |= S_00B800_FORCE_START_AT_000(1);
8536       }
8537 
8538       if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
8539          radv_cs_emit_compute_predication(&cmd_buffer->state, cs, cmd_buffer->mec_inv_pred_va,
8540                                           &cmd_buffer->mec_inv_pred_emitted,
8541                                           5 /* DISPATCH_DIRECT size */);
8542          predicating = false;
8543       }
8544 
8545       radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, predicating) | PKT3_SHADER_TYPE_S(1));
8546       radeon_emit(cs, blocks[0]);
8547       radeon_emit(cs, blocks[1]);
8548       radeon_emit(cs, blocks[2]);
8549       radeon_emit(cs, dispatch_initiator);
8550    }
8551 
8552    assert(cmd_buffer->cs->cdw <= cdw_max);
8553 }
8554 
8555 static void
radv_upload_compute_shader_descriptors(struct radv_cmd_buffer * cmd_buffer,struct radv_compute_pipeline * pipeline,VkPipelineBindPoint bind_point)8556 radv_upload_compute_shader_descriptors(struct radv_cmd_buffer *cmd_buffer,
8557                                        struct radv_compute_pipeline *pipeline,
8558                                        VkPipelineBindPoint bind_point)
8559 {
8560    radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_COMPUTE_BIT, &pipeline->base, bind_point);
8561    radv_flush_constants(cmd_buffer,
8562                         bind_point == VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR
8563                            ? RADV_RT_STAGE_BITS
8564                            : VK_SHADER_STAGE_COMPUTE_BIT,
8565                         &pipeline->base, bind_point);
8566 }
8567 
8568 static void
radv_dispatch(struct radv_cmd_buffer * cmd_buffer,const struct radv_dispatch_info * info,struct radv_compute_pipeline * pipeline,VkPipelineBindPoint bind_point)8569 radv_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info,
8570               struct radv_compute_pipeline *pipeline, VkPipelineBindPoint bind_point)
8571 {
8572    bool has_prefetch = cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7;
8573    bool pipeline_is_dirty = pipeline != cmd_buffer->state.emitted_compute_pipeline;
8574 
8575    if (pipeline->cs_regalloc_hang_bug)
8576       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
8577                                       RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
8578 
8579    if (cmd_buffer->state.flush_bits &
8580        (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB |
8581         RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
8582       /* If we have to wait for idle, set all states first, so that
8583        * all SET packets are processed in parallel with previous draw
8584        * calls. Then upload descriptors, set shader pointers, and
8585        * dispatch, and prefetch at the end. This ensures that the
8586        * time the CUs are idle is very short. (there are only SET_SH
8587        * packets between the wait and the draw)
8588        */
8589       radv_emit_compute_pipeline(cmd_buffer, pipeline);
8590       si_emit_cache_flush(cmd_buffer);
8591       /* <-- CUs are idle here --> */
8592 
8593       radv_upload_compute_shader_descriptors(cmd_buffer, pipeline, bind_point);
8594 
8595       radv_emit_dispatch_packets(cmd_buffer, pipeline, info);
8596       /* <-- CUs are busy here --> */
8597 
8598       /* Start prefetches after the dispatch has been started. Both
8599        * will run in parallel, but starting the dispatch first is
8600        * more important.
8601        */
8602       if (has_prefetch && pipeline_is_dirty) {
8603          radv_emit_shader_prefetch(cmd_buffer, pipeline->base.shaders[MESA_SHADER_COMPUTE]);
8604       }
8605    } else {
8606       /* If we don't wait for idle, start prefetches first, then set
8607        * states, and dispatch at the end.
8608        */
8609       si_emit_cache_flush(cmd_buffer);
8610 
8611       if (has_prefetch && pipeline_is_dirty) {
8612          radv_emit_shader_prefetch(cmd_buffer, pipeline->base.shaders[MESA_SHADER_COMPUTE]);
8613       }
8614 
8615       radv_upload_compute_shader_descriptors(cmd_buffer, pipeline, bind_point);
8616 
8617       radv_emit_compute_pipeline(cmd_buffer, pipeline);
8618       radv_emit_dispatch_packets(cmd_buffer, pipeline, info);
8619    }
8620 
8621    if (pipeline_is_dirty) {
8622       /* Raytracing uses compute shaders but has separate bind points and pipelines.
8623        * So if we set compute userdata & shader registers we should dirty the raytracing
8624        * ones and the other way around.
8625        *
8626        * We only need to do this when the pipeline is dirty because when we switch between
8627        * the two we always need to switch pipelines.
8628        */
8629       radv_mark_descriptor_sets_dirty(cmd_buffer, bind_point == VK_PIPELINE_BIND_POINT_COMPUTE
8630                                                      ? VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR
8631                                                      : VK_PIPELINE_BIND_POINT_COMPUTE);
8632    }
8633 
8634    if (pipeline->cs_regalloc_hang_bug)
8635       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
8636 
8637    radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_CS_PARTIAL_FLUSH);
8638 }
8639 
8640 static void
radv_compute_dispatch(struct radv_cmd_buffer * cmd_buffer,const struct radv_dispatch_info * info)8641 radv_compute_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info)
8642 {
8643    radv_dispatch(cmd_buffer, info, cmd_buffer->state.compute_pipeline,
8644                  VK_PIPELINE_BIND_POINT_COMPUTE);
8645 }
8646 
8647 VKAPI_ATTR void VKAPI_CALL
radv_CmdDispatchBase(VkCommandBuffer commandBuffer,uint32_t base_x,uint32_t base_y,uint32_t base_z,uint32_t x,uint32_t y,uint32_t z)8648 radv_CmdDispatchBase(VkCommandBuffer commandBuffer, uint32_t base_x, uint32_t base_y,
8649                      uint32_t base_z, uint32_t x, uint32_t y, uint32_t z)
8650 {
8651    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8652    struct radv_dispatch_info info = {0};
8653 
8654    info.blocks[0] = x;
8655    info.blocks[1] = y;
8656    info.blocks[2] = z;
8657 
8658    info.offsets[0] = base_x;
8659    info.offsets[1] = base_y;
8660    info.offsets[2] = base_z;
8661    radv_compute_dispatch(cmd_buffer, &info);
8662 }
8663 
8664 VKAPI_ATTR void VKAPI_CALL
radv_CmdDispatch(VkCommandBuffer commandBuffer,uint32_t x,uint32_t y,uint32_t z)8665 radv_CmdDispatch(VkCommandBuffer commandBuffer, uint32_t x, uint32_t y, uint32_t z)
8666 {
8667    radv_CmdDispatchBase(commandBuffer, 0, 0, 0, x, y, z);
8668 }
8669 
8670 VKAPI_ATTR void VKAPI_CALL
radv_CmdDispatchIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset)8671 radv_CmdDispatchIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset)
8672 {
8673    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8674    RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
8675    struct radv_dispatch_info info = {0};
8676 
8677    info.indirect = buffer->bo;
8678    info.va = radv_buffer_get_va(buffer->bo) + buffer->offset + offset;
8679 
8680    radv_compute_dispatch(cmd_buffer, &info);
8681 }
8682 
8683 void
radv_unaligned_dispatch(struct radv_cmd_buffer * cmd_buffer,uint32_t x,uint32_t y,uint32_t z)8684 radv_unaligned_dispatch(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t y, uint32_t z)
8685 {
8686    struct radv_dispatch_info info = {0};
8687 
8688    info.blocks[0] = x;
8689    info.blocks[1] = y;
8690    info.blocks[2] = z;
8691    info.unaligned = 1;
8692 
8693    radv_compute_dispatch(cmd_buffer, &info);
8694 }
8695 
8696 void
radv_indirect_dispatch(struct radv_cmd_buffer * cmd_buffer,struct radeon_winsys_bo * bo,uint64_t va)8697 radv_indirect_dispatch(struct radv_cmd_buffer *cmd_buffer, struct radeon_winsys_bo *bo, uint64_t va)
8698 {
8699    struct radv_dispatch_info info = {0};
8700 
8701    info.indirect = bo;
8702    info.va = va;
8703 
8704    radv_compute_dispatch(cmd_buffer, &info);
8705 }
8706 
8707 enum radv_rt_mode {
8708    radv_rt_mode_direct,
8709    radv_rt_mode_indirect,
8710    radv_rt_mode_indirect2,
8711 };
8712 
8713 static void
radv_trace_rays(struct radv_cmd_buffer * cmd_buffer,const VkTraceRaysIndirectCommand2KHR * tables,uint64_t indirect_va,enum radv_rt_mode mode)8714 radv_trace_rays(struct radv_cmd_buffer *cmd_buffer, const VkTraceRaysIndirectCommand2KHR *tables,
8715                 uint64_t indirect_va, enum radv_rt_mode mode)
8716 {
8717    struct radv_compute_pipeline *pipeline = cmd_buffer->state.rt_pipeline;
8718    uint32_t base_reg = pipeline->base.user_data_0[MESA_SHADER_COMPUTE];
8719 
8720    struct radv_dispatch_info info = {0};
8721    info.unaligned = true;
8722 
8723    uint64_t launch_size_va;
8724    uint64_t sbt_va;
8725 
8726    if (mode != radv_rt_mode_indirect2) {
8727       uint32_t upload_size = mode == radv_rt_mode_direct
8728                                 ? sizeof(VkTraceRaysIndirectCommand2KHR)
8729                                 : offsetof(VkTraceRaysIndirectCommand2KHR, width);
8730 
8731       uint32_t offset;
8732       if (!radv_cmd_buffer_upload_data(cmd_buffer, upload_size, tables, &offset))
8733          return;
8734 
8735       uint64_t upload_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset;
8736 
8737       launch_size_va = (mode == radv_rt_mode_direct)
8738                           ? upload_va + offsetof(VkTraceRaysIndirectCommand2KHR, width)
8739                           : indirect_va;
8740       sbt_va = upload_va;
8741    } else {
8742       launch_size_va = indirect_va + offsetof(VkTraceRaysIndirectCommand2KHR, width);
8743       sbt_va = indirect_va;
8744    }
8745 
8746    if (mode == radv_rt_mode_direct) {
8747       info.blocks[0] = tables->width;
8748       info.blocks[1] = tables->height;
8749       info.blocks[2] = tables->depth;
8750    } else
8751       info.va = launch_size_va;
8752 
8753    struct radv_userdata_info *desc_loc =
8754       radv_lookup_user_sgpr(&pipeline->base, MESA_SHADER_COMPUTE, AC_UD_CS_SBT_DESCRIPTORS);
8755    if (desc_loc->sgpr_idx != -1) {
8756       radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs,
8757                                base_reg + desc_loc->sgpr_idx * 4, sbt_va, true);
8758    }
8759 
8760    struct radv_userdata_info *size_loc =
8761       radv_lookup_user_sgpr(&pipeline->base, MESA_SHADER_COMPUTE, AC_UD_CS_RAY_LAUNCH_SIZE_ADDR);
8762    if (size_loc->sgpr_idx != -1) {
8763       radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs,
8764                                base_reg + size_loc->sgpr_idx * 4, launch_size_va, true);
8765    }
8766 
8767    radv_dispatch(cmd_buffer, &info, pipeline, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
8768 }
8769 
8770 VKAPI_ATTR void VKAPI_CALL
radv_CmdTraceRaysKHR(VkCommandBuffer commandBuffer,const VkStridedDeviceAddressRegionKHR * pRaygenShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pMissShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pHitShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pCallableShaderBindingTable,uint32_t width,uint32_t height,uint32_t depth)8771 radv_CmdTraceRaysKHR(VkCommandBuffer commandBuffer,
8772                      const VkStridedDeviceAddressRegionKHR *pRaygenShaderBindingTable,
8773                      const VkStridedDeviceAddressRegionKHR *pMissShaderBindingTable,
8774                      const VkStridedDeviceAddressRegionKHR *pHitShaderBindingTable,
8775                      const VkStridedDeviceAddressRegionKHR *pCallableShaderBindingTable,
8776                      uint32_t width, uint32_t height, uint32_t depth)
8777 {
8778    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8779 
8780    VkTraceRaysIndirectCommand2KHR tables = {
8781       .raygenShaderRecordAddress = pRaygenShaderBindingTable->deviceAddress,
8782       .raygenShaderRecordSize = pRaygenShaderBindingTable->size,
8783       .missShaderBindingTableAddress = pMissShaderBindingTable->deviceAddress,
8784       .missShaderBindingTableSize = pMissShaderBindingTable->size,
8785       .missShaderBindingTableStride = pMissShaderBindingTable->stride,
8786       .hitShaderBindingTableAddress = pHitShaderBindingTable->deviceAddress,
8787       .hitShaderBindingTableSize = pHitShaderBindingTable->size,
8788       .hitShaderBindingTableStride = pHitShaderBindingTable->stride,
8789       .callableShaderBindingTableAddress = pCallableShaderBindingTable->deviceAddress,
8790       .callableShaderBindingTableSize = pCallableShaderBindingTable->size,
8791       .callableShaderBindingTableStride = pCallableShaderBindingTable->stride,
8792       .width = width,
8793       .height = height,
8794       .depth = depth,
8795    };
8796 
8797    radv_trace_rays(cmd_buffer, &tables, 0, radv_rt_mode_direct);
8798 }
8799 
8800 VKAPI_ATTR void VKAPI_CALL
radv_CmdTraceRaysIndirectKHR(VkCommandBuffer commandBuffer,const VkStridedDeviceAddressRegionKHR * pRaygenShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pMissShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pHitShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pCallableShaderBindingTable,VkDeviceAddress indirectDeviceAddress)8801 radv_CmdTraceRaysIndirectKHR(VkCommandBuffer commandBuffer,
8802                              const VkStridedDeviceAddressRegionKHR *pRaygenShaderBindingTable,
8803                              const VkStridedDeviceAddressRegionKHR *pMissShaderBindingTable,
8804                              const VkStridedDeviceAddressRegionKHR *pHitShaderBindingTable,
8805                              const VkStridedDeviceAddressRegionKHR *pCallableShaderBindingTable,
8806                              VkDeviceAddress indirectDeviceAddress)
8807 {
8808    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8809 
8810    assert(cmd_buffer->device->use_global_bo_list);
8811 
8812    VkTraceRaysIndirectCommand2KHR tables = {
8813       .raygenShaderRecordAddress = pRaygenShaderBindingTable->deviceAddress,
8814       .raygenShaderRecordSize = pRaygenShaderBindingTable->size,
8815       .missShaderBindingTableAddress = pMissShaderBindingTable->deviceAddress,
8816       .missShaderBindingTableSize = pMissShaderBindingTable->size,
8817       .missShaderBindingTableStride = pMissShaderBindingTable->stride,
8818       .hitShaderBindingTableAddress = pHitShaderBindingTable->deviceAddress,
8819       .hitShaderBindingTableSize = pHitShaderBindingTable->size,
8820       .hitShaderBindingTableStride = pHitShaderBindingTable->stride,
8821       .callableShaderBindingTableAddress = pCallableShaderBindingTable->deviceAddress,
8822       .callableShaderBindingTableSize = pCallableShaderBindingTable->size,
8823       .callableShaderBindingTableStride = pCallableShaderBindingTable->stride,
8824    };
8825 
8826    radv_trace_rays(cmd_buffer, &tables, indirectDeviceAddress, radv_rt_mode_indirect);
8827 }
8828 
8829 VKAPI_ATTR void VKAPI_CALL
radv_CmdTraceRaysIndirect2KHR(VkCommandBuffer commandBuffer,VkDeviceAddress indirectDeviceAddress)8830 radv_CmdTraceRaysIndirect2KHR(VkCommandBuffer commandBuffer, VkDeviceAddress indirectDeviceAddress)
8831 {
8832    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8833 
8834    assert(cmd_buffer->device->use_global_bo_list);
8835 
8836    radv_trace_rays(cmd_buffer, NULL, indirectDeviceAddress, radv_rt_mode_indirect2);
8837 }
8838 
8839 static void
radv_set_rt_stack_size(struct radv_cmd_buffer * cmd_buffer,uint32_t size)8840 radv_set_rt_stack_size(struct radv_cmd_buffer *cmd_buffer, uint32_t size)
8841 {
8842    unsigned wave_size = 0;
8843    unsigned scratch_bytes_per_wave = 0;
8844 
8845    if (cmd_buffer->state.rt_pipeline) {
8846       scratch_bytes_per_wave = cmd_buffer->state.rt_pipeline->base.scratch_bytes_per_wave;
8847       wave_size = cmd_buffer->state.rt_pipeline->base.shaders[MESA_SHADER_COMPUTE]->info.wave_size;
8848    }
8849 
8850    /* The hardware register is specified as a multiple of 256 DWORDS. */
8851    scratch_bytes_per_wave += align(size * wave_size, 1024);
8852 
8853    cmd_buffer->compute_scratch_size_per_wave_needed =
8854       MAX2(cmd_buffer->compute_scratch_size_per_wave_needed, scratch_bytes_per_wave);
8855 }
8856 
8857 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetRayTracingPipelineStackSizeKHR(VkCommandBuffer commandBuffer,uint32_t size)8858 radv_CmdSetRayTracingPipelineStackSizeKHR(VkCommandBuffer commandBuffer, uint32_t size)
8859 {
8860    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8861 
8862    radv_set_rt_stack_size(cmd_buffer, size);
8863    cmd_buffer->state.rt_stack_size = size;
8864 }
8865 
8866 VKAPI_ATTR void VKAPI_CALL
radv_CmdEndRenderPass2(VkCommandBuffer commandBuffer,const VkSubpassEndInfo * pSubpassEndInfo)8867 radv_CmdEndRenderPass2(VkCommandBuffer commandBuffer, const VkSubpassEndInfo *pSubpassEndInfo)
8868 {
8869    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8870 
8871    radv_mark_noncoherent_rb(cmd_buffer);
8872 
8873    radv_emit_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier);
8874 
8875    radv_cmd_buffer_end_subpass(cmd_buffer);
8876 
8877    vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer->state.attachments);
8878    vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer->state.subpass_sample_locs);
8879 
8880    cmd_buffer->state.pass = NULL;
8881    cmd_buffer->state.subpass = NULL;
8882    cmd_buffer->state.attachments = NULL;
8883    cmd_buffer->state.framebuffer = NULL;
8884    cmd_buffer->state.subpass_sample_locs = NULL;
8885 }
8886 
8887 VKAPI_ATTR void VKAPI_CALL
radv_CmdBeginRendering(VkCommandBuffer commandBuffer,const VkRenderingInfo * pRenderingInfo)8888 radv_CmdBeginRendering(VkCommandBuffer commandBuffer, const VkRenderingInfo *pRenderingInfo)
8889 {
8890    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8891    const VkRenderingFragmentShadingRateAttachmentInfoKHR *vrs_info = vk_find_struct_const(
8892       pRenderingInfo->pNext, RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_INFO_KHR);
8893    VkResult result;
8894    /* (normal + resolve) for color attachments and ds and a VRS attachment */
8895    VkAttachmentDescription2 att_desc[MAX_RTS * 2 + 3];
8896    VkAttachmentDescriptionStencilLayout ds_stencil_att, ds_stencil_resolve_att;
8897    VkImageView iviews[MAX_RTS * 2 + 3];
8898    VkAttachmentReference2 color_refs[MAX_RTS], color_resolve_refs[MAX_RTS];
8899    VkAttachmentReference2 ds_ref, ds_resolve_ref, vrs_ref;
8900    VkAttachmentReferenceStencilLayout ds_stencil_ref, ds_stencil_resolve_ref;
8901    VkSubpassDescriptionDepthStencilResolve ds_resolve_info;
8902    VkFragmentShadingRateAttachmentInfoKHR vrs_subpass_info;
8903    VkClearValue clear_values[MAX_RTS * 2 + 3];
8904    unsigned att_count = 0;
8905 
8906    VkSubpassDescription2 subpass = {
8907       .sType = VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_2,
8908       .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
8909       .viewMask = pRenderingInfo->viewMask,
8910       .colorAttachmentCount = pRenderingInfo->colorAttachmentCount,
8911       .pColorAttachments = color_refs,
8912       .pResolveAttachments = color_resolve_refs,
8913    };
8914 
8915    for (unsigned i = 0; i < pRenderingInfo->colorAttachmentCount; ++i) {
8916       color_refs[i] = (VkAttachmentReference2){
8917          .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
8918          .attachment = VK_ATTACHMENT_UNUSED,
8919       };
8920       color_resolve_refs[i] = (VkAttachmentReference2){
8921          .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
8922          .attachment = VK_ATTACHMENT_UNUSED,
8923       };
8924 
8925       if (pRenderingInfo->pColorAttachments[i].imageView == VK_NULL_HANDLE)
8926          continue;
8927 
8928       const VkRenderingAttachmentInfo *info = &pRenderingInfo->pColorAttachments[i];
8929       RADV_FROM_HANDLE(radv_image_view, iview, info->imageView);
8930       color_refs[i] = (VkAttachmentReference2){.sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
8931                                                .attachment = att_count,
8932                                                .layout = info->imageLayout,
8933                                                .aspectMask = iview->vk.aspects};
8934 
8935       iviews[att_count] = info->imageView;
8936       clear_values[att_count] = info->clearValue;
8937       VkAttachmentDescription2 *att = att_desc + att_count++;
8938 
8939       memset(att, 0, sizeof(*att));
8940       att->sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2;
8941       att->format = iview->vk.format;
8942       att->samples = iview->image->info.samples;
8943       att->loadOp = info->loadOp;
8944       att->storeOp = info->storeOp;
8945       att->initialLayout = info->imageLayout;
8946       att->finalLayout = info->imageLayout;
8947 
8948       if (pRenderingInfo->flags & VK_RENDERING_RESUMING_BIT)
8949          att->loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
8950 
8951       if (pRenderingInfo->flags & VK_RENDERING_SUSPENDING_BIT)
8952          att->storeOp = VK_ATTACHMENT_STORE_OP_STORE;
8953 
8954       if (info->resolveMode != VK_RESOLVE_MODE_NONE &&
8955           !(pRenderingInfo->flags & VK_RENDERING_SUSPENDING_BIT)) {
8956          RADV_FROM_HANDLE(radv_image_view, resolve_iview, info->resolveImageView);
8957          color_resolve_refs[i] =
8958             (VkAttachmentReference2){.sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
8959                                      .attachment = att_count,
8960                                      .layout = info->resolveImageLayout,
8961                                      .aspectMask = resolve_iview->vk.aspects};
8962 
8963          iviews[att_count] = info->resolveImageView;
8964          att = att_desc + att_count++;
8965 
8966          memset(att, 0, sizeof(*att));
8967          att->sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2;
8968          att->format = resolve_iview->vk.format;
8969          att->samples = resolve_iview->image->info.samples;
8970          att->loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
8971          att->storeOp = VK_ATTACHMENT_STORE_OP_STORE;
8972          att->initialLayout = info->resolveImageLayout;
8973          att->finalLayout = info->resolveImageLayout;
8974       }
8975    }
8976 
8977    if (pRenderingInfo->pDepthAttachment || pRenderingInfo->pStencilAttachment) {
8978       const VkRenderingAttachmentInfo *common_info = pRenderingInfo->pDepthAttachment
8979                                                            ? pRenderingInfo->pDepthAttachment
8980                                                            : pRenderingInfo->pStencilAttachment;
8981       RADV_FROM_HANDLE(radv_image_view, iview, common_info->imageView);
8982 
8983       if (common_info->imageView != VK_NULL_HANDLE) {
8984          ds_ref = (VkAttachmentReference2){
8985             .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
8986             .attachment = att_count,
8987             .layout = common_info->imageLayout,
8988             .aspectMask = (pRenderingInfo->pDepthAttachment ? VK_IMAGE_ASPECT_DEPTH_BIT : 0) |
8989                           (pRenderingInfo->pStencilAttachment ? VK_IMAGE_ASPECT_STENCIL_BIT : 0)};
8990          subpass.pDepthStencilAttachment = &ds_ref;
8991 
8992          iviews[att_count] = common_info->imageView;
8993          if (pRenderingInfo->pDepthAttachment)
8994             clear_values[att_count].depthStencil.depth =
8995                pRenderingInfo->pDepthAttachment->clearValue.depthStencil.depth;
8996          if (pRenderingInfo->pStencilAttachment)
8997             clear_values[att_count].depthStencil.stencil =
8998                pRenderingInfo->pStencilAttachment->clearValue.depthStencil.stencil;
8999          VkAttachmentDescription2 *att = att_desc + att_count++;
9000 
9001          memset(att, 0, sizeof(*att));
9002          att->sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2;
9003          att->format = iview->vk.format;
9004          att->samples = iview->image->info.samples;
9005 
9006          if (pRenderingInfo->pDepthAttachment) {
9007             att->loadOp = pRenderingInfo->pDepthAttachment->loadOp;
9008             att->storeOp = pRenderingInfo->pDepthAttachment->storeOp;
9009          } else {
9010             att->loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
9011             att->storeOp = VK_ATTACHMENT_STORE_OP_STORE;
9012          }
9013 
9014          if (pRenderingInfo->pStencilAttachment) {
9015             att->stencilLoadOp = pRenderingInfo->pStencilAttachment->loadOp;
9016             att->stencilStoreOp = pRenderingInfo->pStencilAttachment->storeOp;
9017          } else {
9018             att->stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
9019             att->stencilStoreOp = VK_ATTACHMENT_STORE_OP_STORE;
9020          }
9021 
9022          if (pRenderingInfo->flags & VK_RENDERING_RESUMING_BIT) {
9023             att->loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
9024             att->stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
9025          }
9026 
9027          if (pRenderingInfo->flags & VK_RENDERING_SUSPENDING_BIT) {
9028             att->storeOp = VK_ATTACHMENT_STORE_OP_STORE;
9029             att->stencilStoreOp = VK_ATTACHMENT_STORE_OP_STORE;
9030          }
9031 
9032          att->initialLayout = common_info->imageLayout;
9033          att->finalLayout = common_info->imageLayout;
9034 
9035          if (pRenderingInfo->pDepthAttachment && pRenderingInfo->pStencilAttachment) {
9036             ds_ref.pNext = &ds_stencil_ref;
9037             ds_stencil_ref = (VkAttachmentReferenceStencilLayout){
9038                .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_STENCIL_LAYOUT,
9039                .stencilLayout = pRenderingInfo->pStencilAttachment->imageLayout};
9040 
9041             att->pNext = &ds_stencil_att;
9042             ds_stencil_att = (VkAttachmentDescriptionStencilLayout){
9043                .sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_STENCIL_LAYOUT,
9044                .stencilInitialLayout = pRenderingInfo->pStencilAttachment->imageLayout,
9045                .stencilFinalLayout = pRenderingInfo->pStencilAttachment->imageLayout,
9046             };
9047          }
9048 
9049          if (((pRenderingInfo->pDepthAttachment &&
9050               pRenderingInfo->pDepthAttachment->resolveMode != VK_RESOLVE_MODE_NONE) ||
9051              (pRenderingInfo->pStencilAttachment &&
9052               pRenderingInfo->pStencilAttachment->resolveMode != VK_RESOLVE_MODE_NONE)) &&
9053              !(pRenderingInfo->flags & VK_RENDERING_SUSPENDING_BIT)) {
9054             RADV_FROM_HANDLE(radv_image_view, resolve_iview, common_info->resolveImageView);
9055             ds_resolve_ref =
9056                (VkAttachmentReference2){.sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
9057                                         .attachment = att_count,
9058                                         .layout = common_info->resolveImageLayout,
9059                                         .aspectMask = resolve_iview->vk.aspects};
9060 
9061             iviews[att_count] = common_info->resolveImageView;
9062             att = att_desc + att_count++;
9063 
9064             memset(att, 0, sizeof(*att));
9065             att->sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2;
9066             att->format = resolve_iview->vk.format;
9067             att->samples = resolve_iview->image->info.samples;
9068             att->loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
9069             att->storeOp = VK_ATTACHMENT_STORE_OP_STORE;
9070             att->initialLayout = common_info->resolveImageLayout;
9071             att->finalLayout = common_info->resolveImageLayout;
9072 
9073             ds_resolve_info = (VkSubpassDescriptionDepthStencilResolve){
9074                .sType = VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_DEPTH_STENCIL_RESOLVE,
9075                .pNext = subpass.pNext,
9076                .depthResolveMode =
9077                   (pRenderingInfo->pDepthAttachment &&
9078                    pRenderingInfo->pDepthAttachment->resolveMode != VK_RESOLVE_MODE_NONE)
9079                      ? pRenderingInfo->pDepthAttachment->resolveMode
9080                      : VK_RESOLVE_MODE_NONE,
9081                .stencilResolveMode =
9082                   (pRenderingInfo->pStencilAttachment &&
9083                    pRenderingInfo->pStencilAttachment->resolveMode != VK_RESOLVE_MODE_NONE)
9084                      ? pRenderingInfo->pStencilAttachment->resolveMode
9085                      : VK_RESOLVE_MODE_NONE,
9086                .pDepthStencilResolveAttachment = &ds_resolve_ref};
9087             subpass.pNext = &ds_resolve_info;
9088 
9089             if (pRenderingInfo->pDepthAttachment && pRenderingInfo->pStencilAttachment &&
9090                 pRenderingInfo->pDepthAttachment->resolveMode != VK_RESOLVE_MODE_NONE &&
9091                 pRenderingInfo->pStencilAttachment->resolveMode != VK_RESOLVE_MODE_NONE) {
9092                ds_resolve_ref.pNext = &ds_stencil_resolve_ref;
9093                ds_stencil_resolve_ref = (VkAttachmentReferenceStencilLayout){
9094                   .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_STENCIL_LAYOUT,
9095                   .stencilLayout = pRenderingInfo->pStencilAttachment->resolveImageLayout};
9096 
9097                att->pNext = &ds_stencil_resolve_att;
9098                ds_stencil_resolve_att = (VkAttachmentDescriptionStencilLayout){
9099                   .sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_STENCIL_LAYOUT,
9100                   .stencilInitialLayout = pRenderingInfo->pStencilAttachment->resolveImageLayout,
9101                   .stencilFinalLayout = pRenderingInfo->pStencilAttachment->resolveImageLayout,
9102                };
9103             }
9104          }
9105       }
9106    }
9107 
9108    if (vrs_info && vrs_info->imageView) {
9109       RADV_FROM_HANDLE(radv_image_view, iview, vrs_info->imageView);
9110       vrs_ref = (VkAttachmentReference2){.sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
9111                                          .attachment = att_count,
9112                                          .layout = vrs_info->imageLayout,
9113                                          .aspectMask = iview->vk.aspects};
9114 
9115       iviews[att_count] = vrs_info->imageView;
9116       VkAttachmentDescription2 *att = att_desc + att_count++;
9117 
9118       memset(att, 0, sizeof(*att));
9119       att->sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2;
9120       att->format = iview->vk.format;
9121       att->samples = iview->image->info.samples;
9122       att->loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
9123       att->storeOp = VK_ATTACHMENT_STORE_OP_DONT_CARE;
9124       att->initialLayout = vrs_info->imageLayout;
9125       att->finalLayout = vrs_info->imageLayout;
9126 
9127       vrs_subpass_info = (VkFragmentShadingRateAttachmentInfoKHR){
9128          .sType = VK_STRUCTURE_TYPE_FRAGMENT_SHADING_RATE_ATTACHMENT_INFO_KHR,
9129          .pNext = subpass.pNext,
9130          .pFragmentShadingRateAttachment = &vrs_ref,
9131          .shadingRateAttachmentTexelSize = vrs_info->shadingRateAttachmentTexelSize,
9132       };
9133       subpass.pNext = &vrs_subpass_info;
9134    }
9135 
9136    VkRenderPassCreateInfo2 rp_create_info = {
9137       .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2,
9138       .attachmentCount = att_count,
9139       .pAttachments = att_desc,
9140       .subpassCount = 1,
9141       .pSubpasses = &subpass,
9142    };
9143 
9144    VkRenderPass rp;
9145    result =
9146       radv_CreateRenderPass2(radv_device_to_handle(cmd_buffer->device), &rp_create_info, NULL, &rp);
9147    if (result != VK_SUCCESS) {
9148       cmd_buffer->record_result = result;
9149       return;
9150    }
9151 
9152    unsigned w = pRenderingInfo->renderArea.offset.x + pRenderingInfo->renderArea.extent.width;
9153    unsigned h = pRenderingInfo->renderArea.offset.y + pRenderingInfo->renderArea.extent.height;
9154    for (unsigned i = 0; i < att_count; ++i) {
9155       RADV_FROM_HANDLE(radv_image_view, iview, iviews[i]);
9156 
9157       if (vrs_info && vrs_info->imageView == iviews[i])
9158          continue;
9159 
9160       w = MIN2(w, iview->extent.width);
9161       h = MIN2(h, iview->extent.height);
9162    }
9163    VkFramebufferCreateInfo fb_create_info = {
9164       .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
9165       .renderPass = rp,
9166       .attachmentCount = att_count,
9167       .pAttachments = iviews,
9168       .width = w,
9169       .height = h,
9170       .layers = pRenderingInfo->layerCount,
9171    };
9172 
9173    VkFramebuffer fb;
9174    result =
9175       vk_common_CreateFramebuffer(radv_device_to_handle(cmd_buffer->device), &fb_create_info, NULL, &fb);
9176    if (result != VK_SUCCESS) {
9177       radv_DestroyRenderPass(radv_device_to_handle(cmd_buffer->device), rp, NULL);
9178       cmd_buffer->record_result = result;
9179       return;
9180    }
9181 
9182    VkRenderPassBeginInfo begin_info = {.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
9183                                        .renderPass = rp,
9184                                        .framebuffer = fb,
9185                                        .renderArea = pRenderingInfo->renderArea,
9186                                        .clearValueCount = att_count,
9187                                        .pClearValues = clear_values};
9188 
9189    const VkSubpassBeginInfo pass_begin_info = {
9190       .sType = VK_STRUCTURE_TYPE_SUBPASS_BEGIN_INFO,
9191       .contents = (pRenderingInfo->flags & VK_RENDERING_CONTENTS_SECONDARY_COMMAND_BUFFERS_BIT)
9192                      ? VK_SUBPASS_CONTENTS_SECONDARY_COMMAND_BUFFERS
9193                      : VK_SUBPASS_CONTENTS_INLINE,
9194    };
9195 
9196    radv_CmdBeginRenderPass2(commandBuffer, &begin_info, &pass_begin_info);
9197 }
9198 
9199 VKAPI_ATTR void VKAPI_CALL
radv_CmdEndRendering(VkCommandBuffer commandBuffer)9200 radv_CmdEndRendering(VkCommandBuffer commandBuffer)
9201 {
9202    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9203    struct radv_render_pass *pass = cmd_buffer->state.pass;
9204    struct vk_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
9205 
9206    radv_CmdEndRenderPass2(commandBuffer, NULL);
9207 
9208    vk_common_DestroyFramebuffer(radv_device_to_handle(cmd_buffer->device),
9209                                 vk_framebuffer_to_handle(framebuffer), NULL);
9210    radv_DestroyRenderPass(radv_device_to_handle(cmd_buffer->device),
9211                           radv_render_pass_to_handle(pass), NULL);
9212 }
9213 
9214 /*
9215  * For HTILE we have the following interesting clear words:
9216  *   0xfffff30f: Uncompressed, full depth range, for depth+stencil HTILE
9217  *   0xfffc000f: Uncompressed, full depth range, for depth only HTILE.
9218  *   0xfffffff0: Clear depth to 1.0
9219  *   0x00000000: Clear depth to 0.0
9220  */
9221 static void
radv_initialize_htile(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range)9222 radv_initialize_htile(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
9223                       const VkImageSubresourceRange *range)
9224 {
9225    struct radv_cmd_state *state = &cmd_buffer->state;
9226    uint32_t htile_value = radv_get_htile_initial_value(cmd_buffer->device, image);
9227    VkClearDepthStencilValue value = {0};
9228    struct radv_barrier_data barrier = {0};
9229 
9230    barrier.layout_transitions.init_mask_ram = 1;
9231    radv_describe_layout_transition(cmd_buffer, &barrier);
9232 
9233    /* Transitioning from LAYOUT_UNDEFINED layout not everyone is consistent
9234     * in considering previous rendering work for WAW hazards. */
9235    state->flush_bits |=
9236       radv_src_access_flush(cmd_buffer, VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, image);
9237 
9238    if (image->planes[0].surface.has_stencil &&
9239        !(range->aspectMask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT))) {
9240       /* Flush caches before performing a separate aspect initialization because it's a
9241        * read-modify-write operation.
9242        */
9243       state->flush_bits |= radv_dst_access_flush(cmd_buffer, VK_ACCESS_2_SHADER_READ_BIT, image);
9244    }
9245 
9246    state->flush_bits |= radv_clear_htile(cmd_buffer, image, range, htile_value);
9247 
9248    radv_set_ds_clear_metadata(cmd_buffer, image, range, value, range->aspectMask);
9249 
9250    if (radv_image_is_tc_compat_htile(image) && (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)) {
9251       /* Initialize the TC-compat metada value to 0 because by
9252        * default DB_Z_INFO.RANGE_PRECISION is set to 1, and we only
9253        * need have to conditionally update its value when performing
9254        * a fast depth clear.
9255        */
9256       radv_set_tc_compat_zrange_metadata(cmd_buffer, image, range, 0);
9257    }
9258 }
9259 
9260 static void
radv_handle_depth_image_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,bool src_render_loop,VkImageLayout dst_layout,bool dst_render_loop,unsigned src_queue_mask,unsigned dst_queue_mask,const VkImageSubresourceRange * range,struct radv_sample_locations_state * sample_locs)9261 radv_handle_depth_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
9262                                    VkImageLayout src_layout, bool src_render_loop,
9263                                    VkImageLayout dst_layout, bool dst_render_loop,
9264                                    unsigned src_queue_mask, unsigned dst_queue_mask,
9265                                    const VkImageSubresourceRange *range,
9266                                    struct radv_sample_locations_state *sample_locs)
9267 {
9268    struct radv_device *device = cmd_buffer->device;
9269 
9270    if (!radv_htile_enabled(image, range->baseMipLevel))
9271       return;
9272 
9273    if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
9274       radv_initialize_htile(cmd_buffer, image, range);
9275    } else if (!radv_layout_is_htile_compressed(device, image, src_layout, src_render_loop,
9276                                                src_queue_mask) &&
9277               radv_layout_is_htile_compressed(device, image, dst_layout, dst_render_loop,
9278                                               dst_queue_mask)) {
9279       radv_initialize_htile(cmd_buffer, image, range);
9280    } else if (radv_layout_is_htile_compressed(device, image, src_layout, src_render_loop,
9281                                               src_queue_mask) &&
9282               !radv_layout_is_htile_compressed(device, image, dst_layout, dst_render_loop,
9283                                                dst_queue_mask)) {
9284       cmd_buffer->state.flush_bits |=
9285          RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
9286 
9287       radv_expand_depth_stencil(cmd_buffer, image, range, sample_locs);
9288 
9289       cmd_buffer->state.flush_bits |=
9290          RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
9291    }
9292 }
9293 
9294 static uint32_t
radv_init_cmask(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,uint32_t value)9295 radv_init_cmask(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
9296                 const VkImageSubresourceRange *range, uint32_t value)
9297 {
9298    struct radv_barrier_data barrier = {0};
9299 
9300    barrier.layout_transitions.init_mask_ram = 1;
9301    radv_describe_layout_transition(cmd_buffer, &barrier);
9302 
9303    return radv_clear_cmask(cmd_buffer, image, range, value);
9304 }
9305 
9306 uint32_t
radv_init_fmask(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range)9307 radv_init_fmask(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
9308                 const VkImageSubresourceRange *range)
9309 {
9310    static const uint32_t fmask_clear_values[4] = {0x00000000, 0x02020202, 0xE4E4E4E4, 0x76543210};
9311    uint32_t log2_samples = util_logbase2(image->info.samples);
9312    uint32_t value = fmask_clear_values[log2_samples];
9313    struct radv_barrier_data barrier = {0};
9314 
9315    barrier.layout_transitions.init_mask_ram = 1;
9316    radv_describe_layout_transition(cmd_buffer, &barrier);
9317 
9318    return radv_clear_fmask(cmd_buffer, image, range, value);
9319 }
9320 
9321 uint32_t
radv_init_dcc(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,uint32_t value)9322 radv_init_dcc(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
9323               const VkImageSubresourceRange *range, uint32_t value)
9324 {
9325    struct radv_barrier_data barrier = {0};
9326    uint32_t flush_bits = 0;
9327    unsigned size = 0;
9328 
9329    barrier.layout_transitions.init_mask_ram = 1;
9330    radv_describe_layout_transition(cmd_buffer, &barrier);
9331 
9332    flush_bits |= radv_clear_dcc(cmd_buffer, image, range, value);
9333 
9334    if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX8) {
9335       /* When DCC is enabled with mipmaps, some levels might not
9336        * support fast clears and we have to initialize them as "fully
9337        * expanded".
9338        */
9339       /* Compute the size of all fast clearable DCC levels. */
9340       for (unsigned i = 0; i < image->planes[0].surface.num_meta_levels; i++) {
9341          struct legacy_surf_dcc_level *dcc_level = &image->planes[0].surface.u.legacy.color.dcc_level[i];
9342          unsigned dcc_fast_clear_size =
9343             dcc_level->dcc_slice_fast_clear_size * image->info.array_size;
9344 
9345          if (!dcc_fast_clear_size)
9346             break;
9347 
9348          size = dcc_level->dcc_offset + dcc_fast_clear_size;
9349       }
9350 
9351       /* Initialize the mipmap levels without DCC. */
9352       if (size != image->planes[0].surface.meta_size) {
9353          flush_bits |= radv_fill_buffer(cmd_buffer, image, image->bindings[0].bo,
9354                                         radv_buffer_get_va(image->bindings[0].bo) +
9355                                            image->bindings[0].offset +
9356                                            image->planes[0].surface.meta_offset + size,
9357                                         image->planes[0].surface.meta_size - size, 0xffffffff);
9358       }
9359    }
9360 
9361    return flush_bits;
9362 }
9363 
9364 /**
9365  * Initialize DCC/FMASK/CMASK metadata for a color image.
9366  */
9367 static void
radv_init_color_image_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,bool src_render_loop,VkImageLayout dst_layout,bool dst_render_loop,unsigned src_queue_mask,unsigned dst_queue_mask,const VkImageSubresourceRange * range)9368 radv_init_color_image_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
9369                                VkImageLayout src_layout, bool src_render_loop,
9370                                VkImageLayout dst_layout, bool dst_render_loop,
9371                                unsigned src_queue_mask, unsigned dst_queue_mask,
9372                                const VkImageSubresourceRange *range)
9373 {
9374    uint32_t flush_bits = 0;
9375 
9376    /* Transitioning from LAYOUT_UNDEFINED layout not everyone is
9377     * consistent in considering previous rendering work for WAW hazards.
9378     */
9379    cmd_buffer->state.flush_bits |=
9380       radv_src_access_flush(cmd_buffer, VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT, image);
9381 
9382    if (radv_image_has_cmask(image)) {
9383       uint32_t value;
9384 
9385       if (cmd_buffer->device->physical_device->rad_info.gfx_level == GFX9) {
9386          /* TODO: Fix clearing CMASK layers on GFX9. */
9387          if (radv_image_is_tc_compat_cmask(image) ||
9388              (radv_image_has_fmask(image) &&
9389               radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel, dst_layout,
9390                                          dst_render_loop, dst_queue_mask))) {
9391             value = 0xccccccccu;
9392          } else {
9393             value = 0xffffffffu;
9394          }
9395       } else {
9396          static const uint32_t cmask_clear_values[4] = {0xffffffff, 0xdddddddd, 0xeeeeeeee, 0xffffffff};
9397          uint32_t log2_samples = util_logbase2(image->info.samples);
9398 
9399          value = cmask_clear_values[log2_samples];
9400       }
9401 
9402       flush_bits |= radv_init_cmask(cmd_buffer, image, range, value);
9403    }
9404 
9405    if (radv_image_has_fmask(image)) {
9406       flush_bits |= radv_init_fmask(cmd_buffer, image, range);
9407    }
9408 
9409    if (radv_dcc_enabled(image, range->baseMipLevel)) {
9410       uint32_t value = 0xffffffffu; /* Fully expanded mode. */
9411 
9412       if (radv_layout_dcc_compressed(cmd_buffer->device, image, range->baseMipLevel,
9413                                      dst_layout, dst_render_loop, dst_queue_mask)) {
9414          value = 0u;
9415       }
9416 
9417       flush_bits |= radv_init_dcc(cmd_buffer, image, range, value);
9418    }
9419 
9420    if (radv_image_has_cmask(image) || radv_dcc_enabled(image, range->baseMipLevel)) {
9421       radv_update_fce_metadata(cmd_buffer, image, range, false);
9422 
9423       uint32_t color_values[2] = {0};
9424       radv_set_color_clear_metadata(cmd_buffer, image, range, color_values);
9425    }
9426 
9427    cmd_buffer->state.flush_bits |= flush_bits;
9428 }
9429 
9430 static void
radv_retile_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,VkImageLayout dst_layout,unsigned dst_queue_mask)9431 radv_retile_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
9432                        VkImageLayout src_layout, VkImageLayout dst_layout, unsigned dst_queue_mask)
9433 {
9434    /* If the image is read-only, we don't have to retile DCC because it can't change. */
9435    if (!(image->vk.usage & RADV_IMAGE_USAGE_WRITE_BITS))
9436       return;
9437 
9438    if (src_layout != VK_IMAGE_LAYOUT_PRESENT_SRC_KHR &&
9439        (dst_layout == VK_IMAGE_LAYOUT_PRESENT_SRC_KHR ||
9440         (dst_queue_mask & (1u << RADV_QUEUE_FOREIGN))))
9441       radv_retile_dcc(cmd_buffer, image);
9442 }
9443 
9444 static bool
radv_image_need_retile(const struct radv_image * image)9445 radv_image_need_retile(const struct radv_image *image)
9446 {
9447    return image->planes[0].surface.display_dcc_offset &&
9448           image->planes[0].surface.display_dcc_offset != image->planes[0].surface.meta_offset;
9449 }
9450 
9451 /**
9452  * Handle color image transitions for DCC/FMASK/CMASK.
9453  */
9454 static void
radv_handle_color_image_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,bool src_render_loop,VkImageLayout dst_layout,bool dst_render_loop,unsigned src_queue_mask,unsigned dst_queue_mask,const VkImageSubresourceRange * range)9455 radv_handle_color_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
9456                                    VkImageLayout src_layout, bool src_render_loop,
9457                                    VkImageLayout dst_layout, bool dst_render_loop,
9458                                    unsigned src_queue_mask, unsigned dst_queue_mask,
9459                                    const VkImageSubresourceRange *range)
9460 {
9461    bool dcc_decompressed = false, fast_clear_flushed = false;
9462 
9463    if (!radv_image_has_cmask(image) && !radv_image_has_fmask(image) &&
9464        !radv_dcc_enabled(image, range->baseMipLevel))
9465       return;
9466 
9467    if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
9468       radv_init_color_image_metadata(cmd_buffer, image, src_layout, src_render_loop, dst_layout,
9469                                      dst_render_loop, src_queue_mask, dst_queue_mask, range);
9470 
9471       if (radv_image_need_retile(image))
9472          radv_retile_transition(cmd_buffer, image, src_layout, dst_layout, dst_queue_mask);
9473       return;
9474    }
9475 
9476    if (radv_dcc_enabled(image, range->baseMipLevel)) {
9477       if (src_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) {
9478          cmd_buffer->state.flush_bits |= radv_init_dcc(cmd_buffer, image, range, 0xffffffffu);
9479       } else if (radv_layout_dcc_compressed(cmd_buffer->device, image, range->baseMipLevel,
9480                                             src_layout, src_render_loop, src_queue_mask) &&
9481                  !radv_layout_dcc_compressed(cmd_buffer->device, image, range->baseMipLevel,
9482                                              dst_layout, dst_render_loop, dst_queue_mask)) {
9483          radv_decompress_dcc(cmd_buffer, image, range);
9484          dcc_decompressed = true;
9485       } else if (radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel,
9486                                             src_layout, src_render_loop, src_queue_mask) &&
9487                  !radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel,
9488                                              dst_layout, dst_render_loop, dst_queue_mask)) {
9489          radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
9490          fast_clear_flushed = true;
9491       }
9492 
9493       if (radv_image_need_retile(image))
9494          radv_retile_transition(cmd_buffer, image, src_layout, dst_layout, dst_queue_mask);
9495    } else if (radv_image_has_cmask(image) || radv_image_has_fmask(image)) {
9496       if (radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel,
9497                                      src_layout, src_render_loop, src_queue_mask) &&
9498           !radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel,
9499                                       dst_layout, dst_render_loop, dst_queue_mask)) {
9500          radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
9501          fast_clear_flushed = true;
9502       }
9503    }
9504 
9505    /* MSAA color decompress. */
9506    if (radv_image_has_fmask(image) &&
9507        (image->vk.usage & (VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT)) &&
9508        radv_layout_fmask_compressed(cmd_buffer->device, image, src_layout, src_queue_mask) &&
9509        !radv_layout_fmask_compressed(cmd_buffer->device, image, dst_layout, dst_queue_mask)) {
9510       if (radv_dcc_enabled(image, range->baseMipLevel) &&
9511           !radv_image_use_dcc_image_stores(cmd_buffer->device, image) && !dcc_decompressed) {
9512          /* A DCC decompress is required before expanding FMASK
9513           * when DCC stores aren't supported to avoid being in
9514           * a state where DCC is compressed and the main
9515           * surface is uncompressed.
9516           */
9517          radv_decompress_dcc(cmd_buffer, image, range);
9518       } else if (!fast_clear_flushed) {
9519          /* A FMASK decompress is required before expanding
9520           * FMASK.
9521           */
9522          radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
9523       }
9524 
9525       struct radv_barrier_data barrier = {0};
9526       barrier.layout_transitions.fmask_color_expand = 1;
9527       radv_describe_layout_transition(cmd_buffer, &barrier);
9528 
9529       radv_expand_fmask_image_inplace(cmd_buffer, image, range);
9530    }
9531 }
9532 
9533 static void
radv_handle_image_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,bool src_render_loop,VkImageLayout dst_layout,bool dst_render_loop,uint32_t src_family_index,uint32_t dst_family_index,const VkImageSubresourceRange * range,struct radv_sample_locations_state * sample_locs)9534 radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
9535                              VkImageLayout src_layout, bool src_render_loop,
9536                              VkImageLayout dst_layout, bool dst_render_loop, uint32_t src_family_index,
9537                              uint32_t dst_family_index, const VkImageSubresourceRange *range,
9538                              struct radv_sample_locations_state *sample_locs)
9539 {
9540    enum radv_queue_family src_qf = vk_queue_to_radv(cmd_buffer->device->physical_device, src_family_index);
9541    enum radv_queue_family dst_qf = vk_queue_to_radv(cmd_buffer->device->physical_device, dst_family_index);
9542    if (image->exclusive && src_family_index != dst_family_index) {
9543       /* This is an acquire or a release operation and there will be
9544        * a corresponding release/acquire. Do the transition in the
9545        * most flexible queue. */
9546 
9547       assert(src_qf == cmd_buffer->qf ||
9548              dst_qf == cmd_buffer->qf);
9549 
9550       if (src_family_index == VK_QUEUE_FAMILY_EXTERNAL || src_family_index == VK_QUEUE_FAMILY_FOREIGN_EXT)
9551          return;
9552 
9553       if (cmd_buffer->qf == RADV_QUEUE_TRANSFER)
9554          return;
9555 
9556       if (cmd_buffer->qf == RADV_QUEUE_COMPUTE &&
9557           (src_qf == RADV_QUEUE_GENERAL || dst_qf == RADV_QUEUE_GENERAL))
9558          return;
9559    }
9560 
9561    unsigned src_queue_mask =
9562       radv_image_queue_family_mask(image, src_qf, cmd_buffer->qf);
9563    unsigned dst_queue_mask =
9564       radv_image_queue_family_mask(image, dst_qf, cmd_buffer->qf);
9565 
9566    if (src_layout == dst_layout && src_render_loop == dst_render_loop && src_queue_mask == dst_queue_mask)
9567       return;
9568 
9569    if (vk_format_has_depth(image->vk.format)) {
9570       radv_handle_depth_image_transition(cmd_buffer, image, src_layout, src_render_loop, dst_layout,
9571                                          dst_render_loop, src_queue_mask, dst_queue_mask, range,
9572                                          sample_locs);
9573    } else {
9574       radv_handle_color_image_transition(cmd_buffer, image, src_layout, src_render_loop, dst_layout,
9575                                          dst_render_loop, src_queue_mask, dst_queue_mask, range);
9576    }
9577 }
9578 
9579 static void
radv_cp_dma_wait_for_stages(struct radv_cmd_buffer * cmd_buffer,VkPipelineStageFlags2 stage_mask)9580 radv_cp_dma_wait_for_stages(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 stage_mask)
9581 {
9582    /* Make sure CP DMA is idle because the driver might have performed a DMA operation for copying a
9583     * buffer (or a MSAA image using FMASK). Note that updating a buffer is considered a clear
9584     * operation but it might also use a CP DMA copy in some rare situations. Other operations using
9585     * a CP DMA clear are implicitly synchronized (see CP_DMA_SYNC).
9586     */
9587    if (stage_mask & (VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_CLEAR_BIT |
9588                      VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT | VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT |
9589                      VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
9590       si_cp_dma_wait_for_idle(cmd_buffer);
9591 }
9592 
9593 static void
radv_barrier(struct radv_cmd_buffer * cmd_buffer,const VkDependencyInfo * dep_info,enum rgp_barrier_reason reason)9594 radv_barrier(struct radv_cmd_buffer *cmd_buffer, const VkDependencyInfo *dep_info,
9595              enum rgp_barrier_reason reason)
9596 {
9597    enum radv_cmd_flush_bits src_flush_bits = 0;
9598    enum radv_cmd_flush_bits dst_flush_bits = 0;
9599    VkPipelineStageFlags2 src_stage_mask = 0;
9600    VkPipelineStageFlags2 dst_stage_mask = 0;
9601 
9602    if (cmd_buffer->state.subpass)
9603       radv_mark_noncoherent_rb(cmd_buffer);
9604 
9605    radv_describe_barrier_start(cmd_buffer, reason);
9606 
9607    for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
9608       src_stage_mask |= dep_info->pMemoryBarriers[i].srcStageMask;
9609       src_flush_bits |=
9610          radv_src_access_flush(cmd_buffer, dep_info->pMemoryBarriers[i].srcAccessMask, NULL);
9611       dst_stage_mask |= dep_info->pMemoryBarriers[i].dstStageMask;
9612       dst_flush_bits |=
9613          radv_dst_access_flush(cmd_buffer, dep_info->pMemoryBarriers[i].dstAccessMask, NULL);
9614    }
9615 
9616    for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
9617       src_stage_mask |= dep_info->pBufferMemoryBarriers[i].srcStageMask;
9618       src_flush_bits |=
9619          radv_src_access_flush(cmd_buffer, dep_info->pBufferMemoryBarriers[i].srcAccessMask, NULL);
9620       dst_stage_mask |= dep_info->pBufferMemoryBarriers[i].dstStageMask;
9621       dst_flush_bits |=
9622          radv_dst_access_flush(cmd_buffer, dep_info->pBufferMemoryBarriers[i].dstAccessMask, NULL);
9623    }
9624 
9625    for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
9626       RADV_FROM_HANDLE(radv_image, image, dep_info->pImageMemoryBarriers[i].image);
9627 
9628       src_stage_mask |= dep_info->pImageMemoryBarriers[i].srcStageMask;
9629       src_flush_bits |=
9630          radv_src_access_flush(cmd_buffer, dep_info->pImageMemoryBarriers[i].srcAccessMask, image);
9631       dst_stage_mask |= dep_info->pImageMemoryBarriers[i].dstStageMask;
9632       dst_flush_bits |=
9633          radv_dst_access_flush(cmd_buffer, dep_info->pImageMemoryBarriers[i].dstAccessMask, image);
9634    }
9635 
9636    /* The Vulkan spec 1.1.98 says:
9637     *
9638     * "An execution dependency with only
9639     *  VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT in the destination stage mask
9640     *  will only prevent that stage from executing in subsequently
9641     *  submitted commands. As this stage does not perform any actual
9642     *  execution, this is not observable - in effect, it does not delay
9643     *  processing of subsequent commands. Similarly an execution dependency
9644     *  with only VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT in the source stage mask
9645     *  will effectively not wait for any prior commands to complete."
9646     */
9647    if (dst_stage_mask != VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT)
9648       radv_stage_flush(cmd_buffer, src_stage_mask);
9649    cmd_buffer->state.flush_bits |= src_flush_bits;
9650 
9651    radv_ace_internal_barrier(cmd_buffer, src_stage_mask, 0);
9652 
9653    for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
9654       RADV_FROM_HANDLE(radv_image, image, dep_info->pImageMemoryBarriers[i].image);
9655 
9656       const struct VkSampleLocationsInfoEXT *sample_locs_info =
9657          vk_find_struct_const(dep_info->pImageMemoryBarriers[i].pNext, SAMPLE_LOCATIONS_INFO_EXT);
9658       struct radv_sample_locations_state sample_locations;
9659 
9660       if (sample_locs_info) {
9661          assert(image->vk.create_flags & VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT);
9662          sample_locations.per_pixel = sample_locs_info->sampleLocationsPerPixel;
9663          sample_locations.grid_size = sample_locs_info->sampleLocationGridSize;
9664          sample_locations.count = sample_locs_info->sampleLocationsCount;
9665          typed_memcpy(&sample_locations.locations[0], sample_locs_info->pSampleLocations,
9666                       sample_locs_info->sampleLocationsCount);
9667       }
9668 
9669       radv_handle_image_transition(
9670          cmd_buffer, image, dep_info->pImageMemoryBarriers[i].oldLayout,
9671          false, /* Outside of a renderpass we are never in a renderloop */
9672          dep_info->pImageMemoryBarriers[i].newLayout,
9673          false, /* Outside of a renderpass we are never in a renderloop */
9674          dep_info->pImageMemoryBarriers[i].srcQueueFamilyIndex,
9675          dep_info->pImageMemoryBarriers[i].dstQueueFamilyIndex,
9676          &dep_info->pImageMemoryBarriers[i].subresourceRange, sample_locs_info ? &sample_locations : NULL);
9677    }
9678 
9679    radv_ace_internal_barrier(cmd_buffer, 0, dst_stage_mask);
9680    radv_cp_dma_wait_for_stages(cmd_buffer, src_stage_mask);
9681 
9682    cmd_buffer->state.flush_bits |= dst_flush_bits;
9683 
9684    radv_describe_barrier_end(cmd_buffer);
9685 }
9686 
9687 VKAPI_ATTR void VKAPI_CALL
radv_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,const VkDependencyInfo * pDependencyInfo)9688 radv_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,
9689                          const VkDependencyInfo *pDependencyInfo)
9690 {
9691    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9692 
9693    radv_barrier(cmd_buffer, pDependencyInfo, RGP_BARRIER_EXTERNAL_CMD_PIPELINE_BARRIER);
9694 }
9695 
9696 static void
write_event(struct radv_cmd_buffer * cmd_buffer,struct radv_event * event,VkPipelineStageFlags2 stageMask,unsigned value)9697 write_event(struct radv_cmd_buffer *cmd_buffer, struct radv_event *event,
9698             VkPipelineStageFlags2 stageMask, unsigned value)
9699 {
9700    struct radeon_cmdbuf *cs = cmd_buffer->cs;
9701    uint64_t va = radv_buffer_get_va(event->bo);
9702 
9703    si_emit_cache_flush(cmd_buffer);
9704 
9705    radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo);
9706 
9707    ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 28);
9708 
9709    if (stageMask & (VK_PIPELINE_STAGE_2_COPY_BIT |
9710                     VK_PIPELINE_STAGE_2_RESOLVE_BIT |
9711                     VK_PIPELINE_STAGE_2_BLIT_BIT |
9712                     VK_PIPELINE_STAGE_2_CLEAR_BIT)) {
9713       /* Be conservative for now. */
9714       stageMask |= VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT;
9715    }
9716 
9717    /* Flags that only require a top-of-pipe event. */
9718    VkPipelineStageFlags2 top_of_pipe_flags = VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT;
9719 
9720    /* Flags that only require a post-index-fetch event. */
9721    VkPipelineStageFlags2 post_index_fetch_flags =
9722       top_of_pipe_flags | VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT;
9723 
9724    /* Flags that only require signaling post PS. */
9725    VkPipelineStageFlags2 post_ps_flags =
9726       post_index_fetch_flags | VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT |
9727       VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT |
9728       VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT | VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT |
9729       VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_NV |
9730       VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT |
9731       VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT |
9732       VK_PIPELINE_STAGE_2_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR |
9733       VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT;
9734 
9735    /* Flags that only require signaling post CS. */
9736    VkPipelineStageFlags2 post_cs_flags = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT;
9737 
9738    radv_cp_dma_wait_for_stages(cmd_buffer, stageMask);
9739 
9740    if (!(stageMask & ~top_of_pipe_flags)) {
9741       /* Just need to sync the PFP engine. */
9742       radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
9743       radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
9744       radeon_emit(cs, va);
9745       radeon_emit(cs, va >> 32);
9746       radeon_emit(cs, value);
9747    } else if (!(stageMask & ~post_index_fetch_flags)) {
9748       /* Sync ME because PFP reads index and indirect buffers. */
9749       radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
9750       radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_ME));
9751       radeon_emit(cs, va);
9752       radeon_emit(cs, va >> 32);
9753       radeon_emit(cs, value);
9754    } else {
9755       unsigned event_type;
9756 
9757       if (!(stageMask & ~post_ps_flags)) {
9758          /* Sync previous fragment shaders. */
9759          event_type = V_028A90_PS_DONE;
9760       } else if (!(stageMask & ~post_cs_flags)) {
9761          /* Sync previous compute shaders. */
9762          event_type = V_028A90_CS_DONE;
9763       } else {
9764          /* Otherwise, sync all prior GPU work. */
9765          event_type = V_028A90_BOTTOM_OF_PIPE_TS;
9766       }
9767 
9768       si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level,
9769                                  radv_cmd_buffer_uses_mec(cmd_buffer), event_type, 0,
9770                                  EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, va, value,
9771                                  cmd_buffer->gfx9_eop_bug_va);
9772    }
9773 
9774    assert(cmd_buffer->cs->cdw <= cdw_max);
9775 }
9776 
9777 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetEvent2(VkCommandBuffer commandBuffer,VkEvent _event,const VkDependencyInfo * pDependencyInfo)9778 radv_CmdSetEvent2(VkCommandBuffer commandBuffer, VkEvent _event,
9779                   const VkDependencyInfo* pDependencyInfo)
9780 {
9781    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9782    RADV_FROM_HANDLE(radv_event, event, _event);
9783    VkPipelineStageFlags2 src_stage_mask = 0;
9784 
9785    for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++)
9786       src_stage_mask |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
9787    for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++)
9788       src_stage_mask |= pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
9789    for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++)
9790       src_stage_mask |= pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
9791 
9792    write_event(cmd_buffer, event, src_stage_mask, 1);
9793 }
9794 
9795 VKAPI_ATTR void VKAPI_CALL
radv_CmdResetEvent2(VkCommandBuffer commandBuffer,VkEvent _event,VkPipelineStageFlags2 stageMask)9796 radv_CmdResetEvent2(VkCommandBuffer commandBuffer, VkEvent _event,
9797                     VkPipelineStageFlags2 stageMask)
9798 {
9799    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9800    RADV_FROM_HANDLE(radv_event, event, _event);
9801 
9802    write_event(cmd_buffer, event, stageMask, 0);
9803 }
9804 
9805 VKAPI_ATTR void VKAPI_CALL
radv_CmdWaitEvents2(VkCommandBuffer commandBuffer,uint32_t eventCount,const VkEvent * pEvents,const VkDependencyInfo * pDependencyInfos)9806 radv_CmdWaitEvents2(VkCommandBuffer commandBuffer, uint32_t eventCount, const VkEvent *pEvents,
9807                     const VkDependencyInfo* pDependencyInfos)
9808 {
9809    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9810    struct radeon_cmdbuf *cs = cmd_buffer->cs;
9811 
9812    for (unsigned i = 0; i < eventCount; ++i) {
9813       RADV_FROM_HANDLE(radv_event, event, pEvents[i]);
9814       uint64_t va = radv_buffer_get_va(event->bo);
9815 
9816       radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo);
9817 
9818       ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 7);
9819 
9820       radv_cp_wait_mem(cs, WAIT_REG_MEM_EQUAL, va, 1, 0xffffffff);
9821       assert(cmd_buffer->cs->cdw <= cdw_max);
9822    }
9823 
9824    radv_barrier(cmd_buffer, pDependencyInfos, RGP_BARRIER_EXTERNAL_CMD_WAIT_EVENTS);
9825 }
9826 
9827 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDeviceMask(VkCommandBuffer commandBuffer,uint32_t deviceMask)9828 radv_CmdSetDeviceMask(VkCommandBuffer commandBuffer, uint32_t deviceMask)
9829 {
9830    /* No-op */
9831 }
9832 
9833 /* VK_EXT_conditional_rendering */
9834 VKAPI_ATTR void VKAPI_CALL
radv_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,const VkConditionalRenderingBeginInfoEXT * pConditionalRenderingBegin)9835 radv_CmdBeginConditionalRenderingEXT(
9836    VkCommandBuffer commandBuffer,
9837    const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin)
9838 {
9839    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9840    RADV_FROM_HANDLE(radv_buffer, buffer, pConditionalRenderingBegin->buffer);
9841    struct radeon_cmdbuf *cs = cmd_buffer->cs;
9842    unsigned pred_op = PREDICATION_OP_BOOL32;
9843    bool draw_visible = true;
9844    uint64_t va;
9845 
9846    va = radv_buffer_get_va(buffer->bo) + buffer->offset + pConditionalRenderingBegin->offset;
9847 
9848    /* By default, if the 32-bit value at offset in buffer memory is zero,
9849     * then the rendering commands are discarded, otherwise they are
9850     * executed as normal. If the inverted flag is set, all commands are
9851     * discarded if the value is non zero.
9852     */
9853    if (pConditionalRenderingBegin->flags & VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT) {
9854       draw_visible = false;
9855    }
9856 
9857    si_emit_cache_flush(cmd_buffer);
9858 
9859    if (cmd_buffer->qf == RADV_QUEUE_GENERAL &&
9860        !cmd_buffer->device->physical_device->rad_info.has_32bit_predication) {
9861       uint64_t pred_value = 0, pred_va;
9862       unsigned pred_offset;
9863 
9864       /* From the Vulkan spec 1.1.107:
9865        *
9866        * "If the 32-bit value at offset in buffer memory is zero,
9867        *  then the rendering commands are discarded, otherwise they
9868        *  are executed as normal. If the value of the predicate in
9869        *  buffer memory changes while conditional rendering is
9870        *  active, the rendering commands may be discarded in an
9871        *  implementation-dependent way. Some implementations may
9872        *  latch the value of the predicate upon beginning conditional
9873        *  rendering while others may read it before every rendering
9874        *  command."
9875        *
9876        * But, the AMD hardware treats the predicate as a 64-bit
9877        * value which means we need a workaround in the driver.
9878        * Luckily, it's not required to support if the value changes
9879        * when predication is active.
9880        *
9881        * The workaround is as follows:
9882        * 1) allocate a 64-value in the upload BO and initialize it
9883        *    to 0
9884        * 2) copy the 32-bit predicate value to the upload BO
9885        * 3) use the new allocated VA address for predication
9886        *
9887        * Based on the conditionalrender demo, it's faster to do the
9888        * COPY_DATA in ME  (+ sync PFP) instead of PFP.
9889        */
9890       radv_cmd_buffer_upload_data(cmd_buffer, 8, &pred_value, &pred_offset);
9891 
9892       pred_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + pred_offset;
9893 
9894       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
9895       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
9896                          COPY_DATA_WR_CONFIRM);
9897       radeon_emit(cs, va);
9898       radeon_emit(cs, va >> 32);
9899       radeon_emit(cs, pred_va);
9900       radeon_emit(cs, pred_va >> 32);
9901 
9902       radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
9903       radeon_emit(cs, 0);
9904 
9905       va = pred_va;
9906       pred_op = PREDICATION_OP_BOOL64;
9907    }
9908 
9909    /* MEC doesn't support predication, we emulate it elsewhere. */
9910    if (!radv_cmd_buffer_uses_mec(cmd_buffer)) {
9911       si_emit_set_predication_state(cmd_buffer, draw_visible, pred_op, va);
9912    }
9913 
9914    /* Store conditional rendering user info. */
9915    cmd_buffer->state.predicating = true;
9916    cmd_buffer->state.predication_type = draw_visible;
9917    cmd_buffer->state.predication_op = pred_op;
9918    cmd_buffer->state.predication_va = va;
9919    cmd_buffer->mec_inv_pred_emitted = false;
9920 }
9921 
9922 VKAPI_ATTR void VKAPI_CALL
radv_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)9923 radv_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)
9924 {
9925    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9926 
9927    /* MEC doesn't support predication, no need to emit anything here. */
9928    if (!radv_cmd_buffer_uses_mec(cmd_buffer)) {
9929       si_emit_set_predication_state(cmd_buffer, false, 0, 0);
9930    }
9931 
9932    /* Reset conditional rendering user info. */
9933    cmd_buffer->state.predicating = false;
9934    cmd_buffer->state.predication_type = -1;
9935    cmd_buffer->state.predication_op = 0;
9936    cmd_buffer->state.predication_va = 0;
9937    cmd_buffer->mec_inv_pred_emitted = false;
9938 }
9939 
9940 /* VK_EXT_transform_feedback */
9941 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes)9942 radv_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer, uint32_t firstBinding,
9943                                         uint32_t bindingCount, const VkBuffer *pBuffers,
9944                                         const VkDeviceSize *pOffsets, const VkDeviceSize *pSizes)
9945 {
9946    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9947    struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
9948    uint8_t enabled_mask = 0;
9949 
9950    assert(firstBinding + bindingCount <= MAX_SO_BUFFERS);
9951    for (uint32_t i = 0; i < bindingCount; i++) {
9952       uint32_t idx = firstBinding + i;
9953 
9954       sb[idx].buffer = radv_buffer_from_handle(pBuffers[i]);
9955       sb[idx].offset = pOffsets[i];
9956 
9957       if (!pSizes || pSizes[i] == VK_WHOLE_SIZE) {
9958          sb[idx].size = sb[idx].buffer->vk.size - sb[idx].offset;
9959       } else {
9960          sb[idx].size = pSizes[i];
9961       }
9962 
9963       radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, sb[idx].buffer->bo);
9964 
9965       enabled_mask |= 1 << idx;
9966    }
9967 
9968    cmd_buffer->state.streamout.enabled_mask |= enabled_mask;
9969 
9970    cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_BUFFER;
9971 }
9972 
9973 bool
radv_is_streamout_enabled(struct radv_cmd_buffer * cmd_buffer)9974 radv_is_streamout_enabled(struct radv_cmd_buffer *cmd_buffer)
9975 {
9976    struct radv_streamout_state *so = &cmd_buffer->state.streamout;
9977 
9978    /* Streamout must be enabled for the PRIMITIVES_GENERATED query to work. */
9979    return (so->streamout_enabled || cmd_buffer->state.prims_gen_query_enabled) &&
9980           !cmd_buffer->state.suspend_streamout;
9981 }
9982 
9983 void
radv_emit_streamout_enable(struct radv_cmd_buffer * cmd_buffer)9984 radv_emit_streamout_enable(struct radv_cmd_buffer *cmd_buffer)
9985 {
9986    struct radv_streamout_state *so = &cmd_buffer->state.streamout;
9987    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
9988    bool streamout_enabled = radv_is_streamout_enabled(cmd_buffer);
9989    struct radeon_cmdbuf *cs = cmd_buffer->cs;
9990    uint32_t enabled_stream_buffers_mask = 0;
9991 
9992    if (pipeline && pipeline->streamout_shader) {
9993       enabled_stream_buffers_mask = pipeline->streamout_shader->info.so.enabled_stream_buffers_mask;
9994    }
9995 
9996    radeon_set_context_reg_seq(cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
9997    radeon_emit(cs, S_028B94_STREAMOUT_0_EN(streamout_enabled) | S_028B94_RAST_STREAM(0) |
9998                       S_028B94_STREAMOUT_1_EN(streamout_enabled) |
9999                       S_028B94_STREAMOUT_2_EN(streamout_enabled) |
10000                       S_028B94_STREAMOUT_3_EN(streamout_enabled));
10001    radeon_emit(cs, so->hw_enabled_mask & enabled_stream_buffers_mask);
10002 
10003    cmd_buffer->state.context_roll_without_scissor_emitted = true;
10004 }
10005 
10006 static void
radv_set_streamout_enable(struct radv_cmd_buffer * cmd_buffer,bool enable)10007 radv_set_streamout_enable(struct radv_cmd_buffer *cmd_buffer, bool enable)
10008 {
10009    struct radv_streamout_state *so = &cmd_buffer->state.streamout;
10010    bool old_streamout_enabled = radv_is_streamout_enabled(cmd_buffer);
10011    uint32_t old_hw_enabled_mask = so->hw_enabled_mask;
10012 
10013    so->streamout_enabled = enable;
10014 
10015    so->hw_enabled_mask = so->enabled_mask | (so->enabled_mask << 4) | (so->enabled_mask << 8) |
10016                          (so->enabled_mask << 12);
10017 
10018    if (!cmd_buffer->device->physical_device->use_ngg_streamout &&
10019        ((old_streamout_enabled != radv_is_streamout_enabled(cmd_buffer)) ||
10020         (old_hw_enabled_mask != so->hw_enabled_mask)))
10021       radv_emit_streamout_enable(cmd_buffer);
10022 
10023    if (cmd_buffer->device->physical_device->use_ngg_streamout) {
10024       cmd_buffer->gds_needed = true;
10025       cmd_buffer->gds_oa_needed = true;
10026    }
10027 }
10028 
10029 static void
radv_flush_vgt_streamout(struct radv_cmd_buffer * cmd_buffer)10030 radv_flush_vgt_streamout(struct radv_cmd_buffer *cmd_buffer)
10031 {
10032    struct radeon_cmdbuf *cs = cmd_buffer->cs;
10033    unsigned reg_strmout_cntl;
10034 
10035    /* The register is at different places on different ASICs. */
10036    if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9) {
10037       reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
10038       radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
10039       radeon_emit(cs, S_370_DST_SEL(V_370_MEM_MAPPED_REGISTER) | S_370_ENGINE_SEL(V_370_ME));
10040       radeon_emit(cs, R_0300FC_CP_STRMOUT_CNTL >> 2);
10041       radeon_emit(cs, 0);
10042       radeon_emit(cs, 0);
10043    } else if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) {
10044       reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
10045       radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0);
10046    } else {
10047       reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL;
10048       radeon_set_config_reg(cs, reg_strmout_cntl, 0);
10049    }
10050 
10051    radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
10052    radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0));
10053 
10054    radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
10055    radeon_emit(cs,
10056                WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
10057    radeon_emit(cs, reg_strmout_cntl >> 2); /* register */
10058    radeon_emit(cs, 0);
10059    radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
10060    radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
10061    radeon_emit(cs, 4);                              /* poll interval */
10062 }
10063 
10064 static void
radv_emit_streamout_begin(struct radv_cmd_buffer * cmd_buffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)10065 radv_emit_streamout_begin(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer,
10066                           uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
10067                           const VkDeviceSize *pCounterBufferOffsets)
10068 
10069 {
10070    struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
10071    struct radv_streamout_state *so = &cmd_buffer->state.streamout;
10072    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
10073    struct radv_shader_info *info = &pipeline->streamout_shader->info;
10074    struct radeon_cmdbuf *cs = cmd_buffer->cs;
10075 
10076    radv_flush_vgt_streamout(cmd_buffer);
10077 
10078    assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
10079    u_foreach_bit(i, so->enabled_mask)
10080    {
10081       int32_t counter_buffer_idx = i - firstCounterBuffer;
10082       if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
10083          counter_buffer_idx = -1;
10084 
10085       /* AMD GCN binds streamout buffers as shader resources.
10086        * VGT only counts primitives and tells the shader through
10087        * SGPRs what to do.
10088        */
10089       radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 2);
10090       radeon_emit(cs, sb[i].size >> 2);     /* BUFFER_SIZE (in DW) */
10091       radeon_emit(cs, info->so.strides[i]); /* VTX_STRIDE (in DW) */
10092 
10093       cmd_buffer->state.context_roll_without_scissor_emitted = true;
10094 
10095       if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) {
10096          /* The array of counter buffers is optional. */
10097          RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
10098          uint64_t va = radv_buffer_get_va(buffer->bo);
10099          uint64_t counter_buffer_offset = 0;
10100 
10101          if (pCounterBufferOffsets)
10102             counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
10103 
10104          va += buffer->offset + counter_buffer_offset;
10105 
10106          /* Append */
10107          radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
10108          radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) |   /* offset in bytes */
10109                             STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
10110          radeon_emit(cs, 0);                                                 /* unused */
10111          radeon_emit(cs, 0);                                                 /* unused */
10112          radeon_emit(cs, va);                                                /* src address lo */
10113          radeon_emit(cs, va >> 32);                                          /* src address hi */
10114 
10115          radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
10116       } else {
10117          /* Start from the beginning. */
10118          radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
10119          radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) | /* offset in bytes */
10120                             STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
10121          radeon_emit(cs, 0);                                                    /* unused */
10122          radeon_emit(cs, 0);                                                    /* unused */
10123          radeon_emit(cs, 0);                                                    /* unused */
10124          radeon_emit(cs, 0);                                                    /* unused */
10125       }
10126    }
10127 
10128    radv_set_streamout_enable(cmd_buffer, true);
10129 }
10130 
10131 static void
gfx10_emit_streamout_begin(struct radv_cmd_buffer * cmd_buffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)10132 gfx10_emit_streamout_begin(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer,
10133                            uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
10134                            const VkDeviceSize *pCounterBufferOffsets)
10135 {
10136    struct radv_streamout_state *so = &cmd_buffer->state.streamout;
10137    unsigned last_target = util_last_bit(so->enabled_mask) - 1;
10138    struct radeon_cmdbuf *cs = cmd_buffer->cs;
10139 
10140    assert(cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10);
10141    assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
10142 
10143    /* Sync because the next streamout operation will overwrite GDS and we
10144     * have to make sure it's idle.
10145     * TODO: Improve by tracking if there is a streamout operation in
10146     * flight.
10147     */
10148    cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH;
10149    si_emit_cache_flush(cmd_buffer);
10150 
10151    u_foreach_bit(i, so->enabled_mask)
10152    {
10153       int32_t counter_buffer_idx = i - firstCounterBuffer;
10154       if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
10155          counter_buffer_idx = -1;
10156 
10157       bool append =
10158          counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx];
10159       uint64_t va = 0;
10160 
10161       if (append) {
10162          RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
10163          uint64_t counter_buffer_offset = 0;
10164 
10165          if (pCounterBufferOffsets)
10166             counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
10167 
10168          va += radv_buffer_get_va(buffer->bo);
10169          va += buffer->offset + counter_buffer_offset;
10170 
10171          radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
10172       }
10173 
10174       radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
10175       radeon_emit(cs, S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) |
10176                          S_411_DST_SEL(V_411_GDS) | S_411_CP_SYNC(i == last_target));
10177       radeon_emit(cs, va);
10178       radeon_emit(cs, va >> 32);
10179       radeon_emit(cs, 4 * i); /* destination in GDS */
10180       radeon_emit(cs, 0);
10181       radeon_emit(cs, S_415_BYTE_COUNT_GFX9(4) | S_415_DISABLE_WR_CONFIRM_GFX9(i != last_target));
10182    }
10183 
10184    radv_set_streamout_enable(cmd_buffer, true);
10185 }
10186 
10187 VKAPI_ATTR void VKAPI_CALL
radv_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)10188 radv_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer,
10189                                   uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
10190                                   const VkDeviceSize *pCounterBufferOffsets)
10191 {
10192    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
10193 
10194    if (cmd_buffer->device->physical_device->use_ngg_streamout) {
10195       gfx10_emit_streamout_begin(cmd_buffer, firstCounterBuffer, counterBufferCount,
10196                                  pCounterBuffers, pCounterBufferOffsets);
10197    } else {
10198       radv_emit_streamout_begin(cmd_buffer, firstCounterBuffer, counterBufferCount, pCounterBuffers,
10199                                 pCounterBufferOffsets);
10200    }
10201 }
10202 
10203 static void
radv_emit_streamout_end(struct radv_cmd_buffer * cmd_buffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)10204 radv_emit_streamout_end(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer,
10205                         uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
10206                         const VkDeviceSize *pCounterBufferOffsets)
10207 {
10208    struct radv_streamout_state *so = &cmd_buffer->state.streamout;
10209    struct radeon_cmdbuf *cs = cmd_buffer->cs;
10210 
10211    radv_flush_vgt_streamout(cmd_buffer);
10212 
10213    assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
10214    u_foreach_bit(i, so->enabled_mask)
10215    {
10216       int32_t counter_buffer_idx = i - firstCounterBuffer;
10217       if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
10218          counter_buffer_idx = -1;
10219 
10220       if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) {
10221          /* The array of counters buffer is optional. */
10222          RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
10223          uint64_t va = radv_buffer_get_va(buffer->bo);
10224          uint64_t counter_buffer_offset = 0;
10225 
10226          if (pCounterBufferOffsets)
10227             counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
10228 
10229          va += buffer->offset + counter_buffer_offset;
10230 
10231          radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
10232          radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) | /* offset in bytes */
10233                             STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
10234                             STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
10235          radeon_emit(cs, va);                                  /* dst address lo */
10236          radeon_emit(cs, va >> 32);                            /* dst address hi */
10237          radeon_emit(cs, 0);                                   /* unused */
10238          radeon_emit(cs, 0);                                   /* unused */
10239 
10240          radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
10241       }
10242 
10243       /* Deactivate transform feedback by zeroing the buffer size.
10244        * The counters (primitives generated, primitives emitted) may
10245        * be enabled even if there is not buffer bound. This ensures
10246        * that the primitives-emitted query won't increment.
10247        */
10248       radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 0);
10249 
10250       cmd_buffer->state.context_roll_without_scissor_emitted = true;
10251    }
10252 
10253    radv_set_streamout_enable(cmd_buffer, false);
10254 }
10255 
10256 static void
gfx10_emit_streamout_end(struct radv_cmd_buffer * cmd_buffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)10257 gfx10_emit_streamout_end(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer,
10258                          uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
10259                          const VkDeviceSize *pCounterBufferOffsets)
10260 {
10261    struct radv_streamout_state *so = &cmd_buffer->state.streamout;
10262    struct radeon_cmdbuf *cs = cmd_buffer->cs;
10263 
10264    assert(cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10);
10265    assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
10266 
10267    u_foreach_bit(i, so->enabled_mask)
10268    {
10269       int32_t counter_buffer_idx = i - firstCounterBuffer;
10270       if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
10271          counter_buffer_idx = -1;
10272 
10273       if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) {
10274          /* The array of counters buffer is optional. */
10275          RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
10276          uint64_t va = radv_buffer_get_va(buffer->bo);
10277          uint64_t counter_buffer_offset = 0;
10278 
10279          if (pCounterBufferOffsets)
10280             counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
10281 
10282          va += buffer->offset + counter_buffer_offset;
10283 
10284          si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level,
10285                                     radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_PS_DONE, 0,
10286                                     EOP_DST_SEL_TC_L2, EOP_DATA_SEL_GDS, va, EOP_DATA_GDS(i, 1), 0);
10287 
10288          radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
10289       }
10290    }
10291 
10292    radv_set_streamout_enable(cmd_buffer, false);
10293 }
10294 
10295 VKAPI_ATTR void VKAPI_CALL
radv_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)10296 radv_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer,
10297                                 uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
10298                                 const VkDeviceSize *pCounterBufferOffsets)
10299 {
10300    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
10301 
10302    if (cmd_buffer->device->physical_device->use_ngg_streamout) {
10303       gfx10_emit_streamout_end(cmd_buffer, firstCounterBuffer, counterBufferCount, pCounterBuffers,
10304                                pCounterBufferOffsets);
10305    } else {
10306       radv_emit_streamout_end(cmd_buffer, firstCounterBuffer, counterBufferCount, pCounterBuffers,
10307                               pCounterBufferOffsets);
10308    }
10309 }
10310 
10311 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,uint32_t instanceCount,uint32_t firstInstance,VkBuffer _counterBuffer,VkDeviceSize counterBufferOffset,uint32_t counterOffset,uint32_t vertexStride)10312 radv_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer, uint32_t instanceCount,
10313                                  uint32_t firstInstance, VkBuffer _counterBuffer,
10314                                  VkDeviceSize counterBufferOffset, uint32_t counterOffset,
10315                                  uint32_t vertexStride)
10316 {
10317    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
10318    RADV_FROM_HANDLE(radv_buffer, counterBuffer, _counterBuffer);
10319    struct radv_draw_info info;
10320 
10321    info.count = 0;
10322    info.instance_count = instanceCount;
10323    info.first_instance = firstInstance;
10324    info.strmout_buffer = counterBuffer;
10325    info.strmout_buffer_offset = counterBufferOffset;
10326    info.stride = vertexStride;
10327    info.indexed = false;
10328    info.indirect = NULL;
10329 
10330    if (!radv_before_draw(cmd_buffer, &info, 1))
10331       return;
10332    struct VkMultiDrawInfoEXT minfo = { 0, 0 };
10333    radv_emit_direct_draw_packets(cmd_buffer, &info, 1, &minfo, S_0287F0_USE_OPAQUE(1), 0);
10334    radv_after_draw(cmd_buffer);
10335 }
10336 
10337 /* VK_AMD_buffer_marker */
10338 VKAPI_ATTR void VKAPI_CALL
radv_CmdWriteBufferMarker2AMD(VkCommandBuffer commandBuffer,VkPipelineStageFlags2 stage,VkBuffer dstBuffer,VkDeviceSize dstOffset,uint32_t marker)10339 radv_CmdWriteBufferMarker2AMD(VkCommandBuffer commandBuffer, VkPipelineStageFlags2 stage,
10340                               VkBuffer dstBuffer, VkDeviceSize dstOffset, uint32_t marker)
10341 {
10342    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
10343    RADV_FROM_HANDLE(radv_buffer, buffer, dstBuffer);
10344    struct radeon_cmdbuf *cs = cmd_buffer->cs;
10345    uint64_t va = radv_buffer_get_va(buffer->bo) + buffer->offset + dstOffset;
10346 
10347    si_emit_cache_flush(cmd_buffer);
10348 
10349    ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 12);
10350 
10351    if (!(stage & ~VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT)) {
10352       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
10353       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
10354                          COPY_DATA_WR_CONFIRM);
10355       radeon_emit(cs, marker);
10356       radeon_emit(cs, 0);
10357       radeon_emit(cs, va);
10358       radeon_emit(cs, va >> 32);
10359    } else {
10360       si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level,
10361                                  radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_BOTTOM_OF_PIPE_TS,
10362                                  0, EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, va, marker,
10363                                  cmd_buffer->gfx9_eop_bug_va);
10364    }
10365 
10366    assert(cmd_buffer->cs->cdw <= cdw_max);
10367 }
10368 
10369 void
radv_CmdBindPipelineShaderGroupNV(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipeline pipeline,uint32_t groupIndex)10370 radv_CmdBindPipelineShaderGroupNV(VkCommandBuffer commandBuffer,
10371                                   VkPipelineBindPoint pipelineBindPoint, VkPipeline pipeline,
10372                                   uint32_t groupIndex)
10373 {
10374    fprintf(stderr, "radv: unimplemented vkCmdBindPipelineShaderGroupNV\n");
10375    abort();
10376 }