1 /*
2 * Copyright © 2019 Raspberry Pi Ltd
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "broadcom/common/v3d_csd.h"
25 #include "v3dv_private.h"
26 #include "util/perf/cpu_trace.h"
27 #include "util/u_pack_color.h"
28 #include "vk_common_entrypoints.h"
29 #include "vk_util.h"
30
31 float
v3dv_get_aa_line_width(struct v3dv_pipeline * pipeline,struct v3dv_cmd_buffer * buffer)32 v3dv_get_aa_line_width(struct v3dv_pipeline *pipeline,
33 struct v3dv_cmd_buffer *buffer)
34 {
35 float width = buffer->vk.dynamic_graphics_state.rs.line.width;
36
37 /* If line smoothing is enabled then we want to add some extra pixels to
38 * the width in order to have some semi-transparent edges.
39 */
40 if (pipeline->line_smooth)
41 width = floorf(M_SQRT2 * width) + 3;
42
43 return width;
44 }
45
46 void
v3dv_job_add_bo(struct v3dv_job * job,struct v3dv_bo * bo)47 v3dv_job_add_bo(struct v3dv_job *job, struct v3dv_bo *bo)
48 {
49 if (!bo)
50 return;
51
52 if (job->bo_handle_mask & bo->handle_bit) {
53 if (_mesa_set_search(job->bos, bo))
54 return;
55 }
56
57 _mesa_set_add(job->bos, bo);
58 job->bo_count++;
59 job->bo_handle_mask |= bo->handle_bit;
60 }
61
62 void
v3dv_job_add_bo_unchecked(struct v3dv_job * job,struct v3dv_bo * bo)63 v3dv_job_add_bo_unchecked(struct v3dv_job *job, struct v3dv_bo *bo)
64 {
65 assert(bo);
66 _mesa_set_add(job->bos, bo);
67 job->bo_count++;
68 job->bo_handle_mask |= bo->handle_bit;
69 }
70
71 static void
cmd_buffer_init(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_device * device)72 cmd_buffer_init(struct v3dv_cmd_buffer *cmd_buffer,
73 struct v3dv_device *device)
74 {
75 /* Do not reset the base object! If we are calling this from a command
76 * buffer reset that would reset the loader's dispatch table for the
77 * command buffer, and any other relevant info from vk_object_base
78 */
79 const uint32_t base_size = sizeof(struct vk_command_buffer);
80 uint8_t *cmd_buffer_driver_start = ((uint8_t *) cmd_buffer) + base_size;
81 memset(cmd_buffer_driver_start, 0, sizeof(*cmd_buffer) - base_size);
82
83 cmd_buffer->device = device;
84
85 list_inithead(&cmd_buffer->private_objs);
86 list_inithead(&cmd_buffer->jobs);
87
88 cmd_buffer->state.subpass_idx = -1;
89 cmd_buffer->state.meta.subpass_idx = -1;
90
91 cmd_buffer->status = V3DV_CMD_BUFFER_STATUS_INITIALIZED;
92 }
93
94 static VkResult
cmd_buffer_create(struct vk_command_pool * pool,VkCommandBufferLevel level,struct vk_command_buffer ** cmd_buffer_out)95 cmd_buffer_create(struct vk_command_pool *pool, VkCommandBufferLevel level,
96 struct vk_command_buffer **cmd_buffer_out)
97 {
98 struct v3dv_device *device =
99 container_of(pool->base.device, struct v3dv_device, vk);
100
101 struct v3dv_cmd_buffer *cmd_buffer;
102 cmd_buffer = vk_zalloc(&pool->alloc,
103 sizeof(*cmd_buffer),
104 8,
105 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
106 if (cmd_buffer == NULL)
107 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
108
109 /* Here we pass 0 as level because this callback hook doesn't have the level
110 * info, but that's fine, vk_common_AllocateCommandBuffers will fix it up
111 * after creation.
112 */
113 VkResult result;
114 result = vk_command_buffer_init(pool, &cmd_buffer->vk,
115 &v3dv_cmd_buffer_ops, level);
116 if (result != VK_SUCCESS) {
117 vk_free(&pool->alloc, cmd_buffer);
118 return result;
119 }
120
121 cmd_buffer_init(cmd_buffer, device);
122
123 *cmd_buffer_out = &cmd_buffer->vk;
124
125 return VK_SUCCESS;
126 }
127
128 static void
job_destroy_gpu_cl_resources(struct v3dv_job * job)129 job_destroy_gpu_cl_resources(struct v3dv_job *job)
130 {
131 assert(job->type == V3DV_JOB_TYPE_GPU_CL ||
132 job->type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE);
133
134 v3dv_cl_destroy(&job->bcl);
135 v3dv_cl_destroy(&job->rcl);
136 v3dv_cl_destroy(&job->indirect);
137
138 /* Since we don't ref BOs when we add them to the command buffer, don't
139 * unref them here either. Bo's will be freed when their corresponding API
140 * objects are destroyed.
141 */
142 _mesa_set_destroy(job->bos, NULL);
143
144 v3dv_bo_free(job->device, job->tile_alloc);
145 v3dv_bo_free(job->device, job->tile_state);
146 }
147
148 static void
job_destroy_cloned_gpu_cl_resources(struct v3dv_job * job)149 job_destroy_cloned_gpu_cl_resources(struct v3dv_job *job)
150 {
151 assert(job->type == V3DV_JOB_TYPE_GPU_CL);
152
153 struct v3dv_cmd_buffer *cmd_buffer = job->cmd_buffer;
154 if (job->clone_owns_bcl) {
155 /* For suspending jobs in command buffers with the simultaneous use flag
156 * we allocate a real copy of the BCL.
157 */
158 assert(job->suspending &&
159 cmd_buffer &&
160 (cmd_buffer->usage_flags &
161 VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT));
162 v3dv_cl_destroy(&job->bcl);
163 } else {
164 list_for_each_entry_safe(struct v3dv_bo, bo, &job->bcl.bo_list, list_link) {
165 list_del(&bo->list_link);
166 vk_free(&job->device->vk.alloc, bo);
167 }
168 }
169
170 list_for_each_entry_safe(struct v3dv_bo, bo, &job->rcl.bo_list, list_link) {
171 list_del(&bo->list_link);
172 vk_free(&job->device->vk.alloc, bo);
173 }
174
175 list_for_each_entry_safe(struct v3dv_bo, bo, &job->indirect.bo_list, list_link) {
176 list_del(&bo->list_link);
177 vk_free(&job->device->vk.alloc, bo);
178 }
179 }
180
181 static void
job_destroy_gpu_csd_resources(struct v3dv_job * job)182 job_destroy_gpu_csd_resources(struct v3dv_job *job)
183 {
184 assert(job->type == V3DV_JOB_TYPE_GPU_CSD);
185 assert(job->cmd_buffer);
186
187 v3dv_cl_destroy(&job->indirect);
188
189 _mesa_set_destroy(job->bos, NULL);
190
191 if (job->csd.shared_memory)
192 v3dv_bo_free(job->device, job->csd.shared_memory);
193 }
194
195 void
v3dv_job_destroy(struct v3dv_job * job)196 v3dv_job_destroy(struct v3dv_job *job)
197 {
198 assert(job);
199
200 list_del(&job->list_link);
201
202 /* Cloned jobs don't make deep copies of the original jobs, so they don't
203 * own any of their resources. However, they do allocate clones of BO
204 * structs, so make sure we free those.
205 */
206 if (!job->is_clone) {
207 switch (job->type) {
208 case V3DV_JOB_TYPE_GPU_CL:
209 case V3DV_JOB_TYPE_GPU_CL_INCOMPLETE:
210 job_destroy_gpu_cl_resources(job);
211 break;
212 case V3DV_JOB_TYPE_GPU_CSD:
213 job_destroy_gpu_csd_resources(job);
214 break;
215 default:
216 break;
217 }
218 } else {
219 /* Cloned jobs */
220 if (job->type == V3DV_JOB_TYPE_GPU_CL)
221 job_destroy_cloned_gpu_cl_resources(job);
222 }
223
224 vk_free(&job->device->vk.alloc, job);
225 }
226
227 void
v3dv_cmd_buffer_add_private_obj(struct v3dv_cmd_buffer * cmd_buffer,uint64_t obj,v3dv_cmd_buffer_private_obj_destroy_cb destroy_cb)228 v3dv_cmd_buffer_add_private_obj(struct v3dv_cmd_buffer *cmd_buffer,
229 uint64_t obj,
230 v3dv_cmd_buffer_private_obj_destroy_cb destroy_cb)
231 {
232 struct v3dv_cmd_buffer_private_obj *pobj =
233 vk_alloc(&cmd_buffer->device->vk.alloc, sizeof(*pobj), 8,
234 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
235 if (!pobj) {
236 v3dv_flag_oom(cmd_buffer, NULL);
237 return;
238 }
239
240 pobj->obj = obj;
241 pobj->destroy_cb = destroy_cb;
242
243 list_addtail(&pobj->list_link, &cmd_buffer->private_objs);
244 }
245
246 static void
cmd_buffer_destroy_private_obj(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_cmd_buffer_private_obj * pobj)247 cmd_buffer_destroy_private_obj(struct v3dv_cmd_buffer *cmd_buffer,
248 struct v3dv_cmd_buffer_private_obj *pobj)
249 {
250 assert(pobj && pobj->obj && pobj->destroy_cb);
251 pobj->destroy_cb(v3dv_device_to_handle(cmd_buffer->device),
252 pobj->obj,
253 &cmd_buffer->device->vk.alloc);
254 list_del(&pobj->list_link);
255 vk_free(&cmd_buffer->device->vk.alloc, pobj);
256 }
257
258 static void
cmd_buffer_free_resources(struct v3dv_cmd_buffer * cmd_buffer)259 cmd_buffer_free_resources(struct v3dv_cmd_buffer *cmd_buffer)
260 {
261 list_for_each_entry_safe(struct v3dv_job, job,
262 &cmd_buffer->jobs, list_link) {
263 v3dv_job_destroy(job);
264 }
265
266 if (cmd_buffer->state.job)
267 v3dv_job_destroy(cmd_buffer->state.job);
268
269 if (cmd_buffer->state.attachments)
270 vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer->state.attachments);
271
272 if (cmd_buffer->state.query.end.alloc_count > 0)
273 vk_free(&cmd_buffer->device->vk.alloc, cmd_buffer->state.query.end.states);
274
275 if (cmd_buffer->push_constants_resource.bo)
276 v3dv_bo_free(cmd_buffer->device, cmd_buffer->push_constants_resource.bo);
277
278 list_for_each_entry_safe(struct v3dv_cmd_buffer_private_obj, pobj,
279 &cmd_buffer->private_objs, list_link) {
280 cmd_buffer_destroy_private_obj(cmd_buffer, pobj);
281 }
282
283 if (cmd_buffer->state.meta.attachments) {
284 assert(cmd_buffer->state.meta.attachment_alloc_count > 0);
285 vk_free(&cmd_buffer->device->vk.alloc, cmd_buffer->state.meta.attachments);
286 }
287
288 v3dv_destroy_dynamic_framebuffer(cmd_buffer);
289 }
290
291 static void
cmd_buffer_destroy(struct vk_command_buffer * vk_cmd_buffer)292 cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer)
293 {
294 struct v3dv_cmd_buffer *cmd_buffer =
295 container_of(vk_cmd_buffer, struct v3dv_cmd_buffer, vk);
296
297 cmd_buffer_free_resources(cmd_buffer);
298 vk_command_buffer_finish(&cmd_buffer->vk);
299 vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer);
300 }
301
302 static bool
cmd_buffer_can_merge_subpass(struct v3dv_cmd_buffer * cmd_buffer,uint32_t subpass_idx)303 cmd_buffer_can_merge_subpass(struct v3dv_cmd_buffer *cmd_buffer,
304 uint32_t subpass_idx)
305 {
306 const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
307 assert(state->pass);
308
309 const struct v3dv_physical_device *physical_device =
310 cmd_buffer->device->pdevice;
311
312 if (cmd_buffer->vk.level != VK_COMMAND_BUFFER_LEVEL_PRIMARY)
313 return false;
314
315 if (!cmd_buffer->state.job)
316 return false;
317
318 if (cmd_buffer->state.job->always_flush)
319 return false;
320
321 if (!physical_device->options.merge_jobs)
322 return false;
323
324 /* Each render pass starts a new job */
325 if (subpass_idx == 0)
326 return false;
327
328 /* Two subpasses can be merged in the same job if we can emit a single RCL
329 * for them (since the RCL includes the END_OF_RENDERING command that
330 * triggers the "render job finished" interrupt). We can do this so long
331 * as both subpasses render against the same attachments.
332 */
333 assert(state->subpass_idx == subpass_idx - 1);
334 struct v3dv_subpass *prev_subpass = &state->pass->subpasses[state->subpass_idx];
335 struct v3dv_subpass *subpass = &state->pass->subpasses[subpass_idx];
336
337 if (subpass->ds_attachment.attachment !=
338 prev_subpass->ds_attachment.attachment)
339 return false;
340
341 if (subpass->color_count != prev_subpass->color_count)
342 return false;
343
344 for (uint32_t i = 0; i < subpass->color_count; i++) {
345 if (subpass->color_attachments[i].attachment !=
346 prev_subpass->color_attachments[i].attachment) {
347 return false;
348 }
349 }
350
351 /* Don't merge if the subpasses have different view masks, since in that
352 * case the framebuffer setup is different and we need to emit different
353 * RCLs.
354 */
355 if (subpass->view_mask != prev_subpass->view_mask)
356 return false;
357
358 /* FIXME: Since some attachment formats can't be resolved using the TLB we
359 * need to emit separate resolve jobs for them and that would not be
360 * compatible with subpass merges. We could fix that by testing if any of
361 * the attachments to resolve doesn't support TLB resolves.
362 */
363 if (prev_subpass->resolve_attachments || subpass->resolve_attachments ||
364 prev_subpass->resolve_depth || prev_subpass->resolve_stencil ||
365 subpass->resolve_depth || subpass->resolve_stencil) {
366 return false;
367 }
368
369 return true;
370 }
371
372 /**
373 * Computes and sets the job frame tiling information required to setup frame
374 * binning and rendering.
375 */
376 static struct v3dv_frame_tiling *
job_compute_frame_tiling(struct v3dv_job * job,uint32_t width,uint32_t height,uint32_t layers,uint32_t render_target_count,uint8_t max_internal_bpp,uint8_t total_color_bpp,bool msaa,bool double_buffer)377 job_compute_frame_tiling(struct v3dv_job *job,
378 uint32_t width,
379 uint32_t height,
380 uint32_t layers,
381 uint32_t render_target_count,
382 uint8_t max_internal_bpp,
383 uint8_t total_color_bpp,
384 bool msaa,
385 bool double_buffer)
386 {
387 assert(job);
388 struct v3dv_frame_tiling *tiling = &job->frame_tiling;
389
390 tiling->width = width;
391 tiling->height = height;
392 tiling->layers = layers;
393 tiling->render_target_count = render_target_count;
394 tiling->msaa = msaa;
395 tiling->internal_bpp = max_internal_bpp;
396 tiling->total_color_bpp = total_color_bpp;
397 tiling->double_buffer = double_buffer;
398
399 /* Double-buffer is incompatible with MSAA */
400 assert(!tiling->msaa || !tiling->double_buffer);
401
402 v3d_choose_tile_size(&job->device->devinfo,
403 render_target_count,
404 max_internal_bpp, total_color_bpp, msaa,
405 tiling->double_buffer,
406 &tiling->tile_width, &tiling->tile_height);
407
408 tiling->draw_tiles_x = DIV_ROUND_UP(width, tiling->tile_width);
409 tiling->draw_tiles_y = DIV_ROUND_UP(height, tiling->tile_height);
410
411 /* Size up our supertiles until we get under the limit */
412 const uint32_t max_supertiles = 256;
413 tiling->supertile_width = 1;
414 tiling->supertile_height = 1;
415 for (;;) {
416 tiling->frame_width_in_supertiles =
417 DIV_ROUND_UP(tiling->draw_tiles_x, tiling->supertile_width);
418 tiling->frame_height_in_supertiles =
419 DIV_ROUND_UP(tiling->draw_tiles_y, tiling->supertile_height);
420 const uint32_t num_supertiles = tiling->frame_width_in_supertiles *
421 tiling->frame_height_in_supertiles;
422 if (num_supertiles < max_supertiles)
423 break;
424
425 if (tiling->supertile_width < tiling->supertile_height)
426 tiling->supertile_width++;
427 else
428 tiling->supertile_height++;
429 }
430
431 return tiling;
432 }
433
434 bool
v3dv_job_allocate_tile_state(struct v3dv_job * job)435 v3dv_job_allocate_tile_state(struct v3dv_job *job)
436 {
437 struct v3dv_frame_tiling *tiling = &job->frame_tiling;
438 const uint32_t layers =
439 job->allocate_tile_state_for_all_layers ? tiling->layers : 1;
440
441 /* The PTB will request the tile alloc initial size per tile at start
442 * of tile binning.
443 */
444 uint32_t tile_alloc_size = 64 * layers *
445 tiling->draw_tiles_x *
446 tiling->draw_tiles_y;
447
448 /* The PTB allocates in aligned 4k chunks after the initial setup. */
449 tile_alloc_size = align(tile_alloc_size, 4096);
450
451 /* Include the first two chunk allocations that the PTB does so that
452 * we definitely clear the OOM condition before triggering one (the HW
453 * won't trigger OOM during the first allocations).
454 */
455 tile_alloc_size += 8192;
456
457 /* For performance, allocate some extra initial memory after the PTB's
458 * minimal allocations, so that we hopefully don't have to block the
459 * GPU on the kernel handling an OOM signal.
460 */
461 tile_alloc_size += 512 * 1024;
462
463 job->tile_alloc = v3dv_bo_alloc(job->device, tile_alloc_size,
464 "tile_alloc", true);
465 if (!job->tile_alloc) {
466 v3dv_flag_oom(NULL, job);
467 return false;
468 }
469
470 v3dv_job_add_bo_unchecked(job, job->tile_alloc);
471
472 const uint32_t tsda_per_tile_size = 256;
473 const uint32_t tile_state_size = layers *
474 tiling->draw_tiles_x *
475 tiling->draw_tiles_y *
476 tsda_per_tile_size;
477 job->tile_state = v3dv_bo_alloc(job->device, tile_state_size, "TSDA", true);
478 if (!job->tile_state) {
479 v3dv_flag_oom(NULL, job);
480 return false;
481 }
482
483 v3dv_job_add_bo_unchecked(job, job->tile_state);
484 return true;
485 }
486
487 void
v3dv_job_start_frame(struct v3dv_job * job,uint32_t width,uint32_t height,uint32_t layers,bool allocate_tile_state_for_all_layers,bool allocate_tile_state_now,uint32_t render_target_count,uint8_t max_internal_bpp,uint8_t total_color_bpp,bool msaa)488 v3dv_job_start_frame(struct v3dv_job *job,
489 uint32_t width,
490 uint32_t height,
491 uint32_t layers,
492 bool allocate_tile_state_for_all_layers,
493 bool allocate_tile_state_now,
494 uint32_t render_target_count,
495 uint8_t max_internal_bpp,
496 uint8_t total_color_bpp,
497 bool msaa)
498 {
499 assert(job);
500
501 /* Start by computing frame tiling spec for this job assuming that
502 * double-buffer mode is disabled.
503 */
504 const struct v3dv_frame_tiling *tiling =
505 job_compute_frame_tiling(job, width, height, layers,
506 render_target_count, max_internal_bpp,
507 total_color_bpp, msaa, false);
508
509 v3dv_cl_ensure_space_with_branch(&job->bcl, 256);
510 v3dv_return_if_oom(NULL, job);
511
512 job->allocate_tile_state_for_all_layers = allocate_tile_state_for_all_layers;
513
514 /* For subpass jobs we postpone tile state allocation until we are finishing
515 * the job and have made a decision about double-buffer.
516 */
517 if (allocate_tile_state_now) {
518 if (!v3dv_job_allocate_tile_state(job))
519 return;
520 }
521
522 v3d_X((&job->device->devinfo), job_emit_binning_prolog)(job, tiling,
523 allocate_tile_state_for_all_layers ? tiling->layers : 1);
524
525 job->ez_state = V3D_EZ_UNDECIDED;
526 job->first_ez_state = V3D_EZ_UNDECIDED;
527 }
528
529 static bool
job_should_enable_double_buffer(struct v3dv_job * job)530 job_should_enable_double_buffer(struct v3dv_job *job)
531 {
532 /* Incompatibility with double-buffer */
533 if (!job->can_use_double_buffer)
534 return false;
535
536 return v3d_double_buffer_score_ok(&job->double_buffer_score);
537 }
538
539 static void
cmd_buffer_end_render_pass_frame(struct v3dv_cmd_buffer * cmd_buffer)540 cmd_buffer_end_render_pass_frame(struct v3dv_cmd_buffer *cmd_buffer)
541 {
542 struct v3dv_job *job = cmd_buffer->state.job;
543 assert(job);
544
545 /* For subpass jobs we always emit the RCL here */
546 assert(v3dv_cl_offset(&job->rcl) == 0);
547
548 /* Only emit RCL for the first job in a suspend/resume chain */
549 if (!job->resuming) {
550 /* Decide if we want to enable double-buffer for this job. If we do, then
551 * we need to rewrite the TILE_BINNING_MODE_CFG packet in the BCL.
552 */
553 if (job_should_enable_double_buffer(job)) {
554 assert(!job->frame_tiling.double_buffer);
555 job_compute_frame_tiling(job,
556 job->frame_tiling.width,
557 job->frame_tiling.height,
558 job->frame_tiling.layers,
559 job->frame_tiling.render_target_count,
560 job->frame_tiling.internal_bpp,
561 job->frame_tiling.total_color_bpp,
562 job->frame_tiling.msaa,
563 true);
564
565 v3d_X((&job->device->devinfo), job_emit_enable_double_buffer)(job);
566 }
567
568 /* At this point we have decided whether we want to use double-buffer or
569 * not and the job's frame tiling represents that decision so we can
570 * allocate the tile state, which we need to do before we emit the RCL.
571 */
572 v3dv_job_allocate_tile_state(job);
573
574 v3d_X((&cmd_buffer->device->devinfo), cmd_buffer_emit_render_pass_rcl)(cmd_buffer);
575 }
576
577 /* Only emit the binning flush for the last job in resume/suspend chain */
578 if (!job->suspending)
579 v3d_X((&cmd_buffer->device->devinfo), job_emit_binning_flush)(job);
580 }
581
582 struct v3dv_job *
v3dv_cmd_buffer_create_cpu_job(struct v3dv_device * device,enum v3dv_job_type type,struct v3dv_cmd_buffer * cmd_buffer,uint32_t subpass_idx)583 v3dv_cmd_buffer_create_cpu_job(struct v3dv_device *device,
584 enum v3dv_job_type type,
585 struct v3dv_cmd_buffer *cmd_buffer,
586 uint32_t subpass_idx)
587 {
588 struct v3dv_job *job = vk_zalloc(&device->vk.alloc,
589 sizeof(struct v3dv_job), 8,
590 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
591 if (!job) {
592 v3dv_flag_oom(cmd_buffer, NULL);
593 return NULL;
594 }
595
596 v3dv_job_init(job, type, device, cmd_buffer, subpass_idx);
597 return job;
598 }
599
600 static void
cmd_buffer_emit_end_query_cpu(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t query,uint32_t count)601 cmd_buffer_emit_end_query_cpu(struct v3dv_cmd_buffer *cmd_buffer,
602 struct v3dv_query_pool *pool,
603 uint32_t query, uint32_t count)
604 {
605 assert(pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
606
607 struct v3dv_job *job =
608 v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
609 V3DV_JOB_TYPE_CPU_END_QUERY,
610 cmd_buffer, -1);
611 v3dv_return_if_oom(cmd_buffer, NULL);
612
613 job->cpu.query_end.pool = pool;
614 job->cpu.query_end.query = query;
615 job->cpu.query_end.count = count;
616 list_addtail(&job->list_link, &cmd_buffer->jobs);
617 }
618
619 static inline bool
cmd_buffer_has_pending_jobs(struct v3dv_cmd_buffer * cmd_buffer)620 cmd_buffer_has_pending_jobs(struct v3dv_cmd_buffer *cmd_buffer)
621 {
622 return cmd_buffer->state.query.end.used_count > 0;
623 }
624
625 static void
cmd_buffer_add_pending_jobs(struct v3dv_cmd_buffer * cmd_buffer)626 cmd_buffer_add_pending_jobs(struct v3dv_cmd_buffer *cmd_buffer)
627 {
628 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
629 const uint32_t count = state->query.end.used_count;
630 for (uint32_t i = 0; i < count; i++) {
631 assert(i < state->query.end.used_count);
632 struct v3dv_end_query_info *info = &state->query.end.states[i];
633 if (info->pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
634 v3dv_cmd_buffer_emit_set_query_availability(cmd_buffer, info->pool,
635 info->query, info->count, 1);
636 } else {
637 cmd_buffer_emit_end_query_cpu(cmd_buffer, info->pool,
638 info->query, info->count);
639 }
640 }
641 state->query.end.used_count = 0;
642 }
643
644 void
v3dv_cmd_buffer_finish_job(struct v3dv_cmd_buffer * cmd_buffer)645 v3dv_cmd_buffer_finish_job(struct v3dv_cmd_buffer *cmd_buffer)
646 {
647 struct v3dv_job *job = cmd_buffer->state.job;
648 if (!job)
649 return;
650
651 if (cmd_buffer->state.oom) {
652 v3dv_job_destroy(job);
653 cmd_buffer->state.job = NULL;
654 return;
655 }
656
657 /* If we have created a job for a command buffer then we should have
658 * recorded something into it: if the job was started in a render pass, it
659 * should at least have the start frame commands, otherwise, it should have
660 * a transfer command. The only exception are secondary command buffers
661 * inside a render pass.
662 *
663 * With dynamic rendering there is also the possibility that we resume a
664 * suspended pass with an empty job. In that case, we need to ensure the
665 * empty job is still a valid commmand list, which we will ensure when we
666 * add the binning flush right below, which only happens if this is the
667 * last job in the resume/suspend chain. If it is not the last then we know
668 * it must at least have the BRANCH instruction to link with a follow-up
669 * resume job.
670 */
671 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY ||
672 (job->resuming && !job->suspending) ||
673 v3dv_cl_offset(&job->bcl) > 0);
674
675 /* When we merge multiple subpasses into the same job we must only emit one
676 * RCL, so we do that here, when we decided that we need to finish the job.
677 * Any rendering that happens outside a render pass is never merged, so
678 * the RCL should have been emitted by the time we got here.
679 */
680 assert(v3dv_cl_offset(&job->rcl) != 0 || cmd_buffer->state.pass);
681
682 if (!(cmd_buffer->state.barrier.dst_mask & V3DV_BARRIER_GRAPHICS_BIT)) {
683 cmd_buffer->state.barrier.bcl_buffer_access = 0;
684 cmd_buffer->state.barrier.bcl_image_access = 0;
685 }
686
687 /* If we are finishing a job inside a render pass we have two scenarios:
688 *
689 * 1. It is a regular CL, in which case we will submit the job to the GPU,
690 * so we may need to generate an RCL and add a binning flush.
691 *
692 * 2. It is a partial CL recorded in a secondary command buffer, in which
693 * case we are not submitting it directly to the GPU but rather branch to
694 * it from a primary command buffer. In this case we just want to end
695 * the BCL with a RETURN_FROM_SUB_LIST and the RCL and binning flush
696 * will be the primary job that branches to this CL.
697 */
698 if (cmd_buffer->state.pass) {
699 if (job->type == V3DV_JOB_TYPE_GPU_CL) {
700 cmd_buffer_end_render_pass_frame(cmd_buffer);
701 } else {
702 assert(job->type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE);
703 v3d_X((&cmd_buffer->device->devinfo), cmd_buffer_end_render_pass_secondary)(cmd_buffer);
704 }
705 }
706
707 bool suspending = job->suspending;
708 list_addtail(&job->list_link, &cmd_buffer->jobs);
709 cmd_buffer->state.job = NULL;
710
711 /* If we have recorded any state with this last GPU job that requires to
712 * emit jobs after the job is completed, add them now. The only exception
713 * is secondary command buffers inside a render pass, because in
714 * that case we want to defer this until we finish recording the primary
715 * job into which we execute the secondary.
716 */
717 if (!suspending) {
718 if (cmd_buffer_has_pending_jobs(cmd_buffer) &&
719 (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY ||
720 !cmd_buffer->state.pass)) {
721 cmd_buffer_add_pending_jobs(cmd_buffer);
722 }
723 }
724 }
725
726 bool
v3dv_job_type_is_gpu(struct v3dv_job * job)727 v3dv_job_type_is_gpu(struct v3dv_job *job)
728 {
729 switch (job->type) {
730 case V3DV_JOB_TYPE_GPU_CL:
731 case V3DV_JOB_TYPE_GPU_CL_INCOMPLETE:
732 case V3DV_JOB_TYPE_GPU_TFU:
733 case V3DV_JOB_TYPE_GPU_CSD:
734 return true;
735 default:
736 return false;
737 }
738 }
739
740 static void
cmd_buffer_serialize_job_if_needed(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_job * job)741 cmd_buffer_serialize_job_if_needed(struct v3dv_cmd_buffer *cmd_buffer,
742 struct v3dv_job *job)
743 {
744 assert(cmd_buffer && job);
745
746 /* Serialization only affects GPU jobs, CPU jobs are always automatically
747 * serialized.
748 */
749 if (!v3dv_job_type_is_gpu(job))
750 return;
751
752 uint8_t barrier_mask = cmd_buffer->state.barrier.dst_mask;
753 if (barrier_mask == 0)
754 return;
755
756 uint8_t bit = 0;
757 uint8_t *src_mask;
758 if (job->type == V3DV_JOB_TYPE_GPU_CSD) {
759 assert(!job->is_transfer);
760 bit = V3DV_BARRIER_COMPUTE_BIT;
761 src_mask = &cmd_buffer->state.barrier.src_mask_compute;
762 } else if (job->is_transfer) {
763 assert(job->type == V3DV_JOB_TYPE_GPU_CL ||
764 job->type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE ||
765 job->type == V3DV_JOB_TYPE_GPU_TFU);
766 bit = V3DV_BARRIER_TRANSFER_BIT;
767 src_mask = &cmd_buffer->state.barrier.src_mask_transfer;
768 } else {
769 assert(job->type == V3DV_JOB_TYPE_GPU_CL ||
770 job->type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE);
771 bit = V3DV_BARRIER_GRAPHICS_BIT;
772 src_mask = &cmd_buffer->state.barrier.src_mask_graphics;
773 }
774
775 if (barrier_mask & bit) {
776 job->serialize = *src_mask;
777 *src_mask = 0;
778 cmd_buffer->state.barrier.dst_mask &= ~bit;
779 }
780 }
781
782 void
v3dv_job_init(struct v3dv_job * job,enum v3dv_job_type type,struct v3dv_device * device,struct v3dv_cmd_buffer * cmd_buffer,int32_t subpass_idx)783 v3dv_job_init(struct v3dv_job *job,
784 enum v3dv_job_type type,
785 struct v3dv_device *device,
786 struct v3dv_cmd_buffer *cmd_buffer,
787 int32_t subpass_idx)
788 {
789 MESA_TRACE_FUNC();
790 assert(job);
791
792 /* Make sure we haven't made this new job current before calling here */
793 assert(!cmd_buffer || cmd_buffer->state.job != job);
794
795 job->type = type;
796
797 job->device = device;
798 job->cmd_buffer = cmd_buffer;
799
800 list_inithead(&job->list_link);
801
802 if (type == V3DV_JOB_TYPE_GPU_CL ||
803 type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE ||
804 type == V3DV_JOB_TYPE_GPU_CSD) {
805 job->bos =
806 _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
807 job->bo_count = 0;
808
809 v3dv_cl_init(job, &job->indirect);
810
811 if (V3D_DBG(ALWAYS_FLUSH))
812 job->always_flush = true;
813 }
814
815 if (type == V3DV_JOB_TYPE_GPU_CL ||
816 type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE) {
817 v3dv_cl_init(job, &job->bcl);
818 v3dv_cl_init(job, &job->rcl);
819 }
820
821 if (cmd_buffer) {
822 /* Flag all state as dirty. Generally, we need to re-emit state for each
823 * new job.
824 *
825 * FIXME: there may be some exceptions, in which case we could skip some
826 * bits.
827 */
828 cmd_buffer->state.dirty = ~0;
829 cmd_buffer->state.dirty_descriptor_stages = ~0;
830 vk_dynamic_graphics_state_dirty_all(&cmd_buffer->vk.dynamic_graphics_state);
831
832 /* Honor inheritance of occlusion queries in secondaries if requested */
833 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
834 cmd_buffer->state.inheritance.occlusion_query_enable) {
835 cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_OCCLUSION_QUERY;
836 }
837
838 /* Keep track of the first subpass that we are recording in this new job.
839 * We will use this when we emit the RCL to decide how to emit our loads
840 * and stores.
841 */
842 if (cmd_buffer->state.pass)
843 job->first_subpass = subpass_idx;
844
845 job->is_transfer = cmd_buffer->state.is_transfer;
846
847 cmd_buffer_serialize_job_if_needed(cmd_buffer, job);
848
849 job->perf = cmd_buffer->state.query.active_query.perf;
850 }
851 }
852
853 struct v3dv_job *
v3dv_cmd_buffer_start_job(struct v3dv_cmd_buffer * cmd_buffer,int32_t subpass_idx,enum v3dv_job_type type)854 v3dv_cmd_buffer_start_job(struct v3dv_cmd_buffer *cmd_buffer,
855 int32_t subpass_idx,
856 enum v3dv_job_type type)
857 {
858 /* Don't create a new job if we can merge the current subpass into
859 * the current job.
860 */
861 if (cmd_buffer->state.pass &&
862 subpass_idx != -1 &&
863 cmd_buffer_can_merge_subpass(cmd_buffer, subpass_idx)) {
864 cmd_buffer->state.job->is_subpass_finish = false;
865 return cmd_buffer->state.job;
866 }
867
868 /* Ensure we are not starting a new job without finishing a previous one */
869 if (cmd_buffer->state.job != NULL)
870 v3dv_cmd_buffer_finish_job(cmd_buffer);
871
872 assert(cmd_buffer->state.job == NULL);
873 struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->vk.alloc,
874 sizeof(struct v3dv_job), 8,
875 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
876
877 if (!job) {
878 mesa_loge("Error: failed to allocate CPU memory for job\n");
879 v3dv_flag_oom(cmd_buffer, NULL);
880 return NULL;
881 }
882
883 v3dv_job_init(job, type, cmd_buffer->device, cmd_buffer, subpass_idx);
884 cmd_buffer->state.job = job;
885
886 return job;
887 }
888
889 static void
cmd_buffer_reset(struct vk_command_buffer * vk_cmd_buffer,VkCommandBufferResetFlags flags)890 cmd_buffer_reset(struct vk_command_buffer *vk_cmd_buffer,
891 VkCommandBufferResetFlags flags)
892 {
893 struct v3dv_cmd_buffer *cmd_buffer =
894 container_of(vk_cmd_buffer, struct v3dv_cmd_buffer, vk);
895
896 vk_command_buffer_reset(&cmd_buffer->vk);
897 if (cmd_buffer->status != V3DV_CMD_BUFFER_STATUS_INITIALIZED) {
898 struct v3dv_device *device = cmd_buffer->device;
899
900 /* FIXME: For now we always free all resources as if
901 * VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT was set.
902 */
903 if (cmd_buffer->status != V3DV_CMD_BUFFER_STATUS_NEW)
904 cmd_buffer_free_resources(cmd_buffer);
905
906 cmd_buffer_init(cmd_buffer, device);
907 }
908
909 assert(cmd_buffer->status == V3DV_CMD_BUFFER_STATUS_INITIALIZED);
910 }
911
912
913 static void
cmd_buffer_emit_resolve(struct v3dv_cmd_buffer * cmd_buffer,uint32_t dst_attachment_idx,uint32_t src_attachment_idx,VkImageAspectFlagBits aspect)914 cmd_buffer_emit_resolve(struct v3dv_cmd_buffer *cmd_buffer,
915 uint32_t dst_attachment_idx,
916 uint32_t src_attachment_idx,
917 VkImageAspectFlagBits aspect)
918 {
919 struct v3dv_image_view *src_iview =
920 cmd_buffer->state.attachments[src_attachment_idx].image_view;
921 struct v3dv_image_view *dst_iview =
922 cmd_buffer->state.attachments[dst_attachment_idx].image_view;
923
924 const VkRect2D *ra = &cmd_buffer->state.render_area;
925
926 VkImageResolve2 region = {
927 .sType = VK_STRUCTURE_TYPE_IMAGE_RESOLVE_2,
928 .srcSubresource = {
929 aspect,
930 src_iview->vk.base_mip_level,
931 src_iview->vk.base_array_layer,
932 src_iview->vk.layer_count,
933 },
934 .srcOffset = { ra->offset.x, ra->offset.y, 0 },
935 .dstSubresource = {
936 aspect,
937 dst_iview->vk.base_mip_level,
938 dst_iview->vk.base_array_layer,
939 dst_iview->vk.layer_count,
940 },
941 .dstOffset = { ra->offset.x, ra->offset.y, 0 },
942 .extent = { ra->extent.width, ra->extent.height, 1 },
943 };
944
945 struct v3dv_image *src_image = (struct v3dv_image *) src_iview->vk.image;
946 struct v3dv_image *dst_image = (struct v3dv_image *) dst_iview->vk.image;
947 VkResolveImageInfo2 resolve_info = {
948 .sType = VK_STRUCTURE_TYPE_RESOLVE_IMAGE_INFO_2,
949 .srcImage = v3dv_image_to_handle(src_image),
950 .srcImageLayout = VK_IMAGE_LAYOUT_GENERAL,
951 .dstImage = v3dv_image_to_handle(dst_image),
952 .dstImageLayout = VK_IMAGE_LAYOUT_GENERAL,
953 .regionCount = 1,
954 .pRegions = ®ion,
955 };
956
957 VkCommandBuffer cmd_buffer_handle = v3dv_cmd_buffer_to_handle(cmd_buffer);
958 v3dv_CmdResolveImage2(cmd_buffer_handle, &resolve_info);
959 }
960
961 static void
cmd_buffer_subpass_handle_pending_resolves(struct v3dv_cmd_buffer * cmd_buffer)962 cmd_buffer_subpass_handle_pending_resolves(struct v3dv_cmd_buffer *cmd_buffer)
963 {
964 assert(cmd_buffer->state.subpass_idx < cmd_buffer->state.pass->subpass_count);
965 const struct v3dv_render_pass *pass = cmd_buffer->state.pass;
966 const struct v3dv_subpass *subpass =
967 &pass->subpasses[cmd_buffer->state.subpass_idx];
968
969 if (!subpass->resolve_attachments)
970 return;
971
972 /* At this point we have already ended the current subpass and now we are
973 * about to emit vkCmdResolveImage calls to get the resolves we can't handle
974 * handle in the subpass RCL.
975 *
976 * vkCmdResolveImage is not supposed to be called inside a render pass so
977 * before we call that we need to make sure our command buffer state reflects
978 * that we are no longer in a subpass by finishing the current job and
979 * resetting the framebuffer and render pass state temporarily and then
980 * restoring it after we are done with the resolves.
981 */
982 if (cmd_buffer->state.job)
983 v3dv_cmd_buffer_finish_job(cmd_buffer);
984 struct v3dv_framebuffer *restore_fb = cmd_buffer->state.framebuffer;
985 struct v3dv_render_pass *restore_pass = cmd_buffer->state.pass;
986 uint32_t restore_subpass_idx = cmd_buffer->state.subpass_idx;
987 cmd_buffer->state.framebuffer = NULL;
988 cmd_buffer->state.pass = NULL;
989 cmd_buffer->state.subpass_idx = -1;
990
991 for (uint32_t i = 0; i < subpass->color_count; i++) {
992 const uint32_t src_attachment_idx =
993 subpass->color_attachments[i].attachment;
994 if (src_attachment_idx == VK_ATTACHMENT_UNUSED)
995 continue;
996
997 /* Skip if this attachment doesn't have a resolve or if it was already
998 * implemented as a TLB resolve.
999 */
1000 if (!cmd_buffer->state.attachments[src_attachment_idx].has_resolve ||
1001 cmd_buffer->state.attachments[src_attachment_idx].use_tlb_resolve) {
1002 continue;
1003 }
1004
1005 const uint32_t dst_attachment_idx =
1006 subpass->resolve_attachments[i].attachment;
1007 assert(dst_attachment_idx != VK_ATTACHMENT_UNUSED);
1008
1009 cmd_buffer_emit_resolve(cmd_buffer, dst_attachment_idx, src_attachment_idx,
1010 VK_IMAGE_ASPECT_COLOR_BIT);
1011 }
1012
1013 const uint32_t ds_src_attachment_idx =
1014 subpass->ds_attachment.attachment;
1015 if (ds_src_attachment_idx != VK_ATTACHMENT_UNUSED &&
1016 cmd_buffer->state.attachments[ds_src_attachment_idx].has_resolve &&
1017 !cmd_buffer->state.attachments[ds_src_attachment_idx].use_tlb_resolve) {
1018 assert(subpass->resolve_depth || subpass->resolve_stencil);
1019 const VkImageAspectFlags ds_aspects =
1020 (subpass->resolve_depth ? VK_IMAGE_ASPECT_DEPTH_BIT : 0) |
1021 (subpass->resolve_stencil ? VK_IMAGE_ASPECT_STENCIL_BIT : 0);
1022 const uint32_t ds_dst_attachment_idx =
1023 subpass->ds_resolve_attachment.attachment;
1024 assert(ds_dst_attachment_idx != VK_ATTACHMENT_UNUSED);
1025 cmd_buffer_emit_resolve(cmd_buffer, ds_dst_attachment_idx,
1026 ds_src_attachment_idx, ds_aspects);
1027 }
1028
1029 cmd_buffer->state.framebuffer = restore_fb;
1030 cmd_buffer->state.pass = restore_pass;
1031 cmd_buffer->state.subpass_idx = restore_subpass_idx;
1032 }
1033
1034 static VkResult
cmd_buffer_begin_render_pass_secondary(struct v3dv_cmd_buffer * cmd_buffer,const VkCommandBufferInheritanceInfo * inheritance_info)1035 cmd_buffer_begin_render_pass_secondary(
1036 struct v3dv_cmd_buffer *cmd_buffer,
1037 const VkCommandBufferInheritanceInfo *inheritance_info)
1038 {
1039 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
1040 assert(cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT);
1041 assert(inheritance_info);
1042
1043 const VkCommandBufferInheritanceRenderingInfo *rendering_info = NULL;
1044 if (inheritance_info->renderPass == VK_NULL_HANDLE) {
1045 rendering_info = vk_find_struct_const(inheritance_info,
1046 COMMAND_BUFFER_INHERITANCE_RENDERING_INFO);
1047 assert(rendering_info);
1048 v3dv_setup_dynamic_render_pass_inheritance(cmd_buffer, rendering_info);
1049 cmd_buffer->state.pass = &cmd_buffer->state.dynamic_pass;
1050 cmd_buffer->state.subpass_idx = 0;
1051 cmd_buffer->state.framebuffer = NULL;
1052 } else {
1053 cmd_buffer->state.pass =
1054 v3dv_render_pass_from_handle(inheritance_info->renderPass);
1055
1056 assert(inheritance_info->subpass < cmd_buffer->state.pass->subpass_count);
1057 cmd_buffer->state.subpass_idx = inheritance_info->subpass;
1058
1059 cmd_buffer->state.framebuffer =
1060 v3dv_framebuffer_from_handle(inheritance_info->framebuffer);
1061 }
1062 assert(cmd_buffer->state.pass);
1063
1064 cmd_buffer->state.inheritance.occlusion_query_enable =
1065 inheritance_info->occlusionQueryEnable;
1066
1067 /* Secondaries that execute inside a render pass won't start subpasses
1068 * so we want to create a job for them here.
1069 */
1070 struct v3dv_job *job =
1071 v3dv_cmd_buffer_start_job(cmd_buffer, cmd_buffer->state.subpass_idx,
1072 V3DV_JOB_TYPE_GPU_CL_INCOMPLETE);
1073 if (!job) {
1074 v3dv_flag_oom(cmd_buffer, NULL);
1075 return VK_ERROR_OUT_OF_HOST_MEMORY;
1076 }
1077
1078 /* Secondary command buffers don't know about the render area, but our
1079 * scissor setup accounts for it, so let's make sure we make it large
1080 * enough that it doesn't actually constrain any rendering. This should
1081 * be fine, since the Vulkan spec states:
1082 *
1083 * "The application must ensure (using scissor if necessary) that all
1084 * rendering is contained within the render area."
1085 */
1086 const struct v3dv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
1087 cmd_buffer->state.render_area.offset.x = 0;
1088 cmd_buffer->state.render_area.offset.y = 0;
1089 cmd_buffer->state.render_area.extent.width =
1090 framebuffer ? framebuffer->width : V3D_MAX_IMAGE_DIMENSION;
1091 cmd_buffer->state.render_area.extent.height =
1092 framebuffer ? framebuffer->height : V3D_MAX_IMAGE_DIMENSION;
1093
1094 /* We only really execute double-buffer mode in primary jobs, so allow this
1095 * mode in render pass secondaries to keep track of the double-buffer mode
1096 * score in them and update the primaries accordingly when they are executed
1097 * into them.
1098 */
1099 job->can_use_double_buffer = true;
1100
1101 return VK_SUCCESS;
1102 }
1103
1104 const struct vk_command_buffer_ops v3dv_cmd_buffer_ops = {
1105 .create = cmd_buffer_create,
1106 .reset = cmd_buffer_reset,
1107 .destroy = cmd_buffer_destroy,
1108 };
1109
1110 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_BeginCommandBuffer(VkCommandBuffer commandBuffer,const VkCommandBufferBeginInfo * pBeginInfo)1111 v3dv_BeginCommandBuffer(VkCommandBuffer commandBuffer,
1112 const VkCommandBufferBeginInfo *pBeginInfo)
1113 {
1114 MESA_TRACE_FUNC();
1115 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1116
1117 /* If this is the first vkBeginCommandBuffer, we must initialize the
1118 * command buffer's state. Otherwise, we must reset its state. In both
1119 * cases we reset it.
1120 */
1121 cmd_buffer_reset(&cmd_buffer->vk, 0);
1122
1123 assert(cmd_buffer->status == V3DV_CMD_BUFFER_STATUS_INITIALIZED);
1124
1125 cmd_buffer->usage_flags = pBeginInfo->flags;
1126
1127 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
1128 if (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
1129 VkResult result =
1130 cmd_buffer_begin_render_pass_secondary(cmd_buffer,
1131 pBeginInfo->pInheritanceInfo);
1132 if (result != VK_SUCCESS)
1133 return result;
1134 }
1135 }
1136
1137 cmd_buffer->status = V3DV_CMD_BUFFER_STATUS_RECORDING;
1138
1139 return VK_SUCCESS;
1140 }
1141
1142 static void
cmd_buffer_update_tile_alignment(struct v3dv_cmd_buffer * cmd_buffer)1143 cmd_buffer_update_tile_alignment(struct v3dv_cmd_buffer *cmd_buffer)
1144 {
1145 /* Render areas and scissor/viewport are only relevant inside render passes,
1146 * otherwise we are dealing with transfer operations where these elements
1147 * don't apply.
1148 */
1149 assert(cmd_buffer->state.pass);
1150 const VkRect2D *rect = &cmd_buffer->state.render_area;
1151
1152 /* We should only call this at the beginning of a subpass so we should
1153 * always have framebuffer information available.
1154 */
1155 assert(cmd_buffer->state.framebuffer);
1156 cmd_buffer->state.tile_aligned_render_area =
1157 v3dv_subpass_area_is_tile_aligned(cmd_buffer->device, rect,
1158 cmd_buffer->state.framebuffer,
1159 cmd_buffer->state.pass,
1160 cmd_buffer->state.subpass_idx);
1161
1162 if (!cmd_buffer->state.tile_aligned_render_area) {
1163 perf_debug("Render area for subpass %d of render pass %p doesn't "
1164 "match render pass granularity.\n",
1165 cmd_buffer->state.subpass_idx, cmd_buffer->state.pass);
1166 }
1167 }
1168
1169 static void
cmd_buffer_update_attachment_resolve_state(struct v3dv_cmd_buffer * cmd_buffer)1170 cmd_buffer_update_attachment_resolve_state(struct v3dv_cmd_buffer *cmd_buffer)
1171 {
1172 /* NOTE: This should be called after cmd_buffer_update_tile_alignment()
1173 * since it relies on up-to-date information about subpass tile alignment.
1174 */
1175 const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1176 const struct v3dv_render_pass *pass = state->pass;
1177 const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
1178
1179 for (uint32_t i = 0; i < subpass->color_count; i++) {
1180 const uint32_t attachment_idx = subpass->color_attachments[i].attachment;
1181 if (attachment_idx == VK_ATTACHMENT_UNUSED)
1182 continue;
1183
1184 state->attachments[attachment_idx].has_resolve =
1185 subpass->resolve_attachments &&
1186 subpass->resolve_attachments[i].attachment != VK_ATTACHMENT_UNUSED;
1187
1188 state->attachments[attachment_idx].use_tlb_resolve =
1189 state->attachments[attachment_idx].has_resolve &&
1190 state->tile_aligned_render_area &&
1191 pass->attachments[attachment_idx].try_tlb_resolve;
1192 }
1193
1194 uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
1195 if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
1196 uint32_t ds_resolve_attachment_idx =
1197 subpass->ds_resolve_attachment.attachment;
1198 state->attachments[ds_attachment_idx].has_resolve =
1199 ds_resolve_attachment_idx != VK_ATTACHMENT_UNUSED;
1200
1201 assert(!state->attachments[ds_attachment_idx].has_resolve ||
1202 (subpass->resolve_depth || subpass->resolve_stencil));
1203
1204 state->attachments[ds_attachment_idx].use_tlb_resolve =
1205 state->attachments[ds_attachment_idx].has_resolve &&
1206 state->tile_aligned_render_area &&
1207 pass->attachments[ds_attachment_idx].try_tlb_resolve;
1208 }
1209 }
1210
1211 static void
cmd_buffer_state_set_attachment_clear_color(struct v3dv_cmd_buffer * cmd_buffer,uint32_t attachment_idx,const VkClearColorValue * color)1212 cmd_buffer_state_set_attachment_clear_color(struct v3dv_cmd_buffer *cmd_buffer,
1213 uint32_t attachment_idx,
1214 const VkClearColorValue *color)
1215 {
1216 assert(attachment_idx < cmd_buffer->state.pass->attachment_count);
1217 const struct v3dv_render_pass_attachment *attachment =
1218 &cmd_buffer->state.pass->attachments[attachment_idx];
1219
1220 uint32_t internal_type, internal_bpp;
1221 const struct v3dv_format *format =
1222 v3d_X((&cmd_buffer->device->devinfo), get_format)(attachment->desc.format);
1223 /* We don't allow multi-planar formats for render pass attachments */
1224 assert(format->plane_count == 1);
1225
1226 v3d_X((&cmd_buffer->device->devinfo), get_internal_type_bpp_for_output_format)
1227 (format->planes[0].rt_type, &internal_type, &internal_bpp);
1228
1229 uint32_t internal_size = 4 << internal_bpp;
1230
1231 struct v3dv_cmd_buffer_attachment_state *attachment_state =
1232 &cmd_buffer->state.attachments[attachment_idx];
1233
1234 v3d_X((&cmd_buffer->device->devinfo), get_hw_clear_color)
1235 (color, internal_type, internal_size, &attachment_state->clear_value.color[0]);
1236
1237 attachment_state->vk_clear_value.color = *color;
1238 }
1239
1240 static void
cmd_buffer_state_set_attachment_clear_depth_stencil(struct v3dv_cmd_buffer * cmd_buffer,uint32_t attachment_idx,bool clear_depth,bool clear_stencil,const VkClearDepthStencilValue * ds)1241 cmd_buffer_state_set_attachment_clear_depth_stencil(
1242 struct v3dv_cmd_buffer *cmd_buffer,
1243 uint32_t attachment_idx,
1244 bool clear_depth, bool clear_stencil,
1245 const VkClearDepthStencilValue *ds)
1246 {
1247 struct v3dv_cmd_buffer_attachment_state *attachment_state =
1248 &cmd_buffer->state.attachments[attachment_idx];
1249
1250 if (clear_depth)
1251 attachment_state->clear_value.z = ds->depth;
1252
1253 if (clear_stencil)
1254 attachment_state->clear_value.s = ds->stencil;
1255
1256 attachment_state->vk_clear_value.depthStencil = *ds;
1257 }
1258
1259 static void
cmd_buffer_state_set_clear_values(struct v3dv_cmd_buffer * cmd_buffer,uint32_t count,const VkClearValue * values)1260 cmd_buffer_state_set_clear_values(struct v3dv_cmd_buffer *cmd_buffer,
1261 uint32_t count, const VkClearValue *values)
1262 {
1263 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1264 const struct v3dv_render_pass *pass = state->pass;
1265
1266 /* There could be less clear values than attachments in the render pass, in
1267 * which case we only want to process as many as we have, or there could be
1268 * more, in which case we want to ignore those for which we don't have a
1269 * corresponding attachment.
1270 */
1271 count = MIN2(count, pass->attachment_count);
1272 for (uint32_t i = 0; i < count; i++) {
1273 const struct v3dv_render_pass_attachment *attachment =
1274 &pass->attachments[i];
1275
1276 if (attachment->desc.loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR)
1277 continue;
1278
1279 VkImageAspectFlags aspects = vk_format_aspects(attachment->desc.format);
1280 if (aspects & VK_IMAGE_ASPECT_COLOR_BIT) {
1281 cmd_buffer_state_set_attachment_clear_color(cmd_buffer, i,
1282 &values[i].color);
1283 } else if (aspects & (VK_IMAGE_ASPECT_DEPTH_BIT |
1284 VK_IMAGE_ASPECT_STENCIL_BIT)) {
1285 cmd_buffer_state_set_attachment_clear_depth_stencil(
1286 cmd_buffer, i,
1287 aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
1288 aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
1289 &values[i].depthStencil);
1290 }
1291 }
1292 }
1293
1294 static void
cmd_buffer_state_set_attachments(struct v3dv_cmd_buffer * cmd_buffer,const VkRenderPassBeginInfo * pRenderPassBegin)1295 cmd_buffer_state_set_attachments(struct v3dv_cmd_buffer *cmd_buffer,
1296 const VkRenderPassBeginInfo *pRenderPassBegin)
1297 {
1298 V3DV_FROM_HANDLE(v3dv_render_pass, pass, pRenderPassBegin->renderPass);
1299 V3DV_FROM_HANDLE(v3dv_framebuffer, framebuffer, pRenderPassBegin->framebuffer);
1300
1301 const VkRenderPassAttachmentBeginInfo *attach_begin =
1302 vk_find_struct_const(pRenderPassBegin, RENDER_PASS_ATTACHMENT_BEGIN_INFO);
1303
1304 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1305
1306 for (uint32_t i = 0; i < pass->attachment_count; i++) {
1307 if (attach_begin && attach_begin->attachmentCount != 0) {
1308 state->attachments[i].image_view =
1309 v3dv_image_view_from_handle(attach_begin->pAttachments[i]);
1310 } else if (framebuffer) {
1311 state->attachments[i].image_view = framebuffer->attachments[i];
1312 } else {
1313 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
1314 state->attachments[i].image_view = NULL;
1315 }
1316 }
1317 }
1318
1319 static void
cmd_buffer_init_render_pass_attachment_state(struct v3dv_cmd_buffer * cmd_buffer,const VkRenderPassBeginInfo * pRenderPassBegin)1320 cmd_buffer_init_render_pass_attachment_state(struct v3dv_cmd_buffer *cmd_buffer,
1321 const VkRenderPassBeginInfo *pRenderPassBegin)
1322 {
1323 cmd_buffer_state_set_clear_values(cmd_buffer,
1324 pRenderPassBegin->clearValueCount,
1325 pRenderPassBegin->pClearValues);
1326
1327 cmd_buffer_state_set_attachments(cmd_buffer, pRenderPassBegin);
1328 }
1329
1330 static void
cmd_buffer_ensure_render_pass_attachment_state(struct v3dv_cmd_buffer * cmd_buffer)1331 cmd_buffer_ensure_render_pass_attachment_state(struct v3dv_cmd_buffer *cmd_buffer)
1332 {
1333 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1334 const struct v3dv_render_pass *pass = state->pass;
1335
1336 if (state->attachment_alloc_count < pass->attachment_count) {
1337 if (state->attachments > 0) {
1338 assert(state->attachment_alloc_count > 0);
1339 vk_free(&cmd_buffer->device->vk.alloc, state->attachments);
1340 }
1341
1342 uint32_t size = sizeof(struct v3dv_cmd_buffer_attachment_state) *
1343 pass->attachment_count;
1344 state->attachments = vk_zalloc(&cmd_buffer->device->vk.alloc, size, 8,
1345 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
1346 if (!state->attachments) {
1347 v3dv_flag_oom(cmd_buffer, NULL);
1348 return;
1349 }
1350 state->attachment_alloc_count = pass->attachment_count;
1351 }
1352
1353 assert(state->attachment_alloc_count >= pass->attachment_count);
1354 }
1355
1356 /* If our render area is smaller than the current clip window we will have
1357 * to emit a new clip window to constraint it to the render area.
1358 */
1359 static void
constraint_clip_window_to_render_area(struct v3dv_cmd_buffer * cmd_buffer)1360 constraint_clip_window_to_render_area(struct v3dv_cmd_buffer *cmd_buffer)
1361 {
1362 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1363 struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state;
1364
1365 uint32_t min_render_x = state->render_area.offset.x;
1366 uint32_t min_render_y = state->render_area.offset.y;
1367 uint32_t max_render_x = min_render_x + state->render_area.extent.width - 1;
1368 uint32_t max_render_y = min_render_y + state->render_area.extent.height - 1;
1369 uint32_t min_clip_x = state->clip_window.offset.x;
1370 uint32_t min_clip_y = state->clip_window.offset.y;
1371 uint32_t max_clip_x = min_clip_x + state->clip_window.extent.width - 1;
1372 uint32_t max_clip_y = min_clip_y + state->clip_window.extent.height - 1;
1373 if (min_render_x > min_clip_x || min_render_y > min_clip_y ||
1374 max_render_x < max_clip_x || max_render_y < max_clip_y) {
1375 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS);
1376 }
1377 }
1378
1379 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,const VkRenderPassBeginInfo * pRenderPassBegin,const VkSubpassBeginInfo * pSubpassBeginInfo)1380 v3dv_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
1381 const VkRenderPassBeginInfo *pRenderPassBegin,
1382 const VkSubpassBeginInfo *pSubpassBeginInfo)
1383 {
1384 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1385 V3DV_FROM_HANDLE(v3dv_render_pass, pass, pRenderPassBegin->renderPass);
1386 V3DV_FROM_HANDLE(v3dv_framebuffer, framebuffer, pRenderPassBegin->framebuffer);
1387
1388 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1389 state->pass = pass;
1390 state->framebuffer = framebuffer;
1391
1392 cmd_buffer_ensure_render_pass_attachment_state(cmd_buffer);
1393 v3dv_return_if_oom(cmd_buffer, NULL);
1394
1395 cmd_buffer_init_render_pass_attachment_state(cmd_buffer, pRenderPassBegin);
1396
1397 state->render_area = pRenderPassBegin->renderArea;
1398 constraint_clip_window_to_render_area(cmd_buffer);
1399
1400 /* Setup for first subpass */
1401 v3dv_cmd_buffer_subpass_start(cmd_buffer, 0);
1402 }
1403
1404 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdNextSubpass2(VkCommandBuffer commandBuffer,const VkSubpassBeginInfo * pSubpassBeginInfo,const VkSubpassEndInfo * pSubpassEndInfo)1405 v3dv_CmdNextSubpass2(VkCommandBuffer commandBuffer,
1406 const VkSubpassBeginInfo *pSubpassBeginInfo,
1407 const VkSubpassEndInfo *pSubpassEndInfo)
1408 {
1409 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1410
1411 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1412 assert(state->subpass_idx < state->pass->subpass_count - 1);
1413
1414 /* Finish the previous subpass */
1415 v3dv_cmd_buffer_subpass_finish(cmd_buffer);
1416 cmd_buffer_subpass_handle_pending_resolves(cmd_buffer);
1417
1418 /* Start the next subpass */
1419 v3dv_cmd_buffer_subpass_start(cmd_buffer, state->subpass_idx + 1);
1420 }
1421
1422 static void
cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer * cmd_buffer)1423 cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer *cmd_buffer)
1424 {
1425 assert(cmd_buffer->state.pass);
1426 assert(cmd_buffer->state.subpass_idx < cmd_buffer->state.pass->subpass_count);
1427 assert(!cmd_buffer->state.resuming);
1428 const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1429 const struct v3dv_render_pass *pass = state->pass;
1430 const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
1431
1432 /* We only need to emit subpass clears as draw calls when the render
1433 * area is not aligned to tile boundaries or for GFXH-1461.
1434 */
1435 if (cmd_buffer->state.tile_aligned_render_area &&
1436 !subpass->do_depth_clear_with_draw &&
1437 !subpass->do_stencil_clear_with_draw) {
1438 return;
1439 }
1440
1441 uint32_t att_count = 0;
1442 VkClearAttachment atts[V3D_MAX_DRAW_BUFFERS + 1]; /* +1 for D/S */
1443
1444 /* We only need to emit subpass clears as draw calls for color attachments
1445 * if the render area is not aligned to tile boundaries.
1446 */
1447 if (!cmd_buffer->state.tile_aligned_render_area) {
1448 for (uint32_t i = 0; i < subpass->color_count; i++) {
1449 const uint32_t att_idx = subpass->color_attachments[i].attachment;
1450 if (att_idx == VK_ATTACHMENT_UNUSED)
1451 continue;
1452
1453 struct v3dv_render_pass_attachment *att = &pass->attachments[att_idx];
1454 if (att->desc.loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR)
1455 continue;
1456
1457 if (state->subpass_idx != att->first_subpass)
1458 continue;
1459
1460 atts[att_count].aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
1461 atts[att_count].colorAttachment = i;
1462 atts[att_count].clearValue = state->attachments[att_idx].vk_clear_value;
1463 att_count++;
1464 }
1465 }
1466
1467 /* For D/S we may also need to emit a subpass clear for GFXH-1461 */
1468 const uint32_t ds_att_idx = subpass->ds_attachment.attachment;
1469 if (ds_att_idx != VK_ATTACHMENT_UNUSED) {
1470 struct v3dv_render_pass_attachment *att = &pass->attachments[ds_att_idx];
1471 if (state->subpass_idx == att->first_subpass) {
1472 VkImageAspectFlags aspects = vk_format_aspects(att->desc.format);
1473 if (att->desc.loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR ||
1474 (cmd_buffer->state.tile_aligned_render_area &&
1475 !subpass->do_depth_clear_with_draw)) {
1476 aspects &= ~VK_IMAGE_ASPECT_DEPTH_BIT;
1477 }
1478 if (att->desc.stencilLoadOp != VK_ATTACHMENT_LOAD_OP_CLEAR ||
1479 (cmd_buffer->state.tile_aligned_render_area &&
1480 !subpass->do_stencil_clear_with_draw)) {
1481 aspects &= ~VK_IMAGE_ASPECT_STENCIL_BIT;
1482 }
1483 if (aspects) {
1484 atts[att_count].aspectMask = aspects;
1485 atts[att_count].colorAttachment = 0; /* Ignored */
1486 atts[att_count].clearValue =
1487 state->attachments[ds_att_idx].vk_clear_value;
1488 att_count++;
1489 }
1490 }
1491 }
1492
1493 if (att_count == 0)
1494 return;
1495
1496 if (!cmd_buffer->state.tile_aligned_render_area) {
1497 perf_debug("Render area doesn't match render pass granularity, falling "
1498 "back to vkCmdClearAttachments for "
1499 "VK_ATTACHMENT_LOAD_OP_CLEAR.\n");
1500 } else if (subpass->do_depth_clear_with_draw ||
1501 subpass->do_stencil_clear_with_draw) {
1502 perf_debug("Subpass clears DEPTH but loads STENCIL (or vice versa), "
1503 "falling back to vkCmdClearAttachments for "
1504 "VK_ATTACHMENT_LOAD_OP_CLEAR.\n");
1505 }
1506
1507 /* From the Vulkan 1.0 spec:
1508 *
1509 * "VK_ATTACHMENT_LOAD_OP_CLEAR specifies that the contents within the
1510 * render area will be cleared to a uniform value, which is specified
1511 * when a render pass instance is begun."
1512 *
1513 * So the clear is only constrained by the render area and not by pipeline
1514 * state such as scissor or viewport, these are the semantics of
1515 * vkCmdClearAttachments as well.
1516 *
1517 * Also:
1518 *
1519 * "If the render pass instance this is recorded in uses multiview, then
1520 * baseArrayLayer must be zero and layerCount must be one."
1521 */
1522 assert(state->framebuffer);
1523 uint32_t layer_count = cmd_buffer->state.pass->multiview_enabled ?
1524 1 : state->framebuffer->layers;
1525 VkCommandBuffer _cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
1526 VkClearRect rect = {
1527 .rect = state->render_area,
1528 .baseArrayLayer = 0,
1529 .layerCount = layer_count,
1530 };
1531 v3dv_CmdClearAttachments(_cmd_buffer, att_count, atts, 1, &rect);
1532 }
1533
1534 bool
v3dv_cmd_buffer_check_needs_load(const struct v3dv_cmd_buffer_state * state,VkImageAspectFlags aspect,uint32_t first_subpass_idx,VkAttachmentLoadOp load_op,uint32_t last_subpass_idx,VkAttachmentStoreOp store_op)1535 v3dv_cmd_buffer_check_needs_load(const struct v3dv_cmd_buffer_state *state,
1536 VkImageAspectFlags aspect,
1537 uint32_t first_subpass_idx,
1538 VkAttachmentLoadOp load_op,
1539 uint32_t last_subpass_idx,
1540 VkAttachmentStoreOp store_op)
1541 {
1542 /* We call this with image->vk.aspects & aspect, so 0 means the aspect we are
1543 * testing does not exist in the image.
1544 */
1545 if (!aspect)
1546 return false;
1547
1548 /* Attachment (or view) load operations apply on the first subpass that
1549 * uses the attachment (or view), otherwise we always need to load.
1550 */
1551 if (state->job->first_subpass > first_subpass_idx)
1552 return true;
1553
1554 /* If the job is continuing a subpass started in another job, we always
1555 * need to load.
1556 */
1557 if (state->job->is_subpass_continue)
1558 return true;
1559
1560 /* If the area is not aligned to tile boundaries and we are going to store,
1561 * then we need to load to preserve contents outside the render area.
1562 */
1563 if (!state->tile_aligned_render_area &&
1564 v3dv_cmd_buffer_check_needs_store(state, aspect, last_subpass_idx,
1565 store_op)) {
1566 return true;
1567 }
1568
1569 /* The attachment load operations must be LOAD */
1570 return load_op == VK_ATTACHMENT_LOAD_OP_LOAD;
1571 }
1572
1573 bool
v3dv_cmd_buffer_check_needs_store(const struct v3dv_cmd_buffer_state * state,VkImageAspectFlags aspect,uint32_t last_subpass_idx,VkAttachmentStoreOp store_op)1574 v3dv_cmd_buffer_check_needs_store(const struct v3dv_cmd_buffer_state *state,
1575 VkImageAspectFlags aspect,
1576 uint32_t last_subpass_idx,
1577 VkAttachmentStoreOp store_op)
1578 {
1579 /* We call this with image->vk.aspects & aspect, so 0 means the aspect we are
1580 * testing does not exist in the image.
1581 */
1582 if (!aspect)
1583 return false;
1584
1585 /* Attachment (or view) store operations only apply on the last subpass
1586 * where the attachment (or view) is used, in other subpasses we always
1587 * need to store.
1588 */
1589 if (state->subpass_idx < last_subpass_idx)
1590 return true;
1591
1592 /* Attachment store operations only apply on the last job we emit on the the
1593 * last subpass where the attachment is used, otherwise we always need to
1594 * store.
1595 */
1596 if (!state->job->is_subpass_finish)
1597 return true;
1598
1599 /* The attachment store operation must be STORE */
1600 return store_op == VK_ATTACHMENT_STORE_OP_STORE;
1601 }
1602
1603 static void
cmd_buffer_subpass_check_double_buffer_mode(struct v3dv_cmd_buffer * cmd_buffer,bool msaa)1604 cmd_buffer_subpass_check_double_buffer_mode(struct v3dv_cmd_buffer *cmd_buffer,
1605 bool msaa)
1606 {
1607 const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1608 struct v3dv_job *job = cmd_buffer->state.job;
1609 assert(job);
1610
1611 job->can_use_double_buffer = false;
1612
1613 /* Double-buffer can only be used if requested via V3D_DEBUG */
1614 if (!V3D_DBG(DOUBLE_BUFFER))
1615 return;
1616
1617 /* Double-buffer cannot be enabled for MSAA jobs */
1618 if (msaa)
1619 return;
1620
1621 const struct v3dv_render_pass *pass = state->pass;
1622 const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
1623
1624 /* FIXME: For now we discard multiview jobs (which have an implicit geometry
1625 * shader) for this optimization. If we want to enable this with multiview
1626 * we would need to check if any view (layer) in any attachment used by the
1627 * job has loads and/or stores as we do below for regular attachments. Also,
1628 * we would want to have a heuristic that doesn't automatically disable
1629 * double-buffer in the presence of geometry shaders.
1630 */
1631 if (state->pass->multiview_enabled)
1632 return;
1633
1634 /* Tile loads are serialized against stores, in which case we don't get
1635 * any benefits from enabling double-buffer and would just pay the price
1636 * of a smaller tile size instead. Similarly, we only benefit from
1637 * double-buffer if we have tile stores, as the point of this mode is
1638 * to execute rendering of a new tile while we store the previous one to
1639 * hide latency on the tile store operation.
1640 */
1641 bool has_stores = false;
1642 for (uint32_t i = 0; i < subpass->color_count; i++) {
1643 uint32_t attachment_idx = subpass->color_attachments[i].attachment;
1644 if (attachment_idx == VK_ATTACHMENT_UNUSED)
1645 continue;
1646
1647 const struct v3dv_render_pass_attachment *attachment =
1648 &state->pass->attachments[attachment_idx];
1649
1650 /* FIXME: This will check 'tile_aligned_render_area' but that was
1651 * computed with a tile size without double-buffer. That is okay
1652 * because if the larger tile size is aligned then we know the smaller
1653 * tile size for double-buffer will be as well. However, we might
1654 * still benefit from doing this check with the smaller tile size
1655 * because it can happen that the smaller size is aligned and the
1656 * larger size is not.
1657 */
1658 if (v3dv_cmd_buffer_check_needs_load(state,
1659 VK_IMAGE_ASPECT_COLOR_BIT,
1660 attachment->first_subpass,
1661 attachment->desc.loadOp,
1662 attachment->last_subpass,
1663 attachment->desc.storeOp)) {
1664 return;
1665 }
1666
1667 if (v3dv_cmd_buffer_check_needs_store(state,
1668 VK_IMAGE_ASPECT_COLOR_BIT,
1669 attachment->last_subpass,
1670 attachment->desc.storeOp)) {
1671 has_stores = true;
1672 }
1673 }
1674
1675 if (subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED) {
1676 uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
1677 const struct v3dv_render_pass_attachment *ds_attachment =
1678 &state->pass->attachments[ds_attachment_idx];
1679
1680 const VkImageAspectFlags ds_aspects =
1681 vk_format_aspects(ds_attachment->desc.format);
1682
1683 if (v3dv_cmd_buffer_check_needs_load(state,
1684 ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
1685 ds_attachment->first_subpass,
1686 ds_attachment->desc.loadOp,
1687 ds_attachment->last_subpass,
1688 ds_attachment->desc.storeOp)) {
1689 return;
1690 }
1691
1692 if (v3dv_cmd_buffer_check_needs_load(state,
1693 ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
1694 ds_attachment->first_subpass,
1695 ds_attachment->desc.stencilLoadOp,
1696 ds_attachment->last_subpass,
1697 ds_attachment->desc.stencilStoreOp)) {
1698 return;
1699 }
1700
1701 has_stores |= v3dv_cmd_buffer_check_needs_store(state,
1702 ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
1703 ds_attachment->last_subpass,
1704 ds_attachment->desc.storeOp);
1705 has_stores |= v3dv_cmd_buffer_check_needs_store(state,
1706 ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
1707 ds_attachment->last_subpass,
1708 ds_attachment->desc.stencilStoreOp);
1709 }
1710
1711 job->can_use_double_buffer = has_stores;
1712 }
1713
1714 static struct v3dv_job *
cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer * cmd_buffer,uint32_t subpass_idx,enum v3dv_job_type type,bool is_subpass_start)1715 cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer *cmd_buffer,
1716 uint32_t subpass_idx,
1717 enum v3dv_job_type type,
1718 bool is_subpass_start)
1719 {
1720 assert(type == V3DV_JOB_TYPE_GPU_CL ||
1721 type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE);
1722
1723 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1724 assert(subpass_idx < state->pass->subpass_count);
1725
1726 /* Starting a new job can trigger a finish of the current one, so don't
1727 * change the command buffer state for the new job until we are done creating
1728 * the new job.
1729 */
1730 struct v3dv_job *job =
1731 v3dv_cmd_buffer_start_job(cmd_buffer, subpass_idx, type);
1732 if (!job)
1733 return NULL;
1734
1735 if (is_subpass_start && cmd_buffer->state.resuming) {
1736 assert(subpass_idx == 0);
1737 job->resuming = true;
1738 }
1739
1740 state->subpass_idx = subpass_idx;
1741
1742 /* If we are starting a new job we need to setup binning. We only do this
1743 * for V3DV_JOB_TYPE_GPU_CL jobs because V3DV_JOB_TYPE_GPU_CL_INCOMPLETE
1744 * jobs are not submitted to the GPU directly, and are instead meant to be
1745 * branched to from other V3DV_JOB_TYPE_GPU_CL jobs. With dynamic rendering,
1746 * all resuming jobs work similarly to secondary command buffers, so we
1747 * apply the same.
1748 */
1749 if (type == V3DV_JOB_TYPE_GPU_CL &&
1750 job->first_subpass == state->subpass_idx &&
1751 !job->resuming) {
1752 const struct v3dv_subpass *subpass =
1753 &state->pass->subpasses[state->subpass_idx];
1754
1755 const struct v3dv_framebuffer *framebuffer = state->framebuffer;
1756
1757 uint8_t max_internal_bpp, total_color_bpp;
1758 bool msaa;
1759 v3d_X((&job->device->devinfo), framebuffer_compute_internal_bpp_msaa)
1760 (framebuffer, state->attachments, subpass,
1761 &max_internal_bpp, &total_color_bpp, &msaa);
1762
1763 /* From the Vulkan spec:
1764 *
1765 * "If the render pass uses multiview, then layers must be one and
1766 * each attachment requires a number of layers that is greater than
1767 * the maximum bit index set in the view mask in the subpasses in
1768 * which it is used."
1769 *
1770 * So when multiview is enabled, we take the number of layers from the
1771 * last bit set in the view mask.
1772 */
1773 uint32_t layers = framebuffer->layers;
1774 if (subpass->view_mask != 0) {
1775 assert(framebuffer->layers == 1);
1776 layers = util_last_bit(subpass->view_mask);
1777 }
1778
1779 v3dv_job_start_frame(job,
1780 framebuffer->width,
1781 framebuffer->height,
1782 layers,
1783 true, false,
1784 subpass->color_count,
1785 max_internal_bpp,
1786 total_color_bpp,
1787 msaa);
1788 }
1789
1790 return job;
1791 }
1792
1793 struct v3dv_job *
v3dv_cmd_buffer_subpass_start(struct v3dv_cmd_buffer * cmd_buffer,uint32_t subpass_idx)1794 v3dv_cmd_buffer_subpass_start(struct v3dv_cmd_buffer *cmd_buffer,
1795 uint32_t subpass_idx)
1796 {
1797 assert(cmd_buffer->state.pass);
1798 assert(subpass_idx < cmd_buffer->state.pass->subpass_count);
1799
1800 struct v3dv_job *job =
1801 cmd_buffer_subpass_create_job(cmd_buffer, subpass_idx,
1802 V3DV_JOB_TYPE_GPU_CL, true);
1803 if (!job)
1804 return NULL;
1805
1806 /* FIXME: do we need all this below for resuming jobs? */
1807
1808 /* Check if our render area is aligned to tile boundaries. We have to do
1809 * this in each subpass because the subset of attachments used can change
1810 * and with that the tile size selected by the hardware can change too.
1811 */
1812 cmd_buffer_update_tile_alignment(cmd_buffer);
1813
1814 /* Decide if we can use double-buffer for this subpass job */
1815 cmd_buffer_subpass_check_double_buffer_mode(cmd_buffer, job->frame_tiling.msaa);
1816
1817 cmd_buffer_update_attachment_resolve_state(cmd_buffer);
1818
1819 /* If we can't use TLB clears then we need to emit draw clears for any
1820 * LOAD_OP_CLEAR attachments in this subpass now. We might also need to emit
1821 * Depth/Stencil clears if we hit GFXH-1461. With dynamic render passes this
1822 * should only be called when starting the render pass, not when resuming.
1823 */
1824 if (!cmd_buffer->state.resuming)
1825 cmd_buffer_emit_subpass_clears(cmd_buffer);
1826
1827 return job;
1828 }
1829
1830 struct v3dv_job *
v3dv_cmd_buffer_subpass_resume(struct v3dv_cmd_buffer * cmd_buffer,uint32_t subpass_idx)1831 v3dv_cmd_buffer_subpass_resume(struct v3dv_cmd_buffer *cmd_buffer,
1832 uint32_t subpass_idx)
1833 {
1834 assert(cmd_buffer->state.pass);
1835 assert(subpass_idx < cmd_buffer->state.pass->subpass_count);
1836
1837 struct v3dv_job *job;
1838 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
1839 job = cmd_buffer_subpass_create_job(cmd_buffer, subpass_idx,
1840 V3DV_JOB_TYPE_GPU_CL, false);
1841 } else {
1842 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
1843 job = cmd_buffer_subpass_create_job(cmd_buffer, subpass_idx,
1844 V3DV_JOB_TYPE_GPU_CL_INCOMPLETE, false);
1845 }
1846
1847 if (!job)
1848 return NULL;
1849
1850 job->is_subpass_continue = true;
1851
1852 return job;
1853 }
1854
1855 void
v3dv_cmd_buffer_subpass_finish(struct v3dv_cmd_buffer * cmd_buffer)1856 v3dv_cmd_buffer_subpass_finish(struct v3dv_cmd_buffer *cmd_buffer)
1857 {
1858 /* We can end up here without a job if the last command recorded into the
1859 * subpass already finished the job (for example a pipeline barrier). In
1860 * that case we miss to set the is_subpass_finish flag, but that is not
1861 * required for proper behavior.
1862 */
1863 struct v3dv_job *job = cmd_buffer->state.job;
1864 if (job)
1865 job->is_subpass_finish = true;
1866 }
1867
1868 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdEndRenderPass2(VkCommandBuffer commandBuffer,const VkSubpassEndInfo * pSubpassEndInfo)1869 v3dv_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
1870 const VkSubpassEndInfo *pSubpassEndInfo)
1871 {
1872 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1873
1874 /* Finalize last subpass */
1875 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1876 assert(state->subpass_idx == state->pass->subpass_count - 1);
1877 v3dv_cmd_buffer_subpass_finish(cmd_buffer);
1878 v3dv_cmd_buffer_finish_job(cmd_buffer);
1879
1880 cmd_buffer_subpass_handle_pending_resolves(cmd_buffer);
1881
1882 /* We are no longer inside a render pass */
1883 state->framebuffer = NULL;
1884 state->pass = NULL;
1885 state->subpass_idx = -1;
1886 }
1887
1888 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_EndCommandBuffer(VkCommandBuffer commandBuffer)1889 v3dv_EndCommandBuffer(VkCommandBuffer commandBuffer)
1890 {
1891 MESA_TRACE_FUNC();
1892 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1893
1894 if (cmd_buffer->state.oom)
1895 return VK_ERROR_OUT_OF_HOST_MEMORY;
1896
1897 /* Primaries should have ended any recording jobs by the time they hit
1898 * vkEndRenderPass (if we are inside a render pass). Commands outside
1899 * a render pass instance (for both primaries and secondaries) spawn
1900 * complete jobs too. So the only case where we can get here without
1901 * finishing a recording job is when we are recording a secondary
1902 * inside a render pass.
1903 */
1904 if (cmd_buffer->state.job) {
1905 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
1906 cmd_buffer->state.pass);
1907 v3dv_cmd_buffer_finish_job(cmd_buffer);
1908 }
1909
1910 cmd_buffer->status = V3DV_CMD_BUFFER_STATUS_EXECUTABLE;
1911
1912 return VK_SUCCESS;
1913 }
1914
1915 static bool
clone_bo_list(struct v3dv_device * device,struct list_head * dst,struct list_head * src)1916 clone_bo_list(struct v3dv_device *device,
1917 struct list_head *dst,
1918 struct list_head *src)
1919 {
1920 assert(device);
1921
1922 list_inithead(dst);
1923 list_for_each_entry(struct v3dv_bo, bo, src, list_link) {
1924 struct v3dv_bo *clone_bo =
1925 vk_alloc(&device->vk.alloc, sizeof(struct v3dv_bo), 8,
1926 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
1927 if (!clone_bo)
1928 return false;
1929
1930 *clone_bo = *bo;
1931 list_addtail(&clone_bo->list_link, dst);
1932 }
1933
1934 return true;
1935 }
1936
1937 struct v3dv_job *
v3dv_job_clone(struct v3dv_job * job,bool skip_bcl)1938 v3dv_job_clone(struct v3dv_job *job, bool skip_bcl)
1939 {
1940 struct v3dv_job *clone = vk_alloc(&job->device->vk.alloc,
1941 sizeof(struct v3dv_job), 8,
1942 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
1943 if (!clone)
1944 return NULL;
1945
1946 /* Cloned jobs don't duplicate resources, they share their CLs with the
1947 * oringinal job, since they are typically read-only. The exception to this
1948 * is dynamic rendering suspension paired with
1949 * VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT, since in that case we need
1950 * to patch the BCL with the resume address and for that we need to create a
1951 * copy of the job so we avoid rewriting the resume address for another copy
1952 * of the same job that may be running in the GPU. When we create a job for
1953 * this use case skip_bcl is set to True and the caller will be responsible
1954 * for creating the BCL.
1955 */
1956 *clone = *job;
1957 clone->is_clone = true;
1958 clone->cmd_buffer = NULL;
1959
1960 /* We need to regen the BO lists so that they point to the BO list in the
1961 * cloned job. Otherwise functions like list_length() will loop forever.
1962 */
1963 if (job->type == V3DV_JOB_TYPE_GPU_CL) {
1964 assert(job->cmd_buffer);
1965 struct v3dv_device *device = job->cmd_buffer->device;
1966
1967 clone->bcl.job = clone;
1968 clone->rcl.job = clone;
1969 clone->indirect.job = clone;
1970
1971 if (!skip_bcl &&
1972 !clone_bo_list(device, &clone->bcl.bo_list, &job->bcl.bo_list)) {
1973 return NULL;
1974 }
1975 if (!clone_bo_list(device, &clone->rcl.bo_list, &job->rcl.bo_list))
1976 return NULL;
1977 if (!clone_bo_list(device, &clone->indirect.bo_list, &job->indirect.bo_list))
1978 return NULL;
1979 }
1980
1981 return clone;
1982 }
1983
1984 /* Clones a job for inclusion in the given command buffer. Note that this
1985 * doesn't make a deep copy so the cloned job it doesn't own any resources.
1986 * Useful when we need to have a job in more than one list, which happens
1987 * for jobs recorded in secondary command buffers when we want to execute
1988 * them in primaries.
1989 */
1990 struct v3dv_job *
v3dv_job_clone_in_cmd_buffer(struct v3dv_job * job,struct v3dv_cmd_buffer * cmd_buffer)1991 v3dv_job_clone_in_cmd_buffer(struct v3dv_job *job,
1992 struct v3dv_cmd_buffer *cmd_buffer)
1993 {
1994 struct v3dv_job *clone = v3dv_job_clone(job, false);
1995 if (!clone) {
1996 v3dv_flag_oom(cmd_buffer, NULL);
1997 return NULL;
1998 }
1999
2000 clone->cmd_buffer = cmd_buffer;
2001 list_addtail(&clone->list_link, &cmd_buffer->jobs);
2002 return clone;
2003 }
2004
2005 void
v3dv_cmd_buffer_merge_barrier_state(struct v3dv_barrier_state * dst,struct v3dv_barrier_state * src)2006 v3dv_cmd_buffer_merge_barrier_state(struct v3dv_barrier_state *dst,
2007 struct v3dv_barrier_state *src)
2008 {
2009 dst->dst_mask |= src->dst_mask;
2010
2011 dst->src_mask_graphics |= src->src_mask_graphics;
2012 dst->src_mask_compute |= src->src_mask_compute;
2013 dst->src_mask_transfer |= src->src_mask_transfer;
2014
2015 dst->bcl_buffer_access |= src->bcl_buffer_access;
2016 dst->bcl_image_access |= src->bcl_image_access;
2017 }
2018
2019 static void
cmd_buffer_execute_outside_pass(struct v3dv_cmd_buffer * primary,uint32_t cmd_buffer_count,const VkCommandBuffer * cmd_buffers)2020 cmd_buffer_execute_outside_pass(struct v3dv_cmd_buffer *primary,
2021 uint32_t cmd_buffer_count,
2022 const VkCommandBuffer *cmd_buffers)
2023 {
2024 struct v3dv_barrier_state pending_barrier = { 0 };
2025 for (uint32_t i = 0; i < cmd_buffer_count; i++) {
2026 V3DV_FROM_HANDLE(v3dv_cmd_buffer, secondary, cmd_buffers[i]);
2027
2028 assert(!(secondary->usage_flags &
2029 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT));
2030
2031 /* Secondary command buffers that execute outside a render pass create
2032 * complete jobs with an RCL and tile setup, so we simply want to merge
2033 * their job list into the primary's. However, because they may be
2034 * executed into multiple primaries at the same time and we only have a
2035 * single list_link in each job, we can't just add then to the primary's
2036 * job list and we instead have to clone them first.
2037 *
2038 * Alternatively, we could create a "execute secondary" CPU job that
2039 * when executed in a queue, would submit all the jobs in the referenced
2040 * secondary command buffer. However, this would raise some challenges
2041 * to make it work with the implementation of wait threads in the queue
2042 * which we use for event waits, for example.
2043 */
2044 list_for_each_entry(struct v3dv_job, secondary_job,
2045 &secondary->jobs, list_link) {
2046 /* These can only happen inside a render pass */
2047 assert(secondary_job->type != V3DV_JOB_TYPE_GPU_CL_INCOMPLETE);
2048 struct v3dv_job *job = v3dv_job_clone_in_cmd_buffer(secondary_job, primary);
2049 if (!job)
2050 return;
2051
2052 if (pending_barrier.dst_mask) {
2053 /* FIXME: do the same we do for primaries and only choose the
2054 * relevant src masks.
2055 */
2056 job->serialize = pending_barrier.src_mask_graphics |
2057 pending_barrier.src_mask_transfer |
2058 pending_barrier.src_mask_compute;
2059 if (pending_barrier.bcl_buffer_access ||
2060 pending_barrier.bcl_image_access) {
2061 job->needs_bcl_sync = true;
2062 }
2063 memset(&pending_barrier, 0, sizeof(pending_barrier));
2064 }
2065 }
2066
2067 /* If this secondary had any pending barrier state we will need that
2068 * barrier state consumed with whatever comes after it (first job in
2069 * the next secondary or the primary, if this was the last secondary).
2070 */
2071 assert(secondary->state.barrier.dst_mask ||
2072 (!secondary->state.barrier.bcl_buffer_access &&
2073 !secondary->state.barrier.bcl_image_access));
2074 pending_barrier = secondary->state.barrier;
2075 }
2076
2077 if (pending_barrier.dst_mask) {
2078 v3dv_cmd_buffer_merge_barrier_state(&primary->state.barrier,
2079 &pending_barrier);
2080 }
2081 }
2082
2083 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdExecuteCommands(VkCommandBuffer commandBuffer,uint32_t commandBufferCount,const VkCommandBuffer * pCommandBuffers)2084 v3dv_CmdExecuteCommands(VkCommandBuffer commandBuffer,
2085 uint32_t commandBufferCount,
2086 const VkCommandBuffer *pCommandBuffers)
2087 {
2088 V3DV_FROM_HANDLE(v3dv_cmd_buffer, primary, commandBuffer);
2089
2090 if (primary->state.pass != NULL) {
2091 v3d_X((&primary->device->devinfo), cmd_buffer_execute_inside_pass)
2092 (primary, commandBufferCount, pCommandBuffers);
2093 } else {
2094 cmd_buffer_execute_outside_pass(primary,
2095 commandBufferCount, pCommandBuffers);
2096 }
2097 }
2098
2099 static void
cmd_buffer_copy_private_dynamic_state(struct v3dv_dynamic_state * dst,struct v3dv_dynamic_state * src,struct vk_dynamic_graphics_state * src_dyn)2100 cmd_buffer_copy_private_dynamic_state(struct v3dv_dynamic_state *dst,
2101 struct v3dv_dynamic_state *src,
2102 struct vk_dynamic_graphics_state *src_dyn)
2103 {
2104 if (BITSET_TEST(src_dyn->set, MESA_VK_DYNAMIC_VP_VIEWPORTS)) {
2105 typed_memcpy(dst->viewport.scale, src->viewport.scale,
2106 MAX_VIEWPORTS);
2107 typed_memcpy(dst->viewport.translate, src->viewport.translate,
2108 MAX_VIEWPORTS);
2109 }
2110 if (BITSET_TEST(src_dyn->set, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES))
2111 dst->color_write_enable = src->color_write_enable;
2112 }
2113
2114 /* This function copies relevant static state from the pipeline to the command
2115 * buffer state.
2116 *
2117 * Notice the Vulkan runtime uses the term 'dynamic' to refer to all state
2118 * that *could* be dynamic, even if it is not dynamic for a particular
2119 * pipeline, so the terminology used in the runtime may be a bit misleading.
2120 */
2121 static void
cmd_buffer_bind_pipeline_static_state(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_pipeline * pipeline)2122 cmd_buffer_bind_pipeline_static_state(struct v3dv_cmd_buffer *cmd_buffer,
2123 struct v3dv_pipeline *pipeline)
2124 {
2125 vk_cmd_set_dynamic_graphics_state(&cmd_buffer->vk, &pipeline->dynamic_graphics_state);
2126 cmd_buffer_copy_private_dynamic_state(&cmd_buffer->state.dynamic, &pipeline->dynamic,
2127 &pipeline->dynamic_graphics_state);
2128
2129 }
2130
2131 static void
bind_graphics_pipeline(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_pipeline * pipeline)2132 bind_graphics_pipeline(struct v3dv_cmd_buffer *cmd_buffer,
2133 struct v3dv_pipeline *pipeline)
2134 {
2135 assert(pipeline && !(pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT));
2136
2137 /* We need to unconditionally bind the pipeline static state, as the state
2138 * could have changed (through calls to vkCmdSetXXX) between bindings of
2139 * the same pipeline.
2140 */
2141 cmd_buffer_bind_pipeline_static_state(cmd_buffer, pipeline);
2142
2143 if (cmd_buffer->state.gfx.pipeline == pipeline)
2144 return;
2145
2146 cmd_buffer->state.gfx.pipeline = pipeline;
2147 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_PIPELINE;
2148 }
2149
2150 static void
bind_compute_pipeline(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_pipeline * pipeline)2151 bind_compute_pipeline(struct v3dv_cmd_buffer *cmd_buffer,
2152 struct v3dv_pipeline *pipeline)
2153 {
2154 assert(pipeline && pipeline->active_stages == VK_SHADER_STAGE_COMPUTE_BIT);
2155
2156 if (cmd_buffer->state.compute.pipeline == pipeline)
2157 return;
2158
2159 cmd_buffer->state.compute.pipeline = pipeline;
2160 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_COMPUTE_PIPELINE;
2161 }
2162
2163 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdBindPipeline(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipeline _pipeline)2164 v3dv_CmdBindPipeline(VkCommandBuffer commandBuffer,
2165 VkPipelineBindPoint pipelineBindPoint,
2166 VkPipeline _pipeline)
2167 {
2168 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
2169 V3DV_FROM_HANDLE(v3dv_pipeline, pipeline, _pipeline);
2170
2171 switch (pipelineBindPoint) {
2172 case VK_PIPELINE_BIND_POINT_COMPUTE:
2173 bind_compute_pipeline(cmd_buffer, pipeline);
2174 break;
2175
2176 case VK_PIPELINE_BIND_POINT_GRAPHICS:
2177 bind_graphics_pipeline(cmd_buffer, pipeline);
2178 break;
2179
2180 default:
2181 assert(!"invalid bind point");
2182 break;
2183 }
2184 }
2185
2186 /* Considers the pipeline's negative_one_to_one state and applies it to the
2187 * current viewport transform if needed to produce the resulting Z translate
2188 * and scale parameters.
2189 */
2190 void
v3dv_cmd_buffer_state_get_viewport_z_xform(struct v3dv_cmd_buffer * cmd_buffer,uint32_t vp_idx,float * translate_z,float * scale_z)2191 v3dv_cmd_buffer_state_get_viewport_z_xform(struct v3dv_cmd_buffer *cmd_buffer,
2192 uint32_t vp_idx,
2193 float *translate_z, float *scale_z)
2194 {
2195 const struct v3dv_viewport_state *vp_state = &cmd_buffer->state.dynamic.viewport;
2196 const struct vk_viewport_state *vk_vp_state = &cmd_buffer->vk.dynamic_graphics_state.vp;
2197
2198 float t = vp_state->translate[vp_idx][2];
2199 float s = vp_state->scale[vp_idx][2];
2200
2201 assert(cmd_buffer->state.gfx.pipeline);
2202 if (cmd_buffer->state.gfx.pipeline->negative_one_to_one) {
2203 t = (t + vk_vp_state->viewports[vp_idx].maxDepth) * 0.5f;
2204 s *= 0.5f;
2205 }
2206
2207 if (translate_z)
2208 *translate_z = t;
2209
2210 if (scale_z)
2211 *scale_z = s;
2212 }
2213
2214 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer,uint32_t attachmentCount,const VkBool32 * pColorWriteEnables)2215 v3dv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer,
2216 uint32_t attachmentCount,
2217 const VkBool32 *pColorWriteEnables)
2218 {
2219 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
2220 struct v3dv_dynamic_state *v3dv_dyn = &cmd_buffer->state.dynamic;
2221 struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state;
2222 uint32_t color_write_enable = 0;
2223
2224 /* Vulkan runtime computes color_write_enable as an 8-bit bitset, setting a
2225 * bit per attachment. But when emitting, it is combined with the
2226 * color_write_mask, that is stored as a 32-bit mask (one bit per channel,
2227 * per attachment). So we store the color_write_enable as a 32-bit mask
2228 * ourselves.
2229 */
2230 for (uint32_t i = 0; i < attachmentCount; i++)
2231 color_write_enable |= pColorWriteEnables[i] ? (0xfu << (i * 4)) : 0;
2232
2233 if (v3dv_dyn->color_write_enable == color_write_enable)
2234 return;
2235
2236 v3dv_dyn->color_write_enable = color_write_enable;
2237 BITSET_SET(dyn->set, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
2238 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
2239 }
2240
2241 /* We keep a custom CmdSetViewport because we want to cache the outcome of
2242 * viewport_compute_xform, and because we need to set the viewport count. This
2243 * is specially relevant to our case because we are pushing/popping the
2244 * dynamic state as part of the meta operations.
2245 */
2246 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdSetViewport(VkCommandBuffer commandBuffer,uint32_t firstViewport,uint32_t viewportCount,const VkViewport * pViewports)2247 v3dv_CmdSetViewport(VkCommandBuffer commandBuffer,
2248 uint32_t firstViewport,
2249 uint32_t viewportCount,
2250 const VkViewport *pViewports)
2251 {
2252 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
2253 struct v3dv_dynamic_state *v3dv_dyn = &cmd_buffer->state.dynamic;
2254 struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state;
2255
2256 const uint32_t total_count = firstViewport + viewportCount;
2257 assert(firstViewport < MAX_VIEWPORTS);
2258 assert(total_count >= 1 && total_count <= MAX_VIEWPORTS);
2259
2260 vk_common_CmdSetViewportWithCount(commandBuffer,
2261 total_count,
2262 pViewports);
2263
2264 for (uint32_t i = firstViewport; i < total_count; i++) {
2265 v3d_X((&cmd_buffer->device->devinfo), viewport_compute_xform)
2266 (&dyn->vp.viewports[i], v3dv_dyn->viewport.scale[i],
2267 v3dv_dyn->viewport.translate[i]);
2268 }
2269 }
2270
2271 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdSetViewportWithCount(VkCommandBuffer commandBuffer,uint32_t viewportCount,const VkViewport * pViewports)2272 v3dv_CmdSetViewportWithCount(VkCommandBuffer commandBuffer,
2273 uint32_t viewportCount,
2274 const VkViewport *pViewports)
2275 {
2276 v3dv_CmdSetViewport(commandBuffer, 0, viewportCount, pViewports);
2277 }
2278
2279 /* We keep a custom CmdSetScissor because we need to set the scissor
2280 * count. This is specially relevant to our case because we are
2281 * pushing/popping the dynamic state as part of the meta operations.
2282 */
2283 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdSetScissor(VkCommandBuffer commandBuffer,uint32_t firstScissor,uint32_t scissorCount,const VkRect2D * pScissors)2284 v3dv_CmdSetScissor(VkCommandBuffer commandBuffer,
2285 uint32_t firstScissor,
2286 uint32_t scissorCount,
2287 const VkRect2D *pScissors)
2288 {
2289 assert(firstScissor < MAX_SCISSORS);
2290 assert(firstScissor + scissorCount >= 1 &&
2291 firstScissor + scissorCount <= MAX_SCISSORS);
2292
2293 vk_common_CmdSetScissorWithCount(commandBuffer,
2294 firstScissor + scissorCount,
2295 pScissors);
2296 }
2297
2298 static void
emit_scissor(struct v3dv_cmd_buffer * cmd_buffer)2299 emit_scissor(struct v3dv_cmd_buffer *cmd_buffer)
2300 {
2301 if (cmd_buffer->vk.dynamic_graphics_state.vp.viewport_count == 0)
2302 return;
2303
2304 struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
2305
2306 /* FIXME: right now we only support one viewport. viewporst[0] would work
2307 * now, but would need to change if we allow multiple viewports.
2308 */
2309 float *vptranslate = dynamic->viewport.translate[0];
2310 float *vpscale = dynamic->viewport.scale[0];
2311 assert(vpscale[0] >= 0);
2312
2313 float vp_minx = vptranslate[0] - vpscale[0];
2314 float vp_maxx = vptranslate[0] + vpscale[0];
2315
2316 /* With KHR_maintenance1 viewport may have negative Y */
2317 float vp_miny = vptranslate[1] - fabsf(vpscale[1]);
2318 float vp_maxy = vptranslate[1] + fabsf(vpscale[1]);
2319
2320 /* Quoting from v3dx_emit:
2321 * "Clip to the scissor if it's enabled, but still clip to the
2322 * drawable regardless since that controls where the binner
2323 * tries to put things.
2324 *
2325 * Additionally, always clip the rendering to the viewport,
2326 * since the hardware does guardband clipping, meaning
2327 * primitives would rasterize outside of the view volume."
2328 */
2329 uint32_t minx, miny, maxx, maxy;
2330
2331 /* From the Vulkan spec:
2332 *
2333 * "The application must ensure (using scissor if necessary) that all
2334 * rendering is contained within the render area. The render area must be
2335 * contained within the framebuffer dimensions."
2336 *
2337 * So it is the application's responsibility to ensure this. Still, we can
2338 * help by automatically restricting the scissor rect to the render area.
2339 */
2340 minx = MAX2(vp_minx, cmd_buffer->state.render_area.offset.x);
2341 miny = MAX2(vp_miny, cmd_buffer->state.render_area.offset.y);
2342 maxx = MIN2(vp_maxx, cmd_buffer->state.render_area.offset.x +
2343 cmd_buffer->state.render_area.extent.width);
2344 maxy = MIN2(vp_maxy, cmd_buffer->state.render_area.offset.y +
2345 cmd_buffer->state.render_area.extent.height);
2346
2347 /* Clip against user provided scissor if needed.
2348 *
2349 * FIXME: right now we only allow one scissor. Below would need to be
2350 * updated if we support more
2351 */
2352 struct vk_dynamic_graphics_state *vk_dyn =
2353 &cmd_buffer->vk.dynamic_graphics_state;
2354 if (vk_dyn->vp.scissor_count > 0) {
2355 VkRect2D *scissor = &vk_dyn->vp.scissors[0];
2356 minx = MAX2(minx, scissor->offset.x);
2357 miny = MAX2(miny, scissor->offset.y);
2358 maxx = MIN2(maxx, scissor->offset.x + scissor->extent.width);
2359 maxy = MIN2(maxy, scissor->offset.y + scissor->extent.height);
2360 }
2361
2362 /* If the scissor is outside the viewport area we end up with
2363 * min{x,y} > max{x,y}.
2364 */
2365 if (minx > maxx)
2366 maxx = minx;
2367 if (miny > maxy)
2368 maxy = miny;
2369
2370 cmd_buffer->state.clip_window.offset.x = minx;
2371 cmd_buffer->state.clip_window.offset.y = miny;
2372 cmd_buffer->state.clip_window.extent.width = maxx - minx;
2373 cmd_buffer->state.clip_window.extent.height = maxy - miny;
2374
2375 v3d_X((&cmd_buffer->device->devinfo), job_emit_clip_window)
2376 (cmd_buffer->state.job, &cmd_buffer->state.clip_window);
2377
2378 BITSET_CLEAR(vk_dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS);
2379 }
2380
2381 static bool
update_gfx_uniform_state(struct v3dv_cmd_buffer * cmd_buffer)2382 update_gfx_uniform_state(struct v3dv_cmd_buffer *cmd_buffer)
2383 {
2384 /* We need to update uniform streams if any piece of state that is passed
2385 * to the shader as a uniform may have changed.
2386 *
2387 * If only descriptor sets are dirty then we can safely ignore updates
2388 * for shader stages that don't access descriptors.
2389 */
2390 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
2391 assert(pipeline);
2392 uint32_t dirty = cmd_buffer->state.dirty;
2393 struct vk_dynamic_graphics_state *dyn =
2394 &cmd_buffer->vk.dynamic_graphics_state;
2395
2396 const bool dirty_uniform_state =
2397 (dirty & (V3DV_CMD_DIRTY_PIPELINE |
2398 V3DV_CMD_DIRTY_PUSH_CONSTANTS |
2399 V3DV_CMD_DIRTY_DESCRIPTOR_SETS |
2400 V3DV_CMD_DIRTY_VIEW_INDEX |
2401 V3DV_CMD_DIRTY_DRAW_ID)) ||
2402 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS);
2403
2404 if (!dirty_uniform_state)
2405 return false;
2406
2407 const bool has_new_pipeline = dirty & V3DV_CMD_DIRTY_PIPELINE;
2408 const bool has_new_viewport = BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS);
2409 const bool has_new_push_constants = dirty & V3DV_CMD_DIRTY_PUSH_CONSTANTS;
2410 const bool has_new_descriptors = dirty & V3DV_CMD_DIRTY_DESCRIPTOR_SETS;
2411 const bool has_new_view_index = dirty & V3DV_CMD_DIRTY_VIEW_INDEX;
2412 const bool has_new_draw_id = dirty & V3DV_CMD_DIRTY_DRAW_ID;
2413
2414 /* VK_SHADER_STAGE_FRAGMENT_BIT */
2415 const bool has_new_descriptors_fs =
2416 has_new_descriptors &&
2417 (cmd_buffer->state.dirty_descriptor_stages & VK_SHADER_STAGE_FRAGMENT_BIT);
2418
2419 const bool has_new_push_constants_fs =
2420 has_new_push_constants &&
2421 (cmd_buffer->state.dirty_push_constants_stages & VK_SHADER_STAGE_FRAGMENT_BIT);
2422
2423 const bool needs_fs_update = has_new_pipeline ||
2424 has_new_view_index ||
2425 has_new_push_constants_fs ||
2426 has_new_descriptors_fs;
2427
2428 if (needs_fs_update) {
2429 struct v3dv_shader_variant *fs_variant =
2430 pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
2431
2432 cmd_buffer->state.uniforms.fs =
2433 v3dv_write_uniforms(cmd_buffer, pipeline, fs_variant);
2434 }
2435
2436 /* VK_SHADER_STAGE_GEOMETRY_BIT */
2437 if (pipeline->has_gs) {
2438 const bool has_new_descriptors_gs =
2439 has_new_descriptors &&
2440 (cmd_buffer->state.dirty_descriptor_stages &
2441 VK_SHADER_STAGE_GEOMETRY_BIT);
2442
2443 const bool has_new_push_constants_gs =
2444 has_new_push_constants &&
2445 (cmd_buffer->state.dirty_push_constants_stages &
2446 VK_SHADER_STAGE_GEOMETRY_BIT);
2447
2448 const bool needs_gs_update = has_new_viewport ||
2449 has_new_view_index ||
2450 has_new_pipeline ||
2451 has_new_push_constants_gs ||
2452 has_new_descriptors_gs;
2453
2454 if (needs_gs_update) {
2455 struct v3dv_shader_variant *gs_variant =
2456 pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY];
2457
2458 struct v3dv_shader_variant *gs_bin_variant =
2459 pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN];
2460
2461 cmd_buffer->state.uniforms.gs =
2462 v3dv_write_uniforms(cmd_buffer, pipeline, gs_variant);
2463
2464 cmd_buffer->state.uniforms.gs_bin =
2465 v3dv_write_uniforms(cmd_buffer, pipeline, gs_bin_variant);
2466 }
2467 }
2468
2469 /* VK_SHADER_STAGE_VERTEX_BIT */
2470 const bool has_new_descriptors_vs =
2471 has_new_descriptors &&
2472 (cmd_buffer->state.dirty_descriptor_stages & VK_SHADER_STAGE_VERTEX_BIT);
2473
2474 const bool has_new_push_constants_vs =
2475 has_new_push_constants &&
2476 (cmd_buffer->state.dirty_push_constants_stages & VK_SHADER_STAGE_VERTEX_BIT);
2477
2478 const bool needs_vs_update = has_new_viewport ||
2479 has_new_view_index ||
2480 has_new_draw_id ||
2481 has_new_pipeline ||
2482 has_new_push_constants_vs ||
2483 has_new_descriptors_vs;
2484
2485 if (needs_vs_update) {
2486 struct v3dv_shader_variant *vs_variant =
2487 pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX];
2488
2489 struct v3dv_shader_variant *vs_bin_variant =
2490 pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN];
2491
2492 cmd_buffer->state.uniforms.vs =
2493 v3dv_write_uniforms(cmd_buffer, pipeline, vs_variant);
2494
2495 cmd_buffer->state.uniforms.vs_bin =
2496 v3dv_write_uniforms(cmd_buffer, pipeline, vs_bin_variant);
2497 }
2498
2499 cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_VIEW_INDEX;
2500 cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_DRAW_ID;
2501
2502 return true;
2503 }
2504
2505 /* This stores command buffer state that we might be about to stomp for
2506 * a meta operation.
2507 */
2508 void
v3dv_cmd_buffer_meta_state_push(struct v3dv_cmd_buffer * cmd_buffer,bool push_descriptor_state)2509 v3dv_cmd_buffer_meta_state_push(struct v3dv_cmd_buffer *cmd_buffer,
2510 bool push_descriptor_state)
2511 {
2512 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
2513
2514 /* Attachment state.
2515 *
2516 * We store this state even if we are not currently in a subpass
2517 * (subpass_idx != -1) because we may get here to implement subpass
2518 * resolves via vkCmdResolveImage from
2519 * cmd_buffer_subpass_handle_pending_resolves. In that scenario we pretend
2520 * we are no longer in a subpass because Vulkan disallows image resolves
2521 * via vkCmdResolveImage during subpasses, but we still need to preserve
2522 * attachment state because we may have more subpasses to go through
2523 * after processing resolves in the current subass.
2524 */
2525 const uint32_t attachment_state_item_size =
2526 sizeof(struct v3dv_cmd_buffer_attachment_state);
2527 const uint32_t attachment_state_total_size =
2528 attachment_state_item_size * state->attachment_alloc_count;
2529 if (state->meta.attachment_alloc_count < state->attachment_alloc_count) {
2530 if (state->meta.attachment_alloc_count > 0)
2531 vk_free(&cmd_buffer->device->vk.alloc, state->meta.attachments);
2532
2533 state->meta.attachments = vk_zalloc(&cmd_buffer->device->vk.alloc,
2534 attachment_state_total_size, 8,
2535 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
2536 if (!state->meta.attachments) {
2537 v3dv_flag_oom(cmd_buffer, NULL);
2538 return;
2539 }
2540 state->meta.attachment_alloc_count = state->attachment_alloc_count;
2541 }
2542 state->meta.attachment_count = state->attachment_alloc_count;
2543 if (state->meta.attachments) {
2544 memcpy(state->meta.attachments, state->attachments,
2545 attachment_state_total_size);
2546 }
2547
2548 if (state->subpass_idx != -1) {
2549 state->meta.subpass_idx = state->subpass_idx;
2550 state->meta.framebuffer = v3dv_framebuffer_to_handle(state->framebuffer);
2551 state->meta.pass = v3dv_render_pass_to_handle(state->pass);
2552
2553 state->meta.tile_aligned_render_area = state->tile_aligned_render_area;
2554 memcpy(&state->meta.render_area, &state->render_area, sizeof(VkRect2D));
2555 }
2556
2557 /* We expect that meta operations are graphics-only, so we only take into
2558 * account the graphics pipeline, and the graphics state
2559 */
2560 state->meta.gfx.pipeline = state->gfx.pipeline;
2561 vk_dynamic_graphics_state_copy(&state->meta.dynamic_graphics_state,
2562 &cmd_buffer->vk.dynamic_graphics_state);
2563 memcpy(&state->meta.dynamic, &state->dynamic, sizeof(state->dynamic));
2564
2565 struct v3dv_descriptor_state *gfx_descriptor_state =
2566 &cmd_buffer->state.gfx.descriptor_state;
2567
2568 if (push_descriptor_state) {
2569 if (gfx_descriptor_state->valid != 0) {
2570 memcpy(&state->meta.gfx.descriptor_state, gfx_descriptor_state,
2571 sizeof(state->gfx.descriptor_state));
2572 }
2573 state->meta.has_descriptor_state = true;
2574 } else {
2575 state->meta.has_descriptor_state = false;
2576 }
2577
2578 if (cmd_buffer->state.push_constants_size > 0) {
2579 state->meta.push_constants_size = cmd_buffer->state.push_constants_size;
2580 memcpy(state->meta.push_constants, cmd_buffer->state.push_constants_data,
2581 cmd_buffer->state.push_constants_size);
2582 cmd_buffer->state.push_constants_size = 0;
2583 }
2584 }
2585
2586 /* This restores command buffer state after a meta operation
2587 */
2588 void
v3dv_cmd_buffer_meta_state_pop(struct v3dv_cmd_buffer * cmd_buffer,bool needs_subpass_resume)2589 v3dv_cmd_buffer_meta_state_pop(struct v3dv_cmd_buffer *cmd_buffer,
2590 bool needs_subpass_resume)
2591 {
2592 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
2593
2594 /* Attachment state */
2595 assert(state->meta.attachment_count <= state->attachment_alloc_count);
2596 const uint32_t attachment_state_item_size =
2597 sizeof(struct v3dv_cmd_buffer_attachment_state);
2598 const uint32_t attachment_state_total_size =
2599 attachment_state_item_size * state->meta.attachment_count;
2600 if (attachment_state_total_size > 0) {
2601 memcpy(state->attachments, state->meta.attachments,
2602 attachment_state_total_size);
2603 }
2604
2605 if (state->meta.subpass_idx != -1) {
2606 state->pass = v3dv_render_pass_from_handle(state->meta.pass);
2607 state->framebuffer = v3dv_framebuffer_from_handle(state->meta.framebuffer);
2608
2609 state->tile_aligned_render_area = state->meta.tile_aligned_render_area;
2610 memcpy(&state->render_area, &state->meta.render_area, sizeof(VkRect2D));
2611
2612 /* Is needs_subpass_resume is true it means that the emitted the meta
2613 * operation in its own job (possibly with an RT config that is
2614 * incompatible with the current subpass), so resuming subpass execution
2615 * after it requires that we create a new job with the subpass RT setup.
2616 */
2617 if (needs_subpass_resume)
2618 v3dv_cmd_buffer_subpass_resume(cmd_buffer, state->meta.subpass_idx);
2619 } else {
2620 state->subpass_idx = -1;
2621 }
2622
2623 if (state->meta.gfx.pipeline != NULL) {
2624 struct v3dv_pipeline *pipeline = state->meta.gfx.pipeline;
2625 VkPipelineBindPoint pipeline_binding =
2626 v3dv_pipeline_get_binding_point(pipeline);
2627 v3dv_CmdBindPipeline(v3dv_cmd_buffer_to_handle(cmd_buffer),
2628 pipeline_binding,
2629 v3dv_pipeline_to_handle(state->meta.gfx.pipeline));
2630 } else {
2631 state->gfx.pipeline = NULL;
2632 }
2633
2634 /* Restore dynamic state */
2635 vk_dynamic_graphics_state_copy(&cmd_buffer->vk.dynamic_graphics_state,
2636 &state->meta.dynamic_graphics_state);
2637 memcpy(&state->dynamic, &state->meta.dynamic, sizeof(state->dynamic));
2638 state->dirty = ~0;
2639
2640 if (state->meta.has_descriptor_state) {
2641 if (state->meta.gfx.descriptor_state.valid != 0) {
2642 memcpy(&state->gfx.descriptor_state, &state->meta.gfx.descriptor_state,
2643 sizeof(state->gfx.descriptor_state));
2644 } else {
2645 state->gfx.descriptor_state.valid = 0;
2646 }
2647 }
2648
2649 /* We only need to restore push constant data if we had any data in the
2650 * original command buffer and the meta operation wrote new push constant
2651 * data.
2652 */
2653 if (state->meta.push_constants_size > 0 &&
2654 cmd_buffer->state.push_constants_size > 0) {
2655 memcpy(cmd_buffer->state.push_constants_data, state->meta.push_constants,
2656 state->meta.push_constants_size);
2657 }
2658 cmd_buffer->state.push_constants_size = state->meta.push_constants_size;
2659
2660 state->meta.gfx.pipeline = NULL;
2661 state->meta.framebuffer = VK_NULL_HANDLE;
2662 state->meta.pass = VK_NULL_HANDLE;
2663 state->meta.subpass_idx = -1;
2664 state->meta.has_descriptor_state = false;
2665 state->meta.push_constants_size = 0;
2666 }
2667
2668 static struct v3dv_job *
cmd_buffer_pre_draw_split_job(struct v3dv_cmd_buffer * cmd_buffer)2669 cmd_buffer_pre_draw_split_job(struct v3dv_cmd_buffer *cmd_buffer)
2670 {
2671 struct v3dv_job *job = cmd_buffer->state.job;
2672 assert(job);
2673
2674 /* If the job has been flagged with 'always_flush' and it has already
2675 * recorded any draw calls then we need to start a new job for it.
2676 */
2677 if (job->always_flush && job->draw_count > 0) {
2678 assert(cmd_buffer->state.pass);
2679 /* First, flag the current job as not being the last in the
2680 * current subpass
2681 */
2682 job->is_subpass_finish = false;
2683
2684 /* Now start a new job in the same subpass and flag it as continuing
2685 * the current subpass.
2686 */
2687 job = v3dv_cmd_buffer_subpass_resume(cmd_buffer,
2688 cmd_buffer->state.subpass_idx);
2689 assert(job->draw_count == 0);
2690
2691 /* Inherit the 'always flush' behavior */
2692 job->always_flush = true;
2693 }
2694
2695 assert(job->draw_count == 0 || !job->always_flush);
2696 return job;
2697 }
2698
2699 /**
2700 * The Vulkan spec states:
2701 *
2702 * "It is legal for a subpass to use no color or depth/stencil
2703 * attachments (...) This kind of subpass can use shader side effects such
2704 * as image stores and atomics to produce an output. In this case, the
2705 * subpass continues to use the width, height, and layers of the framebuffer
2706 * to define the dimensions of the rendering area, and the
2707 * rasterizationSamples from each pipeline’s
2708 * VkPipelineMultisampleStateCreateInfo to define the number of samples used
2709 * in rasterization."
2710 *
2711 * We need to enable MSAA in the TILE_BINNING_MODE_CFG packet, which we
2712 * emit when we start a new frame at the beginning of a subpass. At that point,
2713 * if the framebuffer doesn't have any attachments we won't enable MSAA and
2714 * the job won't be valid in the scenario described by the spec.
2715 *
2716 * This function is intended to be called before a draw call and will test if
2717 * we are in that scenario, in which case, it will restart the current job
2718 * with MSAA enabled.
2719 */
2720 static void
cmd_buffer_restart_job_for_msaa_if_needed(struct v3dv_cmd_buffer * cmd_buffer)2721 cmd_buffer_restart_job_for_msaa_if_needed(struct v3dv_cmd_buffer *cmd_buffer)
2722 {
2723 assert(cmd_buffer->state.job);
2724
2725 /* We don't support variableMultisampleRate so we know that all pipelines
2726 * bound in the same subpass must have matching number of samples, so we
2727 * can do this check only on the first draw call.
2728 */
2729 if (cmd_buffer->state.job->draw_count > 0)
2730 return;
2731
2732 /* We only need to restart the frame if the pipeline requires MSAA but
2733 * our frame tiling didn't enable it.
2734 */
2735 if (!cmd_buffer->state.gfx.pipeline->msaa ||
2736 cmd_buffer->state.job->frame_tiling.msaa) {
2737 return;
2738 }
2739
2740 /* FIXME: Secondary command buffers don't start frames. Instead, they are
2741 * recorded into primary jobs that start them. For secondaries, we should
2742 * still handle this scenario, but we should do that when we record them
2743 * into primaries by testing if any of the secondaries has multisampled
2744 * draw calls in them, and then using that info to decide if we need to
2745 * restart the primary job into which they are being recorded.
2746 */
2747 if (cmd_buffer->vk.level != VK_COMMAND_BUFFER_LEVEL_PRIMARY)
2748 return;
2749
2750 /* Drop the current job and restart it with MSAA enabled */
2751 struct v3dv_job *old_job = cmd_buffer->state.job;
2752 cmd_buffer->state.job = NULL;
2753
2754 struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->vk.alloc,
2755 sizeof(struct v3dv_job), 8,
2756 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
2757 if (!job) {
2758 v3dv_flag_oom(cmd_buffer, NULL);
2759 return;
2760 }
2761
2762 v3dv_job_init(job, V3DV_JOB_TYPE_GPU_CL, cmd_buffer->device, cmd_buffer,
2763 cmd_buffer->state.subpass_idx);
2764 cmd_buffer->state.job = job;
2765
2766 v3dv_job_start_frame(job,
2767 old_job->frame_tiling.width,
2768 old_job->frame_tiling.height,
2769 old_job->frame_tiling.layers,
2770 true, false,
2771 old_job->frame_tiling.render_target_count,
2772 old_job->frame_tiling.internal_bpp,
2773 old_job->frame_tiling.total_color_bpp,
2774 true /* msaa */);
2775
2776 v3dv_job_destroy(old_job);
2777 }
2778
2779 static bool
cmd_buffer_binning_sync_required(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_pipeline * pipeline,bool indexed,bool indirect)2780 cmd_buffer_binning_sync_required(struct v3dv_cmd_buffer *cmd_buffer,
2781 struct v3dv_pipeline *pipeline,
2782 bool indexed, bool indirect)
2783 {
2784 const struct v3dv_descriptor_maps *vs_bin_maps =
2785 pipeline->shared_data->maps[BROADCOM_SHADER_VERTEX_BIN];
2786
2787 const struct v3dv_descriptor_maps *gs_bin_maps =
2788 pipeline->shared_data->maps[BROADCOM_SHADER_GEOMETRY_BIN];
2789
2790 VkAccessFlags buffer_access =
2791 cmd_buffer->state.barrier.bcl_buffer_access;
2792 if (buffer_access) {
2793 /* Index buffer read */
2794 if (indexed && (buffer_access & (VK_ACCESS_2_INDEX_READ_BIT |
2795 VK_ACCESS_2_MEMORY_READ_BIT))) {
2796 return true;
2797 }
2798
2799 /* Indirect buffer read */
2800 if (indirect && (buffer_access & (VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT |
2801 VK_ACCESS_2_MEMORY_READ_BIT))) {
2802 return true;
2803 }
2804
2805 /* Attribute read */
2806 if (buffer_access & (VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT |
2807 VK_ACCESS_2_MEMORY_READ_BIT)) {
2808 const struct v3d_vs_prog_data *prog_data =
2809 pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN]->prog_data.vs;
2810
2811 for (int i = 0; i < ARRAY_SIZE(prog_data->vattr_sizes); i++) {
2812 if (prog_data->vattr_sizes[i] > 0)
2813 return true;
2814 }
2815 }
2816
2817 /* UBO / SSBO read */
2818 if (buffer_access & (VK_ACCESS_2_UNIFORM_READ_BIT |
2819 VK_ACCESS_2_SHADER_READ_BIT |
2820 VK_ACCESS_2_MEMORY_READ_BIT |
2821 VK_ACCESS_2_SHADER_STORAGE_READ_BIT)) {
2822
2823 if (vs_bin_maps->ubo_map.num_desc > 0 ||
2824 vs_bin_maps->ssbo_map.num_desc > 0) {
2825 return true;
2826 }
2827
2828 if (gs_bin_maps && (gs_bin_maps->ubo_map.num_desc > 0 ||
2829 gs_bin_maps->ssbo_map.num_desc > 0)) {
2830 return true;
2831 }
2832 }
2833
2834 /* SSBO write */
2835 if (buffer_access & (VK_ACCESS_2_SHADER_WRITE_BIT |
2836 VK_ACCESS_2_MEMORY_WRITE_BIT |
2837 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT)) {
2838 if (vs_bin_maps->ssbo_map.num_desc > 0)
2839 return true;
2840
2841 if (gs_bin_maps && gs_bin_maps->ssbo_map.num_desc > 0)
2842 return true;
2843 }
2844
2845 /* Texel Buffer read */
2846 if (buffer_access & (VK_ACCESS_2_SHADER_SAMPLED_READ_BIT |
2847 VK_ACCESS_2_MEMORY_READ_BIT)) {
2848 if (vs_bin_maps->texture_map.num_desc > 0)
2849 return true;
2850
2851 if (gs_bin_maps && gs_bin_maps->texture_map.num_desc > 0)
2852 return true;
2853 }
2854 }
2855
2856 VkAccessFlags image_access =
2857 cmd_buffer->state.barrier.bcl_image_access;
2858 if (image_access) {
2859 /* Image load / store */
2860 if (image_access & (VK_ACCESS_2_SHADER_READ_BIT |
2861 VK_ACCESS_2_SHADER_WRITE_BIT |
2862 VK_ACCESS_2_SHADER_SAMPLED_READ_BIT |
2863 VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
2864 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT |
2865 VK_ACCESS_2_MEMORY_READ_BIT |
2866 VK_ACCESS_2_MEMORY_WRITE_BIT)) {
2867 if (vs_bin_maps->texture_map.num_desc > 0 ||
2868 vs_bin_maps->sampler_map.num_desc > 0) {
2869 return true;
2870 }
2871
2872 if (gs_bin_maps && (gs_bin_maps->texture_map.num_desc > 0 ||
2873 gs_bin_maps->sampler_map.num_desc > 0)) {
2874 return true;
2875 }
2876 }
2877 }
2878
2879 return false;
2880 }
2881
2882 void
v3dv_cmd_buffer_consume_bcl_sync(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_job * job)2883 v3dv_cmd_buffer_consume_bcl_sync(struct v3dv_cmd_buffer *cmd_buffer,
2884 struct v3dv_job *job)
2885 {
2886 job->needs_bcl_sync = true;
2887 cmd_buffer->state.barrier.bcl_buffer_access = 0;
2888 cmd_buffer->state.barrier.bcl_image_access = 0;
2889 }
2890
2891 static inline uint32_t
compute_prog_score(struct v3dv_shader_variant * vs)2892 compute_prog_score(struct v3dv_shader_variant *vs)
2893 {
2894 const uint32_t inst_count = vs->qpu_insts_size / sizeof(uint64_t);
2895 const uint32_t tmu_count = vs->prog_data.base->tmu_count +
2896 vs->prog_data.base->tmu_spills +
2897 vs->prog_data.base->tmu_fills;
2898 return inst_count + 4 * tmu_count;
2899 }
2900
2901 static void
job_update_double_buffer_score(struct v3dv_job * job,struct v3dv_pipeline * pipeline,uint32_t vertex_count,VkExtent2D * render_area)2902 job_update_double_buffer_score(struct v3dv_job *job,
2903 struct v3dv_pipeline *pipeline,
2904 uint32_t vertex_count,
2905 VkExtent2D *render_area)
2906 {
2907 /* FIXME: assume anything with GS workloads is too expensive */
2908 struct v3dv_shader_variant *gs_bin =
2909 pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN];
2910 if (gs_bin) {
2911 job->can_use_double_buffer = false;
2912 return;
2913 }
2914
2915 struct v3dv_shader_variant *vs =
2916 pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX];
2917 assert(vs);
2918
2919 struct v3dv_shader_variant *fs =
2920 pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
2921 assert(fs);
2922
2923 v3d_update_double_buffer_score(vertex_count,
2924 vs->qpu_insts_size, fs->qpu_insts_size,
2925 vs->prog_data.base, fs->prog_data.base,
2926 &job->double_buffer_score);
2927 }
2928
2929 void
v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer * cmd_buffer,bool indexed,bool indirect,uint32_t vertex_count)2930 v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer,
2931 bool indexed, bool indirect,
2932 uint32_t vertex_count)
2933 {
2934 assert(cmd_buffer->state.gfx.pipeline);
2935 assert(!(cmd_buffer->state.gfx.pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT));
2936
2937 /* If we emitted a pipeline barrier right before this draw we won't have
2938 * an active job. In that case, create a new job continuing the current
2939 * subpass.
2940 */
2941 if (!cmd_buffer->state.job) {
2942 v3dv_cmd_buffer_subpass_resume(cmd_buffer,
2943 cmd_buffer->state.subpass_idx);
2944 }
2945
2946 /* Restart single sample job for MSAA pipeline if needed */
2947 cmd_buffer_restart_job_for_msaa_if_needed(cmd_buffer);
2948
2949 /* If the job is configured to flush on every draw call we need to create
2950 * a new job now.
2951 */
2952 struct v3dv_job *job = cmd_buffer_pre_draw_split_job(cmd_buffer);
2953 job->draw_count++;
2954
2955 /* Track VK_KHR_buffer_device_address usage in the job */
2956 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
2957 job->uses_buffer_device_address |= pipeline->uses_buffer_device_address;
2958
2959 /* If this job is serialized (has consumed a barrier) then check if we need
2960 * to sync at the binning stage by testing if the binning shaders involved
2961 * with the draw call require access to external resources.
2962 */
2963 if (job->serialize && (cmd_buffer->state.barrier.bcl_buffer_access ||
2964 cmd_buffer->state.barrier.bcl_image_access)) {
2965 assert(!job->needs_bcl_sync);
2966 if (cmd_buffer_binning_sync_required(cmd_buffer, pipeline,
2967 indexed, indirect)) {
2968 v3dv_cmd_buffer_consume_bcl_sync(cmd_buffer, job);
2969 }
2970 }
2971
2972 /* GL shader state binds shaders, uniform and vertex attribute state. The
2973 * compiler injects uniforms to handle some descriptor types (such as
2974 * textures), so we need to regen that when descriptor state changes.
2975 *
2976 * We also need to emit new shader state if we have a dirty viewport since
2977 * that will require that we new uniform state for QUNIFORM_VIEWPORT_*.
2978 */
2979 uint32_t *dirty = &cmd_buffer->state.dirty;
2980 struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state;
2981
2982 const bool dirty_uniform_state =
2983 update_gfx_uniform_state(cmd_buffer);
2984
2985 struct v3dv_device *device = cmd_buffer->device;
2986
2987 if (dirty_uniform_state || (*dirty & V3DV_CMD_DIRTY_VERTEX_BUFFER))
2988 v3d_X((&device->devinfo), cmd_buffer_emit_gl_shader_state)(cmd_buffer);
2989
2990 if (*dirty & (V3DV_CMD_DIRTY_PIPELINE) ||
2991 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE) ||
2992 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE) ||
2993 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE) ||
2994 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE) ||
2995 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE) ||
2996 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE)) {
2997 v3d_X((&device->devinfo), cmd_buffer_emit_configuration_bits)(cmd_buffer);
2998 }
2999
3000 if (*dirty & (V3DV_CMD_DIRTY_PIPELINE)) {
3001 v3d_X((&device->devinfo), cmd_buffer_emit_varyings_state)(cmd_buffer);
3002 }
3003
3004 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS) ||
3005 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS)) {
3006 emit_scissor(cmd_buffer);
3007 }
3008
3009 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS))
3010 v3d_X((&device->devinfo), cmd_buffer_emit_viewport)(cmd_buffer);
3011
3012 if (*dirty & V3DV_CMD_DIRTY_INDEX_BUFFER)
3013 v3d_X((&device->devinfo), cmd_buffer_emit_index_buffer)(cmd_buffer);
3014
3015 bool any_dynamic_stencil_dirty =
3016 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) ||
3017 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) ||
3018 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE) ||
3019 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_OP);
3020
3021 if (*dirty & V3DV_CMD_DIRTY_PIPELINE || any_dynamic_stencil_dirty)
3022 v3d_X((&device->devinfo), cmd_buffer_emit_stencil)(cmd_buffer);
3023
3024 if (*dirty & V3DV_CMD_DIRTY_PIPELINE ||
3025 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS)) {
3026 v3d_X((&device->devinfo), cmd_buffer_emit_depth_bias)(cmd_buffer);
3027 }
3028
3029 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_BOUNDS))
3030 v3d_X((&device->devinfo), cmd_buffer_emit_depth_bounds)(cmd_buffer);
3031
3032 if (*dirty & V3DV_CMD_DIRTY_PIPELINE ||
3033 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS)) {
3034 v3d_X((&device->devinfo), cmd_buffer_emit_blend)(cmd_buffer);
3035 }
3036
3037 if (*dirty & V3DV_CMD_DIRTY_OCCLUSION_QUERY)
3038 v3d_X((&device->devinfo), cmd_buffer_emit_occlusion_query)(cmd_buffer);
3039
3040 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH))
3041 v3d_X((&device->devinfo), cmd_buffer_emit_line_width)(cmd_buffer);
3042
3043 if (dyn->ia.primitive_topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST &&
3044 !job->emitted_default_point_size) {
3045 v3d_X((&device->devinfo), cmd_buffer_emit_default_point_size)(cmd_buffer);
3046 }
3047
3048 if (*dirty & V3DV_CMD_DIRTY_PIPELINE)
3049 v3d_X((&device->devinfo), cmd_buffer_emit_sample_state)(cmd_buffer);
3050
3051 if (*dirty & V3DV_CMD_DIRTY_PIPELINE ||
3052 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES)) {
3053 v3d_X((&device->devinfo), cmd_buffer_emit_color_write_mask)(cmd_buffer);
3054 }
3055
3056 /* We disable double-buffer mode if indirect draws are used because in that
3057 * case we don't know the vertex count.
3058 */
3059 if (indirect) {
3060 job->can_use_double_buffer = false;
3061 } else if (job->can_use_double_buffer) {
3062 job_update_double_buffer_score(job, pipeline, vertex_count,
3063 &cmd_buffer->state.render_area.extent);
3064 }
3065
3066 cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_PIPELINE;
3067 }
3068
3069 static inline void
cmd_buffer_set_view_index(struct v3dv_cmd_buffer * cmd_buffer,uint32_t view_index)3070 cmd_buffer_set_view_index(struct v3dv_cmd_buffer *cmd_buffer,
3071 uint32_t view_index)
3072 {
3073 if (view_index != cmd_buffer->state.view_index) {
3074 cmd_buffer->state.view_index = view_index;
3075 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VIEW_INDEX;
3076 }
3077 }
3078
3079 static void
cmd_buffer_draw(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_draw_info * info)3080 cmd_buffer_draw(struct v3dv_cmd_buffer *cmd_buffer,
3081 struct v3dv_draw_info *info)
3082 {
3083 uint32_t vertex_count =
3084 info->vertex_count * info->instance_count;
3085
3086 struct v3dv_render_pass *pass = cmd_buffer->state.pass;
3087 if (likely(!pass->multiview_enabled)) {
3088 cmd_buffer_set_view_index(cmd_buffer, 0);
3089 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, false, false, vertex_count);
3090 v3d_X((&cmd_buffer->device->devinfo), cmd_buffer_emit_draw)(cmd_buffer, info);
3091 return;
3092 }
3093
3094 uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
3095 while (view_mask) {
3096 cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
3097 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, false, false, vertex_count);
3098 v3d_X((&cmd_buffer->device->devinfo), cmd_buffer_emit_draw)(cmd_buffer, info);
3099 }
3100 }
3101
3102 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdDraw(VkCommandBuffer commandBuffer,uint32_t vertexCount,uint32_t instanceCount,uint32_t firstVertex,uint32_t firstInstance)3103 v3dv_CmdDraw(VkCommandBuffer commandBuffer,
3104 uint32_t vertexCount,
3105 uint32_t instanceCount,
3106 uint32_t firstVertex,
3107 uint32_t firstInstance)
3108 {
3109 if (vertexCount == 0 || instanceCount == 0)
3110 return;
3111
3112 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3113 struct v3dv_draw_info info = {};
3114 info.vertex_count = vertexCount;
3115 info.instance_count = instanceCount;
3116 info.first_instance = firstInstance;
3117 info.first_vertex = firstVertex;
3118
3119 cmd_buffer_draw(cmd_buffer, &info);
3120 }
3121
3122 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdDrawMultiEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawInfoEXT * pVertexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride)3123 v3dv_CmdDrawMultiEXT(VkCommandBuffer commandBuffer,
3124 uint32_t drawCount,
3125 const VkMultiDrawInfoEXT *pVertexInfo,
3126 uint32_t instanceCount,
3127 uint32_t firstInstance,
3128 uint32_t stride)
3129
3130 {
3131 if (drawCount == 0 || instanceCount == 0)
3132 return;
3133
3134 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3135
3136 uint32_t i = 0;
3137 vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) {
3138 cmd_buffer->state.draw_id = i;
3139 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DRAW_ID;
3140
3141 struct v3dv_draw_info info = {};
3142 info.vertex_count = draw->vertexCount;
3143 info.instance_count = instanceCount;
3144 info.first_instance = firstInstance;
3145 info.first_vertex = draw->firstVertex;
3146
3147 cmd_buffer_draw(cmd_buffer, &info);
3148 }
3149 }
3150
3151 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdDrawIndexed(VkCommandBuffer commandBuffer,uint32_t indexCount,uint32_t instanceCount,uint32_t firstIndex,int32_t vertexOffset,uint32_t firstInstance)3152 v3dv_CmdDrawIndexed(VkCommandBuffer commandBuffer,
3153 uint32_t indexCount,
3154 uint32_t instanceCount,
3155 uint32_t firstIndex,
3156 int32_t vertexOffset,
3157 uint32_t firstInstance)
3158 {
3159 if (indexCount == 0 || instanceCount == 0)
3160 return;
3161
3162 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3163
3164 uint32_t vertex_count = indexCount * instanceCount;
3165
3166 struct v3dv_render_pass *pass = cmd_buffer->state.pass;
3167 if (likely(!pass->multiview_enabled)) {
3168 cmd_buffer_set_view_index(cmd_buffer, 0);
3169 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, false, vertex_count);
3170 v3d_X((&cmd_buffer->device->devinfo), cmd_buffer_emit_draw_indexed)
3171 (cmd_buffer, indexCount, instanceCount,
3172 firstIndex, vertexOffset, firstInstance);
3173 return;
3174 }
3175
3176 uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
3177 while (view_mask) {
3178 cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
3179 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, false, vertex_count);
3180 v3d_X((&cmd_buffer->device->devinfo), cmd_buffer_emit_draw_indexed)
3181 (cmd_buffer, indexCount, instanceCount,
3182 firstIndex, vertexOffset, firstInstance);
3183 }
3184 }
3185
3186 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawIndexedInfoEXT * pIndexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride,const int32_t * pVertexOffset)3187 v3dv_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer,
3188 uint32_t drawCount,
3189 const VkMultiDrawIndexedInfoEXT *pIndexInfo,
3190 uint32_t instanceCount,
3191 uint32_t firstInstance,
3192 uint32_t stride,
3193 const int32_t *pVertexOffset)
3194 {
3195 if (drawCount == 0 || instanceCount == 0)
3196 return;
3197
3198 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3199
3200 uint32_t i = 0;
3201 vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
3202 uint32_t vertex_count = draw->indexCount * instanceCount;
3203 int32_t vertexOffset = pVertexOffset ? *pVertexOffset : draw->vertexOffset;
3204
3205 cmd_buffer->state.draw_id = i;
3206 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DRAW_ID;
3207
3208 struct v3dv_render_pass *pass = cmd_buffer->state.pass;
3209 if (likely(!pass->multiview_enabled)) {
3210 cmd_buffer_set_view_index(cmd_buffer, 0);
3211 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, false, vertex_count);
3212 v3d_X((&cmd_buffer->device->devinfo), cmd_buffer_emit_draw_indexed)
3213 (cmd_buffer, draw->indexCount, instanceCount,
3214 draw->firstIndex, vertexOffset, firstInstance);
3215 continue;
3216 }
3217 uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
3218 while (view_mask) {
3219 cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
3220 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, false, vertex_count);
3221 v3d_X((&cmd_buffer->device->devinfo), cmd_buffer_emit_draw_indexed)
3222 (cmd_buffer, draw->indexCount, instanceCount,
3223 draw->firstIndex, vertexOffset, firstInstance);
3224 }
3225 }
3226 }
3227
3228 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdDrawIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)3229 v3dv_CmdDrawIndirect(VkCommandBuffer commandBuffer,
3230 VkBuffer _buffer,
3231 VkDeviceSize offset,
3232 uint32_t drawCount,
3233 uint32_t stride)
3234 {
3235 /* drawCount is the number of draws to execute, and can be zero. */
3236 if (drawCount == 0)
3237 return;
3238
3239 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3240 V3DV_FROM_HANDLE(v3dv_buffer, buffer, _buffer);
3241
3242 struct v3dv_render_pass *pass = cmd_buffer->state.pass;
3243 if (likely(!pass->multiview_enabled)) {
3244 cmd_buffer_set_view_index(cmd_buffer, 0);
3245 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, false, true, 0);
3246 v3d_X((&cmd_buffer->device->devinfo), cmd_buffer_emit_draw_indirect)
3247 (cmd_buffer, buffer, offset, drawCount, stride);
3248 return;
3249 }
3250
3251 uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
3252 while (view_mask) {
3253 cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
3254 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, false, true, 0);
3255 v3d_X((&cmd_buffer->device->devinfo), cmd_buffer_emit_draw_indirect)
3256 (cmd_buffer, buffer, offset, drawCount, stride);
3257 }
3258 }
3259
3260 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)3261 v3dv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
3262 VkBuffer _buffer,
3263 VkDeviceSize offset,
3264 uint32_t drawCount,
3265 uint32_t stride)
3266 {
3267 /* drawCount is the number of draws to execute, and can be zero. */
3268 if (drawCount == 0)
3269 return;
3270
3271 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3272 V3DV_FROM_HANDLE(v3dv_buffer, buffer, _buffer);
3273
3274 struct v3dv_render_pass *pass = cmd_buffer->state.pass;
3275 if (likely(!pass->multiview_enabled)) {
3276 cmd_buffer_set_view_index(cmd_buffer, 0);
3277 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, true, 0);
3278 v3d_X((&cmd_buffer->device->devinfo), cmd_buffer_emit_indexed_indirect)
3279 (cmd_buffer, buffer, offset, drawCount, stride);
3280 return;
3281 }
3282
3283 uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
3284 while (view_mask) {
3285 cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
3286 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, true, 0);
3287 v3d_X((&cmd_buffer->device->devinfo), cmd_buffer_emit_indexed_indirect)
3288 (cmd_buffer, buffer, offset, drawCount, stride);
3289 }
3290 }
3291
3292 static void
handle_barrier(VkPipelineStageFlags2 srcStageMask,VkAccessFlags2 srcAccessMask,VkPipelineStageFlags2 dstStageMask,VkAccessFlags2 dstAccessMask,bool is_image_barrier,bool is_buffer_barrier,struct v3dv_barrier_state * state)3293 handle_barrier(VkPipelineStageFlags2 srcStageMask, VkAccessFlags2 srcAccessMask,
3294 VkPipelineStageFlags2 dstStageMask, VkAccessFlags2 dstAccessMask,
3295 bool is_image_barrier, bool is_buffer_barrier,
3296 struct v3dv_barrier_state *state)
3297 {
3298 /* We only care about barriers between GPU jobs */
3299 if (srcStageMask == VK_PIPELINE_STAGE_2_HOST_BIT ||
3300 dstStageMask == VK_PIPELINE_STAGE_2_HOST_BIT) {
3301 return;
3302 }
3303
3304 /* Track source of the barrier */
3305 uint8_t src_mask = 0;
3306
3307 const VkPipelineStageFlags2 compute_mask =
3308 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT;
3309 if (srcStageMask & (compute_mask | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
3310 src_mask |= V3DV_BARRIER_COMPUTE_BIT;
3311
3312 const VkPipelineStageFlags2 transfer_mask =
3313 VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT |
3314 VK_PIPELINE_STAGE_2_COPY_BIT |
3315 VK_PIPELINE_STAGE_2_BLIT_BIT |
3316 VK_PIPELINE_STAGE_2_CLEAR_BIT;
3317 if (srcStageMask & (transfer_mask | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
3318 src_mask |= V3DV_BARRIER_TRANSFER_BIT;
3319
3320 const VkPipelineStageFlags2 graphics_mask = ~(compute_mask | transfer_mask);
3321 if (srcStageMask & (graphics_mask | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
3322 src_mask |= V3DV_BARRIER_GRAPHICS_BIT;
3323
3324 /* Track consumer of the barrier */
3325 if (dstStageMask & (compute_mask | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) {
3326 state->dst_mask |= V3DV_BARRIER_COMPUTE_BIT;
3327 state->src_mask_compute |= src_mask;
3328 }
3329
3330 if (dstStageMask & (transfer_mask | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) {
3331 state->dst_mask |= V3DV_BARRIER_TRANSFER_BIT;
3332 state->src_mask_transfer |= src_mask;
3333 }
3334
3335 if (dstStageMask & (graphics_mask | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) {
3336 state->dst_mask |= V3DV_BARRIER_GRAPHICS_BIT;
3337 state->src_mask_graphics |= src_mask;
3338
3339 if (dstStageMask & (VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT |
3340 VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT |
3341 VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT |
3342 VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT |
3343 VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT |
3344 VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT |
3345 VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT |
3346 VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT |
3347 VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT |
3348 VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT |
3349 VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT |
3350 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) {
3351 if (is_image_barrier)
3352 state->bcl_image_access |= dstAccessMask;
3353
3354 if (is_buffer_barrier)
3355 state->bcl_buffer_access |= dstAccessMask;
3356 }
3357 }
3358 }
3359
3360 void
v3dv_cmd_buffer_emit_pipeline_barrier(struct v3dv_cmd_buffer * cmd_buffer,const VkDependencyInfo * info)3361 v3dv_cmd_buffer_emit_pipeline_barrier(struct v3dv_cmd_buffer *cmd_buffer,
3362 const VkDependencyInfo *info)
3363 {
3364 uint32_t imageBarrierCount = info->imageMemoryBarrierCount;
3365 const VkImageMemoryBarrier2 *pImageBarriers = info->pImageMemoryBarriers;
3366
3367 uint32_t bufferBarrierCount = info->bufferMemoryBarrierCount;
3368 const VkBufferMemoryBarrier2 *pBufferBarriers = info->pBufferMemoryBarriers;
3369
3370 uint32_t memoryBarrierCount = info->memoryBarrierCount;
3371 const VkMemoryBarrier2 *pMemoryBarriers = info->pMemoryBarriers;
3372
3373 struct v3dv_barrier_state state = { 0 };
3374 for (uint32_t i = 0; i < imageBarrierCount; i++) {
3375 /* We can safely skip barriers for image layout transitions from UNDEFINED
3376 * layout.
3377 *
3378 * Notice that KHR_synchronization2 allows to specify barriers that don't
3379 * involve a layout transition by making oldLayout and newLayout the same,
3380 * including UNDEFINED.
3381 */
3382 if (pImageBarriers[i].oldLayout == VK_IMAGE_LAYOUT_UNDEFINED &&
3383 pImageBarriers[i].oldLayout != pImageBarriers[i].newLayout) {
3384 continue;
3385 }
3386
3387 handle_barrier(pImageBarriers[i].srcStageMask,
3388 pImageBarriers[i].srcAccessMask,
3389 pImageBarriers[i].dstStageMask,
3390 pImageBarriers[i].dstAccessMask,
3391 true, false, &state);
3392 }
3393
3394 for (uint32_t i = 0; i < bufferBarrierCount; i++) {
3395 handle_barrier(pBufferBarriers[i].srcStageMask,
3396 pBufferBarriers[i].srcAccessMask,
3397 pBufferBarriers[i].dstStageMask,
3398 pBufferBarriers[i].dstAccessMask,
3399 false, true, &state);
3400 }
3401
3402 for (uint32_t i = 0; i < memoryBarrierCount; i++) {
3403 handle_barrier(pMemoryBarriers[i].srcStageMask,
3404 pMemoryBarriers[i].srcAccessMask,
3405 pMemoryBarriers[i].dstStageMask,
3406 pMemoryBarriers[i].dstAccessMask,
3407 true, true, &state);
3408 }
3409
3410 /* Bail if we don't relevant barriers */
3411 if (!state.dst_mask)
3412 return;
3413
3414 /* If we have a recording job, finish it here */
3415 if (cmd_buffer->state.job)
3416 v3dv_cmd_buffer_finish_job(cmd_buffer);
3417
3418 /* Update barrier state in the command buffer */
3419 v3dv_cmd_buffer_merge_barrier_state(&cmd_buffer->state.barrier, &state);
3420 }
3421
3422 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,const VkDependencyInfo * pDependencyInfo)3423 v3dv_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,
3424 const VkDependencyInfo *pDependencyInfo)
3425 {
3426 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3427 v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, pDependencyInfo);
3428 }
3429
3430 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes,const VkDeviceSize * pStrides)3431 v3dv_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer,
3432 uint32_t firstBinding,
3433 uint32_t bindingCount,
3434 const VkBuffer *pBuffers,
3435 const VkDeviceSize *pOffsets,
3436 const VkDeviceSize *pSizes,
3437 const VkDeviceSize *pStrides)
3438 {
3439 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3440 struct v3dv_vertex_binding *vb = cmd_buffer->state.vertex_bindings;
3441
3442 assert(firstBinding + bindingCount <= MAX_VBS);
3443 bool vb_state_changed = false;
3444 if (pStrides) {
3445 vk_cmd_set_vertex_binding_strides(&cmd_buffer->vk,
3446 firstBinding, bindingCount,
3447 pStrides);
3448 struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state;
3449 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDING_STRIDES))
3450 vb_state_changed = true;
3451 }
3452
3453 for (uint32_t i = 0; i < bindingCount; i++) {
3454 struct v3dv_buffer *buffer = v3dv_buffer_from_handle(pBuffers[i]);
3455 if (vb[firstBinding + i].buffer != buffer) {
3456 vb[firstBinding + i].buffer = v3dv_buffer_from_handle(pBuffers[i]);
3457 vb_state_changed = true;
3458 }
3459
3460 if (vb[firstBinding + i].offset != pOffsets[i]) {
3461 vb[firstBinding + i].offset = pOffsets[i];
3462 vb_state_changed = true;
3463 }
3464 assert(pOffsets[i] <= buffer->size);
3465
3466 VkDeviceSize size;
3467 if (!pSizes || pSizes[i] == VK_WHOLE_SIZE)
3468 size = buffer->size - pOffsets[i];
3469 else
3470 size = pSizes[i];
3471 assert(pOffsets[i] + size <= buffer->size);
3472
3473 if (vb[firstBinding + i].size != size) {
3474 vb[firstBinding + i].size = size;
3475 vb_state_changed = true;
3476 }
3477 }
3478
3479 if (vb_state_changed)
3480 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VERTEX_BUFFER;
3481 }
3482
3483 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer,VkBuffer buffer,VkDeviceSize offset,VkDeviceSize size,VkIndexType indexType)3484 v3dv_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer,
3485 VkBuffer buffer,
3486 VkDeviceSize offset,
3487 VkDeviceSize size,
3488 VkIndexType indexType)
3489 {
3490 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3491
3492 assert(buffer != VK_NULL_HANDLE);
3493
3494 if (size == VK_WHOLE_SIZE) {
3495 assert(v3dv_buffer_from_handle(buffer)->size >= offset);
3496 size = v3dv_buffer_from_handle(buffer)->size - offset;
3497 }
3498
3499 const uint32_t index_size = vk_index_type_to_bytes(indexType);
3500 if (buffer == cmd_buffer->state.index_buffer.buffer &&
3501 offset == cmd_buffer->state.index_buffer.offset &&
3502 size == cmd_buffer->state.index_buffer.size &&
3503 index_size == cmd_buffer->state.index_buffer.index_size) {
3504 return;
3505 }
3506
3507 cmd_buffer->state.index_buffer.buffer = buffer;
3508 cmd_buffer->state.index_buffer.offset = offset;
3509 cmd_buffer->state.index_buffer.size = size;
3510 cmd_buffer->state.index_buffer.index_size = index_size;
3511 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_INDEX_BUFFER;
3512 }
3513
3514 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdSetLineStippleEXT(VkCommandBuffer commandBuffer,uint32_t lineStippleFactor,uint16_t lineStipplePattern)3515 v3dv_CmdSetLineStippleEXT(VkCommandBuffer commandBuffer,
3516 uint32_t lineStippleFactor,
3517 uint16_t lineStipplePattern)
3518 {
3519 /* We do not support stippled line rasterization so we just ignore this. */
3520 }
3521
3522 /**
3523 * This checks a descriptor set to see if are binding any descriptors that would
3524 * involve sampling from a linear image (the hardware only supports this for
3525 * 1D images), and if so, attempts to create a tiled copy of the linear image
3526 * and rewrite the descriptor set to use that instead.
3527 *
3528 * This was added to support a scenario with Android where some part of the UI
3529 * wanted to show previews of linear swapchain images. For more details:
3530 * https://gitlab.freedesktop.org/mesa/mesa/-/issues/9712
3531 *
3532 * Currently this only supports a linear sampling from a simple 2D image, but
3533 * it could be extended to support more cases if necessary.
3534 */
3535 static void
handle_sample_from_linear_image(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_descriptor_set * set,bool is_compute)3536 handle_sample_from_linear_image(struct v3dv_cmd_buffer *cmd_buffer,
3537 struct v3dv_descriptor_set *set,
3538 bool is_compute)
3539 {
3540 for (int32_t i = 0; i < set->layout->binding_count; i++) {
3541 const struct v3dv_descriptor_set_binding_layout *blayout =
3542 &set->layout->binding[i];
3543 if (blayout->type != VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE &&
3544 blayout->type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
3545 continue;
3546
3547 struct v3dv_descriptor *desc = &set->descriptors[blayout->descriptor_index];
3548 if (!desc->image_view)
3549 continue;
3550
3551 struct v3dv_image *image = (struct v3dv_image *) desc->image_view->vk.image;
3552 struct v3dv_image_view *view = (struct v3dv_image_view *) desc->image_view;
3553 if (image->tiled || view->vk.view_type == VK_IMAGE_VIEW_TYPE_1D ||
3554 view->vk.view_type == VK_IMAGE_VIEW_TYPE_1D_ARRAY) {
3555 continue;
3556 }
3557
3558 /* FIXME: we can probably handle most of these restrictions too with
3559 * a bit of extra effort.
3560 */
3561 if (view->vk.view_type != VK_IMAGE_VIEW_TYPE_2D ||
3562 view->vk.level_count != 1 || view->vk.layer_count != 1 ||
3563 blayout->array_size != 1) {
3564 mesa_loge("Sampling from linear image is not supported. "
3565 "Expect corruption.\n");
3566 continue;
3567 }
3568
3569 /* We are sampling from a linear image. V3D doesn't support this
3570 * so we create a tiled copy of the image and rewrite the descriptor
3571 * to read from it instead.
3572 */
3573 perf_debug("Sampling from linear image is not supported natively and "
3574 "requires a copy.\n");
3575
3576 struct v3dv_device *device = cmd_buffer->device;
3577 VkDevice vk_device = v3dv_device_to_handle(device);
3578
3579 /* Allocate shadow tiled image if needed, we only do this once for
3580 * each image, on the first sampling attempt. We need to take a lock
3581 * since we may be trying to do the same in another command buffer in
3582 * a separate thread.
3583 */
3584 mtx_lock(&device->meta.mtx);
3585 VkResult result;
3586 VkImage tiled_image;
3587 if (image->shadow) {
3588 tiled_image = v3dv_image_to_handle(image->shadow);
3589 } else {
3590 VkImageCreateInfo image_info = {
3591 .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
3592 .flags = image->vk.create_flags,
3593 .imageType = image->vk.image_type,
3594 .format = image->vk.format,
3595 .extent = {
3596 image->vk.extent.width,
3597 image->vk.extent.height,
3598 image->vk.extent.depth,
3599 },
3600 .mipLevels = image->vk.mip_levels,
3601 .arrayLayers = image->vk.array_layers,
3602 .samples = image->vk.samples,
3603 .tiling = VK_IMAGE_TILING_OPTIMAL,
3604 .usage = image->vk.usage,
3605 .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
3606 .queueFamilyIndexCount = 0,
3607 .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
3608 };
3609 result = v3dv_CreateImage(vk_device, &image_info,
3610 &device->vk.alloc, &tiled_image);
3611 if (result != VK_SUCCESS) {
3612 mesa_loge("Failed to copy linear 2D image for sampling."
3613 "Expect corruption.\n");
3614 mtx_unlock(&device->meta.mtx);
3615 continue;
3616 }
3617
3618 bool disjoint = image->vk.create_flags & VK_IMAGE_CREATE_DISJOINT_BIT;
3619 VkImageMemoryRequirementsInfo2 reqs_info = {
3620 .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2,
3621 .image = tiled_image,
3622 };
3623
3624 assert(image->plane_count <= V3DV_MAX_PLANE_COUNT);
3625 for (int p = 0; p < (disjoint ? image->plane_count : 1); p++) {
3626 VkImageAspectFlagBits plane_aspect = VK_IMAGE_ASPECT_PLANE_0_BIT << p;
3627 VkImagePlaneMemoryRequirementsInfo plane_info = {
3628 .sType = VK_STRUCTURE_TYPE_IMAGE_PLANE_MEMORY_REQUIREMENTS_INFO,
3629 .planeAspect = plane_aspect,
3630 };
3631 if (disjoint)
3632 reqs_info.pNext = &plane_info;
3633
3634 VkMemoryRequirements2 reqs = {
3635 .sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2,
3636 };
3637 v3dv_GetImageMemoryRequirements2(vk_device, &reqs_info, &reqs);
3638
3639 VkDeviceMemory mem;
3640 VkMemoryAllocateInfo alloc_info = {
3641 .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
3642 .allocationSize = reqs.memoryRequirements.size,
3643 .memoryTypeIndex = 0,
3644 };
3645 result = v3dv_AllocateMemory(vk_device, &alloc_info,
3646 &device->vk.alloc, &mem);
3647 if (result != VK_SUCCESS) {
3648 mesa_loge("Failed to copy linear 2D image for sampling."
3649 "Expect corruption.\n");
3650 v3dv_DestroyImage(vk_device, tiled_image, &device->vk.alloc);
3651 mtx_unlock(&device->meta.mtx);
3652 continue;
3653 }
3654
3655 VkBindImageMemoryInfo bind_info = {
3656 .sType = VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_INFO,
3657 .image = tiled_image,
3658 .memory = mem,
3659 .memoryOffset = 0,
3660 };
3661 VkBindImagePlaneMemoryInfo plane_bind_info = {
3662 .sType = VK_STRUCTURE_TYPE_BIND_IMAGE_PLANE_MEMORY_INFO,
3663 .planeAspect = plane_aspect,
3664 };
3665 if (disjoint)
3666 bind_info.pNext = &plane_bind_info;
3667 result = v3dv_BindImageMemory2(vk_device, 1, &bind_info);
3668 if (result != VK_SUCCESS) {
3669 mesa_loge("Failed to copy linear 2D image for sampling."
3670 "Expect corruption.\n");
3671 v3dv_DestroyImage(vk_device, tiled_image, &device->vk.alloc);
3672 v3dv_FreeMemory(vk_device, mem, &device->vk.alloc);
3673 mtx_unlock(&device->meta.mtx);
3674 continue;
3675 }
3676 }
3677
3678 image->shadow = v3dv_image_from_handle(tiled_image);
3679 }
3680
3681 /* Create a shadow view that refers to the tiled image if needed */
3682 VkImageView tiled_view;
3683 if (view->shadow) {
3684 tiled_view = v3dv_image_view_to_handle(view->shadow);
3685 } else {
3686 VkImageViewCreateInfo view_info = {
3687 .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
3688 .flags = view->vk.create_flags,
3689 .image = tiled_image,
3690 .viewType = view->vk.view_type,
3691 .format = view->vk.format,
3692 .components = view->vk.swizzle,
3693 .subresourceRange = {
3694 .aspectMask = view->vk.aspects,
3695 .baseMipLevel = view->vk.base_mip_level,
3696 .levelCount = view->vk.level_count,
3697 .baseArrayLayer = view->vk.base_array_layer,
3698 .layerCount = view->vk.layer_count,
3699 },
3700 };
3701 result = v3dv_create_image_view(device, &view_info, &tiled_view);
3702 if (result != VK_SUCCESS) {
3703 mesa_loge("Failed to copy linear 2D image for sampling."
3704 "Expect corruption.\n");
3705 mtx_unlock(&device->meta.mtx);
3706 continue;
3707 }
3708 }
3709
3710 view->shadow = v3dv_image_view_from_handle(tiled_view);
3711
3712 mtx_unlock(&device->meta.mtx);
3713
3714 /* Rewrite the descriptor to use the shadow view */
3715 VkDescriptorImageInfo desc_image_info = {
3716 .sampler = v3dv_sampler_to_handle(desc->sampler),
3717 .imageView = tiled_view,
3718 .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
3719 };
3720 VkWriteDescriptorSet write = {
3721 .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
3722 .dstSet = v3dv_descriptor_set_to_handle(set),
3723 .dstBinding = i,
3724 .dstArrayElement = 0, /* Assumes array_size is 1 */
3725 .descriptorCount = 1,
3726 .descriptorType = desc->type,
3727 .pImageInfo = &desc_image_info,
3728 };
3729 v3dv_UpdateDescriptorSets(vk_device, 1, &write, 0, NULL);
3730
3731 /* Now we need to actually copy the pixel data from the linear image
3732 * into the tiled image storage to ensure it is up-to-date.
3733 *
3734 * FIXME: ideally we would track if the linear image is dirty and skip
3735 * this step otherwise, but that would be a bit of a pain.
3736 *
3737 * Note that we need to place the copy job *before* the current job in
3738 * the command buffer state so we have the tiled image ready to process
3739 * an upcoming draw call in the current job that samples from it.
3740 *
3741 * Also, we need to use the TFU path for this copy, as any other path
3742 * will use the tile buffer and would require a new framebuffer setup,
3743 * thus requiring extra work to stop and resume any in-flight render
3744 * pass. Since we are converting a full 2D texture here the TFU should
3745 * be able to handle this.
3746 */
3747 for (int p = 0; p < image->plane_count; p++) {
3748 VkImageAspectFlagBits plane_aspect = VK_IMAGE_ASPECT_PLANE_0_BIT << p;
3749 struct VkImageCopy2 copy_region = {
3750 .sType = VK_STRUCTURE_TYPE_IMAGE_COPY_2,
3751 .srcSubresource = {
3752 .aspectMask = image->plane_count == 1 ?
3753 view->vk.aspects : (view->vk.aspects & plane_aspect),
3754 .mipLevel = view->vk.base_mip_level,
3755 .baseArrayLayer = view->vk.base_array_layer,
3756 .layerCount = view->vk.layer_count,
3757 },
3758 .srcOffset = {0, 0, 0 },
3759 .dstSubresource = {
3760 .aspectMask = image->plane_count == 1 ?
3761 view->vk.aspects : (view->vk.aspects & plane_aspect),
3762 .mipLevel = view->vk.base_mip_level,
3763 .baseArrayLayer = view->vk.base_array_layer,
3764 .layerCount = view->vk.layer_count,
3765 },
3766 .dstOffset = { 0, 0, 0},
3767 .extent = {
3768 image->planes[p].width,
3769 image->planes[p].height,
3770 1,
3771 },
3772 };
3773 struct v3dv_image *copy_src = image;
3774 struct v3dv_image *copy_dst = v3dv_image_from_handle(tiled_image);
3775 bool ok = v3dv_cmd_buffer_copy_image_tfu(cmd_buffer, copy_dst, copy_src,
3776 ©_region);
3777 if (ok) {
3778 /* This will emit the TFU job right before the current in-flight
3779 * job (if any), since in-fight jobs are only added to the list
3780 * when finished.
3781 */
3782 struct v3dv_job *tfu_job =
3783 list_last_entry(&cmd_buffer->jobs, struct v3dv_job, list_link);
3784 assert(tfu_job->type == V3DV_JOB_TYPE_GPU_TFU);
3785 /* Serialize the copy since we don't know who is producing the linear
3786 * image and we need the image to be ready by the time the copy
3787 * executes.
3788 */
3789 tfu_job->serialize = V3DV_BARRIER_ALL;
3790
3791 /* Also, we need to ensure the TFU copy job completes before anyhing
3792 * else coming after that may be using the tiled shadow copy.
3793 */
3794 if (cmd_buffer->state.job) {
3795 /* If we already had an in-flight job (i.e. we are in a render
3796 * pass) make sure the job waits for the TFU copy.
3797 */
3798 cmd_buffer->state.job->serialize |= V3DV_BARRIER_TRANSFER_BIT;
3799 } else {
3800 /* Otherwise, make the the follow-up job syncs with the TFU
3801 * job we just added when it is created by adding the
3802 * corresponding barrier state.
3803 */
3804 if (!is_compute) {
3805 cmd_buffer->state.barrier.dst_mask |= V3DV_BARRIER_GRAPHICS_BIT;
3806 cmd_buffer->state.barrier.src_mask_graphics |= V3DV_BARRIER_TRANSFER_BIT;
3807 } else {
3808 cmd_buffer->state.barrier.dst_mask |= V3DV_BARRIER_COMPUTE_BIT;
3809 cmd_buffer->state.barrier.src_mask_compute |= V3DV_BARRIER_TRANSFER_BIT;
3810 }
3811 }
3812 } else {
3813 mesa_loge("Failed to copy linear 2D image for sampling."
3814 "TFU doesn't support copy. Expect corruption.\n");
3815 }
3816 }
3817 }
3818 }
3819
3820 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipelineLayout _layout,uint32_t firstSet,uint32_t descriptorSetCount,const VkDescriptorSet * pDescriptorSets,uint32_t dynamicOffsetCount,const uint32_t * pDynamicOffsets)3821 v3dv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
3822 VkPipelineBindPoint pipelineBindPoint,
3823 VkPipelineLayout _layout,
3824 uint32_t firstSet,
3825 uint32_t descriptorSetCount,
3826 const VkDescriptorSet *pDescriptorSets,
3827 uint32_t dynamicOffsetCount,
3828 const uint32_t *pDynamicOffsets)
3829 {
3830 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3831 V3DV_FROM_HANDLE(v3dv_pipeline_layout, layout, _layout);
3832
3833 uint32_t dyn_index = 0;
3834
3835 assert(firstSet + descriptorSetCount <= MAX_SETS);
3836
3837 struct v3dv_descriptor_state *descriptor_state =
3838 pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE ?
3839 &cmd_buffer->state.compute.descriptor_state :
3840 &cmd_buffer->state.gfx.descriptor_state;
3841
3842 VkShaderStageFlags dirty_stages = 0;
3843 bool descriptor_state_changed = false;
3844 for (uint32_t i = 0; i < descriptorSetCount; i++) {
3845 V3DV_FROM_HANDLE(v3dv_descriptor_set, set, pDescriptorSets[i]);
3846 uint32_t index = firstSet + i;
3847
3848 descriptor_state->valid |= (1u << index);
3849 if (descriptor_state->descriptor_sets[index] != set) {
3850 descriptor_state->descriptor_sets[index] = set;
3851 dirty_stages |= set->layout->shader_stages;
3852 descriptor_state_changed = true;
3853
3854 /* Check if we are sampling from a linear 2D image. This is not
3855 * supported in hardware, but may be required for some applications
3856 * so we will transparently convert to tiled at the expense of
3857 * performance.
3858 */
3859 handle_sample_from_linear_image(cmd_buffer, set,
3860 pipelineBindPoint ==
3861 VK_PIPELINE_BIND_POINT_COMPUTE);
3862 }
3863
3864 for (uint32_t j = 0; j < set->layout->dynamic_offset_count; j++, dyn_index++) {
3865 uint32_t idx = j + layout->set[i + firstSet].dynamic_offset_start;
3866
3867 if (descriptor_state->dynamic_offsets[idx] != pDynamicOffsets[dyn_index]) {
3868 descriptor_state->dynamic_offsets[idx] = pDynamicOffsets[dyn_index];
3869 dirty_stages |= set->layout->shader_stages;
3870 descriptor_state_changed = true;
3871 }
3872 }
3873 }
3874
3875 if (descriptor_state_changed) {
3876 if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) {
3877 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DESCRIPTOR_SETS;
3878 cmd_buffer->state.dirty_descriptor_stages |= dirty_stages & VK_SHADER_STAGE_ALL_GRAPHICS;
3879 } else {
3880 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS;
3881 cmd_buffer->state.dirty_descriptor_stages |= VK_SHADER_STAGE_COMPUTE_BIT;
3882 }
3883 }
3884 }
3885
3886 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdPushConstants(VkCommandBuffer commandBuffer,VkPipelineLayout layout,VkShaderStageFlags stageFlags,uint32_t offset,uint32_t size,const void * pValues)3887 v3dv_CmdPushConstants(VkCommandBuffer commandBuffer,
3888 VkPipelineLayout layout,
3889 VkShaderStageFlags stageFlags,
3890 uint32_t offset,
3891 uint32_t size,
3892 const void *pValues)
3893 {
3894 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3895
3896 if (!memcmp((uint8_t *) cmd_buffer->state.push_constants_data + offset,
3897 pValues, size)) {
3898 return;
3899 }
3900
3901 memcpy((uint8_t *) cmd_buffer->state.push_constants_data + offset,
3902 pValues, size);
3903 cmd_buffer->state.push_constants_size =
3904 MAX2(offset + size, cmd_buffer->state.push_constants_size);
3905
3906 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_PUSH_CONSTANTS |
3907 V3DV_CMD_DIRTY_PUSH_CONSTANTS_UBO;
3908 cmd_buffer->state.dirty_push_constants_stages |= stageFlags;
3909 }
3910
3911 void
v3dv_cmd_buffer_ensure_array_state(struct v3dv_cmd_buffer * cmd_buffer,uint32_t slot_size,uint32_t used_count,uint32_t * alloc_count,void ** ptr)3912 v3dv_cmd_buffer_ensure_array_state(struct v3dv_cmd_buffer *cmd_buffer,
3913 uint32_t slot_size,
3914 uint32_t used_count,
3915 uint32_t *alloc_count,
3916 void **ptr)
3917 {
3918 if (used_count >= *alloc_count) {
3919 const uint32_t prev_slot_count = *alloc_count;
3920 void *old_buffer = *ptr;
3921
3922 const uint32_t new_slot_count = MAX2(*alloc_count * 2, 4);
3923 const uint32_t bytes = new_slot_count * slot_size;
3924 *ptr = vk_alloc(&cmd_buffer->device->vk.alloc, bytes, 8,
3925 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
3926 if (*ptr == NULL) {
3927 mesa_loge("Error: failed to allocate CPU buffer for query.\n");
3928 v3dv_flag_oom(cmd_buffer, NULL);
3929 return;
3930 }
3931
3932 if (old_buffer)
3933 memcpy(*ptr, old_buffer, prev_slot_count * slot_size);
3934 *alloc_count = new_slot_count;
3935 }
3936 assert(used_count < *alloc_count);
3937 }
3938
3939 void
v3dv_cmd_buffer_begin_query(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t query,VkQueryControlFlags flags)3940 v3dv_cmd_buffer_begin_query(struct v3dv_cmd_buffer *cmd_buffer,
3941 struct v3dv_query_pool *pool,
3942 uint32_t query,
3943 VkQueryControlFlags flags)
3944 {
3945 assert(query < pool->query_count);
3946 switch (pool->query_type) {
3947 case VK_QUERY_TYPE_OCCLUSION:
3948 /* FIXME: we only support one active occlusion query for now */
3949 assert(cmd_buffer->state.query.active_query.bo == NULL);
3950
3951 cmd_buffer->state.query.active_query.bo = pool->occlusion.bo;
3952 cmd_buffer->state.query.active_query.offset =
3953 pool->queries[query].occlusion.offset;
3954 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
3955 break;
3956 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
3957 assert(cmd_buffer->state.query.active_query.perf == NULL);
3958 if (cmd_buffer->state.pass)
3959 v3dv_cmd_buffer_subpass_finish(cmd_buffer);
3960
3961 cmd_buffer->state.query.active_query.perf =
3962 &pool->queries[query].perf;
3963
3964 if (cmd_buffer->state.pass) {
3965 v3dv_cmd_buffer_subpass_resume(cmd_buffer,
3966 cmd_buffer->state.subpass_idx);
3967 }
3968 break;
3969 }
3970 default:
3971 unreachable("Unsupported query type");
3972 }
3973 }
3974
3975 void
v3dv_cmd_buffer_pause_occlusion_query(struct v3dv_cmd_buffer * cmd_buffer)3976 v3dv_cmd_buffer_pause_occlusion_query(struct v3dv_cmd_buffer *cmd_buffer)
3977 {
3978 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
3979 struct v3dv_bo *occlusion_query_bo = state->query.active_query.bo;
3980 if (occlusion_query_bo) {
3981 assert(!state->query.active_query.paused_bo);
3982 state->query.active_query.paused_bo = occlusion_query_bo;
3983 state->query.active_query.bo = NULL;
3984 state->dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
3985 }
3986 }
3987
3988 void
v3dv_cmd_buffer_resume_occlusion_query(struct v3dv_cmd_buffer * cmd_buffer)3989 v3dv_cmd_buffer_resume_occlusion_query(struct v3dv_cmd_buffer *cmd_buffer)
3990 {
3991 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
3992 struct v3dv_bo *occlusion_query_bo = state->query.active_query.paused_bo;
3993 if (occlusion_query_bo) {
3994 assert(!state->query.active_query.bo);
3995 state->query.active_query.bo = occlusion_query_bo;
3996 state->query.active_query.paused_bo = NULL;
3997 state->dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
3998 }
3999 }
4000
4001 static void
v3dv_cmd_buffer_schedule_end_query(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t query)4002 v3dv_cmd_buffer_schedule_end_query(struct v3dv_cmd_buffer *cmd_buffer,
4003 struct v3dv_query_pool *pool,
4004 uint32_t query)
4005 {
4006 assert(query < pool->query_count);
4007 assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION ||
4008 pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
4009
4010 /* For occlusion queries in the middle of a render pass we don't want to
4011 * split the current job at the EndQuery just to emit query availability,
4012 * instead we queue this state in the command buffer and we emit it when
4013 * we finish the current job.
4014 */
4015 if (cmd_buffer->state.pass &&
4016 pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
4017 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
4018 v3dv_cmd_buffer_ensure_array_state(cmd_buffer,
4019 sizeof(struct v3dv_end_query_info),
4020 state->query.end.used_count,
4021 &state->query.end.alloc_count,
4022 (void **) &state->query.end.states);
4023 v3dv_return_if_oom(cmd_buffer, NULL);
4024
4025 struct v3dv_end_query_info *info =
4026 &state->query.end.states[state->query.end.used_count++];
4027
4028 info->pool = pool;
4029 info->query = query;
4030
4031 /* From the Vulkan spec:
4032 *
4033 * "If queries are used while executing a render pass instance that has
4034 * multiview enabled, the query uses N consecutive query indices in
4035 * the query pool (starting at query) where N is the number of bits set
4036 * in the view mask in the subpass the query is used in. How the
4037 * numerical results of the query are distributed among the queries is
4038 * implementation-dependent."
4039 *
4040 * In our case, only the first query is used but this means we still need
4041 * to flag the other queries as available so we don't emit errors when
4042 * the applications attempt to retrieve values from them.
4043 */
4044 struct v3dv_render_pass *pass = cmd_buffer->state.pass;
4045 if (!pass->multiview_enabled) {
4046 info->count = 1;
4047 } else {
4048 struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
4049 info->count = util_bitcount(subpass->view_mask);
4050 }
4051 } else {
4052 /* Otherwise, schedule the end query job immediately.
4053 *
4054 * Multiview queries cannot cross subpass boundaries, so query count is
4055 * always 1.
4056 */
4057 if (pool->query_type == VK_QUERY_TYPE_OCCLUSION)
4058 v3dv_cmd_buffer_emit_set_query_availability(cmd_buffer, pool, query, 1, 1);
4059 else
4060 cmd_buffer_emit_end_query_cpu(cmd_buffer, pool, query, 1);
4061 }
4062 }
4063
4064 static void
v3dv_cmd_buffer_end_occlusion_query(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t query)4065 v3dv_cmd_buffer_end_occlusion_query(struct v3dv_cmd_buffer *cmd_buffer,
4066 struct v3dv_query_pool *pool,
4067 uint32_t query)
4068 {
4069 assert(query < pool->query_count);
4070 assert(cmd_buffer->state.query.active_query.bo != NULL);
4071
4072 v3dv_cmd_buffer_schedule_end_query(cmd_buffer, pool, query);
4073
4074 cmd_buffer->state.query.active_query.bo = NULL;
4075 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
4076 }
4077
4078 static void
v3dv_cmd_buffer_end_performance_query(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t query)4079 v3dv_cmd_buffer_end_performance_query(struct v3dv_cmd_buffer *cmd_buffer,
4080 struct v3dv_query_pool *pool,
4081 uint32_t query)
4082 {
4083 assert(query < pool->query_count);
4084 assert(cmd_buffer->state.query.active_query.perf != NULL);
4085
4086 if (cmd_buffer->state.pass)
4087 v3dv_cmd_buffer_subpass_finish(cmd_buffer);
4088
4089 v3dv_cmd_buffer_schedule_end_query(cmd_buffer, pool, query);
4090
4091 cmd_buffer->state.query.active_query.perf = NULL;
4092
4093 if (cmd_buffer->state.pass)
4094 v3dv_cmd_buffer_subpass_resume(cmd_buffer, cmd_buffer->state.subpass_idx);
4095 }
4096
v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t query)4097 void v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer,
4098 struct v3dv_query_pool *pool,
4099 uint32_t query)
4100 {
4101 switch (pool->query_type) {
4102 case VK_QUERY_TYPE_OCCLUSION:
4103 v3dv_cmd_buffer_end_occlusion_query(cmd_buffer, pool, query);
4104 break;
4105 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
4106 v3dv_cmd_buffer_end_performance_query(cmd_buffer, pool, query);
4107 break;
4108 default:
4109 unreachable("Unsupported query type");
4110 }
4111 }
4112
4113 void
v3dv_cmd_buffer_add_tfu_job(struct v3dv_cmd_buffer * cmd_buffer,struct drm_v3d_submit_tfu * tfu)4114 v3dv_cmd_buffer_add_tfu_job(struct v3dv_cmd_buffer *cmd_buffer,
4115 struct drm_v3d_submit_tfu *tfu)
4116 {
4117 struct v3dv_device *device = cmd_buffer->device;
4118 struct v3dv_job *job = vk_zalloc(&device->vk.alloc,
4119 sizeof(struct v3dv_job), 8,
4120 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
4121 if (!job) {
4122 v3dv_flag_oom(cmd_buffer, NULL);
4123 return;
4124 }
4125
4126 v3dv_job_init(job, V3DV_JOB_TYPE_GPU_TFU, device, cmd_buffer, -1);
4127 job->tfu = *tfu;
4128 list_addtail(&job->list_link, &cmd_buffer->jobs);
4129 }
4130
4131 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,VkPipelineStageFlags2 stage,VkQueryPool queryPool,uint32_t query)4132 v3dv_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,
4133 VkPipelineStageFlags2 stage,
4134 VkQueryPool queryPool,
4135 uint32_t query)
4136 {
4137 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4138 V3DV_FROM_HANDLE(v3dv_query_pool, query_pool, queryPool);
4139
4140 /* If this is called inside a render pass we need to finish the current
4141 * job here...
4142 */
4143 struct v3dv_render_pass *pass = cmd_buffer->state.pass;
4144 if (pass)
4145 v3dv_cmd_buffer_finish_job(cmd_buffer);
4146
4147 struct v3dv_job *job =
4148 v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
4149 V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY,
4150 cmd_buffer, -1);
4151 v3dv_return_if_oom(cmd_buffer, NULL);
4152
4153 job->cpu.query_timestamp.pool = query_pool;
4154 job->cpu.query_timestamp.query = query;
4155
4156 if (!pass || !pass->multiview_enabled) {
4157 job->cpu.query_timestamp.count = 1;
4158 } else {
4159 struct v3dv_subpass *subpass =
4160 &pass->subpasses[cmd_buffer->state.subpass_idx];
4161 job->cpu.query_timestamp.count = util_bitcount(subpass->view_mask);
4162 }
4163
4164 list_addtail(&job->list_link, &cmd_buffer->jobs);
4165 cmd_buffer->state.job = NULL;
4166
4167 /* ...and resume the subpass after the timestamp */
4168 if (cmd_buffer->state.pass)
4169 v3dv_cmd_buffer_subpass_resume(cmd_buffer, cmd_buffer->state.subpass_idx);
4170 }
4171
4172 static void
cmd_buffer_emit_pre_dispatch(struct v3dv_cmd_buffer * cmd_buffer)4173 cmd_buffer_emit_pre_dispatch(struct v3dv_cmd_buffer *cmd_buffer)
4174 {
4175 assert(cmd_buffer->state.compute.pipeline);
4176 assert(cmd_buffer->state.compute.pipeline->active_stages ==
4177 VK_SHADER_STAGE_COMPUTE_BIT);
4178
4179 cmd_buffer->state.dirty &= ~(V3DV_CMD_DIRTY_COMPUTE_PIPELINE |
4180 V3DV_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS);
4181 cmd_buffer->state.dirty_descriptor_stages &= ~VK_SHADER_STAGE_COMPUTE_BIT;
4182 cmd_buffer->state.dirty_push_constants_stages &= ~VK_SHADER_STAGE_COMPUTE_BIT;
4183 }
4184
4185 void
v3dv_cmd_buffer_rewrite_indirect_csd_job(struct v3dv_device * device,struct v3dv_csd_indirect_cpu_job_info * info,const uint32_t * wg_counts)4186 v3dv_cmd_buffer_rewrite_indirect_csd_job(
4187 struct v3dv_device *device,
4188 struct v3dv_csd_indirect_cpu_job_info *info,
4189 const uint32_t *wg_counts)
4190 {
4191 assert(info->csd_job);
4192 struct v3dv_job *job = info->csd_job;
4193
4194 assert(job->type == V3DV_JOB_TYPE_GPU_CSD);
4195 assert(wg_counts[0] > 0 && wg_counts[1] > 0 && wg_counts[2] > 0);
4196
4197 struct drm_v3d_submit_csd *submit = &job->csd.submit;
4198
4199 job->csd.wg_count[0] = wg_counts[0];
4200 job->csd.wg_count[1] = wg_counts[1];
4201 job->csd.wg_count[2] = wg_counts[2];
4202
4203 submit->cfg[0] = wg_counts[0] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
4204 submit->cfg[1] = wg_counts[1] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
4205 submit->cfg[2] = wg_counts[2] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
4206
4207 uint32_t num_batches = DIV_ROUND_UP(info->wg_size, 16) *
4208 (wg_counts[0] * wg_counts[1] * wg_counts[2]);
4209 /* V3D 7.1.6 and later don't subtract 1 from the number of batches */
4210 if (device->devinfo.ver < 71 ||
4211 (device->devinfo.ver == 71 && device->devinfo.rev < 6)) {
4212 submit->cfg[4] = num_batches - 1;
4213 } else {
4214 submit->cfg[4] = num_batches;
4215 }
4216 assert(submit->cfg[4] != ~0);
4217
4218 if (info->needs_wg_uniform_rewrite) {
4219 /* Make sure the GPU is not currently accessing the indirect CL for this
4220 * job, since we are about to overwrite some of the uniform data.
4221 */
4222 v3dv_bo_wait(job->device, job->indirect.bo, OS_TIMEOUT_INFINITE);
4223
4224 for (uint32_t i = 0; i < 3; i++) {
4225 if (info->wg_uniform_offsets[i]) {
4226 /* Sanity check that our uniform pointers are within the allocated
4227 * BO space for our indirect CL.
4228 */
4229 assert(info->wg_uniform_offsets[i] >= (uint32_t *) job->indirect.base);
4230 assert(info->wg_uniform_offsets[i] < (uint32_t *) job->indirect.next);
4231 *(info->wg_uniform_offsets[i]) = wg_counts[i];
4232 }
4233 }
4234 }
4235 }
4236
4237 static struct v3dv_job *
cmd_buffer_create_csd_job(struct v3dv_cmd_buffer * cmd_buffer,uint32_t base_offset_x,uint32_t base_offset_y,uint32_t base_offset_z,uint32_t group_count_x,uint32_t group_count_y,uint32_t group_count_z,uint32_t ** wg_uniform_offsets_out,uint32_t * wg_size_out)4238 cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
4239 uint32_t base_offset_x,
4240 uint32_t base_offset_y,
4241 uint32_t base_offset_z,
4242 uint32_t group_count_x,
4243 uint32_t group_count_y,
4244 uint32_t group_count_z,
4245 uint32_t **wg_uniform_offsets_out,
4246 uint32_t *wg_size_out)
4247 {
4248 struct v3dv_device *device = cmd_buffer->device;
4249 struct v3dv_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
4250 assert(pipeline && pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE]);
4251 struct v3dv_shader_variant *cs_variant =
4252 pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE];
4253
4254 struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->vk.alloc,
4255 sizeof(struct v3dv_job), 8,
4256 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
4257 if (!job) {
4258 v3dv_flag_oom(cmd_buffer, NULL);
4259 return NULL;
4260 }
4261
4262 v3dv_job_init(job, V3DV_JOB_TYPE_GPU_CSD, cmd_buffer->device, cmd_buffer, -1);
4263 cmd_buffer->state.job = job;
4264
4265 struct drm_v3d_submit_csd *submit = &job->csd.submit;
4266
4267 job->csd.wg_count[0] = group_count_x;
4268 job->csd.wg_count[1] = group_count_y;
4269 job->csd.wg_count[2] = group_count_z;
4270
4271 job->csd.wg_base[0] = base_offset_x;
4272 job->csd.wg_base[1] = base_offset_y;
4273 job->csd.wg_base[2] = base_offset_z;
4274
4275 submit->cfg[0] |= group_count_x << V3D_CSD_CFG012_WG_COUNT_SHIFT;
4276 submit->cfg[1] |= group_count_y << V3D_CSD_CFG012_WG_COUNT_SHIFT;
4277 submit->cfg[2] |= group_count_z << V3D_CSD_CFG012_WG_COUNT_SHIFT;
4278
4279 const struct v3d_compute_prog_data *cpd =
4280 cs_variant->prog_data.cs;
4281
4282 const uint32_t num_wgs = group_count_x * group_count_y * group_count_z;
4283 const uint32_t wg_size = cpd->local_size[0] *
4284 cpd->local_size[1] *
4285 cpd->local_size[2];
4286
4287 uint32_t wgs_per_sg =
4288 v3d_csd_choose_workgroups_per_supergroup(
4289 &cmd_buffer->device->devinfo,
4290 cs_variant->prog_data.cs->has_subgroups,
4291 cs_variant->prog_data.cs->base.has_control_barrier,
4292 cs_variant->prog_data.cs->base.threads,
4293 num_wgs, wg_size);
4294
4295 uint32_t batches_per_sg = DIV_ROUND_UP(wgs_per_sg * wg_size, 16);
4296 uint32_t whole_sgs = num_wgs / wgs_per_sg;
4297 uint32_t rem_wgs = num_wgs - whole_sgs * wgs_per_sg;
4298 uint32_t num_batches = batches_per_sg * whole_sgs +
4299 DIV_ROUND_UP(rem_wgs * wg_size, 16);
4300
4301 submit->cfg[3] |= (wgs_per_sg & 0xf) << V3D_CSD_CFG3_WGS_PER_SG_SHIFT;
4302 submit->cfg[3] |= (batches_per_sg - 1) << V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT;
4303 submit->cfg[3] |= (wg_size & 0xff) << V3D_CSD_CFG3_WG_SIZE_SHIFT;
4304 if (wg_size_out)
4305 *wg_size_out = wg_size;
4306
4307 /* V3D 7.1.6 and later don't subtract 1 from the number of batches */
4308 if (device->devinfo.ver < 71 ||
4309 (device->devinfo.ver == 71 && device->devinfo.rev < 6)) {
4310 submit->cfg[4] = num_batches - 1;
4311 } else {
4312 submit->cfg[4] = num_batches;
4313 }
4314 assert(submit->cfg[4] != ~0);
4315
4316 assert(pipeline->shared_data->assembly_bo);
4317 struct v3dv_bo *cs_assembly_bo = pipeline->shared_data->assembly_bo;
4318
4319 submit->cfg[5] = cs_assembly_bo->offset + cs_variant->assembly_offset;
4320 if (cs_variant->prog_data.base->single_seg)
4321 submit->cfg[5] |= V3D_CSD_CFG5_SINGLE_SEG;
4322 if (cs_variant->prog_data.base->threads == 4)
4323 submit->cfg[5] |= V3D_CSD_CFG5_THREADING;
4324 /* V3D 7.x has made the PROPAGATE_NANS bit in CFG5 reserved */
4325 if (device->devinfo.ver < 71)
4326 submit->cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS;
4327
4328 if (cs_variant->prog_data.cs->shared_size > 0) {
4329 job->csd.shared_memory =
4330 v3dv_bo_alloc(cmd_buffer->device,
4331 cs_variant->prog_data.cs->shared_size * num_wgs,
4332 "shared_vars", true);
4333 if (!job->csd.shared_memory) {
4334 v3dv_flag_oom(cmd_buffer, NULL);
4335 return job;
4336 }
4337 }
4338
4339 v3dv_job_add_bo_unchecked(job, cs_assembly_bo);
4340 struct v3dv_cl_reloc uniforms =
4341 v3dv_write_uniforms_wg_offsets(cmd_buffer, pipeline,
4342 cs_variant,
4343 wg_uniform_offsets_out);
4344 submit->cfg[6] = uniforms.bo->offset + uniforms.offset;
4345
4346
4347 /* Track VK_KHR_buffer_device_address usage in the job */
4348 job->uses_buffer_device_address |= pipeline->uses_buffer_device_address;
4349
4350 v3dv_job_add_bo(job, uniforms.bo);
4351
4352 return job;
4353 }
4354
4355 static void
cmd_buffer_dispatch(struct v3dv_cmd_buffer * cmd_buffer,uint32_t base_offset_x,uint32_t base_offset_y,uint32_t base_offset_z,uint32_t group_count_x,uint32_t group_count_y,uint32_t group_count_z)4356 cmd_buffer_dispatch(struct v3dv_cmd_buffer *cmd_buffer,
4357 uint32_t base_offset_x,
4358 uint32_t base_offset_y,
4359 uint32_t base_offset_z,
4360 uint32_t group_count_x,
4361 uint32_t group_count_y,
4362 uint32_t group_count_z)
4363 {
4364 if (group_count_x == 0 || group_count_y == 0 || group_count_z == 0)
4365 return;
4366
4367 struct v3dv_job *job =
4368 cmd_buffer_create_csd_job(cmd_buffer,
4369 base_offset_x,
4370 base_offset_y,
4371 base_offset_z,
4372 group_count_x,
4373 group_count_y,
4374 group_count_z,
4375 NULL, NULL);
4376
4377 list_addtail(&job->list_link, &cmd_buffer->jobs);
4378 cmd_buffer->state.job = NULL;
4379 }
4380
4381 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdDispatchBase(VkCommandBuffer commandBuffer,uint32_t baseGroupX,uint32_t baseGroupY,uint32_t baseGroupZ,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)4382 v3dv_CmdDispatchBase(VkCommandBuffer commandBuffer,
4383 uint32_t baseGroupX,
4384 uint32_t baseGroupY,
4385 uint32_t baseGroupZ,
4386 uint32_t groupCountX,
4387 uint32_t groupCountY,
4388 uint32_t groupCountZ)
4389 {
4390 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4391
4392 cmd_buffer_emit_pre_dispatch(cmd_buffer);
4393 cmd_buffer_dispatch(cmd_buffer,
4394 baseGroupX, baseGroupY, baseGroupZ,
4395 groupCountX, groupCountY, groupCountZ);
4396 }
4397
4398
4399 static void
cmd_buffer_dispatch_indirect(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_buffer * buffer,uint32_t offset)4400 cmd_buffer_dispatch_indirect(struct v3dv_cmd_buffer *cmd_buffer,
4401 struct v3dv_buffer *buffer,
4402 uint32_t offset)
4403 {
4404 /* We can't do indirect dispatches, so instead we record a CPU job that,
4405 * when executed in the queue, will map the indirect buffer, read the
4406 * dispatch parameters, and submit a regular dispatch.
4407 */
4408 struct v3dv_job *job =
4409 v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
4410 V3DV_JOB_TYPE_CPU_CSD_INDIRECT,
4411 cmd_buffer, -1);
4412 v3dv_return_if_oom(cmd_buffer, NULL);
4413
4414 /* We need to create a CSD job now, even if we still don't know the actual
4415 * dispatch parameters, because the job setup needs to be done using the
4416 * current command buffer state (i.e. pipeline, descriptor sets, push
4417 * constants, etc.). So we create the job with default dispatch parameters
4418 * and we will rewrite the parts we need at submit time if the indirect
4419 * parameters don't match the ones we used to setup the job.
4420 */
4421 struct v3dv_job *csd_job =
4422 cmd_buffer_create_csd_job(cmd_buffer,
4423 0, 0, 0,
4424 1, 1, 1,
4425 &job->cpu.csd_indirect.wg_uniform_offsets[0],
4426 &job->cpu.csd_indirect.wg_size);
4427 v3dv_return_if_oom(cmd_buffer, NULL);
4428 assert(csd_job);
4429
4430 job->cpu.csd_indirect.buffer = buffer;
4431 job->cpu.csd_indirect.offset = offset;
4432 job->cpu.csd_indirect.csd_job = csd_job;
4433
4434 /* If the compute shader reads the workgroup sizes we will also need to
4435 * rewrite the corresponding uniforms.
4436 */
4437 job->cpu.csd_indirect.needs_wg_uniform_rewrite =
4438 job->cpu.csd_indirect.wg_uniform_offsets[0] ||
4439 job->cpu.csd_indirect.wg_uniform_offsets[1] ||
4440 job->cpu.csd_indirect.wg_uniform_offsets[2];
4441
4442 list_addtail(&job->list_link, &cmd_buffer->jobs);
4443
4444 /* If we have a CPU queue we submit the CPU job directly to the
4445 * queue and the CSD job will be dispatched from within the kernel
4446 * queue, otherwise we will have to dispatch the CSD job manually
4447 * right after the CPU job by adding it to the list of jobs in the
4448 * command buffer.
4449 */
4450 if (!cmd_buffer->device->pdevice->caps.cpu_queue)
4451 list_addtail(&csd_job->list_link, &cmd_buffer->jobs);
4452
4453 cmd_buffer->state.job = NULL;
4454 }
4455
4456 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdDispatchIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset)4457 v3dv_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
4458 VkBuffer _buffer,
4459 VkDeviceSize offset)
4460 {
4461 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4462 V3DV_FROM_HANDLE(v3dv_buffer, buffer, _buffer);
4463
4464 assert(offset <= UINT32_MAX);
4465
4466 cmd_buffer_emit_pre_dispatch(cmd_buffer);
4467 cmd_buffer_dispatch_indirect(cmd_buffer, buffer, offset);
4468 }
4469
4470 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdBeginRenderingKHR(VkCommandBuffer commandBuffer,const VkRenderingInfoKHR * info)4471 v3dv_CmdBeginRenderingKHR(VkCommandBuffer commandBuffer,
4472 const VkRenderingInfoKHR *info)
4473 {
4474 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4475
4476 cmd_buffer->state.suspending = info->flags & VK_RENDERING_SUSPENDING_BIT;
4477 cmd_buffer->state.resuming = info->flags & VK_RENDERING_RESUMING_BIT;
4478
4479 /* FIXME: for resuming passes we might not need all this setup below since
4480 * we are only mostly recording draw calls like in secondaries.
4481 */
4482
4483 v3dv_setup_dynamic_render_pass(cmd_buffer, info);
4484 v3dv_return_if_oom(cmd_buffer, NULL);
4485
4486 v3dv_setup_dynamic_framebuffer(cmd_buffer, info);
4487 v3dv_return_if_oom(cmd_buffer, NULL);
4488
4489 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
4490 state->pass = &state->dynamic_pass;
4491 state->framebuffer = state->dynamic_framebuffer;
4492
4493 VkRenderPassBeginInfo begin_info = {
4494 .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
4495 .pNext = NULL,
4496 .renderPass = v3dv_render_pass_to_handle(state->pass),
4497 .framebuffer = v3dv_framebuffer_to_handle(state->framebuffer),
4498 .renderArea = info->renderArea,
4499 };
4500
4501 VkClearValue *clear_values = NULL;
4502 if (state->pass->attachment_count > 0) {
4503 clear_values =
4504 vk_alloc(&cmd_buffer->device->vk.alloc,
4505 state->pass->attachment_count * sizeof(VkClearValue), 8,
4506 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
4507 if (!clear_values) {
4508 v3dv_flag_oom(cmd_buffer, NULL);
4509 return;
4510 }
4511 }
4512
4513 for (int i = 0; i < info->colorAttachmentCount; i++) {
4514 if (!info->pColorAttachments[i].imageView)
4515 continue;
4516
4517 uint32_t a = cmd_buffer->state.dynamic_subpass.color_attachments[i].attachment;
4518 assert(a < state->pass->attachment_count);
4519 clear_values[a] = info->pColorAttachments[i].clearValue;
4520 }
4521
4522 if (info->pDepthAttachment &&
4523 info->pDepthAttachment->imageView != VK_NULL_HANDLE) {
4524 uint32_t a = cmd_buffer->state.dynamic_subpass.ds_attachment.attachment;
4525 assert(a < state->pass->attachment_count);
4526 clear_values[a].depthStencil.depth =
4527 info->pDepthAttachment->clearValue.depthStencil.depth;
4528 }
4529
4530 if (info->pStencilAttachment &&
4531 info->pStencilAttachment->imageView != VK_NULL_HANDLE) {
4532 uint32_t a = cmd_buffer->state.dynamic_subpass.ds_attachment.attachment;
4533 assert(a < state->pass->attachment_count);
4534 clear_values[a].depthStencil.stencil =
4535 info->pStencilAttachment->clearValue.depthStencil.stencil;
4536 }
4537
4538 begin_info.clearValueCount = state->pass->attachment_count;
4539 begin_info.pClearValues = clear_values;
4540
4541 cmd_buffer_ensure_render_pass_attachment_state(cmd_buffer);
4542 v3dv_return_if_oom(cmd_buffer, NULL);
4543 cmd_buffer_init_render_pass_attachment_state(cmd_buffer, &begin_info);
4544
4545 if (clear_values)
4546 vk_free(&cmd_buffer->vk.pool->alloc, clear_values);
4547
4548 state->render_area = info->renderArea;
4549 constraint_clip_window_to_render_area(cmd_buffer);
4550 v3dv_cmd_buffer_subpass_start(cmd_buffer, 0);
4551 }
4552
4553 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdEndRenderingKHR(VkCommandBuffer commandBuffer)4554 v3dv_CmdEndRenderingKHR(VkCommandBuffer commandBuffer)
4555 {
4556 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4557
4558 v3dv_return_if_oom(cmd_buffer, NULL);
4559
4560 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
4561 assert(state->subpass_idx == state->pass->subpass_count - 1);
4562
4563 /* If we have any pending jobs that were waiting for the current job
4564 * to finish and we are suspending the pass here, we need to finish the
4565 * job completely and ensure we emit the pending jobs immediately.
4566 *
4567 * FIXME: this is not optimal but since the resuming command buffer won't
4568 * have the pending state we can't do it after the resuming chain completes
4569 * without some extra work: we would have to generate the pending jobs
4570 * now but not add them to this command buffer's job list, instead, they
4571 * should be added to a separate list of "pending jobs" and at submit time
4572 * we would accumulate these jobs during the suspend/resume chain and emit
4573 * them all after the last job in the chain.
4574 */
4575 if (state->suspending && cmd_buffer_has_pending_jobs(cmd_buffer))
4576 v3dv_cmd_buffer_finish_job(cmd_buffer);
4577
4578 /* If we don't have a job and we are suspending we will need to create one
4579 * so we can link to a follow-up resume job. Because would be starting a new
4580 * job, we should ensure the command buffer state is not flagged as resuming
4581 * from a previous suspend. The new job will consume any pending barrier
4582 * state if necessary.
4583 */
4584 struct v3dv_job *job = cmd_buffer->state.job;
4585 if (!job && state->suspending) {
4586 state->resuming = false;
4587 job = v3dv_cmd_buffer_subpass_resume(cmd_buffer, state->subpass_idx);
4588 if (!job)
4589 return;
4590 }
4591
4592 /* If this job is suspending it means it will continue execution in another
4593 * job (with the same RCL spec). We implement this by branching the BCL and
4594 * we will patch the branch address when we know the resuming job.
4595 */
4596 if (state->suspending)
4597 v3d_X((&cmd_buffer->device->devinfo), cmd_buffer_suspend)(cmd_buffer);
4598
4599 v3dv_cmd_buffer_subpass_finish(cmd_buffer);
4600 v3dv_cmd_buffer_finish_job(cmd_buffer);
4601
4602 /* This must be done after the resume/suspend chain completed. */
4603 if (!state->suspending)
4604 cmd_buffer_subpass_handle_pending_resolves(cmd_buffer);
4605
4606 state->framebuffer = NULL;
4607 state->pass = NULL;
4608 state->subpass_idx = -1;
4609 state->suspending = false;
4610 state->resuming = false;
4611 }
4612