1 /*
2 * Copyright © 2019 Raspberry Pi Ltd
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "broadcom/common/v3d_csd.h"
25 #include "v3dv_private.h"
26 #include "util/u_pack_color.h"
27 #include "vk_util.h"
28 #include "vulkan/runtime/vk_common_entrypoints.h"
29
30 void
v3dv_job_add_bo(struct v3dv_job * job,struct v3dv_bo * bo)31 v3dv_job_add_bo(struct v3dv_job *job, struct v3dv_bo *bo)
32 {
33 if (!bo)
34 return;
35
36 if (job->bo_handle_mask & bo->handle_bit) {
37 if (_mesa_set_search(job->bos, bo))
38 return;
39 }
40
41 _mesa_set_add(job->bos, bo);
42 job->bo_count++;
43 job->bo_handle_mask |= bo->handle_bit;
44 }
45
46 void
v3dv_job_add_bo_unchecked(struct v3dv_job * job,struct v3dv_bo * bo)47 v3dv_job_add_bo_unchecked(struct v3dv_job *job, struct v3dv_bo *bo)
48 {
49 assert(bo);
50 _mesa_set_add(job->bos, bo);
51 job->bo_count++;
52 job->bo_handle_mask |= bo->handle_bit;
53 }
54
55 static void
cmd_buffer_init(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_device * device)56 cmd_buffer_init(struct v3dv_cmd_buffer *cmd_buffer,
57 struct v3dv_device *device)
58 {
59 /* Do not reset the base object! If we are calling this from a command
60 * buffer reset that would reset the loader's dispatch table for the
61 * command buffer, and any other relevant info from vk_object_base
62 */
63 const uint32_t base_size = sizeof(struct vk_command_buffer);
64 uint8_t *cmd_buffer_driver_start = ((uint8_t *) cmd_buffer) + base_size;
65 memset(cmd_buffer_driver_start, 0, sizeof(*cmd_buffer) - base_size);
66
67 cmd_buffer->device = device;
68
69 list_inithead(&cmd_buffer->private_objs);
70 list_inithead(&cmd_buffer->jobs);
71
72 cmd_buffer->state.subpass_idx = -1;
73 cmd_buffer->state.meta.subpass_idx = -1;
74
75 cmd_buffer->status = V3DV_CMD_BUFFER_STATUS_INITIALIZED;
76 }
77
78 static VkResult
cmd_buffer_create(struct vk_command_pool * pool,struct vk_command_buffer ** cmd_buffer_out)79 cmd_buffer_create(struct vk_command_pool *pool,
80 struct vk_command_buffer **cmd_buffer_out)
81 {
82 struct v3dv_device *device =
83 container_of(pool->base.device, struct v3dv_device, vk);
84
85 struct v3dv_cmd_buffer *cmd_buffer;
86 cmd_buffer = vk_zalloc(&pool->alloc,
87 sizeof(*cmd_buffer),
88 8,
89 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
90 if (cmd_buffer == NULL)
91 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
92
93 /* Here we pass 0 as level because this callback hook doesn't have the level
94 * info, but that's fine, vk_common_AllocateCommandBuffers will fix it up
95 * after creation.
96 */
97 VkResult result;
98 result = vk_command_buffer_init(pool, &cmd_buffer->vk,
99 &v3dv_cmd_buffer_ops, 0 /* level */);
100 if (result != VK_SUCCESS) {
101 vk_free(&pool->alloc, cmd_buffer);
102 return result;
103 }
104
105 cmd_buffer_init(cmd_buffer, device);
106
107 *cmd_buffer_out = &cmd_buffer->vk;
108
109 return VK_SUCCESS;
110 }
111
112 static void
job_destroy_gpu_cl_resources(struct v3dv_job * job)113 job_destroy_gpu_cl_resources(struct v3dv_job *job)
114 {
115 assert(job->type == V3DV_JOB_TYPE_GPU_CL ||
116 job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY);
117
118 v3dv_cl_destroy(&job->bcl);
119 v3dv_cl_destroy(&job->rcl);
120 v3dv_cl_destroy(&job->indirect);
121
122 /* Since we don't ref BOs when we add them to the command buffer, don't
123 * unref them here either. Bo's will be freed when their corresponding API
124 * objects are destroyed.
125 */
126 _mesa_set_destroy(job->bos, NULL);
127
128 v3dv_bo_free(job->device, job->tile_alloc);
129 v3dv_bo_free(job->device, job->tile_state);
130 }
131
132 static void
job_destroy_cloned_gpu_cl_resources(struct v3dv_job * job)133 job_destroy_cloned_gpu_cl_resources(struct v3dv_job *job)
134 {
135 assert(job->type == V3DV_JOB_TYPE_GPU_CL);
136
137 list_for_each_entry_safe(struct v3dv_bo, bo, &job->bcl.bo_list, list_link) {
138 list_del(&bo->list_link);
139 vk_free(&job->device->vk.alloc, bo);
140 }
141
142 list_for_each_entry_safe(struct v3dv_bo, bo, &job->rcl.bo_list, list_link) {
143 list_del(&bo->list_link);
144 vk_free(&job->device->vk.alloc, bo);
145 }
146
147 list_for_each_entry_safe(struct v3dv_bo, bo, &job->indirect.bo_list, list_link) {
148 list_del(&bo->list_link);
149 vk_free(&job->device->vk.alloc, bo);
150 }
151 }
152
153 static void
job_destroy_gpu_csd_resources(struct v3dv_job * job)154 job_destroy_gpu_csd_resources(struct v3dv_job *job)
155 {
156 assert(job->type == V3DV_JOB_TYPE_GPU_CSD);
157 assert(job->cmd_buffer);
158
159 v3dv_cl_destroy(&job->indirect);
160
161 _mesa_set_destroy(job->bos, NULL);
162
163 if (job->csd.shared_memory)
164 v3dv_bo_free(job->device, job->csd.shared_memory);
165 }
166
167 void
v3dv_job_destroy(struct v3dv_job * job)168 v3dv_job_destroy(struct v3dv_job *job)
169 {
170 assert(job);
171
172 list_del(&job->list_link);
173
174 /* Cloned jobs don't make deep copies of the original jobs, so they don't
175 * own any of their resources. However, they do allocate clones of BO
176 * structs, so make sure we free those.
177 */
178 if (!job->is_clone) {
179 switch (job->type) {
180 case V3DV_JOB_TYPE_GPU_CL:
181 case V3DV_JOB_TYPE_GPU_CL_SECONDARY:
182 job_destroy_gpu_cl_resources(job);
183 break;
184 case V3DV_JOB_TYPE_GPU_CSD:
185 job_destroy_gpu_csd_resources(job);
186 break;
187 default:
188 break;
189 }
190 } else {
191 /* Cloned jobs */
192 if (job->type == V3DV_JOB_TYPE_GPU_CL)
193 job_destroy_cloned_gpu_cl_resources(job);
194 }
195
196 vk_free(&job->device->vk.alloc, job);
197 }
198
199 void
v3dv_cmd_buffer_add_private_obj(struct v3dv_cmd_buffer * cmd_buffer,uint64_t obj,v3dv_cmd_buffer_private_obj_destroy_cb destroy_cb)200 v3dv_cmd_buffer_add_private_obj(struct v3dv_cmd_buffer *cmd_buffer,
201 uint64_t obj,
202 v3dv_cmd_buffer_private_obj_destroy_cb destroy_cb)
203 {
204 struct v3dv_cmd_buffer_private_obj *pobj =
205 vk_alloc(&cmd_buffer->device->vk.alloc, sizeof(*pobj), 8,
206 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
207 if (!pobj) {
208 v3dv_flag_oom(cmd_buffer, NULL);
209 return;
210 }
211
212 pobj->obj = obj;
213 pobj->destroy_cb = destroy_cb;
214
215 list_addtail(&pobj->list_link, &cmd_buffer->private_objs);
216 }
217
218 static void
cmd_buffer_destroy_private_obj(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_cmd_buffer_private_obj * pobj)219 cmd_buffer_destroy_private_obj(struct v3dv_cmd_buffer *cmd_buffer,
220 struct v3dv_cmd_buffer_private_obj *pobj)
221 {
222 assert(pobj && pobj->obj && pobj->destroy_cb);
223 pobj->destroy_cb(v3dv_device_to_handle(cmd_buffer->device),
224 pobj->obj,
225 &cmd_buffer->device->vk.alloc);
226 list_del(&pobj->list_link);
227 vk_free(&cmd_buffer->device->vk.alloc, pobj);
228 }
229
230 static void
cmd_buffer_free_resources(struct v3dv_cmd_buffer * cmd_buffer)231 cmd_buffer_free_resources(struct v3dv_cmd_buffer *cmd_buffer)
232 {
233 list_for_each_entry_safe(struct v3dv_job, job,
234 &cmd_buffer->jobs, list_link) {
235 v3dv_job_destroy(job);
236 }
237
238 if (cmd_buffer->state.job)
239 v3dv_job_destroy(cmd_buffer->state.job);
240
241 if (cmd_buffer->state.attachments)
242 vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer->state.attachments);
243
244 if (cmd_buffer->state.query.end.alloc_count > 0)
245 vk_free(&cmd_buffer->device->vk.alloc, cmd_buffer->state.query.end.states);
246
247 if (cmd_buffer->push_constants_resource.bo)
248 v3dv_bo_free(cmd_buffer->device, cmd_buffer->push_constants_resource.bo);
249
250 list_for_each_entry_safe(struct v3dv_cmd_buffer_private_obj, pobj,
251 &cmd_buffer->private_objs, list_link) {
252 cmd_buffer_destroy_private_obj(cmd_buffer, pobj);
253 }
254
255 if (cmd_buffer->state.meta.attachments) {
256 assert(cmd_buffer->state.meta.attachment_alloc_count > 0);
257 vk_free(&cmd_buffer->device->vk.alloc, cmd_buffer->state.meta.attachments);
258 }
259 }
260
261 static void
cmd_buffer_destroy(struct vk_command_buffer * vk_cmd_buffer)262 cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer)
263 {
264 struct v3dv_cmd_buffer *cmd_buffer =
265 container_of(vk_cmd_buffer, struct v3dv_cmd_buffer, vk);
266
267 cmd_buffer_free_resources(cmd_buffer);
268 vk_command_buffer_finish(&cmd_buffer->vk);
269 vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer);
270 }
271
272 static bool
cmd_buffer_can_merge_subpass(struct v3dv_cmd_buffer * cmd_buffer,uint32_t subpass_idx)273 cmd_buffer_can_merge_subpass(struct v3dv_cmd_buffer *cmd_buffer,
274 uint32_t subpass_idx)
275 {
276 const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
277 assert(state->pass);
278
279 const struct v3dv_physical_device *physical_device =
280 cmd_buffer->device->pdevice;
281
282 if (cmd_buffer->vk.level != VK_COMMAND_BUFFER_LEVEL_PRIMARY)
283 return false;
284
285 if (!cmd_buffer->state.job)
286 return false;
287
288 if (cmd_buffer->state.job->always_flush)
289 return false;
290
291 if (!physical_device->options.merge_jobs)
292 return false;
293
294 /* Each render pass starts a new job */
295 if (subpass_idx == 0)
296 return false;
297
298 /* Two subpasses can be merged in the same job if we can emit a single RCL
299 * for them (since the RCL includes the END_OF_RENDERING command that
300 * triggers the "render job finished" interrupt). We can do this so long
301 * as both subpasses render against the same attachments.
302 */
303 assert(state->subpass_idx == subpass_idx - 1);
304 struct v3dv_subpass *prev_subpass = &state->pass->subpasses[state->subpass_idx];
305 struct v3dv_subpass *subpass = &state->pass->subpasses[subpass_idx];
306
307 if (subpass->ds_attachment.attachment !=
308 prev_subpass->ds_attachment.attachment)
309 return false;
310
311 if (subpass->color_count != prev_subpass->color_count)
312 return false;
313
314 for (uint32_t i = 0; i < subpass->color_count; i++) {
315 if (subpass->color_attachments[i].attachment !=
316 prev_subpass->color_attachments[i].attachment) {
317 return false;
318 }
319 }
320
321 /* Don't merge if the subpasses have different view masks, since in that
322 * case the framebuffer setup is different and we need to emit different
323 * RCLs.
324 */
325 if (subpass->view_mask != prev_subpass->view_mask)
326 return false;
327
328 /* FIXME: Since some attachment formats can't be resolved using the TLB we
329 * need to emit separate resolve jobs for them and that would not be
330 * compatible with subpass merges. We could fix that by testing if any of
331 * the attachments to resolve doesn't support TLB resolves.
332 */
333 if (prev_subpass->resolve_attachments || subpass->resolve_attachments ||
334 prev_subpass->resolve_depth || prev_subpass->resolve_stencil ||
335 subpass->resolve_depth || subpass->resolve_stencil) {
336 return false;
337 }
338
339 return true;
340 }
341
342 /**
343 * Computes and sets the job frame tiling information required to setup frame
344 * binning and rendering.
345 */
346 static struct v3dv_frame_tiling *
job_compute_frame_tiling(struct v3dv_job * job,uint32_t width,uint32_t height,uint32_t layers,uint32_t render_target_count,uint8_t max_internal_bpp,uint8_t total_color_bpp,bool msaa,bool double_buffer)347 job_compute_frame_tiling(struct v3dv_job *job,
348 uint32_t width,
349 uint32_t height,
350 uint32_t layers,
351 uint32_t render_target_count,
352 uint8_t max_internal_bpp,
353 uint8_t total_color_bpp,
354 bool msaa,
355 bool double_buffer)
356 {
357 assert(job);
358 struct v3dv_frame_tiling *tiling = &job->frame_tiling;
359
360 tiling->width = width;
361 tiling->height = height;
362 tiling->layers = layers;
363 tiling->render_target_count = render_target_count;
364 tiling->msaa = msaa;
365 tiling->internal_bpp = max_internal_bpp;
366 tiling->total_color_bpp = total_color_bpp;
367 tiling->double_buffer = double_buffer;
368
369 /* Double-buffer is incompatible with MSAA */
370 assert(!tiling->msaa || !tiling->double_buffer);
371
372 v3d_choose_tile_size(&job->device->devinfo,
373 render_target_count,
374 max_internal_bpp, total_color_bpp, msaa,
375 tiling->double_buffer,
376 &tiling->tile_width, &tiling->tile_height);
377
378 tiling->draw_tiles_x = DIV_ROUND_UP(width, tiling->tile_width);
379 tiling->draw_tiles_y = DIV_ROUND_UP(height, tiling->tile_height);
380
381 /* Size up our supertiles until we get under the limit */
382 const uint32_t max_supertiles = 256;
383 tiling->supertile_width = 1;
384 tiling->supertile_height = 1;
385 for (;;) {
386 tiling->frame_width_in_supertiles =
387 DIV_ROUND_UP(tiling->draw_tiles_x, tiling->supertile_width);
388 tiling->frame_height_in_supertiles =
389 DIV_ROUND_UP(tiling->draw_tiles_y, tiling->supertile_height);
390 const uint32_t num_supertiles = tiling->frame_width_in_supertiles *
391 tiling->frame_height_in_supertiles;
392 if (num_supertiles < max_supertiles)
393 break;
394
395 if (tiling->supertile_width < tiling->supertile_height)
396 tiling->supertile_width++;
397 else
398 tiling->supertile_height++;
399 }
400
401 return tiling;
402 }
403
404 bool
v3dv_job_allocate_tile_state(struct v3dv_job * job)405 v3dv_job_allocate_tile_state(struct v3dv_job *job)
406 {
407 struct v3dv_frame_tiling *tiling = &job->frame_tiling;
408 const uint32_t layers =
409 job->allocate_tile_state_for_all_layers ? tiling->layers : 1;
410
411 /* The PTB will request the tile alloc initial size per tile at start
412 * of tile binning.
413 */
414 uint32_t tile_alloc_size = 64 * layers *
415 tiling->draw_tiles_x *
416 tiling->draw_tiles_y;
417
418 /* The PTB allocates in aligned 4k chunks after the initial setup. */
419 tile_alloc_size = align(tile_alloc_size, 4096);
420
421 /* Include the first two chunk allocations that the PTB does so that
422 * we definitely clear the OOM condition before triggering one (the HW
423 * won't trigger OOM during the first allocations).
424 */
425 tile_alloc_size += 8192;
426
427 /* For performance, allocate some extra initial memory after the PTB's
428 * minimal allocations, so that we hopefully don't have to block the
429 * GPU on the kernel handling an OOM signal.
430 */
431 tile_alloc_size += 512 * 1024;
432
433 job->tile_alloc = v3dv_bo_alloc(job->device, tile_alloc_size,
434 "tile_alloc", true);
435 if (!job->tile_alloc) {
436 v3dv_flag_oom(NULL, job);
437 return false;
438 }
439
440 v3dv_job_add_bo_unchecked(job, job->tile_alloc);
441
442 const uint32_t tsda_per_tile_size = 256;
443 const uint32_t tile_state_size = layers *
444 tiling->draw_tiles_x *
445 tiling->draw_tiles_y *
446 tsda_per_tile_size;
447 job->tile_state = v3dv_bo_alloc(job->device, tile_state_size, "TSDA", true);
448 if (!job->tile_state) {
449 v3dv_flag_oom(NULL, job);
450 return false;
451 }
452
453 v3dv_job_add_bo_unchecked(job, job->tile_state);
454 return true;
455 }
456
457 void
v3dv_job_start_frame(struct v3dv_job * job,uint32_t width,uint32_t height,uint32_t layers,bool allocate_tile_state_for_all_layers,bool allocate_tile_state_now,uint32_t render_target_count,uint8_t max_internal_bpp,uint8_t total_color_bpp,bool msaa)458 v3dv_job_start_frame(struct v3dv_job *job,
459 uint32_t width,
460 uint32_t height,
461 uint32_t layers,
462 bool allocate_tile_state_for_all_layers,
463 bool allocate_tile_state_now,
464 uint32_t render_target_count,
465 uint8_t max_internal_bpp,
466 uint8_t total_color_bpp,
467 bool msaa)
468 {
469 assert(job);
470
471 /* Start by computing frame tiling spec for this job assuming that
472 * double-buffer mode is disabled.
473 */
474 const struct v3dv_frame_tiling *tiling =
475 job_compute_frame_tiling(job, width, height, layers,
476 render_target_count, max_internal_bpp,
477 total_color_bpp, msaa, false);
478
479 v3dv_cl_ensure_space_with_branch(&job->bcl, 256);
480 v3dv_return_if_oom(NULL, job);
481
482 job->allocate_tile_state_for_all_layers = allocate_tile_state_for_all_layers;
483
484 /* For subpass jobs we postpone tile state allocation until we are finishing
485 * the job and have made a decision about double-buffer.
486 */
487 if (allocate_tile_state_now) {
488 if (!v3dv_job_allocate_tile_state(job))
489 return;
490 }
491
492 v3dv_X(job->device, job_emit_binning_prolog)(job, tiling,
493 allocate_tile_state_for_all_layers ? tiling->layers : 1);
494
495 job->ez_state = V3D_EZ_UNDECIDED;
496 job->first_ez_state = V3D_EZ_UNDECIDED;
497 }
498
499 static bool
job_should_enable_double_buffer(struct v3dv_job * job)500 job_should_enable_double_buffer(struct v3dv_job *job)
501 {
502 /* Incompatibility with double-buffer */
503 if (!job->can_use_double_buffer)
504 return false;
505
506 /* Too much geometry processing */
507 if (job->double_buffer_score.geom > 2000000)
508 return false;
509
510 /* Too little rendering to make up for tile store latency */
511 if (job->double_buffer_score.render < 100000)
512 return false;
513
514 return true;
515 }
516
517 static void
cmd_buffer_end_render_pass_frame(struct v3dv_cmd_buffer * cmd_buffer)518 cmd_buffer_end_render_pass_frame(struct v3dv_cmd_buffer *cmd_buffer)
519 {
520 struct v3dv_job *job = cmd_buffer->state.job;
521 assert(job);
522
523
524 /* For subpass jobs we always emit the RCL here */
525 assert(v3dv_cl_offset(&job->rcl) == 0);
526
527 /* Decide if we want to enable double-buffer for this job. If we do, then
528 * we need to rewrite the TILE_BINNING_MODE_CFG packet in the BCL.
529 */
530 if (job_should_enable_double_buffer(job)) {
531 assert(!job->frame_tiling.double_buffer);
532 job_compute_frame_tiling(job,
533 job->frame_tiling.width,
534 job->frame_tiling.height,
535 job->frame_tiling.layers,
536 job->frame_tiling.render_target_count,
537 job->frame_tiling.internal_bpp,
538 job->frame_tiling.total_color_bpp,
539 job->frame_tiling.msaa,
540 true);
541
542 v3dv_X(job->device, job_emit_enable_double_buffer)(job);
543 }
544
545 /* At this point we have decided whether we want to use double-buffer or
546 * not and the job's frame tiling represents that decision so we can
547 * allocate the tile state, which we need to do before we emit the RCL.
548 */
549 v3dv_job_allocate_tile_state(job);
550
551 v3dv_X(cmd_buffer->device, cmd_buffer_emit_render_pass_rcl)(cmd_buffer);
552
553 v3dv_X(cmd_buffer->device, job_emit_binning_flush)(job);
554 }
555
556 struct v3dv_job *
v3dv_cmd_buffer_create_cpu_job(struct v3dv_device * device,enum v3dv_job_type type,struct v3dv_cmd_buffer * cmd_buffer,uint32_t subpass_idx)557 v3dv_cmd_buffer_create_cpu_job(struct v3dv_device *device,
558 enum v3dv_job_type type,
559 struct v3dv_cmd_buffer *cmd_buffer,
560 uint32_t subpass_idx)
561 {
562 struct v3dv_job *job = vk_zalloc(&device->vk.alloc,
563 sizeof(struct v3dv_job), 8,
564 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
565 if (!job) {
566 v3dv_flag_oom(cmd_buffer, NULL);
567 return NULL;
568 }
569
570 v3dv_job_init(job, type, device, cmd_buffer, subpass_idx);
571 return job;
572 }
573
574 static void
cmd_buffer_emit_end_query_cpu(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t query,uint32_t count)575 cmd_buffer_emit_end_query_cpu(struct v3dv_cmd_buffer *cmd_buffer,
576 struct v3dv_query_pool *pool,
577 uint32_t query, uint32_t count)
578 {
579 assert(pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
580
581 struct v3dv_job *job =
582 v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
583 V3DV_JOB_TYPE_CPU_END_QUERY,
584 cmd_buffer, -1);
585 v3dv_return_if_oom(cmd_buffer, NULL);
586
587 job->cpu.query_end.pool = pool;
588 job->cpu.query_end.query = query;
589 job->cpu.query_end.count = count;
590 list_addtail(&job->list_link, &cmd_buffer->jobs);
591 }
592
593 static void
cmd_buffer_add_jobs_for_pending_state(struct v3dv_cmd_buffer * cmd_buffer)594 cmd_buffer_add_jobs_for_pending_state(struct v3dv_cmd_buffer *cmd_buffer)
595 {
596 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
597
598 if (state->query.end.used_count > 0) {
599 const uint32_t count = state->query.end.used_count;
600 for (uint32_t i = 0; i < count; i++) {
601 assert(i < state->query.end.used_count);
602 struct v3dv_end_query_info *info = &state->query.end.states[i];
603 if (info->pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
604 v3dv_cmd_buffer_emit_set_query_availability(cmd_buffer, info->pool,
605 info->query, info->count, 1);
606 } else {
607 cmd_buffer_emit_end_query_cpu(cmd_buffer, info->pool,
608 info->query, info->count);
609 }
610 }
611 state->query.end.used_count = 0;
612 }
613 }
614
615 void
v3dv_cmd_buffer_finish_job(struct v3dv_cmd_buffer * cmd_buffer)616 v3dv_cmd_buffer_finish_job(struct v3dv_cmd_buffer *cmd_buffer)
617 {
618 struct v3dv_job *job = cmd_buffer->state.job;
619 if (!job)
620 return;
621
622 /* Always clear BCL state after a job has been finished if we don't have
623 * a pending graphics barrier that could consume it (BCL barriers only
624 * apply to graphics jobs). This can happen if the application recorded
625 * a barrier involving geometry stages but none of the draw calls in the
626 * job actually required a binning sync.
627 */
628 if (!(cmd_buffer->state.barrier.dst_mask & V3DV_BARRIER_GRAPHICS_BIT)) {
629 cmd_buffer->state.barrier.bcl_buffer_access = 0;
630 cmd_buffer->state.barrier.bcl_image_access = 0;
631 }
632
633 if (cmd_buffer->state.oom) {
634 v3dv_job_destroy(job);
635 cmd_buffer->state.job = NULL;
636 return;
637 }
638
639 /* If we have created a job for a command buffer then we should have
640 * recorded something into it: if the job was started in a render pass, it
641 * should at least have the start frame commands, otherwise, it should have
642 * a transfer command. The only exception are secondary command buffers
643 * inside a render pass.
644 */
645 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY ||
646 v3dv_cl_offset(&job->bcl) > 0);
647
648 /* When we merge multiple subpasses into the same job we must only emit one
649 * RCL, so we do that here, when we decided that we need to finish the job.
650 * Any rendering that happens outside a render pass is never merged, so
651 * the RCL should have been emitted by the time we got here.
652 */
653 assert(v3dv_cl_offset(&job->rcl) != 0 || cmd_buffer->state.pass);
654
655 /* If we are finishing a job inside a render pass we have two scenarios:
656 *
657 * 1. It is a regular CL, in which case we will submit the job to the GPU,
658 * so we may need to generate an RCL and add a binning flush.
659 *
660 * 2. It is a partial CL recorded in a secondary command buffer, in which
661 * case we are not submitting it directly to the GPU but rather branch to
662 * it from a primary command buffer. In this case we just want to end
663 * the BCL with a RETURN_FROM_SUB_LIST and the RCL and binning flush
664 * will be the primary job that branches to this CL.
665 */
666 if (cmd_buffer->state.pass) {
667 if (job->type == V3DV_JOB_TYPE_GPU_CL) {
668 cmd_buffer_end_render_pass_frame(cmd_buffer);
669 } else {
670 assert(job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY);
671 v3dv_X(cmd_buffer->device, cmd_buffer_end_render_pass_secondary)(cmd_buffer);
672 }
673 }
674
675 list_addtail(&job->list_link, &cmd_buffer->jobs);
676 cmd_buffer->state.job = NULL;
677
678 /* If we have recorded any state with this last GPU job that requires to
679 * emit jobs after the job is completed, add them now. The only exception
680 * is secondary command buffers inside a render pass, because in
681 * that case we want to defer this until we finish recording the primary
682 * job into which we execute the secondary.
683 */
684 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY ||
685 !cmd_buffer->state.pass) {
686 cmd_buffer_add_jobs_for_pending_state(cmd_buffer);
687 }
688 }
689
690 bool
v3dv_job_type_is_gpu(struct v3dv_job * job)691 v3dv_job_type_is_gpu(struct v3dv_job *job)
692 {
693 switch (job->type) {
694 case V3DV_JOB_TYPE_GPU_CL:
695 case V3DV_JOB_TYPE_GPU_CL_SECONDARY:
696 case V3DV_JOB_TYPE_GPU_TFU:
697 case V3DV_JOB_TYPE_GPU_CSD:
698 return true;
699 default:
700 return false;
701 }
702 }
703
704 static void
cmd_buffer_serialize_job_if_needed(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_job * job)705 cmd_buffer_serialize_job_if_needed(struct v3dv_cmd_buffer *cmd_buffer,
706 struct v3dv_job *job)
707 {
708 assert(cmd_buffer && job);
709
710 /* Serialization only affects GPU jobs, CPU jobs are always automatically
711 * serialized.
712 */
713 if (!v3dv_job_type_is_gpu(job))
714 return;
715
716 uint8_t barrier_mask = cmd_buffer->state.barrier.dst_mask;
717 if (barrier_mask == 0)
718 return;
719
720 uint8_t bit = 0;
721 uint8_t *src_mask;
722 if (job->type == V3DV_JOB_TYPE_GPU_CSD) {
723 assert(!job->is_transfer);
724 bit = V3DV_BARRIER_COMPUTE_BIT;
725 src_mask = &cmd_buffer->state.barrier.src_mask_compute;
726 } else if (job->is_transfer) {
727 assert(job->type == V3DV_JOB_TYPE_GPU_CL ||
728 job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY ||
729 job->type == V3DV_JOB_TYPE_GPU_TFU);
730 bit = V3DV_BARRIER_TRANSFER_BIT;
731 src_mask = &cmd_buffer->state.barrier.src_mask_transfer;
732 } else {
733 assert(job->type == V3DV_JOB_TYPE_GPU_CL ||
734 job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY);
735 bit = V3DV_BARRIER_GRAPHICS_BIT;
736 src_mask = &cmd_buffer->state.barrier.src_mask_graphics;
737 }
738
739 if (barrier_mask & bit) {
740 job->serialize = *src_mask;
741 *src_mask = 0;
742 cmd_buffer->state.barrier.dst_mask &= ~bit;
743 }
744 }
745
746 void
v3dv_job_init(struct v3dv_job * job,enum v3dv_job_type type,struct v3dv_device * device,struct v3dv_cmd_buffer * cmd_buffer,int32_t subpass_idx)747 v3dv_job_init(struct v3dv_job *job,
748 enum v3dv_job_type type,
749 struct v3dv_device *device,
750 struct v3dv_cmd_buffer *cmd_buffer,
751 int32_t subpass_idx)
752 {
753 assert(job);
754
755 /* Make sure we haven't made this new job current before calling here */
756 assert(!cmd_buffer || cmd_buffer->state.job != job);
757
758 job->type = type;
759
760 job->device = device;
761 job->cmd_buffer = cmd_buffer;
762
763 list_inithead(&job->list_link);
764
765 if (type == V3DV_JOB_TYPE_GPU_CL ||
766 type == V3DV_JOB_TYPE_GPU_CL_SECONDARY ||
767 type == V3DV_JOB_TYPE_GPU_CSD) {
768 job->bos =
769 _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
770 job->bo_count = 0;
771
772 v3dv_cl_init(job, &job->indirect);
773
774 if (V3D_DBG(ALWAYS_FLUSH))
775 job->always_flush = true;
776 }
777
778 if (type == V3DV_JOB_TYPE_GPU_CL ||
779 type == V3DV_JOB_TYPE_GPU_CL_SECONDARY) {
780 v3dv_cl_init(job, &job->bcl);
781 v3dv_cl_init(job, &job->rcl);
782 }
783
784 if (cmd_buffer) {
785 /* Flag all state as dirty. Generally, we need to re-emit state for each
786 * new job.
787 *
788 * FIXME: there may be some exceptions, in which case we could skip some
789 * bits.
790 */
791 cmd_buffer->state.dirty = ~0;
792 cmd_buffer->state.dirty_descriptor_stages = ~0;
793
794 /* Honor inheritance of occlusion queries in secondaries if requested */
795 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
796 cmd_buffer->state.inheritance.occlusion_query_enable) {
797 cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_OCCLUSION_QUERY;
798 }
799
800 /* Keep track of the first subpass that we are recording in this new job.
801 * We will use this when we emit the RCL to decide how to emit our loads
802 * and stores.
803 */
804 if (cmd_buffer->state.pass)
805 job->first_subpass = subpass_idx;
806
807 job->is_transfer = cmd_buffer->state.is_transfer;
808
809 cmd_buffer_serialize_job_if_needed(cmd_buffer, job);
810
811 job->perf = cmd_buffer->state.query.active_query.perf;
812 }
813 }
814
815 struct v3dv_job *
v3dv_cmd_buffer_start_job(struct v3dv_cmd_buffer * cmd_buffer,int32_t subpass_idx,enum v3dv_job_type type)816 v3dv_cmd_buffer_start_job(struct v3dv_cmd_buffer *cmd_buffer,
817 int32_t subpass_idx,
818 enum v3dv_job_type type)
819 {
820 /* Don't create a new job if we can merge the current subpass into
821 * the current job.
822 */
823 if (cmd_buffer->state.pass &&
824 subpass_idx != -1 &&
825 cmd_buffer_can_merge_subpass(cmd_buffer, subpass_idx)) {
826 cmd_buffer->state.job->is_subpass_finish = false;
827 return cmd_buffer->state.job;
828 }
829
830 /* Ensure we are not starting a new job without finishing a previous one */
831 if (cmd_buffer->state.job != NULL)
832 v3dv_cmd_buffer_finish_job(cmd_buffer);
833
834 assert(cmd_buffer->state.job == NULL);
835 struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->vk.alloc,
836 sizeof(struct v3dv_job), 8,
837 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
838
839 if (!job) {
840 fprintf(stderr, "Error: failed to allocate CPU memory for job\n");
841 v3dv_flag_oom(cmd_buffer, NULL);
842 return NULL;
843 }
844
845 v3dv_job_init(job, type, cmd_buffer->device, cmd_buffer, subpass_idx);
846 cmd_buffer->state.job = job;
847
848 return job;
849 }
850
851 static void
cmd_buffer_reset(struct vk_command_buffer * vk_cmd_buffer,VkCommandBufferResetFlags flags)852 cmd_buffer_reset(struct vk_command_buffer *vk_cmd_buffer,
853 VkCommandBufferResetFlags flags)
854 {
855 struct v3dv_cmd_buffer *cmd_buffer =
856 container_of(vk_cmd_buffer, struct v3dv_cmd_buffer, vk);
857
858 vk_command_buffer_reset(&cmd_buffer->vk);
859 if (cmd_buffer->status != V3DV_CMD_BUFFER_STATUS_INITIALIZED) {
860 struct v3dv_device *device = cmd_buffer->device;
861
862 /* FIXME: For now we always free all resources as if
863 * VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT was set.
864 */
865 if (cmd_buffer->status != V3DV_CMD_BUFFER_STATUS_NEW)
866 cmd_buffer_free_resources(cmd_buffer);
867
868 cmd_buffer_init(cmd_buffer, device);
869 }
870
871 assert(cmd_buffer->status == V3DV_CMD_BUFFER_STATUS_INITIALIZED);
872 }
873
874
875 static void
cmd_buffer_emit_resolve(struct v3dv_cmd_buffer * cmd_buffer,uint32_t dst_attachment_idx,uint32_t src_attachment_idx,VkImageAspectFlagBits aspect)876 cmd_buffer_emit_resolve(struct v3dv_cmd_buffer *cmd_buffer,
877 uint32_t dst_attachment_idx,
878 uint32_t src_attachment_idx,
879 VkImageAspectFlagBits aspect)
880 {
881 struct v3dv_image_view *src_iview =
882 cmd_buffer->state.attachments[src_attachment_idx].image_view;
883 struct v3dv_image_view *dst_iview =
884 cmd_buffer->state.attachments[dst_attachment_idx].image_view;
885
886 const VkRect2D *ra = &cmd_buffer->state.render_area;
887
888 VkImageResolve2 region = {
889 .sType = VK_STRUCTURE_TYPE_IMAGE_RESOLVE_2,
890 .srcSubresource = {
891 aspect,
892 src_iview->vk.base_mip_level,
893 src_iview->vk.base_array_layer,
894 src_iview->vk.layer_count,
895 },
896 .srcOffset = { ra->offset.x, ra->offset.y, 0 },
897 .dstSubresource = {
898 aspect,
899 dst_iview->vk.base_mip_level,
900 dst_iview->vk.base_array_layer,
901 dst_iview->vk.layer_count,
902 },
903 .dstOffset = { ra->offset.x, ra->offset.y, 0 },
904 .extent = { ra->extent.width, ra->extent.height, 1 },
905 };
906
907 struct v3dv_image *src_image = (struct v3dv_image *) src_iview->vk.image;
908 struct v3dv_image *dst_image = (struct v3dv_image *) dst_iview->vk.image;
909 VkResolveImageInfo2 resolve_info = {
910 .sType = VK_STRUCTURE_TYPE_RESOLVE_IMAGE_INFO_2,
911 .srcImage = v3dv_image_to_handle(src_image),
912 .srcImageLayout = VK_IMAGE_LAYOUT_GENERAL,
913 .dstImage = v3dv_image_to_handle(dst_image),
914 .dstImageLayout = VK_IMAGE_LAYOUT_GENERAL,
915 .regionCount = 1,
916 .pRegions = ®ion,
917 };
918
919 VkCommandBuffer cmd_buffer_handle = v3dv_cmd_buffer_to_handle(cmd_buffer);
920 v3dv_CmdResolveImage2(cmd_buffer_handle, &resolve_info);
921 }
922
923 static void
cmd_buffer_subpass_handle_pending_resolves(struct v3dv_cmd_buffer * cmd_buffer)924 cmd_buffer_subpass_handle_pending_resolves(struct v3dv_cmd_buffer *cmd_buffer)
925 {
926 assert(cmd_buffer->state.subpass_idx < cmd_buffer->state.pass->subpass_count);
927 const struct v3dv_render_pass *pass = cmd_buffer->state.pass;
928 const struct v3dv_subpass *subpass =
929 &pass->subpasses[cmd_buffer->state.subpass_idx];
930
931 if (!subpass->resolve_attachments)
932 return;
933
934 /* At this point we have already ended the current subpass and now we are
935 * about to emit vkCmdResolveImage calls to get the resolves we can't handle
936 * handle in the subpass RCL.
937 *
938 * vkCmdResolveImage is not supposed to be called inside a render pass so
939 * before we call that we need to make sure our command buffer state reflects
940 * that we are no longer in a subpass by finishing the current job and
941 * resetting the framebuffer and render pass state temporarily and then
942 * restoring it after we are done with the resolves.
943 */
944 if (cmd_buffer->state.job)
945 v3dv_cmd_buffer_finish_job(cmd_buffer);
946 struct v3dv_framebuffer *restore_fb = cmd_buffer->state.framebuffer;
947 struct v3dv_render_pass *restore_pass = cmd_buffer->state.pass;
948 uint32_t restore_subpass_idx = cmd_buffer->state.subpass_idx;
949 cmd_buffer->state.framebuffer = NULL;
950 cmd_buffer->state.pass = NULL;
951 cmd_buffer->state.subpass_idx = -1;
952
953 for (uint32_t i = 0; i < subpass->color_count; i++) {
954 const uint32_t src_attachment_idx =
955 subpass->color_attachments[i].attachment;
956 if (src_attachment_idx == VK_ATTACHMENT_UNUSED)
957 continue;
958
959 /* Skip if this attachment doesn't have a resolve or if it was already
960 * implemented as a TLB resolve.
961 */
962 if (!cmd_buffer->state.attachments[src_attachment_idx].has_resolve ||
963 cmd_buffer->state.attachments[src_attachment_idx].use_tlb_resolve) {
964 continue;
965 }
966
967 const uint32_t dst_attachment_idx =
968 subpass->resolve_attachments[i].attachment;
969 assert(dst_attachment_idx != VK_ATTACHMENT_UNUSED);
970
971 cmd_buffer_emit_resolve(cmd_buffer, dst_attachment_idx, src_attachment_idx,
972 VK_IMAGE_ASPECT_COLOR_BIT);
973 }
974
975 const uint32_t ds_src_attachment_idx =
976 subpass->ds_attachment.attachment;
977 if (ds_src_attachment_idx != VK_ATTACHMENT_UNUSED &&
978 cmd_buffer->state.attachments[ds_src_attachment_idx].has_resolve &&
979 !cmd_buffer->state.attachments[ds_src_attachment_idx].use_tlb_resolve) {
980 assert(subpass->resolve_depth || subpass->resolve_stencil);
981 const VkImageAspectFlags ds_aspects =
982 (subpass->resolve_depth ? VK_IMAGE_ASPECT_DEPTH_BIT : 0) |
983 (subpass->resolve_stencil ? VK_IMAGE_ASPECT_STENCIL_BIT : 0);
984 const uint32_t ds_dst_attachment_idx =
985 subpass->ds_resolve_attachment.attachment;
986 assert(ds_dst_attachment_idx != VK_ATTACHMENT_UNUSED);
987 cmd_buffer_emit_resolve(cmd_buffer, ds_dst_attachment_idx,
988 ds_src_attachment_idx, ds_aspects);
989 }
990
991 cmd_buffer->state.framebuffer = restore_fb;
992 cmd_buffer->state.pass = restore_pass;
993 cmd_buffer->state.subpass_idx = restore_subpass_idx;
994 }
995
996 static VkResult
cmd_buffer_begin_render_pass_secondary(struct v3dv_cmd_buffer * cmd_buffer,const VkCommandBufferInheritanceInfo * inheritance_info)997 cmd_buffer_begin_render_pass_secondary(
998 struct v3dv_cmd_buffer *cmd_buffer,
999 const VkCommandBufferInheritanceInfo *inheritance_info)
1000 {
1001 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
1002 assert(cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT);
1003 assert(inheritance_info);
1004
1005 cmd_buffer->state.pass =
1006 v3dv_render_pass_from_handle(inheritance_info->renderPass);
1007 assert(cmd_buffer->state.pass);
1008
1009 cmd_buffer->state.framebuffer =
1010 v3dv_framebuffer_from_handle(inheritance_info->framebuffer);
1011
1012 assert(inheritance_info->subpass < cmd_buffer->state.pass->subpass_count);
1013 cmd_buffer->state.subpass_idx = inheritance_info->subpass;
1014
1015 cmd_buffer->state.inheritance.occlusion_query_enable =
1016 inheritance_info->occlusionQueryEnable;
1017
1018 /* Secondaries that execute inside a render pass won't start subpasses
1019 * so we want to create a job for them here.
1020 */
1021 struct v3dv_job *job =
1022 v3dv_cmd_buffer_start_job(cmd_buffer, inheritance_info->subpass,
1023 V3DV_JOB_TYPE_GPU_CL_SECONDARY);
1024 if (!job) {
1025 v3dv_flag_oom(cmd_buffer, NULL);
1026 return VK_ERROR_OUT_OF_HOST_MEMORY;
1027 }
1028
1029 /* Secondary command buffers don't know about the render area, but our
1030 * scissor setup accounts for it, so let's make sure we make it large
1031 * enough that it doesn't actually constrain any rendering. This should
1032 * be fine, since the Vulkan spec states:
1033 *
1034 * "The application must ensure (using scissor if necessary) that all
1035 * rendering is contained within the render area."
1036 */
1037 const struct v3dv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
1038 cmd_buffer->state.render_area.offset.x = 0;
1039 cmd_buffer->state.render_area.offset.y = 0;
1040 cmd_buffer->state.render_area.extent.width =
1041 framebuffer ? framebuffer->width : V3D_MAX_IMAGE_DIMENSION;
1042 cmd_buffer->state.render_area.extent.height =
1043 framebuffer ? framebuffer->height : V3D_MAX_IMAGE_DIMENSION;
1044
1045 /* We only really execute double-buffer mode in primary jobs, so allow this
1046 * mode in render pass secondaries to keep track of the double-buffer mode
1047 * score in them and update the primaries accordingly when they are executed
1048 * into them.
1049 */
1050 job->can_use_double_buffer = true;
1051
1052 return VK_SUCCESS;
1053 }
1054
1055 const struct vk_command_buffer_ops v3dv_cmd_buffer_ops = {
1056 .create = cmd_buffer_create,
1057 .reset = cmd_buffer_reset,
1058 .destroy = cmd_buffer_destroy,
1059 };
1060
1061 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_BeginCommandBuffer(VkCommandBuffer commandBuffer,const VkCommandBufferBeginInfo * pBeginInfo)1062 v3dv_BeginCommandBuffer(VkCommandBuffer commandBuffer,
1063 const VkCommandBufferBeginInfo *pBeginInfo)
1064 {
1065 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1066
1067 /* If this is the first vkBeginCommandBuffer, we must initialize the
1068 * command buffer's state. Otherwise, we must reset its state. In both
1069 * cases we reset it.
1070 */
1071 cmd_buffer_reset(&cmd_buffer->vk, 0);
1072
1073 assert(cmd_buffer->status == V3DV_CMD_BUFFER_STATUS_INITIALIZED);
1074
1075 cmd_buffer->usage_flags = pBeginInfo->flags;
1076
1077 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
1078 if (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
1079 VkResult result =
1080 cmd_buffer_begin_render_pass_secondary(cmd_buffer,
1081 pBeginInfo->pInheritanceInfo);
1082 if (result != VK_SUCCESS)
1083 return result;
1084 }
1085 }
1086
1087 cmd_buffer->status = V3DV_CMD_BUFFER_STATUS_RECORDING;
1088
1089 return VK_SUCCESS;
1090 }
1091
1092 static void
cmd_buffer_update_tile_alignment(struct v3dv_cmd_buffer * cmd_buffer)1093 cmd_buffer_update_tile_alignment(struct v3dv_cmd_buffer *cmd_buffer)
1094 {
1095 /* Render areas and scissor/viewport are only relevant inside render passes,
1096 * otherwise we are dealing with transfer operations where these elements
1097 * don't apply.
1098 */
1099 assert(cmd_buffer->state.pass);
1100 const VkRect2D *rect = &cmd_buffer->state.render_area;
1101
1102 /* We should only call this at the beginning of a subpass so we should
1103 * always have framebuffer information available.
1104 */
1105 assert(cmd_buffer->state.framebuffer);
1106 cmd_buffer->state.tile_aligned_render_area =
1107 v3dv_subpass_area_is_tile_aligned(cmd_buffer->device, rect,
1108 cmd_buffer->state.framebuffer,
1109 cmd_buffer->state.pass,
1110 cmd_buffer->state.subpass_idx);
1111
1112 if (!cmd_buffer->state.tile_aligned_render_area) {
1113 perf_debug("Render area for subpass %d of render pass %p doesn't "
1114 "match render pass granularity.\n",
1115 cmd_buffer->state.subpass_idx, cmd_buffer->state.pass);
1116 }
1117 }
1118
1119 static void
cmd_buffer_update_attachment_resolve_state(struct v3dv_cmd_buffer * cmd_buffer)1120 cmd_buffer_update_attachment_resolve_state(struct v3dv_cmd_buffer *cmd_buffer)
1121 {
1122 /* NOTE: This should be called after cmd_buffer_update_tile_alignment()
1123 * since it relies on up-to-date information about subpass tile alignment.
1124 */
1125 const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1126 const struct v3dv_render_pass *pass = state->pass;
1127 const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
1128
1129 for (uint32_t i = 0; i < subpass->color_count; i++) {
1130 const uint32_t attachment_idx = subpass->color_attachments[i].attachment;
1131 if (attachment_idx == VK_ATTACHMENT_UNUSED)
1132 continue;
1133
1134 state->attachments[attachment_idx].has_resolve =
1135 subpass->resolve_attachments &&
1136 subpass->resolve_attachments[i].attachment != VK_ATTACHMENT_UNUSED;
1137
1138 state->attachments[attachment_idx].use_tlb_resolve =
1139 state->attachments[attachment_idx].has_resolve &&
1140 state->tile_aligned_render_area &&
1141 pass->attachments[attachment_idx].try_tlb_resolve;
1142 }
1143
1144 uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
1145 if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
1146 uint32_t ds_resolve_attachment_idx =
1147 subpass->ds_resolve_attachment.attachment;
1148 state->attachments[ds_attachment_idx].has_resolve =
1149 ds_resolve_attachment_idx != VK_ATTACHMENT_UNUSED;
1150
1151 assert(!state->attachments[ds_attachment_idx].has_resolve ||
1152 (subpass->resolve_depth || subpass->resolve_stencil));
1153
1154 state->attachments[ds_attachment_idx].use_tlb_resolve =
1155 state->attachments[ds_attachment_idx].has_resolve &&
1156 state->tile_aligned_render_area &&
1157 pass->attachments[ds_attachment_idx].try_tlb_resolve;
1158 }
1159 }
1160
1161 static void
cmd_buffer_state_set_attachment_clear_color(struct v3dv_cmd_buffer * cmd_buffer,uint32_t attachment_idx,const VkClearColorValue * color)1162 cmd_buffer_state_set_attachment_clear_color(struct v3dv_cmd_buffer *cmd_buffer,
1163 uint32_t attachment_idx,
1164 const VkClearColorValue *color)
1165 {
1166 assert(attachment_idx < cmd_buffer->state.pass->attachment_count);
1167 const struct v3dv_render_pass_attachment *attachment =
1168 &cmd_buffer->state.pass->attachments[attachment_idx];
1169
1170 uint32_t internal_type, internal_bpp;
1171 const struct v3dv_format *format =
1172 v3dv_X(cmd_buffer->device, get_format)(attachment->desc.format);
1173 /* We don't allow multi-planar formats for render pass attachments */
1174 assert(format->plane_count == 1);
1175
1176 v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_output_format)
1177 (format->planes[0].rt_type, &internal_type, &internal_bpp);
1178
1179 uint32_t internal_size = 4 << internal_bpp;
1180
1181 struct v3dv_cmd_buffer_attachment_state *attachment_state =
1182 &cmd_buffer->state.attachments[attachment_idx];
1183
1184 v3dv_X(cmd_buffer->device, get_hw_clear_color)
1185 (color, internal_type, internal_size, &attachment_state->clear_value.color[0]);
1186
1187 attachment_state->vk_clear_value.color = *color;
1188 }
1189
1190 static void
cmd_buffer_state_set_attachment_clear_depth_stencil(struct v3dv_cmd_buffer * cmd_buffer,uint32_t attachment_idx,bool clear_depth,bool clear_stencil,const VkClearDepthStencilValue * ds)1191 cmd_buffer_state_set_attachment_clear_depth_stencil(
1192 struct v3dv_cmd_buffer *cmd_buffer,
1193 uint32_t attachment_idx,
1194 bool clear_depth, bool clear_stencil,
1195 const VkClearDepthStencilValue *ds)
1196 {
1197 struct v3dv_cmd_buffer_attachment_state *attachment_state =
1198 &cmd_buffer->state.attachments[attachment_idx];
1199
1200 if (clear_depth)
1201 attachment_state->clear_value.z = ds->depth;
1202
1203 if (clear_stencil)
1204 attachment_state->clear_value.s = ds->stencil;
1205
1206 attachment_state->vk_clear_value.depthStencil = *ds;
1207 }
1208
1209 static void
cmd_buffer_state_set_clear_values(struct v3dv_cmd_buffer * cmd_buffer,uint32_t count,const VkClearValue * values)1210 cmd_buffer_state_set_clear_values(struct v3dv_cmd_buffer *cmd_buffer,
1211 uint32_t count, const VkClearValue *values)
1212 {
1213 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1214 const struct v3dv_render_pass *pass = state->pass;
1215
1216 /* There could be less clear values than attachments in the render pass, in
1217 * which case we only want to process as many as we have, or there could be
1218 * more, in which case we want to ignore those for which we don't have a
1219 * corresponding attachment.
1220 */
1221 count = MIN2(count, pass->attachment_count);
1222 for (uint32_t i = 0; i < count; i++) {
1223 const struct v3dv_render_pass_attachment *attachment =
1224 &pass->attachments[i];
1225
1226 if (attachment->desc.loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR)
1227 continue;
1228
1229 VkImageAspectFlags aspects = vk_format_aspects(attachment->desc.format);
1230 if (aspects & VK_IMAGE_ASPECT_COLOR_BIT) {
1231 cmd_buffer_state_set_attachment_clear_color(cmd_buffer, i,
1232 &values[i].color);
1233 } else if (aspects & (VK_IMAGE_ASPECT_DEPTH_BIT |
1234 VK_IMAGE_ASPECT_STENCIL_BIT)) {
1235 cmd_buffer_state_set_attachment_clear_depth_stencil(
1236 cmd_buffer, i,
1237 aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
1238 aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
1239 &values[i].depthStencil);
1240 }
1241 }
1242 }
1243
1244 static void
cmd_buffer_state_set_attachments(struct v3dv_cmd_buffer * cmd_buffer,const VkRenderPassBeginInfo * pRenderPassBegin)1245 cmd_buffer_state_set_attachments(struct v3dv_cmd_buffer *cmd_buffer,
1246 const VkRenderPassBeginInfo *pRenderPassBegin)
1247 {
1248 V3DV_FROM_HANDLE(v3dv_render_pass, pass, pRenderPassBegin->renderPass);
1249 V3DV_FROM_HANDLE(v3dv_framebuffer, framebuffer, pRenderPassBegin->framebuffer);
1250
1251 const VkRenderPassAttachmentBeginInfo *attach_begin =
1252 vk_find_struct_const(pRenderPassBegin, RENDER_PASS_ATTACHMENT_BEGIN_INFO);
1253
1254 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1255
1256 for (uint32_t i = 0; i < pass->attachment_count; i++) {
1257 if (attach_begin && attach_begin->attachmentCount != 0) {
1258 state->attachments[i].image_view =
1259 v3dv_image_view_from_handle(attach_begin->pAttachments[i]);
1260 } else if (framebuffer) {
1261 state->attachments[i].image_view = framebuffer->attachments[i];
1262 } else {
1263 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
1264 state->attachments[i].image_view = NULL;
1265 }
1266 }
1267 }
1268
1269 static void
cmd_buffer_init_render_pass_attachment_state(struct v3dv_cmd_buffer * cmd_buffer,const VkRenderPassBeginInfo * pRenderPassBegin)1270 cmd_buffer_init_render_pass_attachment_state(struct v3dv_cmd_buffer *cmd_buffer,
1271 const VkRenderPassBeginInfo *pRenderPassBegin)
1272 {
1273 cmd_buffer_state_set_clear_values(cmd_buffer,
1274 pRenderPassBegin->clearValueCount,
1275 pRenderPassBegin->pClearValues);
1276
1277 cmd_buffer_state_set_attachments(cmd_buffer, pRenderPassBegin);
1278 }
1279
1280 static void
cmd_buffer_ensure_render_pass_attachment_state(struct v3dv_cmd_buffer * cmd_buffer)1281 cmd_buffer_ensure_render_pass_attachment_state(struct v3dv_cmd_buffer *cmd_buffer)
1282 {
1283 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1284 const struct v3dv_render_pass *pass = state->pass;
1285
1286 if (state->attachment_alloc_count < pass->attachment_count) {
1287 if (state->attachments > 0) {
1288 assert(state->attachment_alloc_count > 0);
1289 vk_free(&cmd_buffer->device->vk.alloc, state->attachments);
1290 }
1291
1292 uint32_t size = sizeof(struct v3dv_cmd_buffer_attachment_state) *
1293 pass->attachment_count;
1294 state->attachments = vk_zalloc(&cmd_buffer->device->vk.alloc, size, 8,
1295 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
1296 if (!state->attachments) {
1297 v3dv_flag_oom(cmd_buffer, NULL);
1298 return;
1299 }
1300 state->attachment_alloc_count = pass->attachment_count;
1301 }
1302
1303 assert(state->attachment_alloc_count >= pass->attachment_count);
1304 }
1305
1306 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,const VkRenderPassBeginInfo * pRenderPassBegin,const VkSubpassBeginInfo * pSubpassBeginInfo)1307 v3dv_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
1308 const VkRenderPassBeginInfo *pRenderPassBegin,
1309 const VkSubpassBeginInfo *pSubpassBeginInfo)
1310 {
1311 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1312 V3DV_FROM_HANDLE(v3dv_render_pass, pass, pRenderPassBegin->renderPass);
1313 V3DV_FROM_HANDLE(v3dv_framebuffer, framebuffer, pRenderPassBegin->framebuffer);
1314
1315 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1316 state->pass = pass;
1317 state->framebuffer = framebuffer;
1318
1319 cmd_buffer_ensure_render_pass_attachment_state(cmd_buffer);
1320 v3dv_return_if_oom(cmd_buffer, NULL);
1321
1322 cmd_buffer_init_render_pass_attachment_state(cmd_buffer, pRenderPassBegin);
1323
1324 state->render_area = pRenderPassBegin->renderArea;
1325
1326 /* If our render area is smaller than the current clip window we will have
1327 * to emit a new clip window to constraint it to the render area.
1328 */
1329 uint32_t min_render_x = state->render_area.offset.x;
1330 uint32_t min_render_y = state->render_area.offset.y;
1331 uint32_t max_render_x = min_render_x + state->render_area.extent.width - 1;
1332 uint32_t max_render_y = min_render_y + state->render_area.extent.height - 1;
1333 uint32_t min_clip_x = state->clip_window.offset.x;
1334 uint32_t min_clip_y = state->clip_window.offset.y;
1335 uint32_t max_clip_x = min_clip_x + state->clip_window.extent.width - 1;
1336 uint32_t max_clip_y = min_clip_y + state->clip_window.extent.height - 1;
1337 if (min_render_x > min_clip_x || min_render_y > min_clip_y ||
1338 max_render_x < max_clip_x || max_render_y < max_clip_y) {
1339 state->dirty |= V3DV_CMD_DIRTY_SCISSOR;
1340 }
1341
1342 /* Setup for first subpass */
1343 v3dv_cmd_buffer_subpass_start(cmd_buffer, 0);
1344 }
1345
1346 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdNextSubpass2(VkCommandBuffer commandBuffer,const VkSubpassBeginInfo * pSubpassBeginInfo,const VkSubpassEndInfo * pSubpassEndInfo)1347 v3dv_CmdNextSubpass2(VkCommandBuffer commandBuffer,
1348 const VkSubpassBeginInfo *pSubpassBeginInfo,
1349 const VkSubpassEndInfo *pSubpassEndInfo)
1350 {
1351 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1352
1353 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1354 assert(state->subpass_idx < state->pass->subpass_count - 1);
1355
1356 /* Finish the previous subpass */
1357 v3dv_cmd_buffer_subpass_finish(cmd_buffer);
1358 cmd_buffer_subpass_handle_pending_resolves(cmd_buffer);
1359
1360 /* Start the next subpass */
1361 v3dv_cmd_buffer_subpass_start(cmd_buffer, state->subpass_idx + 1);
1362 }
1363
1364 static void
cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer * cmd_buffer)1365 cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer *cmd_buffer)
1366 {
1367 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
1368
1369 assert(cmd_buffer->state.pass);
1370 assert(cmd_buffer->state.subpass_idx < cmd_buffer->state.pass->subpass_count);
1371 const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1372 const struct v3dv_render_pass *pass = state->pass;
1373 const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
1374
1375 /* We only need to emit subpass clears as draw calls when the render
1376 * area is not aligned to tile boundaries or for GFXH-1461.
1377 */
1378 if (cmd_buffer->state.tile_aligned_render_area &&
1379 !subpass->do_depth_clear_with_draw &&
1380 !subpass->do_depth_clear_with_draw) {
1381 return;
1382 }
1383
1384 uint32_t att_count = 0;
1385 VkClearAttachment atts[V3D_MAX_DRAW_BUFFERS + 1]; /* +1 for D/S */
1386
1387 /* We only need to emit subpass clears as draw calls for color attachments
1388 * if the render area is not aligned to tile boundaries.
1389 */
1390 if (!cmd_buffer->state.tile_aligned_render_area) {
1391 for (uint32_t i = 0; i < subpass->color_count; i++) {
1392 const uint32_t att_idx = subpass->color_attachments[i].attachment;
1393 if (att_idx == VK_ATTACHMENT_UNUSED)
1394 continue;
1395
1396 struct v3dv_render_pass_attachment *att = &pass->attachments[att_idx];
1397 if (att->desc.loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR)
1398 continue;
1399
1400 if (state->subpass_idx != att->first_subpass)
1401 continue;
1402
1403 atts[att_count].aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
1404 atts[att_count].colorAttachment = i;
1405 atts[att_count].clearValue = state->attachments[att_idx].vk_clear_value;
1406 att_count++;
1407 }
1408 }
1409
1410 /* For D/S we may also need to emit a subpass clear for GFXH-1461 */
1411 const uint32_t ds_att_idx = subpass->ds_attachment.attachment;
1412 if (ds_att_idx != VK_ATTACHMENT_UNUSED) {
1413 struct v3dv_render_pass_attachment *att = &pass->attachments[ds_att_idx];
1414 if (state->subpass_idx == att->first_subpass) {
1415 VkImageAspectFlags aspects = vk_format_aspects(att->desc.format);
1416 if (att->desc.loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR ||
1417 (cmd_buffer->state.tile_aligned_render_area &&
1418 !subpass->do_depth_clear_with_draw)) {
1419 aspects &= ~VK_IMAGE_ASPECT_DEPTH_BIT;
1420 }
1421 if (att->desc.stencilLoadOp != VK_ATTACHMENT_LOAD_OP_CLEAR ||
1422 (cmd_buffer->state.tile_aligned_render_area &&
1423 !subpass->do_stencil_clear_with_draw)) {
1424 aspects &= ~VK_IMAGE_ASPECT_STENCIL_BIT;
1425 }
1426 if (aspects) {
1427 atts[att_count].aspectMask = aspects;
1428 atts[att_count].colorAttachment = 0; /* Ignored */
1429 atts[att_count].clearValue =
1430 state->attachments[ds_att_idx].vk_clear_value;
1431 att_count++;
1432 }
1433 }
1434 }
1435
1436 if (att_count == 0)
1437 return;
1438
1439 if (!cmd_buffer->state.tile_aligned_render_area) {
1440 perf_debug("Render area doesn't match render pass granularity, falling "
1441 "back to vkCmdClearAttachments for "
1442 "VK_ATTACHMENT_LOAD_OP_CLEAR.\n");
1443 } else if (subpass->do_depth_clear_with_draw ||
1444 subpass->do_stencil_clear_with_draw) {
1445 perf_debug("Subpass clears DEPTH but loads STENCIL (or vice versa), "
1446 "falling back to vkCmdClearAttachments for "
1447 "VK_ATTACHMENT_LOAD_OP_CLEAR.\n");
1448 }
1449
1450 /* From the Vulkan 1.0 spec:
1451 *
1452 * "VK_ATTACHMENT_LOAD_OP_CLEAR specifies that the contents within the
1453 * render area will be cleared to a uniform value, which is specified
1454 * when a render pass instance is begun."
1455 *
1456 * So the clear is only constrained by the render area and not by pipeline
1457 * state such as scissor or viewport, these are the semantics of
1458 * vkCmdClearAttachments as well.
1459 */
1460 VkCommandBuffer _cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
1461 VkClearRect rect = {
1462 .rect = state->render_area,
1463 .baseArrayLayer = 0,
1464 .layerCount = 1,
1465 };
1466 v3dv_CmdClearAttachments(_cmd_buffer, att_count, atts, 1, &rect);
1467 }
1468
1469 bool
v3dv_cmd_buffer_check_needs_load(const struct v3dv_cmd_buffer_state * state,VkImageAspectFlags aspect,uint32_t first_subpass_idx,VkAttachmentLoadOp load_op,uint32_t last_subpass_idx,VkAttachmentStoreOp store_op)1470 v3dv_cmd_buffer_check_needs_load(const struct v3dv_cmd_buffer_state *state,
1471 VkImageAspectFlags aspect,
1472 uint32_t first_subpass_idx,
1473 VkAttachmentLoadOp load_op,
1474 uint32_t last_subpass_idx,
1475 VkAttachmentStoreOp store_op)
1476 {
1477 /* We call this with image->vk.aspects & aspect, so 0 means the aspect we are
1478 * testing does not exist in the image.
1479 */
1480 if (!aspect)
1481 return false;
1482
1483 /* Attachment (or view) load operations apply on the first subpass that
1484 * uses the attachment (or view), otherwise we always need to load.
1485 */
1486 if (state->job->first_subpass > first_subpass_idx)
1487 return true;
1488
1489 /* If the job is continuing a subpass started in another job, we always
1490 * need to load.
1491 */
1492 if (state->job->is_subpass_continue)
1493 return true;
1494
1495 /* If the area is not aligned to tile boundaries and we are going to store,
1496 * then we need to load to preserve contents outside the render area.
1497 */
1498 if (!state->tile_aligned_render_area &&
1499 v3dv_cmd_buffer_check_needs_store(state, aspect, last_subpass_idx,
1500 store_op)) {
1501 return true;
1502 }
1503
1504 /* The attachment load operations must be LOAD */
1505 return load_op == VK_ATTACHMENT_LOAD_OP_LOAD;
1506 }
1507
1508 bool
v3dv_cmd_buffer_check_needs_store(const struct v3dv_cmd_buffer_state * state,VkImageAspectFlags aspect,uint32_t last_subpass_idx,VkAttachmentStoreOp store_op)1509 v3dv_cmd_buffer_check_needs_store(const struct v3dv_cmd_buffer_state *state,
1510 VkImageAspectFlags aspect,
1511 uint32_t last_subpass_idx,
1512 VkAttachmentStoreOp store_op)
1513 {
1514 /* We call this with image->vk.aspects & aspect, so 0 means the aspect we are
1515 * testing does not exist in the image.
1516 */
1517 if (!aspect)
1518 return false;
1519
1520 /* Attachment (or view) store operations only apply on the last subpass
1521 * where the attachment (or view) is used, in other subpasses we always
1522 * need to store.
1523 */
1524 if (state->subpass_idx < last_subpass_idx)
1525 return true;
1526
1527 /* Attachment store operations only apply on the last job we emit on the the
1528 * last subpass where the attachment is used, otherwise we always need to
1529 * store.
1530 */
1531 if (!state->job->is_subpass_finish)
1532 return true;
1533
1534 /* The attachment store operation must be STORE */
1535 return store_op == VK_ATTACHMENT_STORE_OP_STORE;
1536 }
1537
1538 static void
cmd_buffer_subpass_check_double_buffer_mode(struct v3dv_cmd_buffer * cmd_buffer,bool msaa)1539 cmd_buffer_subpass_check_double_buffer_mode(struct v3dv_cmd_buffer *cmd_buffer,
1540 bool msaa)
1541 {
1542 const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1543 struct v3dv_job *job = cmd_buffer->state.job;
1544 assert(job);
1545
1546 job->can_use_double_buffer = false;
1547
1548 /* Double-buffer can only be used if requested via V3D_DEBUG */
1549 if (!V3D_DBG(DOUBLE_BUFFER))
1550 return;
1551
1552 /* Double-buffer cannot be enabled for MSAA jobs */
1553 if (msaa)
1554 return;
1555
1556 const struct v3dv_render_pass *pass = state->pass;
1557 const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
1558
1559 /* FIXME: For now we discard multiview jobs (which have an implicit geometry
1560 * shader) for this optimization. If we want to enable this with multiview
1561 * we would need to check if any view (layer) in any attachment used by the
1562 * job has loads and/or stores as we do below for regular attachments. Also,
1563 * we would want to have a heuristic that doesn't automatically disable
1564 * double-buffer in the presence of geometry shaders.
1565 */
1566 if (state->pass->multiview_enabled)
1567 return;
1568
1569 /* Tile loads are serialized against stores, in which case we don't get
1570 * any benefits from enabling double-buffer and would just pay the price
1571 * of a smaller tile size instead. Similarly, we only benefit from
1572 * double-buffer if we have tile stores, as the point of this mode is
1573 * to execute rendering of a new tile while we store the previous one to
1574 * hide latency on the tile store operation.
1575 */
1576 bool has_stores = false;
1577 for (uint32_t i = 0; i < subpass->color_count; i++) {
1578 uint32_t attachment_idx = subpass->color_attachments[i].attachment;
1579 if (attachment_idx == VK_ATTACHMENT_UNUSED)
1580 continue;
1581
1582 const struct v3dv_render_pass_attachment *attachment =
1583 &state->pass->attachments[attachment_idx];
1584
1585 /* FIXME: This will check 'tile_aligned_render_area' but that was
1586 * computed with a tile size without double-buffer. That is okay
1587 * because if the larger tile size is aligned then we know the smaller
1588 * tile size for double-buffer will be as well. However, we might
1589 * still benefit from doing this check with the smaller tile size
1590 * because it can happen that the smaller size is aligned and the
1591 * larger size is not.
1592 */
1593 if (v3dv_cmd_buffer_check_needs_load(state,
1594 VK_IMAGE_ASPECT_COLOR_BIT,
1595 attachment->first_subpass,
1596 attachment->desc.loadOp,
1597 attachment->last_subpass,
1598 attachment->desc.storeOp)) {
1599 return;
1600 }
1601
1602 if (v3dv_cmd_buffer_check_needs_store(state,
1603 VK_IMAGE_ASPECT_COLOR_BIT,
1604 attachment->last_subpass,
1605 attachment->desc.storeOp)) {
1606 has_stores = true;
1607 }
1608 }
1609
1610 if (subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED) {
1611 uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
1612 const struct v3dv_render_pass_attachment *ds_attachment =
1613 &state->pass->attachments[ds_attachment_idx];
1614
1615 const VkImageAspectFlags ds_aspects =
1616 vk_format_aspects(ds_attachment->desc.format);
1617
1618 if (v3dv_cmd_buffer_check_needs_load(state,
1619 ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
1620 ds_attachment->first_subpass,
1621 ds_attachment->desc.loadOp,
1622 ds_attachment->last_subpass,
1623 ds_attachment->desc.storeOp)) {
1624 return;
1625 }
1626
1627 if (v3dv_cmd_buffer_check_needs_load(state,
1628 ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
1629 ds_attachment->first_subpass,
1630 ds_attachment->desc.stencilLoadOp,
1631 ds_attachment->last_subpass,
1632 ds_attachment->desc.stencilStoreOp)) {
1633 return;
1634 }
1635
1636 has_stores |= v3dv_cmd_buffer_check_needs_store(state,
1637 ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
1638 ds_attachment->last_subpass,
1639 ds_attachment->desc.storeOp);
1640 has_stores |= v3dv_cmd_buffer_check_needs_store(state,
1641 ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
1642 ds_attachment->last_subpass,
1643 ds_attachment->desc.stencilStoreOp);
1644 }
1645
1646 job->can_use_double_buffer = has_stores;
1647 }
1648
1649 static struct v3dv_job *
cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer * cmd_buffer,uint32_t subpass_idx,enum v3dv_job_type type)1650 cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer *cmd_buffer,
1651 uint32_t subpass_idx,
1652 enum v3dv_job_type type)
1653 {
1654 assert(type == V3DV_JOB_TYPE_GPU_CL ||
1655 type == V3DV_JOB_TYPE_GPU_CL_SECONDARY);
1656
1657 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1658 assert(subpass_idx < state->pass->subpass_count);
1659
1660 /* Starting a new job can trigger a finish of the current one, so don't
1661 * change the command buffer state for the new job until we are done creating
1662 * the new job.
1663 */
1664 struct v3dv_job *job =
1665 v3dv_cmd_buffer_start_job(cmd_buffer, subpass_idx, type);
1666 if (!job)
1667 return NULL;
1668
1669 state->subpass_idx = subpass_idx;
1670
1671 /* If we are starting a new job we need to setup binning. We only do this
1672 * for V3DV_JOB_TYPE_GPU_CL jobs because V3DV_JOB_TYPE_GPU_CL_SECONDARY
1673 * jobs are not submitted to the GPU directly, and are instead meant to be
1674 * branched to from other V3DV_JOB_TYPE_GPU_CL jobs.
1675 */
1676 if (type == V3DV_JOB_TYPE_GPU_CL &&
1677 job->first_subpass == state->subpass_idx) {
1678 const struct v3dv_subpass *subpass =
1679 &state->pass->subpasses[state->subpass_idx];
1680
1681 const struct v3dv_framebuffer *framebuffer = state->framebuffer;
1682
1683 uint8_t max_internal_bpp, total_color_bpp;
1684 bool msaa;
1685 v3dv_X(job->device, framebuffer_compute_internal_bpp_msaa)
1686 (framebuffer, state->attachments, subpass,
1687 &max_internal_bpp, &total_color_bpp, &msaa);
1688
1689 /* From the Vulkan spec:
1690 *
1691 * "If the render pass uses multiview, then layers must be one and
1692 * each attachment requires a number of layers that is greater than
1693 * the maximum bit index set in the view mask in the subpasses in
1694 * which it is used."
1695 *
1696 * So when multiview is enabled, we take the number of layers from the
1697 * last bit set in the view mask.
1698 */
1699 uint32_t layers = framebuffer->layers;
1700 if (subpass->view_mask != 0) {
1701 assert(framebuffer->layers == 1);
1702 layers = util_last_bit(subpass->view_mask);
1703 }
1704
1705 v3dv_job_start_frame(job,
1706 framebuffer->width,
1707 framebuffer->height,
1708 layers,
1709 true, false,
1710 subpass->color_count,
1711 max_internal_bpp,
1712 total_color_bpp,
1713 msaa);
1714 }
1715
1716 return job;
1717 }
1718
1719 struct v3dv_job *
v3dv_cmd_buffer_subpass_start(struct v3dv_cmd_buffer * cmd_buffer,uint32_t subpass_idx)1720 v3dv_cmd_buffer_subpass_start(struct v3dv_cmd_buffer *cmd_buffer,
1721 uint32_t subpass_idx)
1722 {
1723 assert(cmd_buffer->state.pass);
1724 assert(subpass_idx < cmd_buffer->state.pass->subpass_count);
1725
1726 struct v3dv_job *job =
1727 cmd_buffer_subpass_create_job(cmd_buffer, subpass_idx,
1728 V3DV_JOB_TYPE_GPU_CL);
1729 if (!job)
1730 return NULL;
1731
1732 /* Check if our render area is aligned to tile boundaries. We have to do
1733 * this in each subpass because the subset of attachments used can change
1734 * and with that the tile size selected by the hardware can change too.
1735 */
1736 cmd_buffer_update_tile_alignment(cmd_buffer);
1737
1738 /* Decide if we can use double-buffer for this subpass job */
1739 cmd_buffer_subpass_check_double_buffer_mode(cmd_buffer, job->frame_tiling.msaa);
1740
1741 cmd_buffer_update_attachment_resolve_state(cmd_buffer);
1742
1743 /* If we can't use TLB clears then we need to emit draw clears for any
1744 * LOAD_OP_CLEAR attachments in this subpass now. We might also need to emit
1745 * Depth/Stencil clears if we hit GFXH-1461.
1746 *
1747 * Secondary command buffers don't start subpasses (and may not even have
1748 * framebuffer state), so we only care about this in primaries. The only
1749 * exception could be a secondary running inside a subpass that needs to
1750 * record a meta operation (with its own render pass) that relies on
1751 * attachment load clears, but we don't have any instances of that right
1752 * now.
1753 */
1754 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY)
1755 cmd_buffer_emit_subpass_clears(cmd_buffer);
1756
1757 return job;
1758 }
1759
1760 struct v3dv_job *
v3dv_cmd_buffer_subpass_resume(struct v3dv_cmd_buffer * cmd_buffer,uint32_t subpass_idx)1761 v3dv_cmd_buffer_subpass_resume(struct v3dv_cmd_buffer *cmd_buffer,
1762 uint32_t subpass_idx)
1763 {
1764 assert(cmd_buffer->state.pass);
1765 assert(subpass_idx < cmd_buffer->state.pass->subpass_count);
1766
1767 struct v3dv_job *job;
1768 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
1769 job = cmd_buffer_subpass_create_job(cmd_buffer, subpass_idx,
1770 V3DV_JOB_TYPE_GPU_CL);
1771 } else {
1772 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
1773 job = cmd_buffer_subpass_create_job(cmd_buffer, subpass_idx,
1774 V3DV_JOB_TYPE_GPU_CL_SECONDARY);
1775 }
1776
1777 if (!job)
1778 return NULL;
1779
1780 job->is_subpass_continue = true;
1781
1782 return job;
1783 }
1784
1785 void
v3dv_cmd_buffer_subpass_finish(struct v3dv_cmd_buffer * cmd_buffer)1786 v3dv_cmd_buffer_subpass_finish(struct v3dv_cmd_buffer *cmd_buffer)
1787 {
1788 /* We can end up here without a job if the last command recorded into the
1789 * subpass already finished the job (for example a pipeline barrier). In
1790 * that case we miss to set the is_subpass_finish flag, but that is not
1791 * required for proper behavior.
1792 */
1793 struct v3dv_job *job = cmd_buffer->state.job;
1794 if (job)
1795 job->is_subpass_finish = true;
1796 }
1797
1798 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdEndRenderPass2(VkCommandBuffer commandBuffer,const VkSubpassEndInfo * pSubpassEndInfo)1799 v3dv_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
1800 const VkSubpassEndInfo *pSubpassEndInfo)
1801 {
1802 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1803
1804 /* Finalize last subpass */
1805 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1806 assert(state->subpass_idx == state->pass->subpass_count - 1);
1807 v3dv_cmd_buffer_subpass_finish(cmd_buffer);
1808 v3dv_cmd_buffer_finish_job(cmd_buffer);
1809
1810 cmd_buffer_subpass_handle_pending_resolves(cmd_buffer);
1811
1812 /* We are no longer inside a render pass */
1813 state->framebuffer = NULL;
1814 state->pass = NULL;
1815 state->subpass_idx = -1;
1816 }
1817
1818 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_EndCommandBuffer(VkCommandBuffer commandBuffer)1819 v3dv_EndCommandBuffer(VkCommandBuffer commandBuffer)
1820 {
1821 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1822
1823 if (cmd_buffer->state.oom)
1824 return VK_ERROR_OUT_OF_HOST_MEMORY;
1825
1826 /* Primaries should have ended any recording jobs by the time they hit
1827 * vkEndRenderPass (if we are inside a render pass). Commands outside
1828 * a render pass instance (for both primaries and secondaries) spawn
1829 * complete jobs too. So the only case where we can get here without
1830 * finishing a recording job is when we are recording a secondary
1831 * inside a render pass.
1832 */
1833 if (cmd_buffer->state.job) {
1834 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
1835 cmd_buffer->state.pass);
1836 v3dv_cmd_buffer_finish_job(cmd_buffer);
1837 }
1838
1839 cmd_buffer->status = V3DV_CMD_BUFFER_STATUS_EXECUTABLE;
1840
1841 return VK_SUCCESS;
1842 }
1843
1844 static void
clone_bo_list(struct v3dv_cmd_buffer * cmd_buffer,struct list_head * dst,struct list_head * src)1845 clone_bo_list(struct v3dv_cmd_buffer *cmd_buffer,
1846 struct list_head *dst,
1847 struct list_head *src)
1848 {
1849 assert(cmd_buffer);
1850
1851 list_inithead(dst);
1852 list_for_each_entry(struct v3dv_bo, bo, src, list_link) {
1853 struct v3dv_bo *clone_bo =
1854 vk_alloc(&cmd_buffer->device->vk.alloc, sizeof(struct v3dv_bo), 8,
1855 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
1856 if (!clone_bo) {
1857 v3dv_flag_oom(cmd_buffer, NULL);
1858 return;
1859 }
1860
1861 *clone_bo = *bo;
1862 list_addtail(&clone_bo->list_link, dst);
1863 }
1864 }
1865
1866 /* Clones a job for inclusion in the given command buffer. Note that this
1867 * doesn't make a deep copy so the cloned job it doesn't own any resources.
1868 * Useful when we need to have a job in more than one list, which happens
1869 * for jobs recorded in secondary command buffers when we want to execute
1870 * them in primaries.
1871 */
1872 struct v3dv_job *
v3dv_job_clone_in_cmd_buffer(struct v3dv_job * job,struct v3dv_cmd_buffer * cmd_buffer)1873 v3dv_job_clone_in_cmd_buffer(struct v3dv_job *job,
1874 struct v3dv_cmd_buffer *cmd_buffer)
1875 {
1876 struct v3dv_job *clone_job = vk_alloc(&job->device->vk.alloc,
1877 sizeof(struct v3dv_job), 8,
1878 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
1879 if (!clone_job) {
1880 v3dv_flag_oom(cmd_buffer, NULL);
1881 return NULL;
1882 }
1883
1884 /* Cloned jobs don't duplicate resources! */
1885 *clone_job = *job;
1886 clone_job->is_clone = true;
1887 clone_job->cmd_buffer = cmd_buffer;
1888 list_addtail(&clone_job->list_link, &cmd_buffer->jobs);
1889
1890 /* We need to regen the BO lists so that they point to the BO list in the
1891 * cloned job. Otherwise functions like list_length() will loop forever.
1892 */
1893 if (job->type == V3DV_JOB_TYPE_GPU_CL) {
1894 clone_bo_list(cmd_buffer, &clone_job->bcl.bo_list, &job->bcl.bo_list);
1895 clone_bo_list(cmd_buffer, &clone_job->rcl.bo_list, &job->rcl.bo_list);
1896 clone_bo_list(cmd_buffer, &clone_job->indirect.bo_list,
1897 &job->indirect.bo_list);
1898 }
1899
1900 return clone_job;
1901 }
1902
1903 void
v3dv_cmd_buffer_merge_barrier_state(struct v3dv_barrier_state * dst,struct v3dv_barrier_state * src)1904 v3dv_cmd_buffer_merge_barrier_state(struct v3dv_barrier_state *dst,
1905 struct v3dv_barrier_state *src)
1906 {
1907 dst->dst_mask |= src->dst_mask;
1908
1909 dst->src_mask_graphics |= src->src_mask_graphics;
1910 dst->src_mask_compute |= src->src_mask_compute;
1911 dst->src_mask_transfer |= src->src_mask_transfer;
1912
1913 dst->bcl_buffer_access |= src->bcl_buffer_access;
1914 dst->bcl_image_access |= src->bcl_image_access;
1915 }
1916
1917 static void
cmd_buffer_execute_outside_pass(struct v3dv_cmd_buffer * primary,uint32_t cmd_buffer_count,const VkCommandBuffer * cmd_buffers)1918 cmd_buffer_execute_outside_pass(struct v3dv_cmd_buffer *primary,
1919 uint32_t cmd_buffer_count,
1920 const VkCommandBuffer *cmd_buffers)
1921 {
1922 struct v3dv_barrier_state pending_barrier = { 0 };
1923 for (uint32_t i = 0; i < cmd_buffer_count; i++) {
1924 V3DV_FROM_HANDLE(v3dv_cmd_buffer, secondary, cmd_buffers[i]);
1925
1926 assert(!(secondary->usage_flags &
1927 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT));
1928
1929 /* Secondary command buffers that execute outside a render pass create
1930 * complete jobs with an RCL and tile setup, so we simply want to merge
1931 * their job list into the primary's. However, because they may be
1932 * executed into multiple primaries at the same time and we only have a
1933 * single list_link in each job, we can't just add then to the primary's
1934 * job list and we instead have to clone them first.
1935 *
1936 * Alternatively, we could create a "execute secondary" CPU job that
1937 * when executed in a queue, would submit all the jobs in the referenced
1938 * secondary command buffer. However, this would raise some challenges
1939 * to make it work with the implementation of wait threads in the queue
1940 * which we use for event waits, for example.
1941 */
1942 list_for_each_entry(struct v3dv_job, secondary_job,
1943 &secondary->jobs, list_link) {
1944 /* These can only happen inside a render pass */
1945 assert(secondary_job->type != V3DV_JOB_TYPE_GPU_CL_SECONDARY);
1946 struct v3dv_job *job = v3dv_job_clone_in_cmd_buffer(secondary_job, primary);
1947 if (!job)
1948 return;
1949
1950 if (pending_barrier.dst_mask) {
1951 /* FIXME: do the same we do for primaries and only choose the
1952 * relevant src masks.
1953 */
1954 job->serialize = pending_barrier.src_mask_graphics |
1955 pending_barrier.src_mask_transfer |
1956 pending_barrier.src_mask_compute;
1957 if (pending_barrier.bcl_buffer_access ||
1958 pending_barrier.bcl_image_access) {
1959 job->needs_bcl_sync = true;
1960 }
1961 memset(&pending_barrier, 0, sizeof(pending_barrier));
1962 }
1963 }
1964
1965 /* If this secondary had any pending barrier state we will need that
1966 * barrier state consumed with whatever comes after it (first job in
1967 * the next secondary or the primary, if this was the last secondary).
1968 */
1969 assert(secondary->state.barrier.dst_mask ||
1970 (!secondary->state.barrier.bcl_buffer_access &&
1971 !secondary->state.barrier.bcl_image_access));
1972 pending_barrier = secondary->state.barrier;
1973 }
1974
1975 if (pending_barrier.dst_mask) {
1976 v3dv_cmd_buffer_merge_barrier_state(&primary->state.barrier,
1977 &pending_barrier);
1978 }
1979 }
1980
1981 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdExecuteCommands(VkCommandBuffer commandBuffer,uint32_t commandBufferCount,const VkCommandBuffer * pCommandBuffers)1982 v3dv_CmdExecuteCommands(VkCommandBuffer commandBuffer,
1983 uint32_t commandBufferCount,
1984 const VkCommandBuffer *pCommandBuffers)
1985 {
1986 V3DV_FROM_HANDLE(v3dv_cmd_buffer, primary, commandBuffer);
1987
1988 if (primary->state.pass != NULL) {
1989 v3dv_X(primary->device, cmd_buffer_execute_inside_pass)
1990 (primary, commandBufferCount, pCommandBuffers);
1991 } else {
1992 cmd_buffer_execute_outside_pass(primary,
1993 commandBufferCount, pCommandBuffers);
1994 }
1995 }
1996
1997 /* This goes though the list of possible dynamic states in the pipeline and,
1998 * for those that are not configured as dynamic, copies relevant state into
1999 * the command buffer.
2000 */
2001 static void
cmd_buffer_bind_pipeline_static_state(struct v3dv_cmd_buffer * cmd_buffer,const struct v3dv_dynamic_state * src)2002 cmd_buffer_bind_pipeline_static_state(struct v3dv_cmd_buffer *cmd_buffer,
2003 const struct v3dv_dynamic_state *src)
2004 {
2005 struct v3dv_dynamic_state *dest = &cmd_buffer->state.dynamic;
2006 uint32_t dynamic_mask = src->mask;
2007 uint32_t dirty = 0;
2008
2009 if (!(dynamic_mask & V3DV_DYNAMIC_VIEWPORT)) {
2010 dest->viewport.count = src->viewport.count;
2011 if (memcmp(&dest->viewport.viewports, &src->viewport.viewports,
2012 src->viewport.count * sizeof(VkViewport))) {
2013 typed_memcpy(dest->viewport.viewports,
2014 src->viewport.viewports,
2015 src->viewport.count);
2016 typed_memcpy(dest->viewport.scale, src->viewport.scale,
2017 src->viewport.count);
2018 typed_memcpy(dest->viewport.translate, src->viewport.translate,
2019 src->viewport.count);
2020 dirty |= V3DV_CMD_DIRTY_VIEWPORT;
2021 }
2022 }
2023
2024 if (!(dynamic_mask & V3DV_DYNAMIC_SCISSOR)) {
2025 dest->scissor.count = src->scissor.count;
2026 if (memcmp(&dest->scissor.scissors, &src->scissor.scissors,
2027 src->scissor.count * sizeof(VkRect2D))) {
2028 typed_memcpy(dest->scissor.scissors,
2029 src->scissor.scissors, src->scissor.count);
2030 dirty |= V3DV_CMD_DIRTY_SCISSOR;
2031 }
2032 }
2033
2034 if (!(dynamic_mask & V3DV_DYNAMIC_STENCIL_COMPARE_MASK)) {
2035 if (memcmp(&dest->stencil_compare_mask, &src->stencil_compare_mask,
2036 sizeof(src->stencil_compare_mask))) {
2037 dest->stencil_compare_mask = src->stencil_compare_mask;
2038 dirty |= V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK;
2039 }
2040 }
2041
2042 if (!(dynamic_mask & V3DV_DYNAMIC_STENCIL_WRITE_MASK)) {
2043 if (memcmp(&dest->stencil_write_mask, &src->stencil_write_mask,
2044 sizeof(src->stencil_write_mask))) {
2045 dest->stencil_write_mask = src->stencil_write_mask;
2046 dirty |= V3DV_CMD_DIRTY_STENCIL_WRITE_MASK;
2047 }
2048 }
2049
2050 if (!(dynamic_mask & V3DV_DYNAMIC_STENCIL_REFERENCE)) {
2051 if (memcmp(&dest->stencil_reference, &src->stencil_reference,
2052 sizeof(src->stencil_reference))) {
2053 dest->stencil_reference = src->stencil_reference;
2054 dirty |= V3DV_CMD_DIRTY_STENCIL_REFERENCE;
2055 }
2056 }
2057
2058 if (!(dynamic_mask & V3DV_DYNAMIC_BLEND_CONSTANTS)) {
2059 if (memcmp(dest->blend_constants, src->blend_constants,
2060 sizeof(src->blend_constants))) {
2061 memcpy(dest->blend_constants, src->blend_constants,
2062 sizeof(src->blend_constants));
2063 dirty |= V3DV_CMD_DIRTY_BLEND_CONSTANTS;
2064 }
2065 }
2066
2067 if (!(dynamic_mask & V3DV_DYNAMIC_DEPTH_BIAS)) {
2068 if (memcmp(&dest->depth_bias, &src->depth_bias,
2069 sizeof(src->depth_bias))) {
2070 memcpy(&dest->depth_bias, &src->depth_bias, sizeof(src->depth_bias));
2071 dirty |= V3DV_CMD_DIRTY_DEPTH_BIAS;
2072 }
2073 }
2074
2075 if (!(dynamic_mask & V3DV_DYNAMIC_DEPTH_BOUNDS)) {
2076 if (memcmp(&dest->depth_bounds, &src->depth_bounds,
2077 sizeof(src->depth_bounds))) {
2078 memcpy(&dest->depth_bounds, &src->depth_bounds, sizeof(src->depth_bounds));
2079 dirty |= V3DV_CMD_DIRTY_DEPTH_BOUNDS;
2080 }
2081 }
2082
2083 if (!(dynamic_mask & V3DV_DYNAMIC_LINE_WIDTH)) {
2084 if (dest->line_width != src->line_width) {
2085 dest->line_width = src->line_width;
2086 dirty |= V3DV_CMD_DIRTY_LINE_WIDTH;
2087 }
2088 }
2089
2090 if (!(dynamic_mask & V3DV_DYNAMIC_COLOR_WRITE_ENABLE)) {
2091 if (dest->color_write_enable != src->color_write_enable) {
2092 dest->color_write_enable = src->color_write_enable;
2093 dirty |= V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE;
2094 }
2095 }
2096
2097 cmd_buffer->state.dynamic.mask = dynamic_mask;
2098 cmd_buffer->state.dirty |= dirty;
2099 }
2100
2101 static void
bind_graphics_pipeline(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_pipeline * pipeline)2102 bind_graphics_pipeline(struct v3dv_cmd_buffer *cmd_buffer,
2103 struct v3dv_pipeline *pipeline)
2104 {
2105 assert(pipeline && !(pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT));
2106 if (cmd_buffer->state.gfx.pipeline == pipeline)
2107 return;
2108
2109 cmd_buffer->state.gfx.pipeline = pipeline;
2110
2111 cmd_buffer_bind_pipeline_static_state(cmd_buffer, &pipeline->dynamic_state);
2112
2113 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_PIPELINE;
2114 }
2115
2116 static void
bind_compute_pipeline(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_pipeline * pipeline)2117 bind_compute_pipeline(struct v3dv_cmd_buffer *cmd_buffer,
2118 struct v3dv_pipeline *pipeline)
2119 {
2120 assert(pipeline && pipeline->active_stages == VK_SHADER_STAGE_COMPUTE_BIT);
2121
2122 if (cmd_buffer->state.compute.pipeline == pipeline)
2123 return;
2124
2125 cmd_buffer->state.compute.pipeline = pipeline;
2126 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_COMPUTE_PIPELINE;
2127 }
2128
2129 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdBindPipeline(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipeline _pipeline)2130 v3dv_CmdBindPipeline(VkCommandBuffer commandBuffer,
2131 VkPipelineBindPoint pipelineBindPoint,
2132 VkPipeline _pipeline)
2133 {
2134 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
2135 V3DV_FROM_HANDLE(v3dv_pipeline, pipeline, _pipeline);
2136
2137 switch (pipelineBindPoint) {
2138 case VK_PIPELINE_BIND_POINT_COMPUTE:
2139 bind_compute_pipeline(cmd_buffer, pipeline);
2140 break;
2141
2142 case VK_PIPELINE_BIND_POINT_GRAPHICS:
2143 bind_graphics_pipeline(cmd_buffer, pipeline);
2144 break;
2145
2146 default:
2147 assert(!"invalid bind point");
2148 break;
2149 }
2150 }
2151
2152 /* Considers the pipeline's negative_one_to_one state and applies it to the
2153 * current viewport transform if needed to produce the resulting Z translate
2154 * and scale parameters.
2155 */
2156 void
v3dv_cmd_buffer_state_get_viewport_z_xform(struct v3dv_cmd_buffer_state * state,uint32_t vp_idx,float * translate_z,float * scale_z)2157 v3dv_cmd_buffer_state_get_viewport_z_xform(struct v3dv_cmd_buffer_state *state,
2158 uint32_t vp_idx,
2159 float *translate_z, float *scale_z)
2160 {
2161 const struct v3dv_viewport_state *vp_state = &state->dynamic.viewport;
2162
2163 float t = vp_state->translate[vp_idx][2];
2164 float s = vp_state->scale[vp_idx][2];
2165
2166 assert(state->gfx.pipeline);
2167 if (state->gfx.pipeline->negative_one_to_one) {
2168 t = (t + vp_state->viewports[vp_idx].maxDepth) * 0.5f;
2169 s *= 0.5f;
2170 }
2171
2172 if (translate_z)
2173 *translate_z = t;
2174
2175 if (scale_z)
2176 *scale_z = s;
2177 }
2178
2179 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdSetViewport(VkCommandBuffer commandBuffer,uint32_t firstViewport,uint32_t viewportCount,const VkViewport * pViewports)2180 v3dv_CmdSetViewport(VkCommandBuffer commandBuffer,
2181 uint32_t firstViewport,
2182 uint32_t viewportCount,
2183 const VkViewport *pViewports)
2184 {
2185 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
2186 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
2187 const uint32_t total_count = firstViewport + viewportCount;
2188
2189 assert(firstViewport < MAX_VIEWPORTS);
2190 assert(total_count >= 1 && total_count <= MAX_VIEWPORTS);
2191
2192 if (state->dynamic.viewport.count < total_count)
2193 state->dynamic.viewport.count = total_count;
2194
2195 if (!memcmp(state->dynamic.viewport.viewports + firstViewport,
2196 pViewports, viewportCount * sizeof(*pViewports))) {
2197 return;
2198 }
2199
2200 memcpy(state->dynamic.viewport.viewports + firstViewport, pViewports,
2201 viewportCount * sizeof(*pViewports));
2202
2203 for (uint32_t i = firstViewport; i < total_count; i++) {
2204 v3dv_X(cmd_buffer->device, viewport_compute_xform)
2205 (&state->dynamic.viewport.viewports[i],
2206 state->dynamic.viewport.scale[i],
2207 state->dynamic.viewport.translate[i]);
2208 }
2209
2210 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VIEWPORT;
2211 }
2212
2213 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdSetScissor(VkCommandBuffer commandBuffer,uint32_t firstScissor,uint32_t scissorCount,const VkRect2D * pScissors)2214 v3dv_CmdSetScissor(VkCommandBuffer commandBuffer,
2215 uint32_t firstScissor,
2216 uint32_t scissorCount,
2217 const VkRect2D *pScissors)
2218 {
2219 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
2220 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
2221
2222 assert(firstScissor < MAX_SCISSORS);
2223 assert(firstScissor + scissorCount >= 1 &&
2224 firstScissor + scissorCount <= MAX_SCISSORS);
2225
2226 if (state->dynamic.scissor.count < firstScissor + scissorCount)
2227 state->dynamic.scissor.count = firstScissor + scissorCount;
2228
2229 if (!memcmp(state->dynamic.scissor.scissors + firstScissor,
2230 pScissors, scissorCount * sizeof(*pScissors))) {
2231 return;
2232 }
2233
2234 memcpy(state->dynamic.scissor.scissors + firstScissor, pScissors,
2235 scissorCount * sizeof(*pScissors));
2236
2237 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_SCISSOR;
2238 }
2239
2240 static void
emit_scissor(struct v3dv_cmd_buffer * cmd_buffer)2241 emit_scissor(struct v3dv_cmd_buffer *cmd_buffer)
2242 {
2243 if (cmd_buffer->state.dynamic.viewport.count == 0)
2244 return;
2245
2246 struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
2247
2248 /* FIXME: right now we only support one viewport. viewporst[0] would work
2249 * now, but would need to change if we allow multiple viewports.
2250 */
2251 float *vptranslate = dynamic->viewport.translate[0];
2252 float *vpscale = dynamic->viewport.scale[0];
2253 assert(vpscale[0] >= 0);
2254
2255 float vp_minx = vptranslate[0] - vpscale[0];
2256 float vp_maxx = vptranslate[0] + vpscale[0];
2257
2258 /* With KHR_maintenance1 viewport may have negative Y */
2259 float vp_miny = vptranslate[1] - fabsf(vpscale[1]);
2260 float vp_maxy = vptranslate[1] + fabsf(vpscale[1]);
2261
2262 /* Quoting from v3dx_emit:
2263 * "Clip to the scissor if it's enabled, but still clip to the
2264 * drawable regardless since that controls where the binner
2265 * tries to put things.
2266 *
2267 * Additionally, always clip the rendering to the viewport,
2268 * since the hardware does guardband clipping, meaning
2269 * primitives would rasterize outside of the view volume."
2270 */
2271 uint32_t minx, miny, maxx, maxy;
2272
2273 /* From the Vulkan spec:
2274 *
2275 * "The application must ensure (using scissor if necessary) that all
2276 * rendering is contained within the render area. The render area must be
2277 * contained within the framebuffer dimensions."
2278 *
2279 * So it is the application's responsibility to ensure this. Still, we can
2280 * help by automatically restricting the scissor rect to the render area.
2281 */
2282 minx = MAX2(vp_minx, cmd_buffer->state.render_area.offset.x);
2283 miny = MAX2(vp_miny, cmd_buffer->state.render_area.offset.y);
2284 maxx = MIN2(vp_maxx, cmd_buffer->state.render_area.offset.x +
2285 cmd_buffer->state.render_area.extent.width);
2286 maxy = MIN2(vp_maxy, cmd_buffer->state.render_area.offset.y +
2287 cmd_buffer->state.render_area.extent.height);
2288
2289 /* Clip against user provided scissor if needed.
2290 *
2291 * FIXME: right now we only allow one scissor. Below would need to be
2292 * updated if we support more
2293 */
2294 if (dynamic->scissor.count > 0) {
2295 VkRect2D *scissor = &dynamic->scissor.scissors[0];
2296 minx = MAX2(minx, scissor->offset.x);
2297 miny = MAX2(miny, scissor->offset.y);
2298 maxx = MIN2(maxx, scissor->offset.x + scissor->extent.width);
2299 maxy = MIN2(maxy, scissor->offset.y + scissor->extent.height);
2300 }
2301
2302 /* If the scissor is outside the viewport area we end up with
2303 * min{x,y} > max{x,y}.
2304 */
2305 if (minx > maxx)
2306 maxx = minx;
2307 if (miny > maxy)
2308 maxy = miny;
2309
2310 cmd_buffer->state.clip_window.offset.x = minx;
2311 cmd_buffer->state.clip_window.offset.y = miny;
2312 cmd_buffer->state.clip_window.extent.width = maxx - minx;
2313 cmd_buffer->state.clip_window.extent.height = maxy - miny;
2314
2315 v3dv_X(cmd_buffer->device, job_emit_clip_window)
2316 (cmd_buffer->state.job, &cmd_buffer->state.clip_window);
2317
2318 cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_SCISSOR;
2319 }
2320
2321 static void
update_gfx_uniform_state(struct v3dv_cmd_buffer * cmd_buffer,uint32_t dirty_uniform_state)2322 update_gfx_uniform_state(struct v3dv_cmd_buffer *cmd_buffer,
2323 uint32_t dirty_uniform_state)
2324 {
2325 /* We need to update uniform streams if any piece of state that is passed
2326 * to the shader as a uniform may have changed.
2327 *
2328 * If only descriptor sets are dirty then we can safely ignore updates
2329 * for shader stages that don't access descriptors.
2330 */
2331
2332 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
2333 assert(pipeline);
2334
2335 const bool has_new_pipeline = dirty_uniform_state & V3DV_CMD_DIRTY_PIPELINE;
2336 const bool has_new_viewport = dirty_uniform_state & V3DV_CMD_DIRTY_VIEWPORT;
2337 const bool has_new_push_constants = dirty_uniform_state & V3DV_CMD_DIRTY_PUSH_CONSTANTS;
2338 const bool has_new_descriptors = dirty_uniform_state & V3DV_CMD_DIRTY_DESCRIPTOR_SETS;
2339 const bool has_new_view_index = dirty_uniform_state & V3DV_CMD_DIRTY_VIEW_INDEX;
2340 const bool has_new_draw_id = dirty_uniform_state & V3DV_CMD_DIRTY_DRAW_ID;
2341
2342 /* VK_SHADER_STAGE_FRAGMENT_BIT */
2343 const bool has_new_descriptors_fs =
2344 has_new_descriptors &&
2345 (cmd_buffer->state.dirty_descriptor_stages & VK_SHADER_STAGE_FRAGMENT_BIT);
2346
2347 const bool has_new_push_constants_fs =
2348 has_new_push_constants &&
2349 (cmd_buffer->state.dirty_push_constants_stages & VK_SHADER_STAGE_FRAGMENT_BIT);
2350
2351 const bool needs_fs_update = has_new_pipeline ||
2352 has_new_view_index ||
2353 has_new_push_constants_fs ||
2354 has_new_descriptors_fs;
2355
2356 if (needs_fs_update) {
2357 struct v3dv_shader_variant *fs_variant =
2358 pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
2359
2360 cmd_buffer->state.uniforms.fs =
2361 v3dv_write_uniforms(cmd_buffer, pipeline, fs_variant);
2362 }
2363
2364 /* VK_SHADER_STAGE_GEOMETRY_BIT */
2365 if (pipeline->has_gs) {
2366 const bool has_new_descriptors_gs =
2367 has_new_descriptors &&
2368 (cmd_buffer->state.dirty_descriptor_stages &
2369 VK_SHADER_STAGE_GEOMETRY_BIT);
2370
2371 const bool has_new_push_constants_gs =
2372 has_new_push_constants &&
2373 (cmd_buffer->state.dirty_push_constants_stages &
2374 VK_SHADER_STAGE_GEOMETRY_BIT);
2375
2376 const bool needs_gs_update = has_new_viewport ||
2377 has_new_view_index ||
2378 has_new_pipeline ||
2379 has_new_push_constants_gs ||
2380 has_new_descriptors_gs;
2381
2382 if (needs_gs_update) {
2383 struct v3dv_shader_variant *gs_variant =
2384 pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY];
2385
2386 struct v3dv_shader_variant *gs_bin_variant =
2387 pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN];
2388
2389 cmd_buffer->state.uniforms.gs =
2390 v3dv_write_uniforms(cmd_buffer, pipeline, gs_variant);
2391
2392 cmd_buffer->state.uniforms.gs_bin =
2393 v3dv_write_uniforms(cmd_buffer, pipeline, gs_bin_variant);
2394 }
2395 }
2396
2397 /* VK_SHADER_STAGE_VERTEX_BIT */
2398 const bool has_new_descriptors_vs =
2399 has_new_descriptors &&
2400 (cmd_buffer->state.dirty_descriptor_stages & VK_SHADER_STAGE_VERTEX_BIT);
2401
2402 const bool has_new_push_constants_vs =
2403 has_new_push_constants &&
2404 (cmd_buffer->state.dirty_push_constants_stages & VK_SHADER_STAGE_VERTEX_BIT);
2405
2406 const bool needs_vs_update = has_new_viewport ||
2407 has_new_view_index ||
2408 has_new_draw_id ||
2409 has_new_pipeline ||
2410 has_new_push_constants_vs ||
2411 has_new_descriptors_vs;
2412
2413 if (needs_vs_update) {
2414 struct v3dv_shader_variant *vs_variant =
2415 pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX];
2416
2417 struct v3dv_shader_variant *vs_bin_variant =
2418 pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN];
2419
2420 cmd_buffer->state.uniforms.vs =
2421 v3dv_write_uniforms(cmd_buffer, pipeline, vs_variant);
2422
2423 cmd_buffer->state.uniforms.vs_bin =
2424 v3dv_write_uniforms(cmd_buffer, pipeline, vs_bin_variant);
2425 }
2426
2427 cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_VIEW_INDEX;
2428 cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_DRAW_ID;
2429 }
2430
2431 /* This stores command buffer state that we might be about to stomp for
2432 * a meta operation.
2433 */
2434 void
v3dv_cmd_buffer_meta_state_push(struct v3dv_cmd_buffer * cmd_buffer,bool push_descriptor_state)2435 v3dv_cmd_buffer_meta_state_push(struct v3dv_cmd_buffer *cmd_buffer,
2436 bool push_descriptor_state)
2437 {
2438 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
2439
2440 /* Attachment state.
2441 *
2442 * We store this state even if we are not currently in a subpass
2443 * (subpass_idx != -1) because we may get here to implement subpass
2444 * resolves via vkCmdResolveImage from
2445 * cmd_buffer_subpass_handle_pending_resolves. In that scenario we pretend
2446 * we are no longer in a subpass because Vulkan disallows image resolves
2447 * via vkCmdResolveImage during subpasses, but we still need to preserve
2448 * attachment state because we may have more subpasses to go through
2449 * after processing resolves in the current subass.
2450 */
2451 const uint32_t attachment_state_item_size =
2452 sizeof(struct v3dv_cmd_buffer_attachment_state);
2453 const uint32_t attachment_state_total_size =
2454 attachment_state_item_size * state->attachment_alloc_count;
2455 if (state->meta.attachment_alloc_count < state->attachment_alloc_count) {
2456 if (state->meta.attachment_alloc_count > 0)
2457 vk_free(&cmd_buffer->device->vk.alloc, state->meta.attachments);
2458
2459 state->meta.attachments = vk_zalloc(&cmd_buffer->device->vk.alloc,
2460 attachment_state_total_size, 8,
2461 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
2462 if (!state->meta.attachments) {
2463 v3dv_flag_oom(cmd_buffer, NULL);
2464 return;
2465 }
2466 state->meta.attachment_alloc_count = state->attachment_alloc_count;
2467 }
2468 state->meta.attachment_count = state->attachment_alloc_count;
2469 memcpy(state->meta.attachments, state->attachments,
2470 attachment_state_total_size);
2471
2472 if (state->subpass_idx != -1) {
2473 state->meta.subpass_idx = state->subpass_idx;
2474 state->meta.framebuffer = v3dv_framebuffer_to_handle(state->framebuffer);
2475 state->meta.pass = v3dv_render_pass_to_handle(state->pass);
2476
2477 state->meta.tile_aligned_render_area = state->tile_aligned_render_area;
2478 memcpy(&state->meta.render_area, &state->render_area, sizeof(VkRect2D));
2479 }
2480
2481 /* We expect that meta operations are graphics-only, so we only take into
2482 * account the graphics pipeline, and the graphics state
2483 */
2484 state->meta.gfx.pipeline = state->gfx.pipeline;
2485 memcpy(&state->meta.dynamic, &state->dynamic, sizeof(state->dynamic));
2486
2487 struct v3dv_descriptor_state *gfx_descriptor_state =
2488 &cmd_buffer->state.gfx.descriptor_state;
2489
2490 if (push_descriptor_state) {
2491 if (gfx_descriptor_state->valid != 0) {
2492 memcpy(&state->meta.gfx.descriptor_state, gfx_descriptor_state,
2493 sizeof(state->gfx.descriptor_state));
2494 }
2495 state->meta.has_descriptor_state = true;
2496 } else {
2497 state->meta.has_descriptor_state = false;
2498 }
2499
2500 if (cmd_buffer->state.push_constants_size > 0) {
2501 state->meta.push_constants_size = cmd_buffer->state.push_constants_size;
2502 memcpy(state->meta.push_constants, cmd_buffer->state.push_constants_data,
2503 cmd_buffer->state.push_constants_size);
2504 cmd_buffer->state.push_constants_size = 0;
2505 }
2506 }
2507
2508 /* This restores command buffer state after a meta operation
2509 */
2510 void
v3dv_cmd_buffer_meta_state_pop(struct v3dv_cmd_buffer * cmd_buffer,bool needs_subpass_resume)2511 v3dv_cmd_buffer_meta_state_pop(struct v3dv_cmd_buffer *cmd_buffer,
2512 bool needs_subpass_resume)
2513 {
2514 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
2515
2516 /* Attachment state */
2517 assert(state->meta.attachment_count <= state->attachment_alloc_count);
2518 const uint32_t attachment_state_item_size =
2519 sizeof(struct v3dv_cmd_buffer_attachment_state);
2520 const uint32_t attachment_state_total_size =
2521 attachment_state_item_size * state->meta.attachment_count;
2522 memcpy(state->attachments, state->meta.attachments,
2523 attachment_state_total_size);
2524
2525 if (state->meta.subpass_idx != -1) {
2526 state->pass = v3dv_render_pass_from_handle(state->meta.pass);
2527 state->framebuffer = v3dv_framebuffer_from_handle(state->meta.framebuffer);
2528
2529 state->tile_aligned_render_area = state->meta.tile_aligned_render_area;
2530 memcpy(&state->render_area, &state->meta.render_area, sizeof(VkRect2D));
2531
2532 /* Is needs_subpass_resume is true it means that the emitted the meta
2533 * operation in its own job (possibly with an RT config that is
2534 * incompatible with the current subpass), so resuming subpass execution
2535 * after it requires that we create a new job with the subpass RT setup.
2536 */
2537 if (needs_subpass_resume)
2538 v3dv_cmd_buffer_subpass_resume(cmd_buffer, state->meta.subpass_idx);
2539 } else {
2540 state->subpass_idx = -1;
2541 }
2542
2543 if (state->meta.gfx.pipeline != NULL) {
2544 struct v3dv_pipeline *pipeline = state->meta.gfx.pipeline;
2545 VkPipelineBindPoint pipeline_binding =
2546 v3dv_pipeline_get_binding_point(pipeline);
2547 v3dv_CmdBindPipeline(v3dv_cmd_buffer_to_handle(cmd_buffer),
2548 pipeline_binding,
2549 v3dv_pipeline_to_handle(state->meta.gfx.pipeline));
2550 } else {
2551 state->gfx.pipeline = NULL;
2552 }
2553
2554 /* Restore dynamic state */
2555 memcpy(&state->dynamic, &state->meta.dynamic, sizeof(state->dynamic));
2556 state->dirty = ~0;
2557
2558 if (state->meta.has_descriptor_state) {
2559 if (state->meta.gfx.descriptor_state.valid != 0) {
2560 memcpy(&state->gfx.descriptor_state, &state->meta.gfx.descriptor_state,
2561 sizeof(state->gfx.descriptor_state));
2562 } else {
2563 state->gfx.descriptor_state.valid = 0;
2564 }
2565 }
2566
2567 /* We only need to restore push constant data if we had any data in the
2568 * original command buffer and the meta operation wrote new push constant
2569 * data.
2570 */
2571 if (state->meta.push_constants_size > 0 &&
2572 cmd_buffer->state.push_constants_size > 0) {
2573 memcpy(cmd_buffer->state.push_constants_data, state->meta.push_constants,
2574 state->meta.push_constants_size);
2575 }
2576 cmd_buffer->state.push_constants_size = state->meta.push_constants_size;
2577
2578 state->meta.gfx.pipeline = NULL;
2579 state->meta.framebuffer = VK_NULL_HANDLE;
2580 state->meta.pass = VK_NULL_HANDLE;
2581 state->meta.subpass_idx = -1;
2582 state->meta.has_descriptor_state = false;
2583 state->meta.push_constants_size = 0;
2584 }
2585
2586 static struct v3dv_job *
cmd_buffer_pre_draw_split_job(struct v3dv_cmd_buffer * cmd_buffer)2587 cmd_buffer_pre_draw_split_job(struct v3dv_cmd_buffer *cmd_buffer)
2588 {
2589 struct v3dv_job *job = cmd_buffer->state.job;
2590 assert(job);
2591
2592 /* If the job has been flagged with 'always_flush' and it has already
2593 * recorded any draw calls then we need to start a new job for it.
2594 */
2595 if (job->always_flush && job->draw_count > 0) {
2596 assert(cmd_buffer->state.pass);
2597 /* First, flag the current job as not being the last in the
2598 * current subpass
2599 */
2600 job->is_subpass_finish = false;
2601
2602 /* Now start a new job in the same subpass and flag it as continuing
2603 * the current subpass.
2604 */
2605 job = v3dv_cmd_buffer_subpass_resume(cmd_buffer,
2606 cmd_buffer->state.subpass_idx);
2607 assert(job->draw_count == 0);
2608
2609 /* Inherit the 'always flush' behavior */
2610 job->always_flush = true;
2611 }
2612
2613 assert(job->draw_count == 0 || !job->always_flush);
2614 return job;
2615 }
2616
2617 /**
2618 * The Vulkan spec states:
2619 *
2620 * "It is legal for a subpass to use no color or depth/stencil
2621 * attachments (...) This kind of subpass can use shader side effects such
2622 * as image stores and atomics to produce an output. In this case, the
2623 * subpass continues to use the width, height, and layers of the framebuffer
2624 * to define the dimensions of the rendering area, and the
2625 * rasterizationSamples from each pipeline’s
2626 * VkPipelineMultisampleStateCreateInfo to define the number of samples used
2627 * in rasterization."
2628 *
2629 * We need to enable MSAA in the TILE_BINNING_MODE_CFG packet, which we
2630 * emit when we start a new frame at the beginning of a subpass. At that point,
2631 * if the framebuffer doesn't have any attachments we won't enable MSAA and
2632 * the job won't be valid in the scenario described by the spec.
2633 *
2634 * This function is intended to be called before a draw call and will test if
2635 * we are in that scenario, in which case, it will restart the current job
2636 * with MSAA enabled.
2637 */
2638 static void
cmd_buffer_restart_job_for_msaa_if_needed(struct v3dv_cmd_buffer * cmd_buffer)2639 cmd_buffer_restart_job_for_msaa_if_needed(struct v3dv_cmd_buffer *cmd_buffer)
2640 {
2641 assert(cmd_buffer->state.job);
2642
2643 /* We don't support variableMultisampleRate so we know that all pipelines
2644 * bound in the same subpass must have matching number of samples, so we
2645 * can do this check only on the first draw call.
2646 */
2647 if (cmd_buffer->state.job->draw_count > 0)
2648 return;
2649
2650 /* We only need to restart the frame if the pipeline requires MSAA but
2651 * our frame tiling didn't enable it.
2652 */
2653 if (!cmd_buffer->state.gfx.pipeline->msaa ||
2654 cmd_buffer->state.job->frame_tiling.msaa) {
2655 return;
2656 }
2657
2658 /* FIXME: Secondary command buffers don't start frames. Instead, they are
2659 * recorded into primary jobs that start them. For secondaries, we should
2660 * still handle this scenario, but we should do that when we record them
2661 * into primaries by testing if any of the secondaries has multisampled
2662 * draw calls in them, and then using that info to decide if we need to
2663 * restart the primary job into which they are being recorded.
2664 */
2665 if (cmd_buffer->vk.level != VK_COMMAND_BUFFER_LEVEL_PRIMARY)
2666 return;
2667
2668 /* Drop the current job and restart it with MSAA enabled */
2669 struct v3dv_job *old_job = cmd_buffer->state.job;
2670 cmd_buffer->state.job = NULL;
2671
2672 struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->vk.alloc,
2673 sizeof(struct v3dv_job), 8,
2674 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
2675 if (!job) {
2676 v3dv_flag_oom(cmd_buffer, NULL);
2677 return;
2678 }
2679
2680 v3dv_job_init(job, V3DV_JOB_TYPE_GPU_CL, cmd_buffer->device, cmd_buffer,
2681 cmd_buffer->state.subpass_idx);
2682 cmd_buffer->state.job = job;
2683
2684 v3dv_job_start_frame(job,
2685 old_job->frame_tiling.width,
2686 old_job->frame_tiling.height,
2687 old_job->frame_tiling.layers,
2688 true, false,
2689 old_job->frame_tiling.render_target_count,
2690 old_job->frame_tiling.internal_bpp,
2691 old_job->frame_tiling.total_color_bpp,
2692 true /* msaa */);
2693
2694 v3dv_job_destroy(old_job);
2695 }
2696
2697 static bool
cmd_buffer_binning_sync_required(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_pipeline * pipeline,bool indexed,bool indirect)2698 cmd_buffer_binning_sync_required(struct v3dv_cmd_buffer *cmd_buffer,
2699 struct v3dv_pipeline *pipeline,
2700 bool indexed, bool indirect)
2701 {
2702 const struct v3dv_descriptor_maps *vs_bin_maps =
2703 pipeline->shared_data->maps[BROADCOM_SHADER_VERTEX_BIN];
2704
2705 const struct v3dv_descriptor_maps *gs_bin_maps =
2706 pipeline->shared_data->maps[BROADCOM_SHADER_GEOMETRY_BIN];
2707
2708 VkAccessFlags buffer_access =
2709 cmd_buffer->state.barrier.bcl_buffer_access;
2710 if (buffer_access) {
2711 /* Index buffer read */
2712 if (indexed && (buffer_access & (VK_ACCESS_2_INDEX_READ_BIT |
2713 VK_ACCESS_2_MEMORY_READ_BIT))) {
2714 return true;
2715 }
2716
2717 /* Indirect buffer read */
2718 if (indirect && (buffer_access & (VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT |
2719 VK_ACCESS_2_MEMORY_READ_BIT))) {
2720 return true;
2721 }
2722
2723 /* Attribute read */
2724 if (buffer_access & (VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT |
2725 VK_ACCESS_2_MEMORY_READ_BIT)) {
2726 const struct v3d_vs_prog_data *prog_data =
2727 pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN]->prog_data.vs;
2728
2729 for (int i = 0; i < ARRAY_SIZE(prog_data->vattr_sizes); i++) {
2730 if (prog_data->vattr_sizes[i] > 0)
2731 return true;
2732 }
2733 }
2734
2735 /* UBO / SSBO read */
2736 if (buffer_access & (VK_ACCESS_2_UNIFORM_READ_BIT |
2737 VK_ACCESS_2_SHADER_READ_BIT |
2738 VK_ACCESS_2_MEMORY_READ_BIT |
2739 VK_ACCESS_2_SHADER_STORAGE_READ_BIT)) {
2740
2741 if (vs_bin_maps->ubo_map.num_desc > 0 ||
2742 vs_bin_maps->ssbo_map.num_desc > 0) {
2743 return true;
2744 }
2745
2746 if (gs_bin_maps && (gs_bin_maps->ubo_map.num_desc > 0 ||
2747 gs_bin_maps->ssbo_map.num_desc > 0)) {
2748 return true;
2749 }
2750 }
2751
2752 /* SSBO write */
2753 if (buffer_access & (VK_ACCESS_2_SHADER_WRITE_BIT |
2754 VK_ACCESS_2_MEMORY_WRITE_BIT |
2755 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT)) {
2756 if (vs_bin_maps->ssbo_map.num_desc > 0)
2757 return true;
2758
2759 if (gs_bin_maps && gs_bin_maps->ssbo_map.num_desc > 0)
2760 return true;
2761 }
2762
2763 /* Texel Buffer read */
2764 if (buffer_access & (VK_ACCESS_2_SHADER_SAMPLED_READ_BIT |
2765 VK_ACCESS_2_MEMORY_READ_BIT)) {
2766 if (vs_bin_maps->texture_map.num_desc > 0)
2767 return true;
2768
2769 if (gs_bin_maps && gs_bin_maps->texture_map.num_desc > 0)
2770 return true;
2771 }
2772 }
2773
2774 VkAccessFlags image_access =
2775 cmd_buffer->state.barrier.bcl_image_access;
2776 if (image_access) {
2777 /* Image load / store */
2778 if (image_access & (VK_ACCESS_2_SHADER_READ_BIT |
2779 VK_ACCESS_2_SHADER_WRITE_BIT |
2780 VK_ACCESS_2_SHADER_SAMPLED_READ_BIT |
2781 VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
2782 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT |
2783 VK_ACCESS_2_MEMORY_READ_BIT |
2784 VK_ACCESS_2_MEMORY_WRITE_BIT)) {
2785 if (vs_bin_maps->texture_map.num_desc > 0 ||
2786 vs_bin_maps->sampler_map.num_desc > 0) {
2787 return true;
2788 }
2789
2790 if (gs_bin_maps && (gs_bin_maps->texture_map.num_desc > 0 ||
2791 gs_bin_maps->sampler_map.num_desc > 0)) {
2792 return true;
2793 }
2794 }
2795 }
2796
2797 return false;
2798 }
2799
2800 void
v3dv_cmd_buffer_consume_bcl_sync(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_job * job)2801 v3dv_cmd_buffer_consume_bcl_sync(struct v3dv_cmd_buffer *cmd_buffer,
2802 struct v3dv_job *job)
2803 {
2804 job->needs_bcl_sync = true;
2805 cmd_buffer->state.barrier.bcl_buffer_access = 0;
2806 cmd_buffer->state.barrier.bcl_image_access = 0;
2807 }
2808
2809 static inline uint32_t
compute_prog_score(struct v3dv_shader_variant * vs)2810 compute_prog_score(struct v3dv_shader_variant *vs)
2811 {
2812 const uint32_t inst_count = vs->qpu_insts_size / sizeof(uint64_t);
2813 const uint32_t tmu_count = vs->prog_data.base->tmu_count +
2814 vs->prog_data.base->tmu_spills +
2815 vs->prog_data.base->tmu_fills;
2816 return inst_count + 4 * tmu_count;
2817 }
2818
2819 static void
job_update_double_buffer_score(struct v3dv_job * job,struct v3dv_pipeline * pipeline,uint32_t vertex_count,VkExtent2D * render_area)2820 job_update_double_buffer_score(struct v3dv_job *job,
2821 struct v3dv_pipeline *pipeline,
2822 uint32_t vertex_count,
2823 VkExtent2D *render_area)
2824 {
2825 /* FIXME: assume anything with GS workloads is too expensive */
2826 struct v3dv_shader_variant *gs_bin =
2827 pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN];
2828 if (gs_bin) {
2829 job->can_use_double_buffer = false;
2830 return;
2831 }
2832
2833 /* Keep track of vertex processing: too much geometry processing would not
2834 * be good for double-buffer.
2835 */
2836 struct v3dv_shader_variant *vs_bin =
2837 pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN];
2838 assert(vs_bin);
2839 uint32_t geom_score = vertex_count * compute_prog_score(vs_bin);
2840
2841 struct v3dv_shader_variant *vs =
2842 pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX];
2843 assert(vs);
2844 uint32_t vs_score = vertex_count * compute_prog_score(vs);
2845 geom_score += vs_score;
2846
2847 job->double_buffer_score.geom += geom_score;
2848
2849 /* Compute pixel rendering cost.
2850 *
2851 * We estimate that on average a draw would render 0.2% of the pixels in
2852 * the render area. That would be a 64x64 region in a 1920x1080 area.
2853 */
2854 struct v3dv_shader_variant *fs =
2855 pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
2856 assert(fs);
2857 uint32_t pixel_count = 0.002f * render_area->width * render_area->height;
2858 uint32_t render_score = vs_score + pixel_count * compute_prog_score(fs);
2859
2860 job->double_buffer_score.render += render_score;
2861 }
2862
2863 void
v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer * cmd_buffer,bool indexed,bool indirect,uint32_t vertex_count)2864 v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer,
2865 bool indexed, bool indirect,
2866 uint32_t vertex_count)
2867 {
2868 assert(cmd_buffer->state.gfx.pipeline);
2869 assert(!(cmd_buffer->state.gfx.pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT));
2870
2871 /* If we emitted a pipeline barrier right before this draw we won't have
2872 * an active job. In that case, create a new job continuing the current
2873 * subpass.
2874 */
2875 if (!cmd_buffer->state.job) {
2876 v3dv_cmd_buffer_subpass_resume(cmd_buffer,
2877 cmd_buffer->state.subpass_idx);
2878 }
2879
2880 /* Restart single sample job for MSAA pipeline if needed */
2881 cmd_buffer_restart_job_for_msaa_if_needed(cmd_buffer);
2882
2883 /* If the job is configured to flush on every draw call we need to create
2884 * a new job now.
2885 */
2886 struct v3dv_job *job = cmd_buffer_pre_draw_split_job(cmd_buffer);
2887 job->draw_count++;
2888
2889 /* Track VK_KHR_buffer_device_address usage in the job */
2890 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
2891 job->uses_buffer_device_address |= pipeline->uses_buffer_device_address;
2892
2893 /* If this job is serialized (has consumed a barrier) then check if we need
2894 * to sync at the binning stage by testing if the binning shaders involved
2895 * with the draw call require access to external resources.
2896 */
2897 if (job->serialize && (cmd_buffer->state.barrier.bcl_buffer_access ||
2898 cmd_buffer->state.barrier.bcl_image_access)) {
2899 assert(!job->needs_bcl_sync);
2900 if (cmd_buffer_binning_sync_required(cmd_buffer, pipeline,
2901 indexed, indirect)) {
2902 v3dv_cmd_buffer_consume_bcl_sync(cmd_buffer, job);
2903 }
2904 }
2905
2906 /* GL shader state binds shaders, uniform and vertex attribute state. The
2907 * compiler injects uniforms to handle some descriptor types (such as
2908 * textures), so we need to regen that when descriptor state changes.
2909 *
2910 * We also need to emit new shader state if we have a dirty viewport since
2911 * that will require that we new uniform state for QUNIFORM_VIEWPORT_*.
2912 */
2913 uint32_t *dirty = &cmd_buffer->state.dirty;
2914
2915 const uint32_t dirty_uniform_state =
2916 *dirty & (V3DV_CMD_DIRTY_PIPELINE |
2917 V3DV_CMD_DIRTY_PUSH_CONSTANTS |
2918 V3DV_CMD_DIRTY_DESCRIPTOR_SETS |
2919 V3DV_CMD_DIRTY_VIEWPORT |
2920 V3DV_CMD_DIRTY_VIEW_INDEX |
2921 V3DV_CMD_DIRTY_DRAW_ID);
2922
2923 if (dirty_uniform_state)
2924 update_gfx_uniform_state(cmd_buffer, dirty_uniform_state);
2925
2926 struct v3dv_device *device = cmd_buffer->device;
2927
2928 if (dirty_uniform_state || (*dirty & V3DV_CMD_DIRTY_VERTEX_BUFFER))
2929 v3dv_X(device, cmd_buffer_emit_gl_shader_state)(cmd_buffer);
2930
2931 if (*dirty & (V3DV_CMD_DIRTY_PIPELINE)) {
2932 v3dv_X(device, cmd_buffer_emit_configuration_bits)(cmd_buffer);
2933 v3dv_X(device, cmd_buffer_emit_varyings_state)(cmd_buffer);
2934 }
2935
2936 if (*dirty & (V3DV_CMD_DIRTY_VIEWPORT | V3DV_CMD_DIRTY_SCISSOR)) {
2937 emit_scissor(cmd_buffer);
2938 }
2939
2940 if (*dirty & V3DV_CMD_DIRTY_VIEWPORT) {
2941 v3dv_X(device, cmd_buffer_emit_viewport)(cmd_buffer);
2942 }
2943
2944 if (*dirty & V3DV_CMD_DIRTY_INDEX_BUFFER)
2945 v3dv_X(device, cmd_buffer_emit_index_buffer)(cmd_buffer);
2946
2947 const uint32_t dynamic_stencil_dirty_flags =
2948 V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK |
2949 V3DV_CMD_DIRTY_STENCIL_WRITE_MASK |
2950 V3DV_CMD_DIRTY_STENCIL_REFERENCE;
2951 if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | dynamic_stencil_dirty_flags))
2952 v3dv_X(device, cmd_buffer_emit_stencil)(cmd_buffer);
2953
2954 if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_DEPTH_BIAS))
2955 v3dv_X(device, cmd_buffer_emit_depth_bias)(cmd_buffer);
2956
2957 if (*dirty & V3DV_CMD_DIRTY_DEPTH_BOUNDS)
2958 v3dv_X(device, cmd_buffer_emit_depth_bounds)(cmd_buffer);
2959
2960 if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_BLEND_CONSTANTS))
2961 v3dv_X(device, cmd_buffer_emit_blend)(cmd_buffer);
2962
2963 if (*dirty & V3DV_CMD_DIRTY_OCCLUSION_QUERY)
2964 v3dv_X(device, cmd_buffer_emit_occlusion_query)(cmd_buffer);
2965
2966 if (*dirty & V3DV_CMD_DIRTY_LINE_WIDTH)
2967 v3dv_X(device, cmd_buffer_emit_line_width)(cmd_buffer);
2968
2969 if (*dirty & V3DV_CMD_DIRTY_PIPELINE)
2970 v3dv_X(device, cmd_buffer_emit_sample_state)(cmd_buffer);
2971
2972 if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE))
2973 v3dv_X(device, cmd_buffer_emit_color_write_mask)(cmd_buffer);
2974
2975 /* We disable double-buffer mode if indirect draws are used because in that
2976 * case we don't know the vertex count.
2977 */
2978 if (indirect) {
2979 job->can_use_double_buffer = false;
2980 } else if (job->can_use_double_buffer) {
2981 job_update_double_buffer_score(job, pipeline, vertex_count,
2982 &cmd_buffer->state.render_area.extent);
2983 }
2984
2985 cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_PIPELINE;
2986 }
2987
2988 static inline void
cmd_buffer_set_view_index(struct v3dv_cmd_buffer * cmd_buffer,uint32_t view_index)2989 cmd_buffer_set_view_index(struct v3dv_cmd_buffer *cmd_buffer,
2990 uint32_t view_index)
2991 {
2992 cmd_buffer->state.view_index = view_index;
2993 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VIEW_INDEX;
2994 }
2995
2996 static void
cmd_buffer_draw(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_draw_info * info)2997 cmd_buffer_draw(struct v3dv_cmd_buffer *cmd_buffer,
2998 struct v3dv_draw_info *info)
2999 {
3000 uint32_t vertex_count =
3001 info->vertex_count * info->instance_count;
3002
3003 struct v3dv_render_pass *pass = cmd_buffer->state.pass;
3004 if (likely(!pass->multiview_enabled)) {
3005 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, false, false, vertex_count);
3006 v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw)(cmd_buffer, info);
3007 return;
3008 }
3009
3010 uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
3011 while (view_mask) {
3012 cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
3013 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, false, false, vertex_count);
3014 v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw)(cmd_buffer, info);
3015 }
3016 }
3017
3018 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdDraw(VkCommandBuffer commandBuffer,uint32_t vertexCount,uint32_t instanceCount,uint32_t firstVertex,uint32_t firstInstance)3019 v3dv_CmdDraw(VkCommandBuffer commandBuffer,
3020 uint32_t vertexCount,
3021 uint32_t instanceCount,
3022 uint32_t firstVertex,
3023 uint32_t firstInstance)
3024 {
3025 if (vertexCount == 0 || instanceCount == 0)
3026 return;
3027
3028 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3029 struct v3dv_draw_info info = {};
3030 info.vertex_count = vertexCount;
3031 info.instance_count = instanceCount;
3032 info.first_instance = firstInstance;
3033 info.first_vertex = firstVertex;
3034
3035 cmd_buffer_draw(cmd_buffer, &info);
3036 }
3037
3038 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdDrawMultiEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawInfoEXT * pVertexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride)3039 v3dv_CmdDrawMultiEXT(VkCommandBuffer commandBuffer,
3040 uint32_t drawCount,
3041 const VkMultiDrawInfoEXT *pVertexInfo,
3042 uint32_t instanceCount,
3043 uint32_t firstInstance,
3044 uint32_t stride)
3045
3046 {
3047 if (drawCount == 0 || instanceCount == 0)
3048 return;
3049
3050 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3051
3052 uint32_t i = 0;
3053 vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) {
3054 cmd_buffer->state.draw_id = i;
3055 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DRAW_ID;
3056
3057 struct v3dv_draw_info info = {};
3058 info.vertex_count = draw->vertexCount;
3059 info.instance_count = instanceCount;
3060 info.first_instance = firstInstance;
3061 info.first_vertex = draw->firstVertex;
3062
3063 cmd_buffer_draw(cmd_buffer, &info);
3064 }
3065 }
3066
3067 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdDrawIndexed(VkCommandBuffer commandBuffer,uint32_t indexCount,uint32_t instanceCount,uint32_t firstIndex,int32_t vertexOffset,uint32_t firstInstance)3068 v3dv_CmdDrawIndexed(VkCommandBuffer commandBuffer,
3069 uint32_t indexCount,
3070 uint32_t instanceCount,
3071 uint32_t firstIndex,
3072 int32_t vertexOffset,
3073 uint32_t firstInstance)
3074 {
3075 if (indexCount == 0 || instanceCount == 0)
3076 return;
3077
3078 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3079
3080 uint32_t vertex_count = indexCount * instanceCount;
3081
3082 struct v3dv_render_pass *pass = cmd_buffer->state.pass;
3083 if (likely(!pass->multiview_enabled)) {
3084 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, false, vertex_count);
3085 v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indexed)
3086 (cmd_buffer, indexCount, instanceCount,
3087 firstIndex, vertexOffset, firstInstance);
3088 return;
3089 }
3090
3091 uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
3092 while (view_mask) {
3093 cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
3094 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, false, vertex_count);
3095 v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indexed)
3096 (cmd_buffer, indexCount, instanceCount,
3097 firstIndex, vertexOffset, firstInstance);
3098 }
3099 }
3100
3101 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawIndexedInfoEXT * pIndexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride,const int32_t * pVertexOffset)3102 v3dv_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer,
3103 uint32_t drawCount,
3104 const VkMultiDrawIndexedInfoEXT *pIndexInfo,
3105 uint32_t instanceCount,
3106 uint32_t firstInstance,
3107 uint32_t stride,
3108 const int32_t *pVertexOffset)
3109 {
3110 if (drawCount == 0 || instanceCount == 0)
3111 return;
3112
3113 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3114
3115 uint32_t i = 0;
3116 vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
3117 uint32_t vertex_count = draw->indexCount * instanceCount;
3118 int32_t vertexOffset = pVertexOffset ? *pVertexOffset : draw->vertexOffset;
3119
3120 cmd_buffer->state.draw_id = i;
3121 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DRAW_ID;
3122
3123 struct v3dv_render_pass *pass = cmd_buffer->state.pass;
3124 if (likely(!pass->multiview_enabled)) {
3125 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, false, vertex_count);
3126 v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indexed)
3127 (cmd_buffer, draw->indexCount, instanceCount,
3128 draw->firstIndex, vertexOffset, firstInstance);
3129 continue;
3130 }
3131
3132 uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
3133 while (view_mask) {
3134 cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
3135 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, false, vertex_count);
3136 v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indexed)
3137 (cmd_buffer, draw->indexCount, instanceCount,
3138 draw->firstIndex, vertexOffset, firstInstance);
3139 }
3140 }
3141 }
3142
3143 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdDrawIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)3144 v3dv_CmdDrawIndirect(VkCommandBuffer commandBuffer,
3145 VkBuffer _buffer,
3146 VkDeviceSize offset,
3147 uint32_t drawCount,
3148 uint32_t stride)
3149 {
3150 /* drawCount is the number of draws to execute, and can be zero. */
3151 if (drawCount == 0)
3152 return;
3153
3154 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3155 V3DV_FROM_HANDLE(v3dv_buffer, buffer, _buffer);
3156
3157 struct v3dv_render_pass *pass = cmd_buffer->state.pass;
3158 if (likely(!pass->multiview_enabled)) {
3159 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, false, true, 0);
3160 v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indirect)
3161 (cmd_buffer, buffer, offset, drawCount, stride);
3162 return;
3163 }
3164
3165 uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
3166 while (view_mask) {
3167 cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
3168 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, false, true, 0);
3169 v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indirect)
3170 (cmd_buffer, buffer, offset, drawCount, stride);
3171 }
3172 }
3173
3174 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)3175 v3dv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
3176 VkBuffer _buffer,
3177 VkDeviceSize offset,
3178 uint32_t drawCount,
3179 uint32_t stride)
3180 {
3181 /* drawCount is the number of draws to execute, and can be zero. */
3182 if (drawCount == 0)
3183 return;
3184
3185 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3186 V3DV_FROM_HANDLE(v3dv_buffer, buffer, _buffer);
3187
3188 struct v3dv_render_pass *pass = cmd_buffer->state.pass;
3189 if (likely(!pass->multiview_enabled)) {
3190 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, true, 0);
3191 v3dv_X(cmd_buffer->device, cmd_buffer_emit_indexed_indirect)
3192 (cmd_buffer, buffer, offset, drawCount, stride);
3193 return;
3194 }
3195
3196 uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
3197 while (view_mask) {
3198 cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
3199 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, true, 0);
3200 v3dv_X(cmd_buffer->device, cmd_buffer_emit_indexed_indirect)
3201 (cmd_buffer, buffer, offset, drawCount, stride);
3202 }
3203 }
3204
3205 static void
handle_barrier(VkPipelineStageFlags2 srcStageMask,VkAccessFlags2 srcAccessMask,VkPipelineStageFlags2 dstStageMask,VkAccessFlags2 dstAccessMask,bool is_image_barrier,bool is_buffer_barrier,struct v3dv_barrier_state * state)3206 handle_barrier(VkPipelineStageFlags2 srcStageMask, VkAccessFlags2 srcAccessMask,
3207 VkPipelineStageFlags2 dstStageMask, VkAccessFlags2 dstAccessMask,
3208 bool is_image_barrier, bool is_buffer_barrier,
3209 struct v3dv_barrier_state *state)
3210 {
3211 /* We only care about barriers between GPU jobs */
3212 if (srcStageMask == VK_PIPELINE_STAGE_2_HOST_BIT ||
3213 dstStageMask == VK_PIPELINE_STAGE_2_HOST_BIT) {
3214 return;
3215 }
3216
3217 /* Track source of the barrier */
3218 uint8_t src_mask = 0;
3219
3220 const VkPipelineStageFlags2 compute_mask =
3221 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT;
3222 if (srcStageMask & (compute_mask | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
3223 src_mask |= V3DV_BARRIER_COMPUTE_BIT;
3224
3225 const VkPipelineStageFlags2 transfer_mask =
3226 VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT |
3227 VK_PIPELINE_STAGE_2_COPY_BIT |
3228 VK_PIPELINE_STAGE_2_BLIT_BIT |
3229 VK_PIPELINE_STAGE_2_CLEAR_BIT;
3230 if (srcStageMask & (transfer_mask | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
3231 src_mask |= V3DV_BARRIER_TRANSFER_BIT;
3232
3233 const VkPipelineStageFlags2 graphics_mask = ~(compute_mask | transfer_mask);
3234 if (srcStageMask & (graphics_mask | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
3235 src_mask |= V3DV_BARRIER_GRAPHICS_BIT;
3236
3237 /* Track consumer of the barrier */
3238 if (dstStageMask & (compute_mask | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) {
3239 state->dst_mask |= V3DV_BARRIER_COMPUTE_BIT;
3240 state->src_mask_compute |= src_mask;
3241 }
3242
3243 if (dstStageMask & (transfer_mask | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) {
3244 state->dst_mask |= V3DV_BARRIER_TRANSFER_BIT;
3245 state->src_mask_transfer |= src_mask;
3246 }
3247
3248 if (dstStageMask & (graphics_mask | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) {
3249 state->dst_mask |= V3DV_BARRIER_GRAPHICS_BIT;
3250 state->src_mask_graphics |= src_mask;
3251
3252 if (dstStageMask & (VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT |
3253 VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT |
3254 VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT |
3255 VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT |
3256 VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT |
3257 VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT |
3258 VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT |
3259 VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT |
3260 VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT |
3261 VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT |
3262 VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT |
3263 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) {
3264 if (is_image_barrier)
3265 state->bcl_image_access |= dstAccessMask;
3266
3267 if (is_buffer_barrier)
3268 state->bcl_buffer_access |= dstAccessMask;
3269 }
3270 }
3271 }
3272
3273 void
v3dv_cmd_buffer_emit_pipeline_barrier(struct v3dv_cmd_buffer * cmd_buffer,const VkDependencyInfo * info)3274 v3dv_cmd_buffer_emit_pipeline_barrier(struct v3dv_cmd_buffer *cmd_buffer,
3275 const VkDependencyInfo *info)
3276 {
3277 uint32_t imageBarrierCount = info->imageMemoryBarrierCount;
3278 const VkImageMemoryBarrier2 *pImageBarriers = info->pImageMemoryBarriers;
3279
3280 uint32_t bufferBarrierCount = info->bufferMemoryBarrierCount;
3281 const VkBufferMemoryBarrier2 *pBufferBarriers = info->pBufferMemoryBarriers;
3282
3283 uint32_t memoryBarrierCount = info->memoryBarrierCount;
3284 const VkMemoryBarrier2 *pMemoryBarriers = info->pMemoryBarriers;
3285
3286 struct v3dv_barrier_state state = { 0 };
3287 for (uint32_t i = 0; i < imageBarrierCount; i++) {
3288 /* We can safely skip barriers for image layout transitions from UNDEFINED
3289 * layout.
3290 *
3291 * Notice that KHR_synchronization2 allows to specify barriers that don't
3292 * involve a layout transition by making oldLayout and newLayout the same,
3293 * including UNDEFINED.
3294 */
3295 if (pImageBarriers[i].oldLayout == VK_IMAGE_LAYOUT_UNDEFINED &&
3296 pImageBarriers[i].oldLayout != pImageBarriers[i].newLayout) {
3297 continue;
3298 }
3299
3300 handle_barrier(pImageBarriers[i].srcStageMask,
3301 pImageBarriers[i].srcAccessMask,
3302 pImageBarriers[i].dstStageMask,
3303 pImageBarriers[i].dstAccessMask,
3304 true, false, &state);
3305 }
3306
3307 for (uint32_t i = 0; i < bufferBarrierCount; i++) {
3308 handle_barrier(pBufferBarriers[i].srcStageMask,
3309 pBufferBarriers[i].srcAccessMask,
3310 pBufferBarriers[i].dstStageMask,
3311 pBufferBarriers[i].dstAccessMask,
3312 false, true, &state);
3313 }
3314
3315 for (uint32_t i = 0; i < memoryBarrierCount; i++) {
3316 handle_barrier(pMemoryBarriers[i].srcStageMask,
3317 pMemoryBarriers[i].srcAccessMask,
3318 pMemoryBarriers[i].dstStageMask,
3319 pMemoryBarriers[i].dstAccessMask,
3320 true, true, &state);
3321 }
3322
3323 /* Bail if we don't relevant barriers */
3324 if (!state.dst_mask)
3325 return;
3326
3327 /* If we have a recording job, finish it here */
3328 if (cmd_buffer->state.job)
3329 v3dv_cmd_buffer_finish_job(cmd_buffer);
3330
3331 /* Update barrier state in the command buffer */
3332 v3dv_cmd_buffer_merge_barrier_state(&cmd_buffer->state.barrier, &state);
3333 }
3334
3335 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,const VkDependencyInfo * pDependencyInfo)3336 v3dv_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,
3337 const VkDependencyInfo *pDependencyInfo)
3338 {
3339 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3340 v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, pDependencyInfo);
3341 }
3342
3343 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets)3344 v3dv_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,
3345 uint32_t firstBinding,
3346 uint32_t bindingCount,
3347 const VkBuffer *pBuffers,
3348 const VkDeviceSize *pOffsets)
3349 {
3350 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3351 struct v3dv_vertex_binding *vb = cmd_buffer->state.vertex_bindings;
3352
3353 /* We have to defer setting up vertex buffer since we need the buffer
3354 * stride from the pipeline.
3355 */
3356
3357 assert(firstBinding + bindingCount <= MAX_VBS);
3358 bool vb_state_changed = false;
3359 for (uint32_t i = 0; i < bindingCount; i++) {
3360 if (vb[firstBinding + i].buffer != v3dv_buffer_from_handle(pBuffers[i])) {
3361 vb[firstBinding + i].buffer = v3dv_buffer_from_handle(pBuffers[i]);
3362 vb_state_changed = true;
3363 }
3364 if (vb[firstBinding + i].offset != pOffsets[i]) {
3365 vb[firstBinding + i].offset = pOffsets[i];
3366 vb_state_changed = true;
3367 }
3368 }
3369
3370 if (vb_state_changed)
3371 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VERTEX_BUFFER;
3372 }
3373
3374 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,VkBuffer buffer,VkDeviceSize offset,VkIndexType indexType)3375 v3dv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,
3376 VkBuffer buffer,
3377 VkDeviceSize offset,
3378 VkIndexType indexType)
3379 {
3380 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3381
3382 const uint32_t index_size = vk_index_type_to_bytes(indexType);
3383 if (buffer == cmd_buffer->state.index_buffer.buffer &&
3384 offset == cmd_buffer->state.index_buffer.offset &&
3385 index_size == cmd_buffer->state.index_buffer.index_size) {
3386 return;
3387 }
3388
3389 cmd_buffer->state.index_buffer.buffer = buffer;
3390 cmd_buffer->state.index_buffer.offset = offset;
3391 cmd_buffer->state.index_buffer.index_size = index_size;
3392 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_INDEX_BUFFER;
3393 }
3394
3395 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t compareMask)3396 v3dv_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,
3397 VkStencilFaceFlags faceMask,
3398 uint32_t compareMask)
3399 {
3400 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3401
3402 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
3403 cmd_buffer->state.dynamic.stencil_compare_mask.front = compareMask & 0xff;
3404 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
3405 cmd_buffer->state.dynamic.stencil_compare_mask.back = compareMask & 0xff;
3406
3407 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK;
3408 }
3409
3410 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t writeMask)3411 v3dv_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,
3412 VkStencilFaceFlags faceMask,
3413 uint32_t writeMask)
3414 {
3415 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3416
3417 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
3418 cmd_buffer->state.dynamic.stencil_write_mask.front = writeMask & 0xff;
3419 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
3420 cmd_buffer->state.dynamic.stencil_write_mask.back = writeMask & 0xff;
3421
3422 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_STENCIL_WRITE_MASK;
3423 }
3424
3425 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdSetStencilReference(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t reference)3426 v3dv_CmdSetStencilReference(VkCommandBuffer commandBuffer,
3427 VkStencilFaceFlags faceMask,
3428 uint32_t reference)
3429 {
3430 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3431
3432 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
3433 cmd_buffer->state.dynamic.stencil_reference.front = reference & 0xff;
3434 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
3435 cmd_buffer->state.dynamic.stencil_reference.back = reference & 0xff;
3436
3437 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_STENCIL_REFERENCE;
3438 }
3439
3440 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdSetDepthBias(VkCommandBuffer commandBuffer,float depthBiasConstantFactor,float depthBiasClamp,float depthBiasSlopeFactor)3441 v3dv_CmdSetDepthBias(VkCommandBuffer commandBuffer,
3442 float depthBiasConstantFactor,
3443 float depthBiasClamp,
3444 float depthBiasSlopeFactor)
3445 {
3446 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3447
3448 cmd_buffer->state.dynamic.depth_bias.constant_factor = depthBiasConstantFactor;
3449 cmd_buffer->state.dynamic.depth_bias.depth_bias_clamp = depthBiasClamp;
3450 cmd_buffer->state.dynamic.depth_bias.slope_factor = depthBiasSlopeFactor;
3451 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DEPTH_BIAS;
3452 }
3453
3454 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdSetDepthBounds(VkCommandBuffer commandBuffer,float minDepthBounds,float maxDepthBounds)3455 v3dv_CmdSetDepthBounds(VkCommandBuffer commandBuffer,
3456 float minDepthBounds,
3457 float maxDepthBounds)
3458 {
3459 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3460
3461 cmd_buffer->state.dynamic.depth_bounds.min = minDepthBounds;
3462 cmd_buffer->state.dynamic.depth_bounds.max = maxDepthBounds;
3463 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DEPTH_BOUNDS;
3464 }
3465
3466 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdSetLineStippleEXT(VkCommandBuffer commandBuffer,uint32_t lineStippleFactor,uint16_t lineStipplePattern)3467 v3dv_CmdSetLineStippleEXT(VkCommandBuffer commandBuffer,
3468 uint32_t lineStippleFactor,
3469 uint16_t lineStipplePattern)
3470 {
3471 /* We do not support stippled line rasterization so we just ignore this. */
3472 }
3473
3474 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdSetLineWidth(VkCommandBuffer commandBuffer,float lineWidth)3475 v3dv_CmdSetLineWidth(VkCommandBuffer commandBuffer,
3476 float lineWidth)
3477 {
3478 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3479
3480 cmd_buffer->state.dynamic.line_width = lineWidth;
3481 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_LINE_WIDTH;
3482 }
3483
3484 /**
3485 * This checks a descriptor set to see if are binding any descriptors that would
3486 * involve sampling from a linear image (the hardware only supports this for
3487 * 1D images), and if so, attempts to create a tiled copy of the linear image
3488 * and rewrite the descriptor set to use that instead.
3489 *
3490 * This was added to support a scenario with Android where some part of the UI
3491 * wanted to show previews of linear swapchain images. For more details:
3492 * https://gitlab.freedesktop.org/mesa/mesa/-/issues/9712
3493 *
3494 * Currently this only supports a linear sampling from a simple 2D image, but
3495 * it could be extended to support more cases if necessary.
3496 */
3497 static void
handle_sample_from_linear_image(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_descriptor_set * set,bool is_compute)3498 handle_sample_from_linear_image(struct v3dv_cmd_buffer *cmd_buffer,
3499 struct v3dv_descriptor_set *set,
3500 bool is_compute)
3501 {
3502 for (int32_t i = 0; i < set->layout->binding_count; i++) {
3503 const struct v3dv_descriptor_set_binding_layout *blayout =
3504 &set->layout->binding[i];
3505 if (blayout->type != VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE &&
3506 blayout->type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
3507 continue;
3508
3509 struct v3dv_descriptor *desc = &set->descriptors[blayout->descriptor_index];
3510 if (!desc->image_view)
3511 continue;
3512
3513 struct v3dv_image *image = (struct v3dv_image *) desc->image_view->vk.image;
3514 struct v3dv_image_view *view = (struct v3dv_image_view *) desc->image_view;
3515 if (image->tiled || view->vk.view_type == VK_IMAGE_VIEW_TYPE_1D ||
3516 view->vk.view_type == VK_IMAGE_VIEW_TYPE_1D_ARRAY) {
3517 continue;
3518 }
3519
3520 /* FIXME: we can probably handle most of these restrictions too with
3521 * a bit of extra effort.
3522 */
3523 if (view->vk.view_type != VK_IMAGE_VIEW_TYPE_2D ||
3524 view->vk.level_count != 1 || view->vk.layer_count != 1 ||
3525 blayout->array_size != 1) {
3526 fprintf(stderr, "Sampling from linear image is not supported. "
3527 "Expect corruption.\n");
3528 continue;
3529 }
3530
3531 /* We are sampling from a linear image. V3D doesn't support this
3532 * so we create a tiled copy of the image and rewrite the descriptor
3533 * to read from it instead.
3534 */
3535 perf_debug("Sampling from linear image is not supported natively and "
3536 "requires a copy.\n");
3537
3538 struct v3dv_device *device = cmd_buffer->device;
3539 VkDevice vk_device = v3dv_device_to_handle(device);
3540
3541 /* Allocate shadow tiled image if needed, we only do this once for
3542 * each image, on the first sampling attempt. We need to take a lock
3543 * since we may be trying to do the same in another command buffer in
3544 * a separate thread.
3545 */
3546 mtx_lock(&device->meta.mtx);
3547 VkResult result;
3548 VkImage tiled_image;
3549 if (image->shadow) {
3550 tiled_image = v3dv_image_to_handle(image->shadow);
3551 } else {
3552 VkImageCreateInfo image_info = {
3553 .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
3554 .flags = image->vk.create_flags,
3555 .imageType = image->vk.image_type,
3556 .format = image->vk.format,
3557 .extent = {
3558 image->vk.extent.width,
3559 image->vk.extent.height,
3560 image->vk.extent.depth,
3561 },
3562 .mipLevels = image->vk.mip_levels,
3563 .arrayLayers = image->vk.array_layers,
3564 .samples = image->vk.samples,
3565 .tiling = VK_IMAGE_TILING_OPTIMAL,
3566 .usage = image->vk.usage,
3567 .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
3568 .queueFamilyIndexCount = 0,
3569 .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
3570 };
3571 result = v3dv_CreateImage(vk_device, &image_info,
3572 &device->vk.alloc, &tiled_image);
3573 if (result != VK_SUCCESS) {
3574 fprintf(stderr, "Failed to copy linear 2D image for sampling."
3575 "Expect corruption.\n");
3576 mtx_unlock(&device->meta.mtx);
3577 continue;
3578 }
3579
3580 bool disjoint = image->vk.create_flags & VK_IMAGE_CREATE_DISJOINT_BIT;
3581 VkImageMemoryRequirementsInfo2 reqs_info = {
3582 .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2,
3583 .image = tiled_image,
3584 };
3585
3586 assert(image->plane_count <= V3DV_MAX_PLANE_COUNT);
3587 for (int p = 0; p < (disjoint ? image->plane_count : 1); p++) {
3588 VkImageAspectFlagBits plane_aspect = VK_IMAGE_ASPECT_PLANE_0_BIT << p;
3589 VkImagePlaneMemoryRequirementsInfo plane_info = {
3590 .sType = VK_STRUCTURE_TYPE_IMAGE_PLANE_MEMORY_REQUIREMENTS_INFO,
3591 .planeAspect = plane_aspect,
3592 };
3593 if (disjoint)
3594 reqs_info.pNext = &plane_info;
3595
3596 VkMemoryRequirements2 reqs = {
3597 .sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2,
3598 };
3599 v3dv_GetImageMemoryRequirements2(vk_device, &reqs_info, &reqs);
3600
3601 VkDeviceMemory mem;
3602 VkMemoryAllocateInfo alloc_info = {
3603 .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
3604 .allocationSize = reqs.memoryRequirements.size,
3605 .memoryTypeIndex = 0,
3606 };
3607 result = v3dv_AllocateMemory(vk_device, &alloc_info,
3608 &device->vk.alloc, &mem);
3609 if (result != VK_SUCCESS) {
3610 fprintf(stderr, "Failed to copy linear 2D image for sampling."
3611 "Expect corruption.\n");
3612 v3dv_DestroyImage(vk_device, tiled_image, &device->vk.alloc);
3613 mtx_unlock(&device->meta.mtx);
3614 continue;
3615 }
3616
3617 VkBindImageMemoryInfo bind_info = {
3618 .sType = VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_INFO,
3619 .image = tiled_image,
3620 .memory = mem,
3621 .memoryOffset = 0,
3622 };
3623 VkBindImagePlaneMemoryInfo plane_bind_info = {
3624 .sType = VK_STRUCTURE_TYPE_BIND_IMAGE_PLANE_MEMORY_INFO,
3625 .planeAspect = plane_aspect,
3626 };
3627 if (disjoint)
3628 bind_info.pNext = &plane_bind_info;
3629 result = v3dv_BindImageMemory2(vk_device, 1, &bind_info);
3630 if (result != VK_SUCCESS) {
3631 fprintf(stderr, "Failed to copy linear 2D image for sampling."
3632 "Expect corruption.\n");
3633 v3dv_DestroyImage(vk_device, tiled_image, &device->vk.alloc);
3634 v3dv_FreeMemory(vk_device, mem, &device->vk.alloc);
3635 mtx_unlock(&device->meta.mtx);
3636 continue;
3637 }
3638 }
3639
3640 image->shadow = v3dv_image_from_handle(tiled_image);
3641 }
3642
3643 /* Create a shadow view that refers to the tiled image if needed */
3644 VkImageView tiled_view;
3645 if (view->shadow) {
3646 tiled_view = v3dv_image_view_to_handle(view->shadow);
3647 } else {
3648 VkImageViewCreateInfo view_info = {
3649 .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
3650 .flags = view->vk.create_flags,
3651 .image = tiled_image,
3652 .viewType = view->vk.view_type,
3653 .format = view->vk.format,
3654 .components = view->vk.swizzle,
3655 .subresourceRange = {
3656 .aspectMask = view->vk.aspects,
3657 .baseMipLevel = view->vk.base_mip_level,
3658 .levelCount = view->vk.level_count,
3659 .baseArrayLayer = view->vk.base_array_layer,
3660 .layerCount = view->vk.layer_count,
3661 },
3662 };
3663 result = v3dv_create_image_view(device, &view_info, &tiled_view);
3664 if (result != VK_SUCCESS) {
3665 fprintf(stderr, "Failed to copy linear 2D image for sampling."
3666 "Expect corruption.\n");
3667 mtx_unlock(&device->meta.mtx);
3668 continue;
3669 }
3670 }
3671
3672 view->shadow = v3dv_image_view_from_handle(tiled_view);
3673
3674 mtx_unlock(&device->meta.mtx);
3675
3676 /* Rewrite the descriptor to use the shadow view */
3677 VkDescriptorImageInfo desc_image_info = {
3678 .sampler = v3dv_sampler_to_handle(desc->sampler),
3679 .imageView = tiled_view,
3680 .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
3681 };
3682 VkWriteDescriptorSet write = {
3683 .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
3684 .dstSet = v3dv_descriptor_set_to_handle(set),
3685 .dstBinding = i,
3686 .dstArrayElement = 0, /* Assumes array_size is 1 */
3687 .descriptorCount = 1,
3688 .descriptorType = desc->type,
3689 .pImageInfo = &desc_image_info,
3690 };
3691 v3dv_UpdateDescriptorSets(vk_device, 1, &write, 0, NULL);
3692
3693 /* Now we need to actually copy the pixel data from the linear image
3694 * into the tiled image storage to ensure it is up-to-date.
3695 *
3696 * FIXME: ideally we would track if the linear image is dirty and skip
3697 * this step otherwise, but that would be a bit of a pain.
3698 *
3699 * Note that we need to place the copy job *before* the current job in
3700 * the command buffer state so we have the tiled image ready to process
3701 * an upcoming draw call in the current job that samples from it.
3702 *
3703 * Also, we need to use the TFU path for this copy, as any other path
3704 * will use the tile buffer and would require a new framebuffer setup,
3705 * thus requiring extra work to stop and resume any in-flight render
3706 * pass. Since we are converting a full 2D texture here the TFU should
3707 * be able to handle this.
3708 */
3709 for (int p = 0; p < image->plane_count; p++) {
3710 VkImageAspectFlagBits plane_aspect = VK_IMAGE_ASPECT_PLANE_0_BIT << p;
3711 struct VkImageCopy2 copy_region = {
3712 .sType = VK_STRUCTURE_TYPE_IMAGE_COPY_2,
3713 .srcSubresource = {
3714 .aspectMask = image->plane_count == 1 ?
3715 view->vk.aspects : (view->vk.aspects & plane_aspect),
3716 .mipLevel = view->vk.base_mip_level,
3717 .baseArrayLayer = view->vk.base_array_layer,
3718 .layerCount = view->vk.layer_count,
3719 },
3720 .srcOffset = {0, 0, 0 },
3721 .dstSubresource = {
3722 .aspectMask = image->plane_count == 1 ?
3723 view->vk.aspects : (view->vk.aspects & plane_aspect),
3724 .mipLevel = view->vk.base_mip_level,
3725 .baseArrayLayer = view->vk.base_array_layer,
3726 .layerCount = view->vk.layer_count,
3727 },
3728 .dstOffset = { 0, 0, 0},
3729 .extent = {
3730 image->planes[p].width,
3731 image->planes[p].height,
3732 1,
3733 },
3734 };
3735 struct v3dv_image *copy_src = image;
3736 struct v3dv_image *copy_dst = v3dv_image_from_handle(tiled_image);
3737 bool ok = v3dv_cmd_buffer_copy_image_tfu(cmd_buffer, copy_dst, copy_src,
3738 ©_region);
3739 if (ok) {
3740 /* This will emit the TFU job right before the current in-flight
3741 * job (if any), since in-fight jobs are only added to the list
3742 * when finished.
3743 */
3744 struct v3dv_job *tfu_job =
3745 list_last_entry(&cmd_buffer->jobs, struct v3dv_job, list_link);
3746 assert(tfu_job->type == V3DV_JOB_TYPE_GPU_TFU);
3747 /* Serialize the copy since we don't know who is producing the linear
3748 * image and we need the image to be ready by the time the copy
3749 * executes.
3750 */
3751 tfu_job->serialize = V3DV_BARRIER_ALL;
3752
3753 /* Also, we need to ensure the TFU copy job completes before anyhing
3754 * else coming after that may be using the tiled shadow copy.
3755 */
3756 if (cmd_buffer->state.job) {
3757 /* If we already had an in-flight job (i.e. we are in a render
3758 * pass) make sure the job waits for the TFU copy.
3759 */
3760 cmd_buffer->state.job->serialize |= V3DV_BARRIER_TRANSFER_BIT;
3761 } else {
3762 /* Otherwise, make the the follow-up job syncs with the TFU
3763 * job we just added when it is created by adding the
3764 * corresponding barrier state.
3765 */
3766 if (!is_compute) {
3767 cmd_buffer->state.barrier.dst_mask |= V3DV_BARRIER_GRAPHICS_BIT;
3768 cmd_buffer->state.barrier.src_mask_graphics |= V3DV_BARRIER_TRANSFER_BIT;
3769 } else {
3770 cmd_buffer->state.barrier.dst_mask |= V3DV_BARRIER_COMPUTE_BIT;
3771 cmd_buffer->state.barrier.src_mask_compute |= V3DV_BARRIER_TRANSFER_BIT;
3772 }
3773 }
3774 } else {
3775 fprintf(stderr, "Failed to copy linear 2D image for sampling."
3776 "TFU doesn't support copy. Expect corruption.\n");
3777 }
3778 }
3779 }
3780 }
3781
3782 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipelineLayout _layout,uint32_t firstSet,uint32_t descriptorSetCount,const VkDescriptorSet * pDescriptorSets,uint32_t dynamicOffsetCount,const uint32_t * pDynamicOffsets)3783 v3dv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
3784 VkPipelineBindPoint pipelineBindPoint,
3785 VkPipelineLayout _layout,
3786 uint32_t firstSet,
3787 uint32_t descriptorSetCount,
3788 const VkDescriptorSet *pDescriptorSets,
3789 uint32_t dynamicOffsetCount,
3790 const uint32_t *pDynamicOffsets)
3791 {
3792 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3793 V3DV_FROM_HANDLE(v3dv_pipeline_layout, layout, _layout);
3794
3795 uint32_t dyn_index = 0;
3796
3797 assert(firstSet + descriptorSetCount <= MAX_SETS);
3798
3799 struct v3dv_descriptor_state *descriptor_state =
3800 pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE ?
3801 &cmd_buffer->state.compute.descriptor_state :
3802 &cmd_buffer->state.gfx.descriptor_state;
3803
3804 VkShaderStageFlags dirty_stages = 0;
3805 bool descriptor_state_changed = false;
3806 for (uint32_t i = 0; i < descriptorSetCount; i++) {
3807 V3DV_FROM_HANDLE(v3dv_descriptor_set, set, pDescriptorSets[i]);
3808 uint32_t index = firstSet + i;
3809
3810 descriptor_state->valid |= (1u << index);
3811 if (descriptor_state->descriptor_sets[index] != set) {
3812 descriptor_state->descriptor_sets[index] = set;
3813 dirty_stages |= set->layout->shader_stages;
3814 descriptor_state_changed = true;
3815
3816 /* Check if we are sampling from a linear 2D image. This is not
3817 * supported in hardware, but may be required for some applications
3818 * so we will transparently convert to tiled at the expense of
3819 * performance.
3820 */
3821 handle_sample_from_linear_image(cmd_buffer, set,
3822 pipelineBindPoint ==
3823 VK_PIPELINE_BIND_POINT_COMPUTE);
3824 }
3825
3826 for (uint32_t j = 0; j < set->layout->dynamic_offset_count; j++, dyn_index++) {
3827 uint32_t idx = j + layout->set[i + firstSet].dynamic_offset_start;
3828
3829 if (descriptor_state->dynamic_offsets[idx] != pDynamicOffsets[dyn_index]) {
3830 descriptor_state->dynamic_offsets[idx] = pDynamicOffsets[dyn_index];
3831 dirty_stages |= set->layout->shader_stages;
3832 descriptor_state_changed = true;
3833 }
3834 }
3835 }
3836
3837 if (descriptor_state_changed) {
3838 if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) {
3839 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DESCRIPTOR_SETS;
3840 cmd_buffer->state.dirty_descriptor_stages |= dirty_stages & VK_SHADER_STAGE_ALL_GRAPHICS;
3841 } else {
3842 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS;
3843 cmd_buffer->state.dirty_descriptor_stages |= VK_SHADER_STAGE_COMPUTE_BIT;
3844 }
3845 }
3846 }
3847
3848 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdPushConstants(VkCommandBuffer commandBuffer,VkPipelineLayout layout,VkShaderStageFlags stageFlags,uint32_t offset,uint32_t size,const void * pValues)3849 v3dv_CmdPushConstants(VkCommandBuffer commandBuffer,
3850 VkPipelineLayout layout,
3851 VkShaderStageFlags stageFlags,
3852 uint32_t offset,
3853 uint32_t size,
3854 const void *pValues)
3855 {
3856 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3857
3858 if (!memcmp((uint8_t *) cmd_buffer->state.push_constants_data + offset,
3859 pValues, size)) {
3860 return;
3861 }
3862
3863 memcpy((uint8_t *) cmd_buffer->state.push_constants_data + offset,
3864 pValues, size);
3865 cmd_buffer->state.push_constants_size =
3866 MAX2(offset + size, cmd_buffer->state.push_constants_size);
3867
3868 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_PUSH_CONSTANTS |
3869 V3DV_CMD_DIRTY_PUSH_CONSTANTS_UBO;
3870 cmd_buffer->state.dirty_push_constants_stages |= stageFlags;
3871 }
3872
3873 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdSetBlendConstants(VkCommandBuffer commandBuffer,const float blendConstants[4])3874 v3dv_CmdSetBlendConstants(VkCommandBuffer commandBuffer,
3875 const float blendConstants[4])
3876 {
3877 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3878 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
3879
3880 if (!memcmp(state->dynamic.blend_constants, blendConstants,
3881 sizeof(state->dynamic.blend_constants))) {
3882 return;
3883 }
3884
3885 memcpy(state->dynamic.blend_constants, blendConstants,
3886 sizeof(state->dynamic.blend_constants));
3887
3888 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_BLEND_CONSTANTS;
3889 }
3890
3891 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer,uint32_t attachmentCount,const VkBool32 * pColorWriteEnables)3892 v3dv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer,
3893 uint32_t attachmentCount,
3894 const VkBool32 *pColorWriteEnables)
3895 {
3896 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3897 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
3898 uint32_t color_write_enable = 0;
3899
3900 for (uint32_t i = 0; i < attachmentCount; i++)
3901 color_write_enable |= pColorWriteEnables[i] ? (0xfu << (i * 4)) : 0;
3902
3903 if (state->dynamic.color_write_enable == color_write_enable)
3904 return;
3905
3906 state->dynamic.color_write_enable = color_write_enable;
3907
3908 state->dirty |= V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE;
3909 }
3910
3911 void
v3dv_cmd_buffer_ensure_array_state(struct v3dv_cmd_buffer * cmd_buffer,uint32_t slot_size,uint32_t used_count,uint32_t * alloc_count,void ** ptr)3912 v3dv_cmd_buffer_ensure_array_state(struct v3dv_cmd_buffer *cmd_buffer,
3913 uint32_t slot_size,
3914 uint32_t used_count,
3915 uint32_t *alloc_count,
3916 void **ptr)
3917 {
3918 if (used_count >= *alloc_count) {
3919 const uint32_t prev_slot_count = *alloc_count;
3920 void *old_buffer = *ptr;
3921
3922 const uint32_t new_slot_count = MAX2(*alloc_count * 2, 4);
3923 const uint32_t bytes = new_slot_count * slot_size;
3924 *ptr = vk_alloc(&cmd_buffer->device->vk.alloc, bytes, 8,
3925 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
3926 if (*ptr == NULL) {
3927 fprintf(stderr, "Error: failed to allocate CPU buffer for query.\n");
3928 v3dv_flag_oom(cmd_buffer, NULL);
3929 return;
3930 }
3931
3932 memcpy(*ptr, old_buffer, prev_slot_count * slot_size);
3933 *alloc_count = new_slot_count;
3934 }
3935 assert(used_count < *alloc_count);
3936 }
3937
3938 void
v3dv_cmd_buffer_begin_query(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t query,VkQueryControlFlags flags)3939 v3dv_cmd_buffer_begin_query(struct v3dv_cmd_buffer *cmd_buffer,
3940 struct v3dv_query_pool *pool,
3941 uint32_t query,
3942 VkQueryControlFlags flags)
3943 {
3944 assert(query < pool->query_count);
3945 switch (pool->query_type) {
3946 case VK_QUERY_TYPE_OCCLUSION:
3947 /* FIXME: we only support one active occlusion query for now */
3948 assert(cmd_buffer->state.query.active_query.bo == NULL);
3949
3950 cmd_buffer->state.query.active_query.bo = pool->occlusion.bo;
3951 cmd_buffer->state.query.active_query.offset =
3952 pool->queries[query].occlusion.offset;
3953 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
3954 break;
3955 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
3956 assert(cmd_buffer->state.query.active_query.perf == NULL);
3957 if (cmd_buffer->state.pass)
3958 v3dv_cmd_buffer_subpass_finish(cmd_buffer);
3959
3960 cmd_buffer->state.query.active_query.perf =
3961 &pool->queries[query].perf;
3962
3963 if (cmd_buffer->state.pass) {
3964 v3dv_cmd_buffer_subpass_resume(cmd_buffer,
3965 cmd_buffer->state.subpass_idx);
3966 }
3967 break;
3968 }
3969 default:
3970 unreachable("Unsupported query type");
3971 }
3972 }
3973
3974 void
v3dv_cmd_buffer_pause_occlusion_query(struct v3dv_cmd_buffer * cmd_buffer)3975 v3dv_cmd_buffer_pause_occlusion_query(struct v3dv_cmd_buffer *cmd_buffer)
3976 {
3977 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
3978 struct v3dv_bo *occlusion_query_bo = state->query.active_query.bo;
3979 if (occlusion_query_bo) {
3980 assert(!state->query.active_query.paused_bo);
3981 state->query.active_query.paused_bo = occlusion_query_bo;
3982 state->query.active_query.bo = NULL;
3983 state->dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
3984 }
3985 }
3986
3987 void
v3dv_cmd_buffer_resume_occlusion_query(struct v3dv_cmd_buffer * cmd_buffer)3988 v3dv_cmd_buffer_resume_occlusion_query(struct v3dv_cmd_buffer *cmd_buffer)
3989 {
3990 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
3991 struct v3dv_bo *occlusion_query_bo = state->query.active_query.paused_bo;
3992 if (occlusion_query_bo) {
3993 assert(!state->query.active_query.bo);
3994 state->query.active_query.bo = occlusion_query_bo;
3995 state->query.active_query.paused_bo = NULL;
3996 state->dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
3997 }
3998 }
3999
4000 static void
v3dv_cmd_buffer_schedule_end_query(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t query)4001 v3dv_cmd_buffer_schedule_end_query(struct v3dv_cmd_buffer *cmd_buffer,
4002 struct v3dv_query_pool *pool,
4003 uint32_t query)
4004 {
4005 assert(query < pool->query_count);
4006 assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION ||
4007 pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
4008
4009 /* For occlusion queries in the middle of a render pass we don't want to
4010 * split the current job at the EndQuery just to emit query availability,
4011 * instead we queue this state in the command buffer and we emit it when
4012 * we finish the current job.
4013 */
4014 if (cmd_buffer->state.pass &&
4015 pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
4016 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
4017 v3dv_cmd_buffer_ensure_array_state(cmd_buffer,
4018 sizeof(struct v3dv_end_query_info),
4019 state->query.end.used_count,
4020 &state->query.end.alloc_count,
4021 (void **) &state->query.end.states);
4022 v3dv_return_if_oom(cmd_buffer, NULL);
4023
4024 struct v3dv_end_query_info *info =
4025 &state->query.end.states[state->query.end.used_count++];
4026
4027 info->pool = pool;
4028 info->query = query;
4029
4030 /* From the Vulkan spec:
4031 *
4032 * "If queries are used while executing a render pass instance that has
4033 * multiview enabled, the query uses N consecutive query indices in
4034 * the query pool (starting at query) where N is the number of bits set
4035 * in the view mask in the subpass the query is used in. How the
4036 * numerical results of the query are distributed among the queries is
4037 * implementation-dependent."
4038 *
4039 * In our case, only the first query is used but this means we still need
4040 * to flag the other queries as available so we don't emit errors when
4041 * the applications attempt to retrieve values from them.
4042 */
4043 struct v3dv_render_pass *pass = cmd_buffer->state.pass;
4044 if (!pass->multiview_enabled) {
4045 info->count = 1;
4046 } else {
4047 struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
4048 info->count = util_bitcount(subpass->view_mask);
4049 }
4050 } else {
4051 /* Otherwise, schedule the end query job immediately.
4052 *
4053 * Multiview queries cannot cross subpass boundaries, so query count is
4054 * always 1.
4055 */
4056 if (pool->query_type == VK_QUERY_TYPE_OCCLUSION)
4057 v3dv_cmd_buffer_emit_set_query_availability(cmd_buffer, pool, query, 1, 1);
4058 else
4059 cmd_buffer_emit_end_query_cpu(cmd_buffer, pool, query, 1);
4060 }
4061 }
4062
4063 static void
v3dv_cmd_buffer_end_occlusion_query(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t query)4064 v3dv_cmd_buffer_end_occlusion_query(struct v3dv_cmd_buffer *cmd_buffer,
4065 struct v3dv_query_pool *pool,
4066 uint32_t query)
4067 {
4068 assert(query < pool->query_count);
4069 assert(cmd_buffer->state.query.active_query.bo != NULL);
4070
4071 v3dv_cmd_buffer_schedule_end_query(cmd_buffer, pool, query);
4072
4073 cmd_buffer->state.query.active_query.bo = NULL;
4074 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
4075 }
4076
4077 static void
v3dv_cmd_buffer_end_performance_query(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t query)4078 v3dv_cmd_buffer_end_performance_query(struct v3dv_cmd_buffer *cmd_buffer,
4079 struct v3dv_query_pool *pool,
4080 uint32_t query)
4081 {
4082 assert(query < pool->query_count);
4083 assert(cmd_buffer->state.query.active_query.perf != NULL);
4084
4085 if (cmd_buffer->state.pass)
4086 v3dv_cmd_buffer_subpass_finish(cmd_buffer);
4087
4088 v3dv_cmd_buffer_schedule_end_query(cmd_buffer, pool, query);
4089
4090 cmd_buffer->state.query.active_query.perf = NULL;
4091
4092 if (cmd_buffer->state.pass)
4093 v3dv_cmd_buffer_subpass_resume(cmd_buffer, cmd_buffer->state.subpass_idx);
4094 }
4095
v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t query)4096 void v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer,
4097 struct v3dv_query_pool *pool,
4098 uint32_t query)
4099 {
4100 switch (pool->query_type) {
4101 case VK_QUERY_TYPE_OCCLUSION:
4102 v3dv_cmd_buffer_end_occlusion_query(cmd_buffer, pool, query);
4103 break;
4104 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
4105 v3dv_cmd_buffer_end_performance_query(cmd_buffer, pool, query);
4106 break;
4107 default:
4108 unreachable("Unsupported query type");
4109 }
4110 }
4111
4112 void
v3dv_cmd_buffer_add_tfu_job(struct v3dv_cmd_buffer * cmd_buffer,struct drm_v3d_submit_tfu * tfu)4113 v3dv_cmd_buffer_add_tfu_job(struct v3dv_cmd_buffer *cmd_buffer,
4114 struct drm_v3d_submit_tfu *tfu)
4115 {
4116 struct v3dv_device *device = cmd_buffer->device;
4117 struct v3dv_job *job = vk_zalloc(&device->vk.alloc,
4118 sizeof(struct v3dv_job), 8,
4119 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
4120 if (!job) {
4121 v3dv_flag_oom(cmd_buffer, NULL);
4122 return;
4123 }
4124
4125 v3dv_job_init(job, V3DV_JOB_TYPE_GPU_TFU, device, cmd_buffer, -1);
4126 job->tfu = *tfu;
4127 list_addtail(&job->list_link, &cmd_buffer->jobs);
4128 }
4129
4130 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,VkPipelineStageFlags2 stage,VkQueryPool queryPool,uint32_t query)4131 v3dv_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,
4132 VkPipelineStageFlags2 stage,
4133 VkQueryPool queryPool,
4134 uint32_t query)
4135 {
4136 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4137 V3DV_FROM_HANDLE(v3dv_query_pool, query_pool, queryPool);
4138
4139 /* If this is called inside a render pass we need to finish the current
4140 * job here...
4141 */
4142 struct v3dv_render_pass *pass = cmd_buffer->state.pass;
4143 if (pass)
4144 v3dv_cmd_buffer_finish_job(cmd_buffer);
4145
4146 struct v3dv_job *job =
4147 v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
4148 V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY,
4149 cmd_buffer, -1);
4150 v3dv_return_if_oom(cmd_buffer, NULL);
4151
4152 job->cpu.query_timestamp.pool = query_pool;
4153 job->cpu.query_timestamp.query = query;
4154
4155 if (!pass || !pass->multiview_enabled) {
4156 job->cpu.query_timestamp.count = 1;
4157 } else {
4158 struct v3dv_subpass *subpass =
4159 &pass->subpasses[cmd_buffer->state.subpass_idx];
4160 job->cpu.query_timestamp.count = util_bitcount(subpass->view_mask);
4161 }
4162
4163 list_addtail(&job->list_link, &cmd_buffer->jobs);
4164 cmd_buffer->state.job = NULL;
4165
4166 /* ...and resume the subpass after the timestamp */
4167 if (cmd_buffer->state.pass)
4168 v3dv_cmd_buffer_subpass_resume(cmd_buffer, cmd_buffer->state.subpass_idx);
4169 }
4170
4171 static void
cmd_buffer_emit_pre_dispatch(struct v3dv_cmd_buffer * cmd_buffer)4172 cmd_buffer_emit_pre_dispatch(struct v3dv_cmd_buffer *cmd_buffer)
4173 {
4174 assert(cmd_buffer->state.compute.pipeline);
4175 assert(cmd_buffer->state.compute.pipeline->active_stages ==
4176 VK_SHADER_STAGE_COMPUTE_BIT);
4177
4178 cmd_buffer->state.dirty &= ~(V3DV_CMD_DIRTY_COMPUTE_PIPELINE |
4179 V3DV_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS);
4180 cmd_buffer->state.dirty_descriptor_stages &= ~VK_SHADER_STAGE_COMPUTE_BIT;
4181 cmd_buffer->state.dirty_push_constants_stages &= ~VK_SHADER_STAGE_COMPUTE_BIT;
4182 }
4183
4184 void
v3dv_cmd_buffer_rewrite_indirect_csd_job(struct v3dv_device * device,struct v3dv_csd_indirect_cpu_job_info * info,const uint32_t * wg_counts)4185 v3dv_cmd_buffer_rewrite_indirect_csd_job(
4186 struct v3dv_device *device,
4187 struct v3dv_csd_indirect_cpu_job_info *info,
4188 const uint32_t *wg_counts)
4189 {
4190 assert(info->csd_job);
4191 struct v3dv_job *job = info->csd_job;
4192
4193 assert(job->type == V3DV_JOB_TYPE_GPU_CSD);
4194 assert(wg_counts[0] > 0 && wg_counts[1] > 0 && wg_counts[2] > 0);
4195
4196 struct drm_v3d_submit_csd *submit = &job->csd.submit;
4197
4198 job->csd.wg_count[0] = wg_counts[0];
4199 job->csd.wg_count[1] = wg_counts[1];
4200 job->csd.wg_count[2] = wg_counts[2];
4201
4202 submit->cfg[0] = wg_counts[0] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
4203 submit->cfg[1] = wg_counts[1] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
4204 submit->cfg[2] = wg_counts[2] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
4205
4206 uint32_t num_batches = DIV_ROUND_UP(info->wg_size, 16) *
4207 (wg_counts[0] * wg_counts[1] * wg_counts[2]);
4208 /* V3D 7.1.6 and later don't subtract 1 from the number of batches */
4209 if (device->devinfo.ver < 71 ||
4210 (device->devinfo.ver == 71 && device->devinfo.rev < 6)) {
4211 submit->cfg[4] = num_batches - 1;
4212 } else {
4213 submit->cfg[4] = num_batches;
4214 }
4215 assert(submit->cfg[4] != ~0);
4216
4217 if (info->needs_wg_uniform_rewrite) {
4218 /* Make sure the GPU is not currently accessing the indirect CL for this
4219 * job, since we are about to overwrite some of the uniform data.
4220 */
4221 v3dv_bo_wait(job->device, job->indirect.bo, OS_TIMEOUT_INFINITE);
4222
4223 for (uint32_t i = 0; i < 3; i++) {
4224 if (info->wg_uniform_offsets[i]) {
4225 /* Sanity check that our uniform pointers are within the allocated
4226 * BO space for our indirect CL.
4227 */
4228 assert(info->wg_uniform_offsets[i] >= (uint32_t *) job->indirect.base);
4229 assert(info->wg_uniform_offsets[i] < (uint32_t *) job->indirect.next);
4230 *(info->wg_uniform_offsets[i]) = wg_counts[i];
4231 }
4232 }
4233 }
4234 }
4235
4236 static struct v3dv_job *
cmd_buffer_create_csd_job(struct v3dv_cmd_buffer * cmd_buffer,uint32_t base_offset_x,uint32_t base_offset_y,uint32_t base_offset_z,uint32_t group_count_x,uint32_t group_count_y,uint32_t group_count_z,uint32_t ** wg_uniform_offsets_out,uint32_t * wg_size_out)4237 cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
4238 uint32_t base_offset_x,
4239 uint32_t base_offset_y,
4240 uint32_t base_offset_z,
4241 uint32_t group_count_x,
4242 uint32_t group_count_y,
4243 uint32_t group_count_z,
4244 uint32_t **wg_uniform_offsets_out,
4245 uint32_t *wg_size_out)
4246 {
4247 struct v3dv_device *device = cmd_buffer->device;
4248 struct v3dv_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
4249 assert(pipeline && pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE]);
4250 struct v3dv_shader_variant *cs_variant =
4251 pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE];
4252
4253 struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->vk.alloc,
4254 sizeof(struct v3dv_job), 8,
4255 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
4256 if (!job) {
4257 v3dv_flag_oom(cmd_buffer, NULL);
4258 return NULL;
4259 }
4260
4261 v3dv_job_init(job, V3DV_JOB_TYPE_GPU_CSD, cmd_buffer->device, cmd_buffer, -1);
4262 cmd_buffer->state.job = job;
4263
4264 struct drm_v3d_submit_csd *submit = &job->csd.submit;
4265
4266 job->csd.wg_count[0] = group_count_x;
4267 job->csd.wg_count[1] = group_count_y;
4268 job->csd.wg_count[2] = group_count_z;
4269
4270 job->csd.wg_base[0] = base_offset_x;
4271 job->csd.wg_base[1] = base_offset_y;
4272 job->csd.wg_base[2] = base_offset_z;
4273
4274 submit->cfg[0] |= group_count_x << V3D_CSD_CFG012_WG_COUNT_SHIFT;
4275 submit->cfg[1] |= group_count_y << V3D_CSD_CFG012_WG_COUNT_SHIFT;
4276 submit->cfg[2] |= group_count_z << V3D_CSD_CFG012_WG_COUNT_SHIFT;
4277
4278 const struct v3d_compute_prog_data *cpd =
4279 cs_variant->prog_data.cs;
4280
4281 const uint32_t num_wgs = group_count_x * group_count_y * group_count_z;
4282 const uint32_t wg_size = cpd->local_size[0] *
4283 cpd->local_size[1] *
4284 cpd->local_size[2];
4285
4286 uint32_t wgs_per_sg =
4287 v3d_csd_choose_workgroups_per_supergroup(
4288 &cmd_buffer->device->devinfo,
4289 cs_variant->prog_data.cs->has_subgroups,
4290 cs_variant->prog_data.cs->base.has_control_barrier,
4291 cs_variant->prog_data.cs->base.threads,
4292 num_wgs, wg_size);
4293
4294 uint32_t batches_per_sg = DIV_ROUND_UP(wgs_per_sg * wg_size, 16);
4295 uint32_t whole_sgs = num_wgs / wgs_per_sg;
4296 uint32_t rem_wgs = num_wgs - whole_sgs * wgs_per_sg;
4297 uint32_t num_batches = batches_per_sg * whole_sgs +
4298 DIV_ROUND_UP(rem_wgs * wg_size, 16);
4299
4300 submit->cfg[3] |= (wgs_per_sg & 0xf) << V3D_CSD_CFG3_WGS_PER_SG_SHIFT;
4301 submit->cfg[3] |= (batches_per_sg - 1) << V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT;
4302 submit->cfg[3] |= (wg_size & 0xff) << V3D_CSD_CFG3_WG_SIZE_SHIFT;
4303 if (wg_size_out)
4304 *wg_size_out = wg_size;
4305
4306 /* V3D 7.1.6 and later don't subtract 1 from the number of batches */
4307 if (device->devinfo.ver < 71 ||
4308 (device->devinfo.ver == 71 && device->devinfo.rev < 6)) {
4309 submit->cfg[4] = num_batches - 1;
4310 } else {
4311 submit->cfg[4] = num_batches;
4312 }
4313 assert(submit->cfg[4] != ~0);
4314
4315 assert(pipeline->shared_data->assembly_bo);
4316 struct v3dv_bo *cs_assembly_bo = pipeline->shared_data->assembly_bo;
4317
4318 submit->cfg[5] = cs_assembly_bo->offset + cs_variant->assembly_offset;
4319 if (cs_variant->prog_data.base->single_seg)
4320 submit->cfg[5] |= V3D_CSD_CFG5_SINGLE_SEG;
4321 if (cs_variant->prog_data.base->threads == 4)
4322 submit->cfg[5] |= V3D_CSD_CFG5_THREADING;
4323 /* V3D 7.x has made the PROPAGATE_NANS bit in CFG5 reserved */
4324 if (device->devinfo.ver < 71)
4325 submit->cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS;
4326
4327 if (cs_variant->prog_data.cs->shared_size > 0) {
4328 job->csd.shared_memory =
4329 v3dv_bo_alloc(cmd_buffer->device,
4330 cs_variant->prog_data.cs->shared_size * num_wgs,
4331 "shared_vars", true);
4332 if (!job->csd.shared_memory) {
4333 v3dv_flag_oom(cmd_buffer, NULL);
4334 return job;
4335 }
4336 }
4337
4338 v3dv_job_add_bo_unchecked(job, cs_assembly_bo);
4339 struct v3dv_cl_reloc uniforms =
4340 v3dv_write_uniforms_wg_offsets(cmd_buffer, pipeline,
4341 cs_variant,
4342 wg_uniform_offsets_out);
4343 submit->cfg[6] = uniforms.bo->offset + uniforms.offset;
4344
4345
4346 /* Track VK_KHR_buffer_device_address usage in the job */
4347 job->uses_buffer_device_address |= pipeline->uses_buffer_device_address;
4348
4349 v3dv_job_add_bo(job, uniforms.bo);
4350
4351 return job;
4352 }
4353
4354 static void
cmd_buffer_dispatch(struct v3dv_cmd_buffer * cmd_buffer,uint32_t base_offset_x,uint32_t base_offset_y,uint32_t base_offset_z,uint32_t group_count_x,uint32_t group_count_y,uint32_t group_count_z)4355 cmd_buffer_dispatch(struct v3dv_cmd_buffer *cmd_buffer,
4356 uint32_t base_offset_x,
4357 uint32_t base_offset_y,
4358 uint32_t base_offset_z,
4359 uint32_t group_count_x,
4360 uint32_t group_count_y,
4361 uint32_t group_count_z)
4362 {
4363 if (group_count_x == 0 || group_count_y == 0 || group_count_z == 0)
4364 return;
4365
4366 struct v3dv_job *job =
4367 cmd_buffer_create_csd_job(cmd_buffer,
4368 base_offset_x,
4369 base_offset_y,
4370 base_offset_z,
4371 group_count_x,
4372 group_count_y,
4373 group_count_z,
4374 NULL, NULL);
4375
4376 list_addtail(&job->list_link, &cmd_buffer->jobs);
4377 cmd_buffer->state.job = NULL;
4378 }
4379
4380 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdDispatchBase(VkCommandBuffer commandBuffer,uint32_t baseGroupX,uint32_t baseGroupY,uint32_t baseGroupZ,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)4381 v3dv_CmdDispatchBase(VkCommandBuffer commandBuffer,
4382 uint32_t baseGroupX,
4383 uint32_t baseGroupY,
4384 uint32_t baseGroupZ,
4385 uint32_t groupCountX,
4386 uint32_t groupCountY,
4387 uint32_t groupCountZ)
4388 {
4389 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4390
4391 cmd_buffer_emit_pre_dispatch(cmd_buffer);
4392 cmd_buffer_dispatch(cmd_buffer,
4393 baseGroupX, baseGroupY, baseGroupZ,
4394 groupCountX, groupCountY, groupCountZ);
4395 }
4396
4397
4398 static void
cmd_buffer_dispatch_indirect(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_buffer * buffer,uint32_t offset)4399 cmd_buffer_dispatch_indirect(struct v3dv_cmd_buffer *cmd_buffer,
4400 struct v3dv_buffer *buffer,
4401 uint32_t offset)
4402 {
4403 /* We can't do indirect dispatches, so instead we record a CPU job that,
4404 * when executed in the queue, will map the indirect buffer, read the
4405 * dispatch parameters, and submit a regular dispatch.
4406 */
4407 struct v3dv_job *job =
4408 v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
4409 V3DV_JOB_TYPE_CPU_CSD_INDIRECT,
4410 cmd_buffer, -1);
4411 v3dv_return_if_oom(cmd_buffer, NULL);
4412
4413 /* We need to create a CSD job now, even if we still don't know the actual
4414 * dispatch parameters, because the job setup needs to be done using the
4415 * current command buffer state (i.e. pipeline, descriptor sets, push
4416 * constants, etc.). So we create the job with default dispatch parameters
4417 * and we will rewrite the parts we need at submit time if the indirect
4418 * parameters don't match the ones we used to setup the job.
4419 */
4420 struct v3dv_job *csd_job =
4421 cmd_buffer_create_csd_job(cmd_buffer,
4422 0, 0, 0,
4423 1, 1, 1,
4424 &job->cpu.csd_indirect.wg_uniform_offsets[0],
4425 &job->cpu.csd_indirect.wg_size);
4426 v3dv_return_if_oom(cmd_buffer, NULL);
4427 assert(csd_job);
4428
4429 job->cpu.csd_indirect.buffer = buffer;
4430 job->cpu.csd_indirect.offset = offset;
4431 job->cpu.csd_indirect.csd_job = csd_job;
4432
4433 /* If the compute shader reads the workgroup sizes we will also need to
4434 * rewrite the corresponding uniforms.
4435 */
4436 job->cpu.csd_indirect.needs_wg_uniform_rewrite =
4437 job->cpu.csd_indirect.wg_uniform_offsets[0] ||
4438 job->cpu.csd_indirect.wg_uniform_offsets[1] ||
4439 job->cpu.csd_indirect.wg_uniform_offsets[2];
4440
4441 list_addtail(&job->list_link, &cmd_buffer->jobs);
4442
4443 /* If we have a CPU queue we submit the CPU job directly to the
4444 * queue and the CSD job will be dispatched from within the kernel
4445 * queue, otherwise we will have to dispatch the CSD job manually
4446 * right after the CPU job by adding it to the list of jobs in the
4447 * command buffer.
4448 */
4449 if (!cmd_buffer->device->pdevice->caps.cpu_queue)
4450 list_addtail(&csd_job->list_link, &cmd_buffer->jobs);
4451
4452 cmd_buffer->state.job = NULL;
4453 }
4454
4455 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdDispatchIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset)4456 v3dv_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
4457 VkBuffer _buffer,
4458 VkDeviceSize offset)
4459 {
4460 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4461 V3DV_FROM_HANDLE(v3dv_buffer, buffer, _buffer);
4462
4463 assert(offset <= UINT32_MAX);
4464
4465 cmd_buffer_emit_pre_dispatch(cmd_buffer);
4466 cmd_buffer_dispatch_indirect(cmd_buffer, buffer, offset);
4467 }
4468