1 /*
2 * Copyright © 2022 Imagination Technologies Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a copy
5 * of this software and associated documentation files (the "Software"), to deal
6 * in the Software without restriction, including without limitation the rights
7 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 * copies of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <limits.h>
26 #include <stdbool.h>
27 #include <stddef.h>
28 #include <stdint.h>
29 #include <string.h>
30 #include <vulkan/vulkan.h>
31
32 #include "hwdef/rogue_hw_defs.h"
33 #include "hwdef/rogue_hw_utils.h"
34 #include "pvr_blit.h"
35 #include "pvr_bo.h"
36 #include "pvr_clear.h"
37 #include "pvr_common.h"
38 #include "pvr_csb.h"
39 #include "pvr_csb_enum_helpers.h"
40 #include "pvr_device_info.h"
41 #include "pvr_formats.h"
42 #include "pvr_hardcode.h"
43 #include "pvr_hw_pass.h"
44 #include "pvr_job_common.h"
45 #include "pvr_job_render.h"
46 #include "pvr_limits.h"
47 #include "pvr_pds.h"
48 #include "pvr_private.h"
49 #include "pvr_tex_state.h"
50 #include "pvr_types.h"
51 #include "pvr_uscgen.h"
52 #include "pvr_winsys.h"
53 #include "util/bitscan.h"
54 #include "util/bitset.h"
55 #include "util/compiler.h"
56 #include "util/list.h"
57 #include "util/macros.h"
58 #include "util/u_dynarray.h"
59 #include "util/u_math.h"
60 #include "util/u_pack_color.h"
61 #include "vk_alloc.h"
62 #include "vk_command_buffer.h"
63 #include "vk_command_pool.h"
64 #include "vk_common_entrypoints.h"
65 #include "vk_format.h"
66 #include "vk_graphics_state.h"
67 #include "vk_log.h"
68 #include "vk_object.h"
69 #include "vk_util.h"
70
71 /* Structure used to pass data into pvr_compute_generate_control_stream()
72 * function.
73 */
74 struct pvr_compute_kernel_info {
75 pvr_dev_addr_t indirect_buffer_addr;
76 bool global_offsets_present;
77 uint32_t usc_common_size;
78 uint32_t usc_unified_size;
79 uint32_t pds_temp_size;
80 uint32_t pds_data_size;
81 enum PVRX(CDMCTRL_USC_TARGET) usc_target;
82 bool is_fence;
83 uint32_t pds_data_offset;
84 uint32_t pds_code_offset;
85 enum PVRX(CDMCTRL_SD_TYPE) sd_type;
86 bool usc_common_shared;
87 uint32_t local_size[PVR_WORKGROUP_DIMENSIONS];
88 uint32_t global_size[PVR_WORKGROUP_DIMENSIONS];
89 uint32_t max_instances;
90 };
91
pvr_cmd_buffer_free_sub_cmd(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd * sub_cmd)92 static void pvr_cmd_buffer_free_sub_cmd(struct pvr_cmd_buffer *cmd_buffer,
93 struct pvr_sub_cmd *sub_cmd)
94 {
95 if (sub_cmd->owned) {
96 switch (sub_cmd->type) {
97 case PVR_SUB_CMD_TYPE_GRAPHICS:
98 util_dynarray_fini(&sub_cmd->gfx.sec_query_indices);
99 pvr_csb_finish(&sub_cmd->gfx.control_stream);
100 pvr_bo_free(cmd_buffer->device, sub_cmd->gfx.terminate_ctrl_stream);
101 pvr_bo_suballoc_free(sub_cmd->gfx.depth_bias_bo);
102 pvr_bo_suballoc_free(sub_cmd->gfx.scissor_bo);
103 break;
104
105 case PVR_SUB_CMD_TYPE_COMPUTE:
106 case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY:
107 pvr_csb_finish(&sub_cmd->compute.control_stream);
108 break;
109
110 case PVR_SUB_CMD_TYPE_TRANSFER:
111 list_for_each_entry_safe (struct pvr_transfer_cmd,
112 transfer_cmd,
113 sub_cmd->transfer.transfer_cmds,
114 link) {
115 list_del(&transfer_cmd->link);
116 if (!transfer_cmd->is_deferred_clear)
117 vk_free(&cmd_buffer->vk.pool->alloc, transfer_cmd);
118 }
119 break;
120
121 case PVR_SUB_CMD_TYPE_EVENT:
122 if (sub_cmd->event.type == PVR_EVENT_TYPE_WAIT)
123 vk_free(&cmd_buffer->vk.pool->alloc, sub_cmd->event.wait.events);
124 break;
125
126 default:
127 unreachable("Unsupported sub-command type");
128 }
129 }
130
131 list_del(&sub_cmd->link);
132 vk_free(&cmd_buffer->vk.pool->alloc, sub_cmd);
133 }
134
pvr_cmd_buffer_free_sub_cmds(struct pvr_cmd_buffer * cmd_buffer)135 static void pvr_cmd_buffer_free_sub_cmds(struct pvr_cmd_buffer *cmd_buffer)
136 {
137 list_for_each_entry_safe (struct pvr_sub_cmd,
138 sub_cmd,
139 &cmd_buffer->sub_cmds,
140 link) {
141 pvr_cmd_buffer_free_sub_cmd(cmd_buffer, sub_cmd);
142 }
143 }
144
pvr_cmd_buffer_free_resources(struct pvr_cmd_buffer * cmd_buffer)145 static void pvr_cmd_buffer_free_resources(struct pvr_cmd_buffer *cmd_buffer)
146 {
147 vk_free(&cmd_buffer->vk.pool->alloc,
148 cmd_buffer->state.render_pass_info.attachments);
149 vk_free(&cmd_buffer->vk.pool->alloc,
150 cmd_buffer->state.render_pass_info.clear_values);
151
152 util_dynarray_fini(&cmd_buffer->state.query_indices);
153
154 pvr_cmd_buffer_free_sub_cmds(cmd_buffer);
155
156 list_for_each_entry_safe (struct pvr_suballoc_bo,
157 suballoc_bo,
158 &cmd_buffer->bo_list,
159 link) {
160 list_del(&suballoc_bo->link);
161 pvr_bo_suballoc_free(suballoc_bo);
162 }
163
164 util_dynarray_fini(&cmd_buffer->deferred_clears);
165 util_dynarray_fini(&cmd_buffer->deferred_csb_commands);
166 util_dynarray_fini(&cmd_buffer->scissor_array);
167 util_dynarray_fini(&cmd_buffer->depth_bias_array);
168 }
169
pvr_cmd_buffer_reset(struct vk_command_buffer * vk_cmd_buffer,VkCommandBufferResetFlags flags)170 static void pvr_cmd_buffer_reset(struct vk_command_buffer *vk_cmd_buffer,
171 VkCommandBufferResetFlags flags)
172 {
173 struct pvr_cmd_buffer *cmd_buffer =
174 container_of(vk_cmd_buffer, struct pvr_cmd_buffer, vk);
175
176 /* FIXME: For now we always free all resources as if
177 * VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT was set.
178 */
179 pvr_cmd_buffer_free_resources(cmd_buffer);
180
181 vk_command_buffer_reset(&cmd_buffer->vk);
182
183 memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
184 memset(&cmd_buffer->scissor_words, 0, sizeof(cmd_buffer->scissor_words));
185
186 cmd_buffer->usage_flags = 0;
187 }
188
pvr_cmd_buffer_destroy(struct vk_command_buffer * vk_cmd_buffer)189 static void pvr_cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer)
190 {
191 struct pvr_cmd_buffer *cmd_buffer =
192 container_of(vk_cmd_buffer, struct pvr_cmd_buffer, vk);
193
194 pvr_cmd_buffer_free_resources(cmd_buffer);
195 vk_command_buffer_finish(&cmd_buffer->vk);
196 vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer);
197 }
198
199 static const struct vk_command_buffer_ops cmd_buffer_ops = {
200 .reset = pvr_cmd_buffer_reset,
201 .destroy = pvr_cmd_buffer_destroy,
202 };
203
pvr_cmd_buffer_create(struct pvr_device * device,struct vk_command_pool * pool,VkCommandBufferLevel level,VkCommandBuffer * pCommandBuffer)204 static VkResult pvr_cmd_buffer_create(struct pvr_device *device,
205 struct vk_command_pool *pool,
206 VkCommandBufferLevel level,
207 VkCommandBuffer *pCommandBuffer)
208 {
209 struct pvr_cmd_buffer *cmd_buffer;
210 VkResult result;
211
212 cmd_buffer = vk_zalloc(&pool->alloc,
213 sizeof(*cmd_buffer),
214 8U,
215 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
216 if (!cmd_buffer)
217 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
218
219 result =
220 vk_command_buffer_init(pool, &cmd_buffer->vk, &cmd_buffer_ops, level);
221 if (result != VK_SUCCESS) {
222 vk_free(&pool->alloc, cmd_buffer);
223 return result;
224 }
225
226 cmd_buffer->device = device;
227
228 util_dynarray_init(&cmd_buffer->depth_bias_array, NULL);
229 util_dynarray_init(&cmd_buffer->scissor_array, NULL);
230 util_dynarray_init(&cmd_buffer->deferred_csb_commands, NULL);
231 util_dynarray_init(&cmd_buffer->deferred_clears, NULL);
232
233 list_inithead(&cmd_buffer->sub_cmds);
234 list_inithead(&cmd_buffer->bo_list);
235
236 *pCommandBuffer = pvr_cmd_buffer_to_handle(cmd_buffer);
237
238 return VK_SUCCESS;
239 }
240
241 VkResult
pvr_AllocateCommandBuffers(VkDevice _device,const VkCommandBufferAllocateInfo * pAllocateInfo,VkCommandBuffer * pCommandBuffers)242 pvr_AllocateCommandBuffers(VkDevice _device,
243 const VkCommandBufferAllocateInfo *pAllocateInfo,
244 VkCommandBuffer *pCommandBuffers)
245 {
246 VK_FROM_HANDLE(vk_command_pool, pool, pAllocateInfo->commandPool);
247 PVR_FROM_HANDLE(pvr_device, device, _device);
248 VkResult result = VK_SUCCESS;
249 uint32_t i;
250
251 for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
252 result = pvr_cmd_buffer_create(device,
253 pool,
254 pAllocateInfo->level,
255 &pCommandBuffers[i]);
256 if (result != VK_SUCCESS)
257 break;
258 }
259
260 if (result != VK_SUCCESS) {
261 while (i--) {
262 VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, pCommandBuffers[i]);
263 pvr_cmd_buffer_destroy(cmd_buffer);
264 }
265
266 for (i = 0; i < pAllocateInfo->commandBufferCount; i++)
267 pCommandBuffers[i] = VK_NULL_HANDLE;
268 }
269
270 return result;
271 }
272
pvr_cmd_buffer_update_barriers(struct pvr_cmd_buffer * cmd_buffer,enum pvr_sub_cmd_type type)273 static void pvr_cmd_buffer_update_barriers(struct pvr_cmd_buffer *cmd_buffer,
274 enum pvr_sub_cmd_type type)
275 {
276 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
277 uint32_t barriers;
278
279 switch (type) {
280 case PVR_SUB_CMD_TYPE_GRAPHICS:
281 barriers = PVR_PIPELINE_STAGE_GEOM_BIT | PVR_PIPELINE_STAGE_FRAG_BIT;
282 break;
283
284 case PVR_SUB_CMD_TYPE_COMPUTE:
285 barriers = PVR_PIPELINE_STAGE_COMPUTE_BIT;
286 break;
287
288 case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY:
289 case PVR_SUB_CMD_TYPE_TRANSFER:
290 /* Compute jobs are used for occlusion queries but to copy the results we
291 * have to sync with transfer jobs because vkCmdCopyQueryPoolResults() is
292 * deemed as a transfer operation by the spec.
293 */
294 barriers = PVR_PIPELINE_STAGE_TRANSFER_BIT;
295 break;
296
297 case PVR_SUB_CMD_TYPE_EVENT:
298 barriers = 0;
299 break;
300
301 default:
302 unreachable("Unsupported sub-command type");
303 }
304
305 for (uint32_t i = 0; i < ARRAY_SIZE(state->barriers_needed); i++)
306 state->barriers_needed[i] |= barriers;
307 }
308
309 static VkResult
pvr_cmd_buffer_upload_tables(struct pvr_device * device,struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)310 pvr_cmd_buffer_upload_tables(struct pvr_device *device,
311 struct pvr_cmd_buffer *cmd_buffer,
312 struct pvr_sub_cmd_gfx *const sub_cmd)
313 {
314 const uint32_t cache_line_size =
315 rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
316 VkResult result;
317
318 assert(!sub_cmd->depth_bias_bo && !sub_cmd->scissor_bo);
319
320 if (cmd_buffer->depth_bias_array.size > 0) {
321 result =
322 pvr_gpu_upload(device,
323 device->heaps.general_heap,
324 util_dynarray_begin(&cmd_buffer->depth_bias_array),
325 cmd_buffer->depth_bias_array.size,
326 cache_line_size,
327 &sub_cmd->depth_bias_bo);
328 if (result != VK_SUCCESS)
329 return result;
330 }
331
332 if (cmd_buffer->scissor_array.size > 0) {
333 result = pvr_gpu_upload(device,
334 device->heaps.general_heap,
335 util_dynarray_begin(&cmd_buffer->scissor_array),
336 cmd_buffer->scissor_array.size,
337 cache_line_size,
338 &sub_cmd->scissor_bo);
339 if (result != VK_SUCCESS)
340 goto err_free_depth_bias_bo;
341 }
342
343 util_dynarray_clear(&cmd_buffer->depth_bias_array);
344 util_dynarray_clear(&cmd_buffer->scissor_array);
345
346 return VK_SUCCESS;
347
348 err_free_depth_bias_bo:
349 pvr_bo_suballoc_free(sub_cmd->depth_bias_bo);
350 sub_cmd->depth_bias_bo = NULL;
351
352 return result;
353 }
354
355 static VkResult
pvr_cmd_buffer_emit_ppp_state(const struct pvr_cmd_buffer * const cmd_buffer,struct pvr_csb * const csb)356 pvr_cmd_buffer_emit_ppp_state(const struct pvr_cmd_buffer *const cmd_buffer,
357 struct pvr_csb *const csb)
358 {
359 const struct pvr_framebuffer *const framebuffer =
360 cmd_buffer->state.render_pass_info.framebuffer;
361
362 assert(csb->stream_type == PVR_CMD_STREAM_TYPE_GRAPHICS ||
363 csb->stream_type == PVR_CMD_STREAM_TYPE_GRAPHICS_DEFERRED);
364
365 pvr_csb_set_relocation_mark(csb);
366
367 pvr_csb_emit (csb, VDMCTRL_PPP_STATE0, state0) {
368 state0.addrmsb = framebuffer->ppp_state_bo->dev_addr;
369 state0.word_count = framebuffer->ppp_state_size;
370 }
371
372 pvr_csb_emit (csb, VDMCTRL_PPP_STATE1, state1) {
373 state1.addrlsb = framebuffer->ppp_state_bo->dev_addr;
374 }
375
376 pvr_csb_clear_relocation_mark(csb);
377
378 return csb->status;
379 }
380
381 VkResult
pvr_cmd_buffer_upload_general(struct pvr_cmd_buffer * const cmd_buffer,const void * const data,const size_t size,struct pvr_suballoc_bo ** const pvr_bo_out)382 pvr_cmd_buffer_upload_general(struct pvr_cmd_buffer *const cmd_buffer,
383 const void *const data,
384 const size_t size,
385 struct pvr_suballoc_bo **const pvr_bo_out)
386 {
387 struct pvr_device *const device = cmd_buffer->device;
388 const uint32_t cache_line_size =
389 rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
390 struct pvr_suballoc_bo *suballoc_bo;
391 VkResult result;
392
393 result = pvr_gpu_upload(device,
394 device->heaps.general_heap,
395 data,
396 size,
397 cache_line_size,
398 &suballoc_bo);
399 if (result != VK_SUCCESS)
400 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
401
402 list_add(&suballoc_bo->link, &cmd_buffer->bo_list);
403
404 *pvr_bo_out = suballoc_bo;
405
406 return VK_SUCCESS;
407 }
408
409 static VkResult
pvr_cmd_buffer_upload_usc(struct pvr_cmd_buffer * const cmd_buffer,const void * const code,const size_t code_size,uint64_t code_alignment,struct pvr_suballoc_bo ** const pvr_bo_out)410 pvr_cmd_buffer_upload_usc(struct pvr_cmd_buffer *const cmd_buffer,
411 const void *const code,
412 const size_t code_size,
413 uint64_t code_alignment,
414 struct pvr_suballoc_bo **const pvr_bo_out)
415 {
416 struct pvr_device *const device = cmd_buffer->device;
417 const uint32_t cache_line_size =
418 rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
419 struct pvr_suballoc_bo *suballoc_bo;
420 VkResult result;
421
422 code_alignment = MAX2(code_alignment, cache_line_size);
423
424 result =
425 pvr_gpu_upload_usc(device, code, code_size, code_alignment, &suballoc_bo);
426 if (result != VK_SUCCESS)
427 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
428
429 list_add(&suballoc_bo->link, &cmd_buffer->bo_list);
430
431 *pvr_bo_out = suballoc_bo;
432
433 return VK_SUCCESS;
434 }
435
pvr_cmd_buffer_upload_pds(struct pvr_cmd_buffer * const cmd_buffer,const uint32_t * data,uint32_t data_size_dwords,uint32_t data_alignment,const uint32_t * code,uint32_t code_size_dwords,uint32_t code_alignment,uint64_t min_alignment,struct pvr_pds_upload * const pds_upload_out)436 VkResult pvr_cmd_buffer_upload_pds(struct pvr_cmd_buffer *const cmd_buffer,
437 const uint32_t *data,
438 uint32_t data_size_dwords,
439 uint32_t data_alignment,
440 const uint32_t *code,
441 uint32_t code_size_dwords,
442 uint32_t code_alignment,
443 uint64_t min_alignment,
444 struct pvr_pds_upload *const pds_upload_out)
445 {
446 struct pvr_device *const device = cmd_buffer->device;
447 VkResult result;
448
449 result = pvr_gpu_upload_pds(device,
450 data,
451 data_size_dwords,
452 data_alignment,
453 code,
454 code_size_dwords,
455 code_alignment,
456 min_alignment,
457 pds_upload_out);
458 if (result != VK_SUCCESS)
459 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
460
461 list_add(&pds_upload_out->pvr_bo->link, &cmd_buffer->bo_list);
462
463 return VK_SUCCESS;
464 }
465
466 static inline VkResult
pvr_cmd_buffer_upload_pds_data(struct pvr_cmd_buffer * const cmd_buffer,const uint32_t * data,uint32_t data_size_dwords,uint32_t data_alignment,struct pvr_pds_upload * const pds_upload_out)467 pvr_cmd_buffer_upload_pds_data(struct pvr_cmd_buffer *const cmd_buffer,
468 const uint32_t *data,
469 uint32_t data_size_dwords,
470 uint32_t data_alignment,
471 struct pvr_pds_upload *const pds_upload_out)
472 {
473 return pvr_cmd_buffer_upload_pds(cmd_buffer,
474 data,
475 data_size_dwords,
476 data_alignment,
477 NULL,
478 0,
479 0,
480 data_alignment,
481 pds_upload_out);
482 }
483
484 /* pbe_cs_words must be an array of length emit_count with
485 * ROGUE_NUM_PBESTATE_STATE_WORDS entries
486 */
pvr_sub_cmd_gfx_per_job_fragment_programs_create_and_upload(struct pvr_cmd_buffer * const cmd_buffer,const uint32_t emit_count,const uint32_t * pbe_cs_words,struct pvr_pds_upload * const pds_upload_out)487 static VkResult pvr_sub_cmd_gfx_per_job_fragment_programs_create_and_upload(
488 struct pvr_cmd_buffer *const cmd_buffer,
489 const uint32_t emit_count,
490 const uint32_t *pbe_cs_words,
491 struct pvr_pds_upload *const pds_upload_out)
492 {
493 struct pvr_pds_event_program pixel_event_program = {
494 /* No data to DMA, just a DOUTU needed. */
495 .num_emit_word_pairs = 0,
496 };
497 const uint32_t staging_buffer_size =
498 PVR_DW_TO_BYTES(cmd_buffer->device->pixel_event_data_size_in_dwords);
499 const VkAllocationCallbacks *const allocator = &cmd_buffer->vk.pool->alloc;
500 struct pvr_device *const device = cmd_buffer->device;
501 struct pvr_suballoc_bo *usc_eot_program = NULL;
502 struct util_dynarray eot_program_bin;
503 uint32_t *staging_buffer;
504 uint32_t usc_temp_count;
505 VkResult result;
506
507 assert(emit_count > 0);
508
509 pvr_uscgen_eot("per-job EOT",
510 emit_count,
511 pbe_cs_words,
512 &usc_temp_count,
513 &eot_program_bin);
514
515 result = pvr_cmd_buffer_upload_usc(cmd_buffer,
516 eot_program_bin.data,
517 eot_program_bin.size,
518 4,
519 &usc_eot_program);
520
521 util_dynarray_fini(&eot_program_bin);
522
523 if (result != VK_SUCCESS)
524 return result;
525
526 pvr_pds_setup_doutu(&pixel_event_program.task_control,
527 usc_eot_program->dev_addr.addr,
528 usc_temp_count,
529 PVRX(PDSINST_DOUTU_SAMPLE_RATE_INSTANCE),
530 false);
531
532 /* TODO: We could skip allocating this and generate directly into the device
533 * buffer thus removing one allocation and memcpy() per job. Would this
534 * speed up things in a noticeable way?
535 */
536 staging_buffer = vk_alloc(allocator,
537 staging_buffer_size,
538 8,
539 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
540 if (!staging_buffer) {
541 result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
542 goto err_free_usc_pixel_program;
543 }
544
545 /* Generate the data segment. The code segment was uploaded earlier when
546 * setting up the PDS static heap data.
547 */
548 pvr_pds_generate_pixel_event_data_segment(&pixel_event_program,
549 staging_buffer,
550 &device->pdevice->dev_info);
551
552 result = pvr_cmd_buffer_upload_pds_data(
553 cmd_buffer,
554 staging_buffer,
555 cmd_buffer->device->pixel_event_data_size_in_dwords,
556 4,
557 pds_upload_out);
558 if (result != VK_SUCCESS)
559 goto err_free_pixel_event_staging_buffer;
560
561 vk_free(allocator, staging_buffer);
562
563 return VK_SUCCESS;
564
565 err_free_pixel_event_staging_buffer:
566 vk_free(allocator, staging_buffer);
567
568 err_free_usc_pixel_program:
569 list_del(&usc_eot_program->link);
570 pvr_bo_suballoc_free(usc_eot_program);
571
572 return result;
573 }
574
pvr_sub_cmd_gfx_build_terminate_ctrl_stream(struct pvr_device * const device,const struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const gfx_sub_cmd)575 static VkResult pvr_sub_cmd_gfx_build_terminate_ctrl_stream(
576 struct pvr_device *const device,
577 const struct pvr_cmd_buffer *const cmd_buffer,
578 struct pvr_sub_cmd_gfx *const gfx_sub_cmd)
579 {
580 struct list_head bo_list;
581 struct pvr_csb csb;
582 VkResult result;
583
584 pvr_csb_init(device, PVR_CMD_STREAM_TYPE_GRAPHICS, &csb);
585
586 result = pvr_cmd_buffer_emit_ppp_state(cmd_buffer, &csb);
587 if (result != VK_SUCCESS)
588 goto err_csb_finish;
589
590 result = pvr_csb_emit_terminate(&csb);
591 if (result != VK_SUCCESS)
592 goto err_csb_finish;
593
594 result = pvr_csb_bake(&csb, &bo_list);
595 if (result != VK_SUCCESS)
596 goto err_csb_finish;
597
598 /* This is a trivial control stream, there's no reason it should ever require
599 * more memory than a single bo can provide.
600 */
601 assert(list_is_singular(&bo_list));
602 gfx_sub_cmd->terminate_ctrl_stream =
603 list_first_entry(&bo_list, struct pvr_bo, link);
604
605 return VK_SUCCESS;
606
607 err_csb_finish:
608 pvr_csb_finish(&csb);
609
610 return result;
611 }
612
pvr_setup_texture_state_words(struct pvr_device * device,struct pvr_combined_image_sampler_descriptor * descriptor,const struct pvr_image_view * image_view)613 static VkResult pvr_setup_texture_state_words(
614 struct pvr_device *device,
615 struct pvr_combined_image_sampler_descriptor *descriptor,
616 const struct pvr_image_view *image_view)
617 {
618 const struct pvr_image *image = vk_to_pvr_image(image_view->vk.image);
619 struct pvr_texture_state_info info = {
620 .format = image_view->vk.format,
621 .mem_layout = image->memlayout,
622 .type = image_view->vk.view_type,
623 .is_cube = image_view->vk.view_type == VK_IMAGE_VIEW_TYPE_CUBE ||
624 image_view->vk.view_type == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY,
625 .tex_state_type = PVR_TEXTURE_STATE_SAMPLE,
626 .extent = image_view->vk.extent,
627 .mip_levels = 1,
628 .sample_count = image_view->vk.image->samples,
629 .stride = image->physical_extent.width,
630 .addr = image->dev_addr,
631 };
632 const uint8_t *const swizzle = pvr_get_format_swizzle(info.format);
633 VkResult result;
634
635 memcpy(&info.swizzle, swizzle, sizeof(info.swizzle));
636
637 /* TODO: Can we use image_view->texture_state instead of generating here? */
638 result = pvr_pack_tex_state(device, &info, descriptor->image);
639 if (result != VK_SUCCESS)
640 return result;
641
642 descriptor->sampler = (union pvr_sampler_descriptor){ 0 };
643
644 pvr_csb_pack (&descriptor->sampler.data.sampler_word,
645 TEXSTATE_SAMPLER,
646 sampler) {
647 sampler.non_normalized_coords = true;
648 sampler.addrmode_v = PVRX(TEXSTATE_ADDRMODE_CLAMP_TO_EDGE);
649 sampler.addrmode_u = PVRX(TEXSTATE_ADDRMODE_CLAMP_TO_EDGE);
650 sampler.minfilter = PVRX(TEXSTATE_FILTER_POINT);
651 sampler.magfilter = PVRX(TEXSTATE_FILTER_POINT);
652 sampler.dadjust = PVRX(TEXSTATE_DADJUST_ZERO_UINT);
653 }
654
655 return VK_SUCCESS;
656 }
657
658 static VkResult
pvr_load_op_constants_create_and_upload(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_load_op * load_op,pvr_dev_addr_t * const addr_out)659 pvr_load_op_constants_create_and_upload(struct pvr_cmd_buffer *cmd_buffer,
660 const struct pvr_load_op *load_op,
661 pvr_dev_addr_t *const addr_out)
662 {
663 const struct pvr_render_pass_info *render_pass_info =
664 &cmd_buffer->state.render_pass_info;
665 const struct pvr_render_pass *pass = render_pass_info->pass;
666 const struct pvr_renderpass_hwsetup_render *hw_render = load_op->hw_render;
667 const struct pvr_renderpass_colorinit *color_init =
668 &hw_render->color_init[0];
669 const VkClearValue *clear_value =
670 &render_pass_info->clear_values[color_init->index];
671 struct pvr_suballoc_bo *clear_bo;
672 uint32_t attachment_count;
673 bool has_depth_clear;
674 bool has_depth_load;
675 VkResult result;
676
677 /* These are only setup and never used for now. These will need to be
678 * uploaded into a buffer based on some compiler info.
679 */
680 /* TODO: Remove the above comment once the compiler is hooked up and we're
681 * setting up + uploading the buffer.
682 */
683 struct pvr_combined_image_sampler_descriptor
684 texture_states[PVR_LOAD_OP_CLEARS_LOADS_MAX_RTS];
685 uint32_t texture_count = 0;
686 uint32_t hw_clear_value[PVR_LOAD_OP_CLEARS_LOADS_MAX_RTS *
687 PVR_CLEAR_COLOR_ARRAY_SIZE];
688 uint32_t next_clear_consts = 0;
689
690 if (load_op->is_hw_object)
691 attachment_count = load_op->hw_render->color_init_count;
692 else
693 attachment_count = load_op->subpass->color_count;
694
695 for (uint32_t i = 0; i < attachment_count; i++) {
696 struct pvr_image_view *image_view;
697 uint32_t attachment_idx;
698
699 if (load_op->is_hw_object)
700 attachment_idx = load_op->hw_render->color_init[i].index;
701 else
702 attachment_idx = load_op->subpass->color_attachments[i];
703
704 image_view = render_pass_info->attachments[attachment_idx];
705
706 assert((load_op->clears_loads_state.rt_load_mask &
707 load_op->clears_loads_state.rt_clear_mask) == 0);
708 if (load_op->clears_loads_state.rt_load_mask & BITFIELD_BIT(i)) {
709 result = pvr_setup_texture_state_words(cmd_buffer->device,
710 &texture_states[texture_count],
711 image_view);
712 if (result != VK_SUCCESS)
713 return result;
714
715 texture_count++;
716 } else if (load_op->clears_loads_state.rt_clear_mask & BITFIELD_BIT(i)) {
717 const uint32_t accum_fmt_size =
718 pvr_get_pbe_accum_format_size_in_bytes(image_view->vk.format);
719
720 assert(next_clear_consts +
721 vk_format_get_blocksize(image_view->vk.format) <=
722 ARRAY_SIZE(hw_clear_value));
723
724 /* FIXME: do this at the point we store the clear values? */
725 pvr_get_hw_clear_color(image_view->vk.format,
726 clear_value->color,
727 &hw_clear_value[next_clear_consts]);
728
729 next_clear_consts += DIV_ROUND_UP(accum_fmt_size, sizeof(uint32_t));
730 }
731 }
732
733 has_depth_load = false;
734 for (uint32_t i = 0;
735 i < ARRAY_SIZE(load_op->clears_loads_state.dest_vk_format);
736 i++) {
737 if (load_op->clears_loads_state.dest_vk_format[i] ==
738 VK_FORMAT_D32_SFLOAT) {
739 has_depth_load = true;
740 break;
741 }
742 }
743
744 has_depth_clear = load_op->clears_loads_state.depth_clear_to_reg != -1;
745
746 assert(!(has_depth_clear && has_depth_load));
747
748 if (has_depth_load) {
749 const struct pvr_render_pass_attachment *attachment;
750 const struct pvr_image_view *image_view;
751
752 assert(load_op->subpass->depth_stencil_attachment !=
753 VK_ATTACHMENT_UNUSED);
754 assert(!load_op->is_hw_object);
755 attachment =
756 &pass->attachments[load_op->subpass->depth_stencil_attachment];
757
758 image_view = render_pass_info->attachments[attachment->index];
759
760 result = pvr_setup_texture_state_words(cmd_buffer->device,
761 &texture_states[texture_count],
762 image_view);
763 if (result != VK_SUCCESS)
764 return result;
765
766 texture_count++;
767 } else if (has_depth_clear) {
768 const struct pvr_render_pass_attachment *attachment;
769 VkClearValue clear_value;
770
771 assert(load_op->subpass->depth_stencil_attachment !=
772 VK_ATTACHMENT_UNUSED);
773 attachment =
774 &pass->attachments[load_op->subpass->depth_stencil_attachment];
775
776 clear_value = render_pass_info->clear_values[attachment->index];
777
778 assert(next_clear_consts < ARRAY_SIZE(hw_clear_value));
779 hw_clear_value[next_clear_consts++] = fui(clear_value.depthStencil.depth);
780 }
781
782 result = pvr_cmd_buffer_upload_general(cmd_buffer,
783 &hw_clear_value[0],
784 sizeof(hw_clear_value),
785 &clear_bo);
786 if (result != VK_SUCCESS)
787 return result;
788
789 *addr_out = clear_bo->dev_addr;
790
791 return VK_SUCCESS;
792 }
793
pvr_load_op_pds_data_create_and_upload(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_load_op * load_op,pvr_dev_addr_t constants_addr,struct pvr_pds_upload * const pds_upload_out)794 static VkResult pvr_load_op_pds_data_create_and_upload(
795 struct pvr_cmd_buffer *cmd_buffer,
796 const struct pvr_load_op *load_op,
797 pvr_dev_addr_t constants_addr,
798 struct pvr_pds_upload *const pds_upload_out)
799 {
800 struct pvr_device *device = cmd_buffer->device;
801 const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
802 struct pvr_pds_pixel_shader_sa_program program = { 0 };
803 uint32_t staging_buffer_size;
804 uint32_t *staging_buffer;
805 VkResult result;
806
807 program.num_texture_dma_kicks = 1;
808
809 pvr_csb_pack (&program.texture_dma_address[0],
810 PDSINST_DOUT_FIELDS_DOUTD_SRC0,
811 value) {
812 value.sbase = constants_addr;
813 }
814
815 pvr_csb_pack (&program.texture_dma_control[0],
816 PDSINST_DOUT_FIELDS_DOUTD_SRC1,
817 value) {
818 value.dest = PVRX(PDSINST_DOUTD_DEST_COMMON_STORE);
819 value.a0 = load_op->shareds_dest_offset;
820 value.bsize = load_op->shareds_count;
821 }
822
823 pvr_pds_set_sizes_pixel_shader_sa_texture_data(&program, dev_info);
824
825 staging_buffer_size = PVR_DW_TO_BYTES(program.data_size);
826
827 staging_buffer = vk_alloc(&cmd_buffer->vk.pool->alloc,
828 staging_buffer_size,
829 8,
830 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
831 if (!staging_buffer)
832 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
833
834 pvr_pds_generate_pixel_shader_sa_texture_state_data(&program,
835 staging_buffer,
836 dev_info);
837
838 result = pvr_cmd_buffer_upload_pds_data(cmd_buffer,
839 staging_buffer,
840 program.data_size,
841 1,
842 pds_upload_out);
843 if (result != VK_SUCCESS) {
844 vk_free(&cmd_buffer->vk.pool->alloc, staging_buffer);
845 return result;
846 }
847
848 vk_free(&cmd_buffer->vk.pool->alloc, staging_buffer);
849
850 return VK_SUCCESS;
851 }
852
853 /* FIXME: Should this function be specific to the HW background object, in
854 * which case its name should be changed, or should it have the load op
855 * structure passed in?
856 */
857 static VkResult
pvr_load_op_data_create_and_upload(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_load_op * load_op,struct pvr_pds_upload * const pds_upload_out)858 pvr_load_op_data_create_and_upload(struct pvr_cmd_buffer *cmd_buffer,
859 const struct pvr_load_op *load_op,
860 struct pvr_pds_upload *const pds_upload_out)
861 {
862 pvr_dev_addr_t constants_addr;
863 VkResult result;
864
865 result = pvr_load_op_constants_create_and_upload(cmd_buffer,
866 load_op,
867 &constants_addr);
868 if (result != VK_SUCCESS)
869 return result;
870
871 return pvr_load_op_pds_data_create_and_upload(cmd_buffer,
872 load_op,
873 constants_addr,
874 pds_upload_out);
875 }
876
pvr_pds_bgnd_pack_state(const struct pvr_load_op * load_op,const struct pvr_pds_upload * load_op_program,uint64_t pds_reg_values[static const ROGUE_NUM_CR_PDS_BGRND_WORDS])877 static void pvr_pds_bgnd_pack_state(
878 const struct pvr_load_op *load_op,
879 const struct pvr_pds_upload *load_op_program,
880 uint64_t pds_reg_values[static const ROGUE_NUM_CR_PDS_BGRND_WORDS])
881 {
882 pvr_csb_pack (&pds_reg_values[0], CR_PDS_BGRND0_BASE, value) {
883 value.shader_addr = PVR_DEV_ADDR(load_op->pds_frag_prog.data_offset);
884 value.texunicode_addr =
885 PVR_DEV_ADDR(load_op->pds_tex_state_prog.code_offset);
886 }
887
888 pvr_csb_pack (&pds_reg_values[1], CR_PDS_BGRND1_BASE, value) {
889 value.texturedata_addr = PVR_DEV_ADDR(load_op_program->data_offset);
890 }
891
892 pvr_csb_pack (&pds_reg_values[2], CR_PDS_BGRND3_SIZEINFO, value) {
893 value.usc_sharedsize =
894 DIV_ROUND_UP(load_op->const_shareds_count,
895 PVRX(CR_PDS_BGRND3_SIZEINFO_USC_SHAREDSIZE_UNIT_SIZE));
896 value.pds_texturestatesize = DIV_ROUND_UP(
897 load_op_program->data_size,
898 PVRX(CR_PDS_BGRND3_SIZEINFO_PDS_TEXTURESTATESIZE_UNIT_SIZE));
899 value.pds_tempsize =
900 DIV_ROUND_UP(load_op->temps_count,
901 PVRX(CR_PDS_BGRND3_SIZEINFO_PDS_TEMPSIZE_UNIT_SIZE));
902 }
903 }
904
905 /**
906 * \brief Calculates the stride in pixels based on the pitch in bytes and pixel
907 * format.
908 *
909 * \param[in] pitch Width pitch in bytes.
910 * \param[in] vk_format Vulkan image format.
911 * \return Stride in pixels.
912 */
pvr_stride_from_pitch(uint32_t pitch,VkFormat vk_format)913 static inline uint32_t pvr_stride_from_pitch(uint32_t pitch, VkFormat vk_format)
914 {
915 const unsigned int cpp = vk_format_get_blocksize(vk_format);
916
917 assert(pitch % cpp == 0);
918
919 return pitch / cpp;
920 }
921
pvr_setup_pbe_state(const struct pvr_device_info * dev_info,const struct pvr_framebuffer * framebuffer,uint32_t mrt_index,const struct usc_mrt_resource * mrt_resource,const struct pvr_image_view * const iview,const VkRect2D * render_area,const bool down_scale,const uint32_t samples,uint32_t pbe_cs_words[static const ROGUE_NUM_PBESTATE_STATE_WORDS],uint64_t pbe_reg_words[static const ROGUE_NUM_PBESTATE_REG_WORDS])922 static void pvr_setup_pbe_state(
923 const struct pvr_device_info *dev_info,
924 const struct pvr_framebuffer *framebuffer,
925 uint32_t mrt_index,
926 const struct usc_mrt_resource *mrt_resource,
927 const struct pvr_image_view *const iview,
928 const VkRect2D *render_area,
929 const bool down_scale,
930 const uint32_t samples,
931 uint32_t pbe_cs_words[static const ROGUE_NUM_PBESTATE_STATE_WORDS],
932 uint64_t pbe_reg_words[static const ROGUE_NUM_PBESTATE_REG_WORDS])
933 {
934 const struct pvr_image *image = pvr_image_view_get_image(iview);
935 uint32_t level_pitch = image->mip_levels[iview->vk.base_mip_level].pitch;
936
937 struct pvr_pbe_surf_params surface_params;
938 struct pvr_pbe_render_params render_params;
939 bool with_packed_usc_channel;
940 const uint8_t *swizzle;
941 uint32_t position;
942
943 /* down_scale should be true when performing a resolve, in which case there
944 * should be more than one sample.
945 */
946 assert((down_scale && samples > 1U) || (!down_scale && samples == 1U));
947
948 /* Setup surface parameters. */
949
950 if (PVR_HAS_FEATURE(dev_info, usc_f16sop_u8)) {
951 with_packed_usc_channel = vk_format_is_unorm(iview->vk.format) ||
952 vk_format_is_snorm(iview->vk.format);
953 } else {
954 with_packed_usc_channel = false;
955 }
956
957 swizzle = pvr_get_format_swizzle(iview->vk.format);
958 memcpy(surface_params.swizzle, swizzle, sizeof(surface_params.swizzle));
959
960 pvr_pbe_get_src_format_and_gamma(iview->vk.format,
961 PVR_PBE_GAMMA_NONE,
962 with_packed_usc_channel,
963 &surface_params.source_format,
964 &surface_params.gamma);
965
966 surface_params.is_normalized = vk_format_is_normalized(iview->vk.format);
967 surface_params.pbe_packmode = pvr_get_pbe_packmode(iview->vk.format);
968 surface_params.nr_components = vk_format_get_nr_components(iview->vk.format);
969
970 /* FIXME: Should we have an inline function to return the address of a mip
971 * level?
972 */
973 surface_params.addr =
974 PVR_DEV_ADDR_OFFSET(image->vma->dev_addr,
975 image->mip_levels[iview->vk.base_mip_level].offset);
976 surface_params.addr =
977 PVR_DEV_ADDR_OFFSET(surface_params.addr,
978 iview->vk.base_array_layer * image->layer_size);
979
980 surface_params.mem_layout = image->memlayout;
981 surface_params.stride = pvr_stride_from_pitch(level_pitch, iview->vk.format);
982 surface_params.depth = iview->vk.extent.depth;
983 surface_params.width = iview->vk.extent.width;
984 surface_params.height = iview->vk.extent.height;
985 surface_params.z_only_render = false;
986 surface_params.down_scale = down_scale;
987
988 /* Setup render parameters. */
989
990 if (mrt_resource->type == USC_MRT_RESOURCE_TYPE_MEMORY) {
991 position = mrt_resource->mem.offset_dw;
992 } else {
993 assert(mrt_resource->type == USC_MRT_RESOURCE_TYPE_OUTPUT_REG);
994 assert(mrt_resource->reg.offset == 0);
995
996 position = mrt_resource->reg.output_reg;
997 }
998
999 assert(position <= 3 || PVR_HAS_FEATURE(dev_info, eight_output_registers));
1000
1001 switch (position) {
1002 case 0:
1003 case 4:
1004 render_params.source_start = PVR_PBE_STARTPOS_BIT0;
1005 break;
1006 case 1:
1007 case 5:
1008 render_params.source_start = PVR_PBE_STARTPOS_BIT32;
1009 break;
1010 case 2:
1011 case 6:
1012 render_params.source_start = PVR_PBE_STARTPOS_BIT64;
1013 break;
1014 case 3:
1015 case 7:
1016 render_params.source_start = PVR_PBE_STARTPOS_BIT96;
1017 break;
1018 default:
1019 assert(!"Invalid output register");
1020 break;
1021 }
1022
1023 #define PVR_DEC_IF_NOT_ZERO(_v) (((_v) > 0) ? (_v)-1 : 0)
1024
1025 render_params.min_x_clip = MAX2(0, render_area->offset.x);
1026 render_params.min_y_clip = MAX2(0, render_area->offset.y);
1027 render_params.max_x_clip = MIN2(
1028 framebuffer->width - 1,
1029 PVR_DEC_IF_NOT_ZERO(render_area->offset.x + render_area->extent.width));
1030 render_params.max_y_clip = MIN2(
1031 framebuffer->height - 1,
1032 PVR_DEC_IF_NOT_ZERO(render_area->offset.y + render_area->extent.height));
1033
1034 #undef PVR_DEC_IF_NOT_ZERO
1035
1036 render_params.slice = 0;
1037 render_params.mrt_index = mrt_index;
1038
1039 pvr_pbe_pack_state(dev_info,
1040 &surface_params,
1041 &render_params,
1042 pbe_cs_words,
1043 pbe_reg_words);
1044 }
1045
1046 static struct pvr_render_target *
pvr_get_render_target(const struct pvr_render_pass * pass,const struct pvr_framebuffer * framebuffer,uint32_t idx)1047 pvr_get_render_target(const struct pvr_render_pass *pass,
1048 const struct pvr_framebuffer *framebuffer,
1049 uint32_t idx)
1050 {
1051 const struct pvr_renderpass_hwsetup_render *hw_render =
1052 &pass->hw_setup->renders[idx];
1053 uint32_t rt_idx = 0;
1054
1055 switch (hw_render->sample_count) {
1056 case 1:
1057 case 2:
1058 case 4:
1059 case 8:
1060 rt_idx = util_logbase2(hw_render->sample_count);
1061 break;
1062
1063 default:
1064 unreachable("Unsupported sample count");
1065 break;
1066 }
1067
1068 return &framebuffer->render_targets[rt_idx];
1069 }
1070
1071 static uint32_t
pvr_pass_get_pixel_output_width(const struct pvr_render_pass * pass,uint32_t idx,const struct pvr_device_info * dev_info)1072 pvr_pass_get_pixel_output_width(const struct pvr_render_pass *pass,
1073 uint32_t idx,
1074 const struct pvr_device_info *dev_info)
1075 {
1076 const struct pvr_renderpass_hwsetup_render *hw_render =
1077 &pass->hw_setup->renders[idx];
1078 /* Default value based on the maximum value found in all existing cores. The
1079 * maximum is used as this is being treated as a lower bound, making it a
1080 * "safer" choice than the minimum value found in all existing cores.
1081 */
1082 const uint32_t min_output_regs =
1083 PVR_GET_FEATURE_VALUE(dev_info, usc_min_output_registers_per_pix, 2U);
1084 const uint32_t width = MAX2(hw_render->output_regs_count, min_output_regs);
1085
1086 return util_next_power_of_two(width);
1087 }
1088
1089 static inline bool
pvr_ds_attachment_requires_zls(const struct pvr_ds_attachment * attachment)1090 pvr_ds_attachment_requires_zls(const struct pvr_ds_attachment *attachment)
1091 {
1092 bool zls_used;
1093
1094 zls_used = attachment->load.d || attachment->load.s;
1095 zls_used |= attachment->store.d || attachment->store.s;
1096
1097 return zls_used;
1098 }
1099
1100 /**
1101 * \brief If depth and/or stencil attachment dimensions are not tile-aligned,
1102 * then we may need to insert some additional transfer subcommands.
1103 *
1104 * It's worth noting that we check whether the dimensions are smaller than a
1105 * tile here, rather than checking whether they're tile-aligned - this relies
1106 * on the assumption that we can safely use any attachment with dimensions
1107 * larger than a tile. If the attachment is twiddled, it will be over-allocated
1108 * to the nearest power-of-two (which will be tile-aligned). If the attachment
1109 * is not twiddled, we don't need to worry about tile-alignment at all.
1110 */
pvr_sub_cmd_gfx_requires_ds_subtile_alignment(const struct pvr_device_info * dev_info,const struct pvr_render_job * job)1111 static bool pvr_sub_cmd_gfx_requires_ds_subtile_alignment(
1112 const struct pvr_device_info *dev_info,
1113 const struct pvr_render_job *job)
1114 {
1115 const struct pvr_image *const ds_image =
1116 pvr_image_view_get_image(job->ds.iview);
1117 uint32_t zls_tile_size_x;
1118 uint32_t zls_tile_size_y;
1119
1120 rogue_get_zls_tile_size_xy(dev_info, &zls_tile_size_x, &zls_tile_size_y);
1121
1122 if (ds_image->physical_extent.width >= zls_tile_size_x &&
1123 ds_image->physical_extent.height >= zls_tile_size_y) {
1124 return false;
1125 }
1126
1127 /* If we have the zls_subtile feature, we can skip the alignment iff:
1128 * - The attachment is not multisampled, and
1129 * - The depth and stencil attachments are the same.
1130 */
1131 if (PVR_HAS_FEATURE(dev_info, zls_subtile) &&
1132 ds_image->vk.samples == VK_SAMPLE_COUNT_1_BIT &&
1133 job->has_stencil_attachment == job->has_depth_attachment) {
1134 return false;
1135 }
1136
1137 /* No ZLS functions enabled; nothing to do. */
1138 if ((!job->has_depth_attachment && !job->has_stencil_attachment) ||
1139 !pvr_ds_attachment_requires_zls(&job->ds)) {
1140 return false;
1141 }
1142
1143 return true;
1144 }
1145
1146 static VkResult
pvr_sub_cmd_gfx_align_ds_subtiles(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const gfx_sub_cmd)1147 pvr_sub_cmd_gfx_align_ds_subtiles(struct pvr_cmd_buffer *const cmd_buffer,
1148 struct pvr_sub_cmd_gfx *const gfx_sub_cmd)
1149 {
1150 struct pvr_sub_cmd *const prev_sub_cmd =
1151 container_of(gfx_sub_cmd, struct pvr_sub_cmd, gfx);
1152 struct pvr_ds_attachment *const ds = &gfx_sub_cmd->job.ds;
1153 const struct pvr_image *const ds_image = pvr_image_view_get_image(ds->iview);
1154 const VkFormat copy_format = pvr_get_raw_copy_format(ds_image->vk.format);
1155
1156 struct pvr_suballoc_bo *buffer;
1157 uint32_t buffer_layer_size;
1158 VkBufferImageCopy2 region;
1159 VkExtent2D zls_tile_size;
1160 VkExtent2D rounded_size;
1161 uint32_t buffer_size;
1162 VkExtent2D scale;
1163 VkResult result;
1164
1165 /* The operations below assume the last command in the buffer was the target
1166 * gfx subcommand. Assert that this is the case.
1167 */
1168 assert(list_last_entry(&cmd_buffer->sub_cmds, struct pvr_sub_cmd, link) ==
1169 prev_sub_cmd);
1170
1171 if (!pvr_ds_attachment_requires_zls(ds))
1172 return VK_SUCCESS;
1173
1174 rogue_get_zls_tile_size_xy(&cmd_buffer->device->pdevice->dev_info,
1175 &zls_tile_size.width,
1176 &zls_tile_size.height);
1177 rogue_get_isp_scale_xy_from_samples(ds_image->vk.samples,
1178 &scale.width,
1179 &scale.height);
1180
1181 rounded_size = (VkExtent2D){
1182 .width = ALIGN_POT(ds_image->physical_extent.width, zls_tile_size.width),
1183 .height =
1184 ALIGN_POT(ds_image->physical_extent.height, zls_tile_size.height),
1185 };
1186
1187 buffer_layer_size = vk_format_get_blocksize(ds_image->vk.format) *
1188 rounded_size.width * rounded_size.height * scale.width *
1189 scale.height;
1190
1191 if (ds->iview->vk.layer_count > 1)
1192 buffer_layer_size = ALIGN_POT(buffer_layer_size, ds_image->alignment);
1193
1194 buffer_size = buffer_layer_size * ds->iview->vk.layer_count;
1195
1196 result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
1197 cmd_buffer->device->heaps.general_heap,
1198 buffer_size,
1199 &buffer);
1200 if (result != VK_SUCCESS)
1201 return result;
1202
1203 region = (VkBufferImageCopy2){
1204 .sType = VK_STRUCTURE_TYPE_BUFFER_IMAGE_COPY_2,
1205 .pNext = NULL,
1206 .bufferOffset = 0,
1207 .bufferRowLength = rounded_size.width,
1208 .bufferImageHeight = 0,
1209 .imageSubresource = {
1210 .aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT,
1211 .mipLevel = ds->iview->vk.base_mip_level,
1212 .baseArrayLayer = ds->iview->vk.base_array_layer,
1213 .layerCount = ds->iview->vk.layer_count,
1214 },
1215 .imageOffset = { 0 },
1216 .imageExtent = {
1217 .width = ds->iview->vk.extent.width,
1218 .height = ds->iview->vk.extent.height,
1219 .depth = 1,
1220 },
1221 };
1222
1223 if (ds->load.d || ds->load.s) {
1224 cmd_buffer->state.current_sub_cmd = NULL;
1225
1226 result =
1227 pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_TRANSFER);
1228 if (result != VK_SUCCESS)
1229 return result;
1230
1231 result = pvr_copy_image_to_buffer_region_format(cmd_buffer,
1232 ds_image,
1233 buffer->dev_addr,
1234 ®ion,
1235 copy_format,
1236 copy_format);
1237 if (result != VK_SUCCESS)
1238 return result;
1239
1240 cmd_buffer->state.current_sub_cmd->transfer.serialize_with_frag = true;
1241
1242 result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
1243 if (result != VK_SUCCESS)
1244 return result;
1245
1246 /* Now we have to fiddle with cmd_buffer to place this transfer command
1247 * *before* the target gfx subcommand.
1248 */
1249 list_move_to(&cmd_buffer->state.current_sub_cmd->link,
1250 &prev_sub_cmd->link);
1251
1252 cmd_buffer->state.current_sub_cmd = prev_sub_cmd;
1253 }
1254
1255 if (ds->store.d || ds->store.s) {
1256 cmd_buffer->state.current_sub_cmd = NULL;
1257
1258 result =
1259 pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_TRANSFER);
1260 if (result != VK_SUCCESS)
1261 return result;
1262
1263 result = pvr_copy_buffer_to_image_region_format(cmd_buffer,
1264 buffer->dev_addr,
1265 ds_image,
1266 ®ion,
1267 copy_format,
1268 copy_format,
1269 0);
1270 if (result != VK_SUCCESS)
1271 return result;
1272
1273 cmd_buffer->state.current_sub_cmd->transfer.serialize_with_frag = true;
1274
1275 result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
1276 if (result != VK_SUCCESS)
1277 return result;
1278
1279 cmd_buffer->state.current_sub_cmd = prev_sub_cmd;
1280 }
1281
1282 /* Finally, patch up the target graphics sub_cmd to use the correctly-strided
1283 * buffer.
1284 */
1285 ds->has_alignment_transfers = true;
1286 ds->addr = buffer->dev_addr;
1287 ds->physical_extent = rounded_size;
1288
1289 gfx_sub_cmd->wait_on_previous_transfer = true;
1290
1291 return VK_SUCCESS;
1292 }
1293
1294 struct pvr_emit_state {
1295 uint32_t pbe_cs_words[PVR_MAX_COLOR_ATTACHMENTS]
1296 [ROGUE_NUM_PBESTATE_STATE_WORDS];
1297
1298 uint64_t pbe_reg_words[PVR_MAX_COLOR_ATTACHMENTS]
1299 [ROGUE_NUM_PBESTATE_REG_WORDS];
1300
1301 uint32_t emit_count;
1302 };
1303
1304 static void
pvr_setup_emit_state(const struct pvr_device_info * dev_info,const struct pvr_renderpass_hwsetup_render * hw_render,struct pvr_render_pass_info * render_pass_info,struct pvr_emit_state * emit_state)1305 pvr_setup_emit_state(const struct pvr_device_info *dev_info,
1306 const struct pvr_renderpass_hwsetup_render *hw_render,
1307 struct pvr_render_pass_info *render_pass_info,
1308 struct pvr_emit_state *emit_state)
1309 {
1310 assert(hw_render->pbe_emits <= PVR_NUM_PBE_EMIT_REGS);
1311
1312 if (hw_render->eot_surface_count == 0) {
1313 emit_state->emit_count = 1;
1314 pvr_csb_pack (&emit_state->pbe_cs_words[0][1],
1315 PBESTATE_STATE_WORD1,
1316 state) {
1317 state.emptytile = true;
1318 }
1319 return;
1320 }
1321
1322 static_assert(USC_MRT_RESOURCE_TYPE_OUTPUT_REG + 1 ==
1323 USC_MRT_RESOURCE_TYPE_MEMORY,
1324 "The loop below needs adjusting.");
1325
1326 emit_state->emit_count = 0;
1327 for (uint32_t resource_type = USC_MRT_RESOURCE_TYPE_OUTPUT_REG;
1328 resource_type <= USC_MRT_RESOURCE_TYPE_MEMORY;
1329 resource_type++) {
1330 for (uint32_t i = 0; i < hw_render->eot_surface_count; i++) {
1331 const struct pvr_framebuffer *framebuffer =
1332 render_pass_info->framebuffer;
1333 const struct pvr_renderpass_hwsetup_eot_surface *surface =
1334 &hw_render->eot_surfaces[i];
1335 const struct pvr_image_view *iview =
1336 render_pass_info->attachments[surface->attachment_idx];
1337 const struct usc_mrt_resource *mrt_resource =
1338 &hw_render->eot_setup.mrt_resources[surface->mrt_idx];
1339 uint32_t samples = 1;
1340
1341 if (mrt_resource->type != resource_type)
1342 continue;
1343
1344 if (surface->need_resolve) {
1345 const struct pvr_image_view *resolve_src =
1346 render_pass_info->attachments[surface->src_attachment_idx];
1347
1348 /* Attachments that are the destination of resolve operations must
1349 * be loaded before their next use.
1350 */
1351 render_pass_info->enable_bg_tag = true;
1352 render_pass_info->process_empty_tiles = true;
1353
1354 if (surface->resolve_type != PVR_RESOLVE_TYPE_PBE)
1355 continue;
1356
1357 samples = (uint32_t)resolve_src->vk.image->samples;
1358 }
1359
1360 assert(emit_state->emit_count < ARRAY_SIZE(emit_state->pbe_cs_words));
1361 assert(emit_state->emit_count < ARRAY_SIZE(emit_state->pbe_reg_words));
1362
1363 pvr_setup_pbe_state(dev_info,
1364 framebuffer,
1365 emit_state->emit_count,
1366 mrt_resource,
1367 iview,
1368 &render_pass_info->render_area,
1369 surface->need_resolve,
1370 samples,
1371 emit_state->pbe_cs_words[emit_state->emit_count],
1372 emit_state->pbe_reg_words[emit_state->emit_count]);
1373 emit_state->emit_count += 1;
1374 }
1375 }
1376
1377 assert(emit_state->emit_count == hw_render->pbe_emits);
1378 }
1379
1380 static inline bool
pvr_is_render_area_tile_aligned(const struct pvr_cmd_buffer * cmd_buffer,const struct pvr_image_view * iview)1381 pvr_is_render_area_tile_aligned(const struct pvr_cmd_buffer *cmd_buffer,
1382 const struct pvr_image_view *iview)
1383 {
1384 const VkRect2D *render_area =
1385 &cmd_buffer->state.render_pass_info.render_area;
1386
1387 return render_area->offset.x == 0 && render_area->offset.y == 0 &&
1388 render_area->extent.height == iview->vk.extent.height &&
1389 render_area->extent.width == iview->vk.extent.width;
1390 }
1391
pvr_sub_cmd_gfx_job_init(const struct pvr_device_info * dev_info,struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_gfx * sub_cmd)1392 static VkResult pvr_sub_cmd_gfx_job_init(const struct pvr_device_info *dev_info,
1393 struct pvr_cmd_buffer *cmd_buffer,
1394 struct pvr_sub_cmd_gfx *sub_cmd)
1395 {
1396 static const VkClearDepthStencilValue default_ds_clear_value = {
1397 .depth = 1.0f,
1398 .stencil = 0xFFFFFFFF,
1399 };
1400
1401 const struct vk_dynamic_graphics_state *dynamic_state =
1402 &cmd_buffer->vk.dynamic_graphics_state;
1403 struct pvr_render_pass_info *render_pass_info =
1404 &cmd_buffer->state.render_pass_info;
1405 const struct pvr_renderpass_hwsetup_render *hw_render =
1406 &render_pass_info->pass->hw_setup->renders[sub_cmd->hw_render_idx];
1407 struct pvr_render_job *job = &sub_cmd->job;
1408 struct pvr_pds_upload pds_pixel_event_program;
1409 struct pvr_framebuffer *framebuffer = render_pass_info->framebuffer;
1410 struct pvr_spm_bgobj_state *spm_bgobj_state =
1411 &framebuffer->spm_bgobj_state_per_render[sub_cmd->hw_render_idx];
1412 struct pvr_render_target *render_target;
1413 VkResult result;
1414
1415 if (sub_cmd->barrier_store) {
1416 /* There can only ever be one frag job running on the hardware at any one
1417 * time, and a context switch is not allowed mid-tile, so instead of
1418 * allocating a new scratch buffer we can reuse the SPM scratch buffer to
1419 * perform the store.
1420 * So use the SPM EOT program with the SPM PBE reg words in order to store
1421 * the render to the SPM scratch buffer.
1422 */
1423
1424 memcpy(job->pbe_reg_words,
1425 &framebuffer->spm_eot_state_per_render[0].pbe_reg_words,
1426 sizeof(job->pbe_reg_words));
1427 job->pds_pixel_event_data_offset =
1428 framebuffer->spm_eot_state_per_render[0]
1429 .pixel_event_program_data_offset;
1430 } else {
1431 struct pvr_emit_state emit_state = { 0 };
1432
1433 pvr_setup_emit_state(dev_info, hw_render, render_pass_info, &emit_state);
1434
1435 memcpy(job->pbe_reg_words,
1436 emit_state.pbe_reg_words,
1437 sizeof(job->pbe_reg_words));
1438
1439 result = pvr_sub_cmd_gfx_per_job_fragment_programs_create_and_upload(
1440 cmd_buffer,
1441 emit_state.emit_count,
1442 emit_state.pbe_cs_words[0],
1443 &pds_pixel_event_program);
1444 if (result != VK_SUCCESS)
1445 return result;
1446
1447 job->pds_pixel_event_data_offset = pds_pixel_event_program.data_offset;
1448 }
1449
1450 if (sub_cmd->barrier_load) {
1451 job->enable_bg_tag = true;
1452 job->process_empty_tiles = true;
1453
1454 /* Load the previously stored render from the SPM scratch buffer. */
1455
1456 STATIC_ASSERT(ARRAY_SIZE(job->pds_bgnd_reg_values) ==
1457 ARRAY_SIZE(spm_bgobj_state->pds_reg_values));
1458 typed_memcpy(job->pds_bgnd_reg_values,
1459 spm_bgobj_state->pds_reg_values,
1460 ARRAY_SIZE(spm_bgobj_state->pds_reg_values));
1461 } else if (hw_render->load_op) {
1462 const struct pvr_load_op *load_op = hw_render->load_op;
1463 struct pvr_pds_upload load_op_program;
1464
1465 /* Recalculate Background Object(s). */
1466
1467 /* FIXME: Should we free the PDS pixel event data or let it be freed
1468 * when the pool gets emptied?
1469 */
1470 result = pvr_load_op_data_create_and_upload(cmd_buffer,
1471 load_op,
1472 &load_op_program);
1473 if (result != VK_SUCCESS)
1474 return result;
1475
1476 job->enable_bg_tag = render_pass_info->enable_bg_tag;
1477 job->process_empty_tiles = render_pass_info->process_empty_tiles;
1478
1479 pvr_pds_bgnd_pack_state(load_op,
1480 &load_op_program,
1481 job->pds_bgnd_reg_values);
1482 }
1483
1484 /* TODO: In some cases a PR can be removed by storing to the color attachment
1485 * and have the background object load directly from it instead of using the
1486 * scratch buffer. In those cases we can also set this to "false" and avoid
1487 * extra fw overhead.
1488 */
1489 /* The scratch buffer is always needed and allocated to avoid data loss in
1490 * case SPM is hit so set the flag unconditionally.
1491 */
1492 job->requires_spm_scratch_buffer = true;
1493
1494 memcpy(job->pr_pbe_reg_words,
1495 &framebuffer->spm_eot_state_per_render[0].pbe_reg_words,
1496 sizeof(job->pbe_reg_words));
1497 job->pr_pds_pixel_event_data_offset =
1498 framebuffer->spm_eot_state_per_render[0].pixel_event_program_data_offset;
1499
1500 STATIC_ASSERT(ARRAY_SIZE(job->pds_pr_bgnd_reg_values) ==
1501 ARRAY_SIZE(spm_bgobj_state->pds_reg_values));
1502 typed_memcpy(job->pds_pr_bgnd_reg_values,
1503 spm_bgobj_state->pds_reg_values,
1504 ARRAY_SIZE(spm_bgobj_state->pds_reg_values));
1505
1506 render_target = pvr_get_render_target(render_pass_info->pass,
1507 framebuffer,
1508 sub_cmd->hw_render_idx);
1509 job->rt_dataset = render_target->rt_dataset;
1510
1511 job->ctrl_stream_addr = pvr_csb_get_start_address(&sub_cmd->control_stream);
1512
1513 if (sub_cmd->depth_bias_bo)
1514 job->depth_bias_table_addr = sub_cmd->depth_bias_bo->dev_addr;
1515 else
1516 job->depth_bias_table_addr = PVR_DEV_ADDR_INVALID;
1517
1518 if (sub_cmd->scissor_bo)
1519 job->scissor_table_addr = sub_cmd->scissor_bo->dev_addr;
1520 else
1521 job->scissor_table_addr = PVR_DEV_ADDR_INVALID;
1522
1523 job->pixel_output_width =
1524 pvr_pass_get_pixel_output_width(render_pass_info->pass,
1525 sub_cmd->hw_render_idx,
1526 dev_info);
1527
1528 /* Setup depth/stencil job information. */
1529 if (hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED) {
1530 struct pvr_image_view *ds_iview =
1531 render_pass_info->attachments[hw_render->ds_attach_idx];
1532 const struct pvr_image *ds_image = pvr_image_view_get_image(ds_iview);
1533
1534 job->has_depth_attachment = vk_format_has_depth(ds_image->vk.format);
1535 job->has_stencil_attachment = vk_format_has_stencil(ds_image->vk.format);
1536
1537 if (job->has_depth_attachment || job->has_stencil_attachment) {
1538 uint32_t level_pitch =
1539 ds_image->mip_levels[ds_iview->vk.base_mip_level].pitch;
1540 const bool render_area_is_tile_aligned =
1541 pvr_is_render_area_tile_aligned(cmd_buffer, ds_iview);
1542 bool store_was_optimised_out = false;
1543 bool d_store = false, s_store = false;
1544 bool d_load = false, s_load = false;
1545
1546 job->ds.iview = ds_iview;
1547 job->ds.addr = ds_image->dev_addr;
1548
1549 job->ds.stride =
1550 pvr_stride_from_pitch(level_pitch, ds_iview->vk.format);
1551 job->ds.height = ds_iview->vk.extent.height;
1552 job->ds.physical_extent = (VkExtent2D){
1553 .width = u_minify(ds_image->physical_extent.width,
1554 ds_iview->vk.base_mip_level),
1555 .height = u_minify(ds_image->physical_extent.height,
1556 ds_iview->vk.base_mip_level),
1557 };
1558 job->ds.layer_size = ds_image->layer_size;
1559
1560 job->ds_clear_value = default_ds_clear_value;
1561
1562 if (hw_render->ds_attach_idx < render_pass_info->clear_value_count) {
1563 const VkClearDepthStencilValue *const clear_values =
1564 &render_pass_info->clear_values[hw_render->ds_attach_idx]
1565 .depthStencil;
1566
1567 if (job->has_depth_attachment)
1568 job->ds_clear_value.depth = clear_values->depth;
1569
1570 if (job->has_stencil_attachment)
1571 job->ds_clear_value.stencil = clear_values->stencil;
1572 }
1573
1574 switch (ds_iview->vk.format) {
1575 case VK_FORMAT_D16_UNORM:
1576 job->ds.zls_format = PVRX(CR_ZLS_FORMAT_TYPE_16BITINT);
1577 break;
1578
1579 case VK_FORMAT_S8_UINT:
1580 case VK_FORMAT_D32_SFLOAT:
1581 job->ds.zls_format = PVRX(CR_ZLS_FORMAT_TYPE_F32Z);
1582 break;
1583
1584 case VK_FORMAT_D24_UNORM_S8_UINT:
1585 job->ds.zls_format = PVRX(CR_ZLS_FORMAT_TYPE_24BITINT);
1586 break;
1587
1588 default:
1589 unreachable("Unsupported depth stencil format");
1590 }
1591
1592 job->ds.memlayout = ds_image->memlayout;
1593
1594 if (job->has_depth_attachment) {
1595 if (hw_render->depth_store || sub_cmd->barrier_store) {
1596 const bool depth_init_is_clear = hw_render->depth_init ==
1597 VK_ATTACHMENT_LOAD_OP_CLEAR;
1598
1599 d_store = true;
1600
1601 if (hw_render->depth_store && render_area_is_tile_aligned &&
1602 !(sub_cmd->modifies_depth || depth_init_is_clear)) {
1603 d_store = false;
1604 store_was_optimised_out = true;
1605 }
1606 }
1607
1608 if (d_store && !render_area_is_tile_aligned) {
1609 d_load = true;
1610 } else if (hw_render->depth_init == VK_ATTACHMENT_LOAD_OP_LOAD) {
1611 enum pvr_depth_stencil_usage depth_usage = sub_cmd->depth_usage;
1612
1613 assert(depth_usage != PVR_DEPTH_STENCIL_USAGE_UNDEFINED);
1614 d_load = (depth_usage != PVR_DEPTH_STENCIL_USAGE_NEVER);
1615 } else {
1616 d_load = sub_cmd->barrier_load;
1617 }
1618 }
1619
1620 if (job->has_stencil_attachment) {
1621 if (hw_render->stencil_store || sub_cmd->barrier_store) {
1622 const bool stencil_init_is_clear = hw_render->stencil_init ==
1623 VK_ATTACHMENT_LOAD_OP_CLEAR;
1624
1625 s_store = true;
1626
1627 if (hw_render->stencil_store && render_area_is_tile_aligned &&
1628 !(sub_cmd->modifies_stencil || stencil_init_is_clear)) {
1629 s_store = false;
1630 store_was_optimised_out = true;
1631 }
1632 }
1633
1634 if (s_store && !render_area_is_tile_aligned) {
1635 s_load = true;
1636 } else if (hw_render->stencil_init == VK_ATTACHMENT_LOAD_OP_LOAD) {
1637 enum pvr_depth_stencil_usage stencil_usage =
1638 sub_cmd->stencil_usage;
1639
1640 assert(stencil_usage != PVR_DEPTH_STENCIL_USAGE_UNDEFINED);
1641 s_load = (stencil_usage != PVR_DEPTH_STENCIL_USAGE_NEVER);
1642 } else {
1643 s_load = sub_cmd->barrier_load;
1644 }
1645 }
1646
1647 job->ds.load.d = d_load;
1648 job->ds.load.s = s_load;
1649 job->ds.store.d = d_store;
1650 job->ds.store.s = s_store;
1651
1652 /* ZLS can't do masked writes for packed depth stencil formats so if
1653 * we store anything, we have to store everything.
1654 */
1655 if ((job->ds.store.d || job->ds.store.s) &&
1656 pvr_zls_format_type_is_packed(job->ds.zls_format)) {
1657 job->ds.store.d = true;
1658 job->ds.store.s = true;
1659
1660 /* In case we are only operating on one aspect of the attachment we
1661 * need to load the unused one in order to preserve its contents due
1662 * to the forced store which might otherwise corrupt it.
1663 */
1664 if (hw_render->depth_init != VK_ATTACHMENT_LOAD_OP_CLEAR)
1665 job->ds.load.d = true;
1666
1667 if (hw_render->stencil_init != VK_ATTACHMENT_LOAD_OP_CLEAR)
1668 job->ds.load.s = true;
1669 }
1670
1671 if (pvr_ds_attachment_requires_zls(&job->ds) ||
1672 store_was_optimised_out) {
1673 job->process_empty_tiles = true;
1674 }
1675
1676 if (pvr_sub_cmd_gfx_requires_ds_subtile_alignment(dev_info, job)) {
1677 result = pvr_sub_cmd_gfx_align_ds_subtiles(cmd_buffer, sub_cmd);
1678 if (result != VK_SUCCESS)
1679 return result;
1680 }
1681 }
1682 } else {
1683 job->has_depth_attachment = false;
1684 job->has_stencil_attachment = false;
1685 job->ds_clear_value = default_ds_clear_value;
1686 }
1687
1688 if (hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED) {
1689 struct pvr_image_view *iview =
1690 render_pass_info->attachments[hw_render->ds_attach_idx];
1691 const struct pvr_image *image = pvr_image_view_get_image(iview);
1692
1693 /* If the HW render pass has a valid depth/stencil surface, determine the
1694 * sample count from the attachment's image.
1695 */
1696 job->samples = image->vk.samples;
1697 } else if (hw_render->output_regs_count) {
1698 /* If the HW render pass has output registers, we have color attachments
1699 * to write to, so determine the sample count from the count specified for
1700 * every color attachment in this render.
1701 */
1702 job->samples = hw_render->sample_count;
1703 } else if (cmd_buffer->state.gfx_pipeline) {
1704 /* If the HW render pass has no color or depth/stencil attachments, we
1705 * determine the sample count from the count given during pipeline
1706 * creation.
1707 */
1708 job->samples = dynamic_state->ms.rasterization_samples;
1709 } else if (render_pass_info->pass->attachment_count > 0) {
1710 /* If we get here, we have a render pass with subpasses containing no
1711 * attachments. The next best thing is largest of the sample counts
1712 * specified by the render pass attachment descriptions.
1713 */
1714 job->samples = render_pass_info->pass->max_sample_count;
1715 } else {
1716 /* No appropriate framebuffer attachment is available. */
1717 mesa_logw("Defaulting render job sample count to 1.");
1718 job->samples = VK_SAMPLE_COUNT_1_BIT;
1719 }
1720
1721 if (sub_cmd->max_tiles_in_flight ==
1722 PVR_GET_FEATURE_VALUE(dev_info, isp_max_tiles_in_flight, 1U)) {
1723 /* Use the default limit based on the partition store. */
1724 job->max_tiles_in_flight = 0U;
1725 } else {
1726 job->max_tiles_in_flight = sub_cmd->max_tiles_in_flight;
1727 }
1728
1729 job->frag_uses_atomic_ops = sub_cmd->frag_uses_atomic_ops;
1730 job->disable_compute_overlap = false;
1731 job->max_shared_registers = cmd_buffer->state.max_shared_regs;
1732 job->run_frag = true;
1733 job->geometry_terminate = true;
1734
1735 return VK_SUCCESS;
1736 }
1737
1738 static void
pvr_sub_cmd_compute_job_init(const struct pvr_physical_device * pdevice,struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * sub_cmd)1739 pvr_sub_cmd_compute_job_init(const struct pvr_physical_device *pdevice,
1740 struct pvr_cmd_buffer *cmd_buffer,
1741 struct pvr_sub_cmd_compute *sub_cmd)
1742 {
1743 sub_cmd->num_shared_regs = MAX2(cmd_buffer->device->idfwdf_state.usc_shareds,
1744 cmd_buffer->state.max_shared_regs);
1745
1746 cmd_buffer->state.max_shared_regs = 0U;
1747 }
1748
1749 #define PIXEL_ALLOCATION_SIZE_MAX_IN_BLOCKS \
1750 (1024 / PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE))
1751
1752 static uint32_t
pvr_compute_flat_slot_size(const struct pvr_physical_device * pdevice,uint32_t coeff_regs_count,bool use_barrier,uint32_t total_workitems)1753 pvr_compute_flat_slot_size(const struct pvr_physical_device *pdevice,
1754 uint32_t coeff_regs_count,
1755 bool use_barrier,
1756 uint32_t total_workitems)
1757 {
1758 const struct pvr_device_runtime_info *dev_runtime_info =
1759 &pdevice->dev_runtime_info;
1760 const struct pvr_device_info *dev_info = &pdevice->dev_info;
1761 uint32_t max_workgroups_per_task = ROGUE_CDM_MAX_PACKED_WORKGROUPS_PER_TASK;
1762 uint32_t max_avail_coeff_regs =
1763 dev_runtime_info->cdm_max_local_mem_size_regs;
1764 uint32_t localstore_chunks_count =
1765 DIV_ROUND_UP(PVR_DW_TO_BYTES(coeff_regs_count),
1766 PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE));
1767
1768 /* Ensure that we cannot have more workgroups in a slot than the available
1769 * number of coefficients allow us to have.
1770 */
1771 if (coeff_regs_count > 0U) {
1772 /* If the geometry or fragment jobs can overlap with the compute job, or
1773 * if there is a vertex shader already running then we need to consider
1774 * this in calculating max allowed work-groups.
1775 */
1776 if (PVR_HAS_QUIRK(dev_info, 52354) &&
1777 (PVR_HAS_FEATURE(dev_info, compute_overlap) ||
1778 PVR_HAS_FEATURE(dev_info, gs_rta_support))) {
1779 /* Solve for n (number of work-groups per task). All values are in
1780 * size of common store alloc blocks:
1781 *
1782 * n + (2n + 7) * (local_memory_size_max - 1) =
1783 * (coefficient_memory_pool_size) - (7 * pixel_allocation_size_max)
1784 * ==>
1785 * n + 2n * (local_memory_size_max - 1) =
1786 * (coefficient_memory_pool_size) - (7 * pixel_allocation_size_max)
1787 * - (7 * (local_memory_size_max - 1))
1788 * ==>
1789 * n * (1 + 2 * (local_memory_size_max - 1)) =
1790 * (coefficient_memory_pool_size) - (7 * pixel_allocation_size_max)
1791 * - (7 * (local_memory_size_max - 1))
1792 * ==>
1793 * n = ((coefficient_memory_pool_size) -
1794 * (7 * pixel_allocation_size_max) -
1795 * (7 * (local_memory_size_max - 1)) / (1 +
1796 * 2 * (local_memory_size_max - 1)))
1797 */
1798 uint32_t max_common_store_blocks =
1799 DIV_ROUND_UP(max_avail_coeff_regs * 4U,
1800 PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE));
1801
1802 /* (coefficient_memory_pool_size) - (7 * pixel_allocation_size_max)
1803 */
1804 max_common_store_blocks -= ROGUE_MAX_OVERLAPPED_PIXEL_TASK_INSTANCES *
1805 PIXEL_ALLOCATION_SIZE_MAX_IN_BLOCKS;
1806
1807 /* - (7 * (local_memory_size_max - 1)) */
1808 max_common_store_blocks -= (ROGUE_MAX_OVERLAPPED_PIXEL_TASK_INSTANCES *
1809 (localstore_chunks_count - 1U));
1810
1811 /* Divide by (1 + 2 * (local_memory_size_max - 1)) */
1812 max_workgroups_per_task = max_common_store_blocks /
1813 (1U + 2U * (localstore_chunks_count - 1U));
1814
1815 max_workgroups_per_task =
1816 MIN2(max_workgroups_per_task,
1817 ROGUE_CDM_MAX_PACKED_WORKGROUPS_PER_TASK);
1818
1819 } else {
1820 max_workgroups_per_task =
1821 MIN2((max_avail_coeff_regs / coeff_regs_count),
1822 max_workgroups_per_task);
1823 }
1824 }
1825
1826 /* max_workgroups_per_task should at least be one. */
1827 assert(max_workgroups_per_task >= 1U);
1828
1829 if (total_workitems >= ROGUE_MAX_INSTANCES_PER_TASK) {
1830 /* In this case, the work group size will have been padded up to the
1831 * next ROGUE_MAX_INSTANCES_PER_TASK so we just set max instances to be
1832 * ROGUE_MAX_INSTANCES_PER_TASK.
1833 */
1834 return ROGUE_MAX_INSTANCES_PER_TASK;
1835 }
1836
1837 /* In this case, the number of instances in the slot must be clamped to
1838 * accommodate whole work-groups only.
1839 */
1840 if (PVR_HAS_QUIRK(dev_info, 49032) || use_barrier) {
1841 max_workgroups_per_task =
1842 MIN2(max_workgroups_per_task,
1843 ROGUE_MAX_INSTANCES_PER_TASK / total_workitems);
1844 return total_workitems * max_workgroups_per_task;
1845 }
1846
1847 return MIN2(total_workitems * max_workgroups_per_task,
1848 ROGUE_MAX_INSTANCES_PER_TASK);
1849 }
1850
1851 static void
pvr_compute_generate_control_stream(struct pvr_csb * csb,struct pvr_sub_cmd_compute * sub_cmd,const struct pvr_compute_kernel_info * info)1852 pvr_compute_generate_control_stream(struct pvr_csb *csb,
1853 struct pvr_sub_cmd_compute *sub_cmd,
1854 const struct pvr_compute_kernel_info *info)
1855 {
1856 pvr_csb_set_relocation_mark(csb);
1857
1858 /* Compute kernel 0. */
1859 pvr_csb_emit (csb, CDMCTRL_KERNEL0, kernel0) {
1860 kernel0.indirect_present = !!info->indirect_buffer_addr.addr;
1861 kernel0.global_offsets_present = info->global_offsets_present;
1862 kernel0.usc_common_size = info->usc_common_size;
1863 kernel0.usc_unified_size = info->usc_unified_size;
1864 kernel0.pds_temp_size = info->pds_temp_size;
1865 kernel0.pds_data_size = info->pds_data_size;
1866 kernel0.usc_target = info->usc_target;
1867 kernel0.fence = info->is_fence;
1868 }
1869
1870 /* Compute kernel 1. */
1871 pvr_csb_emit (csb, CDMCTRL_KERNEL1, kernel1) {
1872 kernel1.data_addr = PVR_DEV_ADDR(info->pds_data_offset);
1873 kernel1.sd_type = info->sd_type;
1874 kernel1.usc_common_shared = info->usc_common_shared;
1875 }
1876
1877 /* Compute kernel 2. */
1878 pvr_csb_emit (csb, CDMCTRL_KERNEL2, kernel2) {
1879 kernel2.code_addr = PVR_DEV_ADDR(info->pds_code_offset);
1880 }
1881
1882 if (info->indirect_buffer_addr.addr) {
1883 /* Compute kernel 6. */
1884 pvr_csb_emit (csb, CDMCTRL_KERNEL6, kernel6) {
1885 kernel6.indirect_addrmsb = info->indirect_buffer_addr;
1886 }
1887
1888 /* Compute kernel 7. */
1889 pvr_csb_emit (csb, CDMCTRL_KERNEL7, kernel7) {
1890 kernel7.indirect_addrlsb = info->indirect_buffer_addr;
1891 }
1892 } else {
1893 /* Compute kernel 3. */
1894 pvr_csb_emit (csb, CDMCTRL_KERNEL3, kernel3) {
1895 assert(info->global_size[0U] > 0U);
1896 kernel3.workgroup_x = info->global_size[0U] - 1U;
1897 }
1898
1899 /* Compute kernel 4. */
1900 pvr_csb_emit (csb, CDMCTRL_KERNEL4, kernel4) {
1901 assert(info->global_size[1U] > 0U);
1902 kernel4.workgroup_y = info->global_size[1U] - 1U;
1903 }
1904
1905 /* Compute kernel 5. */
1906 pvr_csb_emit (csb, CDMCTRL_KERNEL5, kernel5) {
1907 assert(info->global_size[2U] > 0U);
1908 kernel5.workgroup_z = info->global_size[2U] - 1U;
1909 }
1910 }
1911
1912 /* Compute kernel 8. */
1913 pvr_csb_emit (csb, CDMCTRL_KERNEL8, kernel8) {
1914 if (info->max_instances == ROGUE_MAX_INSTANCES_PER_TASK)
1915 kernel8.max_instances = 0U;
1916 else
1917 kernel8.max_instances = info->max_instances;
1918
1919 assert(info->local_size[0U] > 0U);
1920 kernel8.workgroup_size_x = info->local_size[0U] - 1U;
1921 assert(info->local_size[1U] > 0U);
1922 kernel8.workgroup_size_y = info->local_size[1U] - 1U;
1923 assert(info->local_size[2U] > 0U);
1924 kernel8.workgroup_size_z = info->local_size[2U] - 1U;
1925 }
1926
1927 pvr_csb_clear_relocation_mark(csb);
1928
1929 /* Track the highest amount of shared registers usage in this dispatch.
1930 * This is used by the FW for context switching, so must be large enough
1931 * to contain all the shared registers that might be in use for this compute
1932 * job. Coefficients don't need to be included as the context switch will not
1933 * happen within the execution of a single workgroup, thus nothing needs to
1934 * be preserved.
1935 */
1936 if (info->usc_common_shared) {
1937 sub_cmd->num_shared_regs =
1938 MAX2(sub_cmd->num_shared_regs, info->usc_common_size);
1939 }
1940 }
1941
1942 /* TODO: This can be pre-packed and uploaded directly. Would that provide any
1943 * speed up?
1944 */
1945 static void
pvr_compute_generate_idfwdf(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd)1946 pvr_compute_generate_idfwdf(struct pvr_cmd_buffer *cmd_buffer,
1947 struct pvr_sub_cmd_compute *const sub_cmd)
1948 {
1949 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
1950 bool *const is_sw_barier_required =
1951 &state->current_sub_cmd->compute.pds_sw_barrier_requires_clearing;
1952 const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
1953 struct pvr_csb *csb = &sub_cmd->control_stream;
1954 const struct pvr_pds_upload *program;
1955
1956 if (PVR_NEED_SW_COMPUTE_PDS_BARRIER(&pdevice->dev_info) &&
1957 *is_sw_barier_required) {
1958 *is_sw_barier_required = false;
1959 program = &cmd_buffer->device->idfwdf_state.sw_compute_barrier_pds;
1960 } else {
1961 program = &cmd_buffer->device->idfwdf_state.pds;
1962 }
1963
1964 struct pvr_compute_kernel_info info = {
1965 .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
1966 .global_offsets_present = false,
1967 .usc_common_size = DIV_ROUND_UP(
1968 PVR_DW_TO_BYTES(cmd_buffer->device->idfwdf_state.usc_shareds),
1969 PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE)),
1970 .usc_unified_size = 0U,
1971 .pds_temp_size = 0U,
1972 .pds_data_size =
1973 DIV_ROUND_UP(PVR_DW_TO_BYTES(program->data_size),
1974 PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE)),
1975 .usc_target = PVRX(CDMCTRL_USC_TARGET_ALL),
1976 .is_fence = false,
1977 .pds_data_offset = program->data_offset,
1978 .sd_type = PVRX(CDMCTRL_SD_TYPE_USC),
1979 .usc_common_shared = true,
1980 .pds_code_offset = program->code_offset,
1981 .global_size = { 1U, 1U, 1U },
1982 .local_size = { 1U, 1U, 1U },
1983 };
1984
1985 /* We don't need to pad work-group size for this case. */
1986
1987 info.max_instances =
1988 pvr_compute_flat_slot_size(pdevice,
1989 cmd_buffer->device->idfwdf_state.usc_shareds,
1990 false,
1991 1U);
1992
1993 pvr_compute_generate_control_stream(csb, sub_cmd, &info);
1994 }
1995
pvr_compute_generate_fence(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd,bool deallocate_shareds)1996 void pvr_compute_generate_fence(struct pvr_cmd_buffer *cmd_buffer,
1997 struct pvr_sub_cmd_compute *const sub_cmd,
1998 bool deallocate_shareds)
1999 {
2000 const struct pvr_pds_upload *program =
2001 &cmd_buffer->device->pds_compute_fence_program;
2002 const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
2003 struct pvr_csb *csb = &sub_cmd->control_stream;
2004
2005 struct pvr_compute_kernel_info info = {
2006 .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
2007 .global_offsets_present = false,
2008 .usc_common_size = 0U,
2009 .usc_unified_size = 0U,
2010 .pds_temp_size = 0U,
2011 .pds_data_size =
2012 DIV_ROUND_UP(PVR_DW_TO_BYTES(program->data_size),
2013 PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE)),
2014 .usc_target = PVRX(CDMCTRL_USC_TARGET_ANY),
2015 .is_fence = true,
2016 .pds_data_offset = program->data_offset,
2017 .sd_type = PVRX(CDMCTRL_SD_TYPE_PDS),
2018 .usc_common_shared = deallocate_shareds,
2019 .pds_code_offset = program->code_offset,
2020 .global_size = { 1U, 1U, 1U },
2021 .local_size = { 1U, 1U, 1U },
2022 };
2023
2024 /* We don't need to pad work-group size for this case. */
2025 /* Here we calculate the slot size. This can depend on the use of barriers,
2026 * local memory, BRN's or other factors.
2027 */
2028 info.max_instances = pvr_compute_flat_slot_size(pdevice, 0U, false, 1U);
2029
2030 pvr_compute_generate_control_stream(csb, sub_cmd, &info);
2031 }
2032
2033 static VkResult
pvr_cmd_buffer_process_deferred_clears(struct pvr_cmd_buffer * cmd_buffer)2034 pvr_cmd_buffer_process_deferred_clears(struct pvr_cmd_buffer *cmd_buffer)
2035 {
2036 util_dynarray_foreach (&cmd_buffer->deferred_clears,
2037 struct pvr_transfer_cmd,
2038 transfer_cmd) {
2039 VkResult result;
2040
2041 result = pvr_cmd_buffer_add_transfer_cmd(cmd_buffer, transfer_cmd);
2042 if (result != VK_SUCCESS)
2043 return result;
2044
2045 cmd_buffer->state.current_sub_cmd->transfer.serialize_with_frag = true;
2046 }
2047
2048 return VK_SUCCESS;
2049 }
2050
pvr_cmd_buffer_end_sub_cmd(struct pvr_cmd_buffer * cmd_buffer)2051 VkResult pvr_cmd_buffer_end_sub_cmd(struct pvr_cmd_buffer *cmd_buffer)
2052 {
2053 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
2054 struct pvr_sub_cmd *sub_cmd = state->current_sub_cmd;
2055 struct pvr_device *device = cmd_buffer->device;
2056 const struct pvr_query_pool *query_pool = NULL;
2057 struct pvr_suballoc_bo *query_bo = NULL;
2058 size_t query_indices_size = 0;
2059 VkResult result;
2060
2061 /* FIXME: Is this NULL check required because this function is called from
2062 * pvr_resolve_unemitted_resolve_attachments()? See comment about this
2063 * function being called twice in a row in pvr_CmdEndRenderPass().
2064 */
2065 if (!sub_cmd)
2066 return VK_SUCCESS;
2067
2068 if (!sub_cmd->owned) {
2069 state->current_sub_cmd = NULL;
2070 return VK_SUCCESS;
2071 }
2072
2073 switch (sub_cmd->type) {
2074 case PVR_SUB_CMD_TYPE_GRAPHICS: {
2075 struct pvr_sub_cmd_gfx *const gfx_sub_cmd = &sub_cmd->gfx;
2076
2077 query_indices_size =
2078 util_dynarray_num_elements(&state->query_indices, char);
2079
2080 if (query_indices_size > 0) {
2081 const bool secondary_cont =
2082 cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
2083 cmd_buffer->usage_flags &
2084 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
2085
2086 assert(gfx_sub_cmd->query_pool);
2087
2088 if (secondary_cont) {
2089 util_dynarray_append_dynarray(&state->query_indices,
2090 &gfx_sub_cmd->sec_query_indices);
2091 } else {
2092 const void *data = util_dynarray_begin(&state->query_indices);
2093
2094 result = pvr_cmd_buffer_upload_general(cmd_buffer,
2095 data,
2096 query_indices_size,
2097 &query_bo);
2098 if (result != VK_SUCCESS)
2099 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2100
2101 query_pool = gfx_sub_cmd->query_pool;
2102 }
2103
2104 gfx_sub_cmd->has_occlusion_query = true;
2105
2106 util_dynarray_clear(&state->query_indices);
2107 }
2108
2109 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
2110 result = pvr_csb_emit_return(&gfx_sub_cmd->control_stream);
2111 if (result != VK_SUCCESS)
2112 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2113
2114 break;
2115 }
2116
2117 /* TODO: Check if the sub_cmd can be skipped based on
2118 * sub_cmd->gfx.empty_cmd flag.
2119 */
2120
2121 /* TODO: Set the state in the functions called with the command buffer
2122 * instead of here.
2123 */
2124
2125 result = pvr_cmd_buffer_upload_tables(device, cmd_buffer, gfx_sub_cmd);
2126 if (result != VK_SUCCESS)
2127 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2128
2129 result = pvr_cmd_buffer_emit_ppp_state(cmd_buffer,
2130 &gfx_sub_cmd->control_stream);
2131 if (result != VK_SUCCESS)
2132 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2133
2134 result = pvr_csb_emit_terminate(&gfx_sub_cmd->control_stream);
2135 if (result != VK_SUCCESS)
2136 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2137
2138 result = pvr_sub_cmd_gfx_job_init(&device->pdevice->dev_info,
2139 cmd_buffer,
2140 gfx_sub_cmd);
2141 if (result != VK_SUCCESS)
2142 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2143
2144 if (pvr_sub_cmd_gfx_requires_split_submit(gfx_sub_cmd)) {
2145 result = pvr_sub_cmd_gfx_build_terminate_ctrl_stream(device,
2146 cmd_buffer,
2147 gfx_sub_cmd);
2148 if (result != VK_SUCCESS)
2149 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2150 }
2151
2152 break;
2153 }
2154
2155 case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY:
2156 case PVR_SUB_CMD_TYPE_COMPUTE: {
2157 struct pvr_sub_cmd_compute *const compute_sub_cmd = &sub_cmd->compute;
2158
2159 pvr_compute_generate_fence(cmd_buffer, compute_sub_cmd, true);
2160
2161 result = pvr_csb_emit_terminate(&compute_sub_cmd->control_stream);
2162 if (result != VK_SUCCESS)
2163 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2164
2165 pvr_sub_cmd_compute_job_init(device->pdevice,
2166 cmd_buffer,
2167 compute_sub_cmd);
2168 break;
2169 }
2170
2171 case PVR_SUB_CMD_TYPE_TRANSFER:
2172 break;
2173
2174 case PVR_SUB_CMD_TYPE_EVENT:
2175 break;
2176
2177 default:
2178 unreachable("Unsupported sub-command type");
2179 }
2180
2181 state->current_sub_cmd = NULL;
2182
2183 /* pvr_cmd_buffer_process_deferred_clears() must be called with a NULL
2184 * current_sub_cmd.
2185 *
2186 * We can start a sub_cmd of a different type from the current sub_cmd only
2187 * after having ended the current sub_cmd. However, we can't end the current
2188 * sub_cmd if this depends on starting sub_cmd(s) of a different type. Hence,
2189 * don't try to start transfer sub_cmd(s) with
2190 * pvr_cmd_buffer_process_deferred_clears() until the current hasn't ended.
2191 * Failing to do so we will cause a circular dependency between
2192 * pvr_cmd_buffer_{end,start}_cmd and blow the stack.
2193 */
2194 if (sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS) {
2195 result = pvr_cmd_buffer_process_deferred_clears(cmd_buffer);
2196 if (result != VK_SUCCESS)
2197 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2198 }
2199
2200 if (query_pool) {
2201 struct pvr_query_info query_info;
2202
2203 assert(query_bo);
2204 assert(query_indices_size);
2205
2206 query_info.type = PVR_QUERY_TYPE_AVAILABILITY_WRITE;
2207
2208 /* sizeof(uint32_t) is for the size of single query. */
2209 query_info.availability_write.num_query_indices =
2210 query_indices_size / sizeof(uint32_t);
2211 query_info.availability_write.index_bo = query_bo;
2212
2213 query_info.availability_write.num_queries = query_pool->query_count;
2214 query_info.availability_write.availability_bo =
2215 query_pool->availability_buffer;
2216
2217 /* Insert a barrier after the graphics sub command and before the
2218 * query sub command so that the availability write program waits for the
2219 * fragment shader to complete.
2220 */
2221
2222 result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
2223 if (result != VK_SUCCESS)
2224 return result;
2225
2226 cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
2227 .type = PVR_EVENT_TYPE_BARRIER,
2228 .barrier = {
2229 .wait_for_stage_mask = PVR_PIPELINE_STAGE_FRAG_BIT,
2230 .wait_at_stage_mask = PVR_PIPELINE_STAGE_OCCLUSION_QUERY_BIT,
2231 },
2232 };
2233
2234 return pvr_add_query_program(cmd_buffer, &query_info);
2235 }
2236
2237 return VK_SUCCESS;
2238 }
2239
pvr_reset_graphics_dirty_state(struct pvr_cmd_buffer * const cmd_buffer,bool start_geom)2240 void pvr_reset_graphics_dirty_state(struct pvr_cmd_buffer *const cmd_buffer,
2241 bool start_geom)
2242 {
2243 struct vk_dynamic_graphics_state *const dynamic_state =
2244 &cmd_buffer->vk.dynamic_graphics_state;
2245
2246 if (start_geom) {
2247 /*
2248 * Initial geometry phase state.
2249 * It's the driver's responsibility to ensure that the state of the
2250 * hardware is correctly initialized at the start of every geometry
2251 * phase. This is required to prevent stale state from a previous
2252 * geometry phase erroneously affecting the next geometry phase.
2253 *
2254 * If a geometry phase does not contain any geometry, this restriction
2255 * can be ignored. If the first draw call in a geometry phase will only
2256 * update the depth or stencil buffers i.e. ISP_TAGWRITEDISABLE is set
2257 * in the ISP State Control Word, the PDS State Pointers
2258 * (TA_PRES_PDSSTATEPTR*) in the first PPP State Update do not need to
2259 * be supplied, since they will never reach the PDS in the fragment
2260 * phase.
2261 */
2262
2263 cmd_buffer->state.emit_header = (struct PVRX(TA_STATE_HEADER)){
2264 .pres_stream_out_size = true,
2265 .pres_ppp_ctrl = true,
2266 .pres_varying_word2 = true,
2267 .pres_varying_word1 = true,
2268 .pres_varying_word0 = true,
2269 .pres_outselects = true,
2270 .pres_wclamp = true,
2271 .pres_viewport = true,
2272 .pres_region_clip = true,
2273 .pres_pds_state_ptr0 = true,
2274 .pres_ispctl_fb = true,
2275 .pres_ispctl = true,
2276 };
2277 } else {
2278 struct PVRX(TA_STATE_HEADER) *const emit_header =
2279 &cmd_buffer->state.emit_header;
2280
2281 emit_header->pres_ppp_ctrl = true;
2282 emit_header->pres_varying_word1 = true;
2283 emit_header->pres_varying_word0 = true;
2284 emit_header->pres_outselects = true;
2285 emit_header->pres_viewport = true;
2286 emit_header->pres_region_clip = true;
2287 emit_header->pres_pds_state_ptr0 = true;
2288 emit_header->pres_ispctl_fb = true;
2289 emit_header->pres_ispctl = true;
2290 }
2291
2292 memset(&cmd_buffer->state.ppp_state,
2293 0U,
2294 sizeof(cmd_buffer->state.ppp_state));
2295
2296 cmd_buffer->state.dirty.vertex_bindings = true;
2297 cmd_buffer->state.dirty.gfx_pipeline_binding = true;
2298
2299 BITSET_SET(dynamic_state->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS);
2300 BITSET_SET(dynamic_state->dirty, MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT);
2301 }
2302
2303 static inline bool
pvr_cmd_uses_deferred_cs_cmds(const struct pvr_cmd_buffer * const cmd_buffer)2304 pvr_cmd_uses_deferred_cs_cmds(const struct pvr_cmd_buffer *const cmd_buffer)
2305 {
2306 const VkCommandBufferUsageFlags deferred_control_stream_flags =
2307 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT |
2308 VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
2309
2310 return cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
2311 (cmd_buffer->usage_flags & deferred_control_stream_flags) ==
2312 deferred_control_stream_flags;
2313 }
2314
pvr_cmd_buffer_start_sub_cmd(struct pvr_cmd_buffer * cmd_buffer,enum pvr_sub_cmd_type type)2315 VkResult pvr_cmd_buffer_start_sub_cmd(struct pvr_cmd_buffer *cmd_buffer,
2316 enum pvr_sub_cmd_type type)
2317 {
2318 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
2319 struct pvr_device *device = cmd_buffer->device;
2320 struct pvr_sub_cmd *sub_cmd;
2321 VkResult result;
2322
2323 /* Check the current status of the buffer. */
2324 if (vk_command_buffer_has_error(&cmd_buffer->vk))
2325 return vk_command_buffer_get_record_result(&cmd_buffer->vk);
2326
2327 pvr_cmd_buffer_update_barriers(cmd_buffer, type);
2328
2329 /* TODO: Add proper support for joining consecutive event sub_cmd? */
2330 if (state->current_sub_cmd) {
2331 if (state->current_sub_cmd->type == type) {
2332 /* Continue adding to the current sub command. */
2333 return VK_SUCCESS;
2334 }
2335
2336 /* End the current sub command. */
2337 result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
2338 if (result != VK_SUCCESS)
2339 return result;
2340 }
2341
2342 sub_cmd = vk_zalloc(&cmd_buffer->vk.pool->alloc,
2343 sizeof(*sub_cmd),
2344 8,
2345 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
2346 if (!sub_cmd) {
2347 return vk_command_buffer_set_error(&cmd_buffer->vk,
2348 VK_ERROR_OUT_OF_HOST_MEMORY);
2349 }
2350
2351 sub_cmd->type = type;
2352 sub_cmd->owned = true;
2353
2354 switch (type) {
2355 case PVR_SUB_CMD_TYPE_GRAPHICS:
2356 sub_cmd->gfx.depth_usage = PVR_DEPTH_STENCIL_USAGE_UNDEFINED;
2357 sub_cmd->gfx.stencil_usage = PVR_DEPTH_STENCIL_USAGE_UNDEFINED;
2358 sub_cmd->gfx.modifies_depth = false;
2359 sub_cmd->gfx.modifies_stencil = false;
2360 sub_cmd->gfx.max_tiles_in_flight =
2361 PVR_GET_FEATURE_VALUE(&device->pdevice->dev_info,
2362 isp_max_tiles_in_flight,
2363 1);
2364 sub_cmd->gfx.hw_render_idx = state->render_pass_info.current_hw_subpass;
2365 sub_cmd->gfx.framebuffer = state->render_pass_info.framebuffer;
2366 sub_cmd->gfx.empty_cmd = true;
2367
2368 if (state->vis_test_enabled)
2369 sub_cmd->gfx.query_pool = state->query_pool;
2370
2371 pvr_reset_graphics_dirty_state(cmd_buffer, true);
2372
2373 if (pvr_cmd_uses_deferred_cs_cmds(cmd_buffer)) {
2374 pvr_csb_init(device,
2375 PVR_CMD_STREAM_TYPE_GRAPHICS_DEFERRED,
2376 &sub_cmd->gfx.control_stream);
2377 } else {
2378 pvr_csb_init(device,
2379 PVR_CMD_STREAM_TYPE_GRAPHICS,
2380 &sub_cmd->gfx.control_stream);
2381 }
2382
2383 util_dynarray_init(&sub_cmd->gfx.sec_query_indices, NULL);
2384 break;
2385
2386 case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY:
2387 case PVR_SUB_CMD_TYPE_COMPUTE:
2388 pvr_csb_init(device,
2389 PVR_CMD_STREAM_TYPE_COMPUTE,
2390 &sub_cmd->compute.control_stream);
2391 break;
2392
2393 case PVR_SUB_CMD_TYPE_TRANSFER:
2394 sub_cmd->transfer.transfer_cmds = &sub_cmd->transfer.transfer_cmds_priv;
2395 list_inithead(sub_cmd->transfer.transfer_cmds);
2396 break;
2397
2398 case PVR_SUB_CMD_TYPE_EVENT:
2399 break;
2400
2401 default:
2402 unreachable("Unsupported sub-command type");
2403 }
2404
2405 list_addtail(&sub_cmd->link, &cmd_buffer->sub_cmds);
2406 state->current_sub_cmd = sub_cmd;
2407
2408 return VK_SUCCESS;
2409 }
2410
pvr_cmd_buffer_alloc_mem(struct pvr_cmd_buffer * cmd_buffer,struct pvr_winsys_heap * heap,uint64_t size,struct pvr_suballoc_bo ** const pvr_bo_out)2411 VkResult pvr_cmd_buffer_alloc_mem(struct pvr_cmd_buffer *cmd_buffer,
2412 struct pvr_winsys_heap *heap,
2413 uint64_t size,
2414 struct pvr_suballoc_bo **const pvr_bo_out)
2415 {
2416 const uint32_t cache_line_size =
2417 rogue_get_slc_cache_line_size(&cmd_buffer->device->pdevice->dev_info);
2418 struct pvr_suballoc_bo *suballoc_bo;
2419 struct pvr_suballocator *allocator;
2420 VkResult result;
2421
2422 if (heap == cmd_buffer->device->heaps.general_heap)
2423 allocator = &cmd_buffer->device->suballoc_general;
2424 else if (heap == cmd_buffer->device->heaps.pds_heap)
2425 allocator = &cmd_buffer->device->suballoc_pds;
2426 else if (heap == cmd_buffer->device->heaps.transfer_frag_heap)
2427 allocator = &cmd_buffer->device->suballoc_transfer;
2428 else if (heap == cmd_buffer->device->heaps.usc_heap)
2429 allocator = &cmd_buffer->device->suballoc_usc;
2430 else
2431 unreachable("Unknown heap type");
2432
2433 result =
2434 pvr_bo_suballoc(allocator, size, cache_line_size, false, &suballoc_bo);
2435 if (result != VK_SUCCESS)
2436 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2437
2438 list_add(&suballoc_bo->link, &cmd_buffer->bo_list);
2439
2440 *pvr_bo_out = suballoc_bo;
2441
2442 return VK_SUCCESS;
2443 }
2444
pvr_cmd_bind_compute_pipeline(const struct pvr_compute_pipeline * const compute_pipeline,struct pvr_cmd_buffer * const cmd_buffer)2445 static void pvr_cmd_bind_compute_pipeline(
2446 const struct pvr_compute_pipeline *const compute_pipeline,
2447 struct pvr_cmd_buffer *const cmd_buffer)
2448 {
2449 cmd_buffer->state.compute_pipeline = compute_pipeline;
2450 cmd_buffer->state.dirty.compute_pipeline_binding = true;
2451 }
2452
pvr_cmd_bind_graphics_pipeline(const struct pvr_graphics_pipeline * const gfx_pipeline,struct pvr_cmd_buffer * const cmd_buffer)2453 static void pvr_cmd_bind_graphics_pipeline(
2454 const struct pvr_graphics_pipeline *const gfx_pipeline,
2455 struct pvr_cmd_buffer *const cmd_buffer)
2456 {
2457 cmd_buffer->state.gfx_pipeline = gfx_pipeline;
2458 cmd_buffer->state.dirty.gfx_pipeline_binding = true;
2459
2460 vk_cmd_set_dynamic_graphics_state(&cmd_buffer->vk,
2461 &gfx_pipeline->dynamic_state);
2462 }
2463
pvr_CmdBindPipeline(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipeline _pipeline)2464 void pvr_CmdBindPipeline(VkCommandBuffer commandBuffer,
2465 VkPipelineBindPoint pipelineBindPoint,
2466 VkPipeline _pipeline)
2467 {
2468 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2469 PVR_FROM_HANDLE(pvr_pipeline, pipeline, _pipeline);
2470
2471 switch (pipelineBindPoint) {
2472 case VK_PIPELINE_BIND_POINT_COMPUTE:
2473 pvr_cmd_bind_compute_pipeline(to_pvr_compute_pipeline(pipeline),
2474 cmd_buffer);
2475 break;
2476
2477 case VK_PIPELINE_BIND_POINT_GRAPHICS:
2478 pvr_cmd_bind_graphics_pipeline(to_pvr_graphics_pipeline(pipeline),
2479 cmd_buffer);
2480 break;
2481
2482 default:
2483 unreachable("Invalid bind point.");
2484 break;
2485 }
2486 }
2487
2488 #if defined(DEBUG)
check_viewport_quirk_70165(const struct pvr_device * device,const VkViewport * pViewport)2489 static void check_viewport_quirk_70165(const struct pvr_device *device,
2490 const VkViewport *pViewport)
2491 {
2492 const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
2493 float min_vertex_x, max_vertex_x, min_vertex_y, max_vertex_y;
2494 float min_screen_space_value, max_screen_space_value;
2495 float sign_to_unsigned_offset, fixed_point_max;
2496 float guardband_width, guardband_height;
2497
2498 if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
2499 /* Max representable value in 13.4 fixed point format.
2500 * Round-down to avoid precision issues.
2501 * Calculated as (2 ** 13) - 2*(2 ** -4)
2502 */
2503 fixed_point_max = 8192.0f - 2.0f / 16.0f;
2504
2505 if (PVR_HAS_FEATURE(dev_info, screen_size8K)) {
2506 if (pViewport->width <= 4096 && pViewport->height <= 4096) {
2507 guardband_width = pViewport->width / 4.0f;
2508 guardband_height = pViewport->height / 4.0f;
2509
2510 /* 2k of the range is negative */
2511 sign_to_unsigned_offset = 2048.0f;
2512 } else {
2513 guardband_width = 0.0f;
2514 guardband_height = 0.0f;
2515
2516 /* For > 4k renders, the entire range is positive */
2517 sign_to_unsigned_offset = 0.0f;
2518 }
2519 } else {
2520 guardband_width = pViewport->width / 4.0f;
2521 guardband_height = pViewport->height / 4.0f;
2522
2523 /* 2k of the range is negative */
2524 sign_to_unsigned_offset = 2048.0f;
2525 }
2526 } else {
2527 /* Max representable value in 16.8 fixed point format
2528 * Calculated as (2 ** 16) - (2 ** -8)
2529 */
2530 fixed_point_max = 65535.99609375f;
2531 guardband_width = pViewport->width / 4.0f;
2532 guardband_height = pViewport->height / 4.0f;
2533
2534 /* 4k/20k of the range is negative */
2535 sign_to_unsigned_offset = (float)PVR_MAX_NEG_OFFSCREEN_OFFSET;
2536 }
2537
2538 min_screen_space_value = -sign_to_unsigned_offset;
2539 max_screen_space_value = fixed_point_max - sign_to_unsigned_offset;
2540
2541 min_vertex_x = pViewport->x - guardband_width;
2542 max_vertex_x = pViewport->x + pViewport->width + guardband_width;
2543 min_vertex_y = pViewport->y - guardband_height;
2544 max_vertex_y = pViewport->y + pViewport->height + guardband_height;
2545 if (min_vertex_x < min_screen_space_value ||
2546 max_vertex_x > max_screen_space_value ||
2547 min_vertex_y < min_screen_space_value ||
2548 max_vertex_y > max_screen_space_value) {
2549 mesa_logw("Viewport is affected by BRN70165, geometry outside "
2550 "the viewport could be corrupted");
2551 }
2552 }
2553 #endif
2554
pvr_CmdSetViewport(VkCommandBuffer commandBuffer,uint32_t firstViewport,uint32_t viewportCount,const VkViewport * pViewports)2555 void pvr_CmdSetViewport(VkCommandBuffer commandBuffer,
2556 uint32_t firstViewport,
2557 uint32_t viewportCount,
2558 const VkViewport *pViewports)
2559 {
2560 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2561 const uint32_t total_count = firstViewport + viewportCount;
2562
2563 assert(firstViewport < PVR_MAX_VIEWPORTS && viewportCount > 0);
2564 assert(total_count >= 1 && total_count <= PVR_MAX_VIEWPORTS);
2565
2566 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2567
2568 #if defined(DEBUG)
2569 if (PVR_HAS_QUIRK(&cmd_buffer->device->pdevice->dev_info, 70165)) {
2570 for (uint32_t viewport = 0; viewport < viewportCount; viewport++) {
2571 check_viewport_quirk_70165(cmd_buffer->device, &pViewports[viewport]);
2572 }
2573 }
2574 #endif
2575
2576 vk_common_CmdSetViewport(commandBuffer,
2577 firstViewport,
2578 viewportCount,
2579 pViewports);
2580 }
2581
pvr_CmdSetDepthBounds(VkCommandBuffer commandBuffer,float minDepthBounds,float maxDepthBounds)2582 void pvr_CmdSetDepthBounds(VkCommandBuffer commandBuffer,
2583 float minDepthBounds,
2584 float maxDepthBounds)
2585 {
2586 mesa_logd("No support for depth bounds testing.");
2587 }
2588
pvr_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipelineLayout _layout,uint32_t firstSet,uint32_t descriptorSetCount,const VkDescriptorSet * pDescriptorSets,uint32_t dynamicOffsetCount,const uint32_t * pDynamicOffsets)2589 void pvr_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
2590 VkPipelineBindPoint pipelineBindPoint,
2591 VkPipelineLayout _layout,
2592 uint32_t firstSet,
2593 uint32_t descriptorSetCount,
2594 const VkDescriptorSet *pDescriptorSets,
2595 uint32_t dynamicOffsetCount,
2596 const uint32_t *pDynamicOffsets)
2597 {
2598 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2599 struct pvr_descriptor_state *descriptor_state;
2600
2601 assert(firstSet + descriptorSetCount <= PVR_MAX_DESCRIPTOR_SETS);
2602
2603 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2604
2605 switch (pipelineBindPoint) {
2606 case VK_PIPELINE_BIND_POINT_GRAPHICS:
2607 case VK_PIPELINE_BIND_POINT_COMPUTE:
2608 break;
2609
2610 default:
2611 unreachable("Unsupported bind point.");
2612 break;
2613 }
2614
2615 if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) {
2616 descriptor_state = &cmd_buffer->state.gfx_desc_state;
2617 cmd_buffer->state.dirty.gfx_desc_dirty = true;
2618 } else {
2619 descriptor_state = &cmd_buffer->state.compute_desc_state;
2620 cmd_buffer->state.dirty.compute_desc_dirty = true;
2621 }
2622
2623 for (uint32_t i = 0; i < descriptorSetCount; i++) {
2624 PVR_FROM_HANDLE(pvr_descriptor_set, set, pDescriptorSets[i]);
2625 uint32_t index = firstSet + i;
2626
2627 if (descriptor_state->descriptor_sets[index] != set) {
2628 descriptor_state->descriptor_sets[index] = set;
2629 descriptor_state->valid_mask |= (1u << index);
2630 }
2631 }
2632
2633 if (dynamicOffsetCount > 0) {
2634 PVR_FROM_HANDLE(pvr_pipeline_layout, pipeline_layout, _layout);
2635 uint32_t set_offset = 0;
2636
2637 for (uint32_t set = 0; set < firstSet; set++)
2638 set_offset += pipeline_layout->set_layout[set]->dynamic_buffer_count;
2639
2640 assert(set_offset + dynamicOffsetCount <=
2641 ARRAY_SIZE(descriptor_state->dynamic_offsets));
2642
2643 /* From the Vulkan 1.3.238 spec. :
2644 *
2645 * "If any of the sets being bound include dynamic uniform or storage
2646 * buffers, then pDynamicOffsets includes one element for each array
2647 * element in each dynamic descriptor type binding in each set."
2648 *
2649 */
2650 for (uint32_t i = 0; i < dynamicOffsetCount; i++)
2651 descriptor_state->dynamic_offsets[set_offset + i] = pDynamicOffsets[i];
2652 }
2653 }
2654
pvr_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets)2655 void pvr_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,
2656 uint32_t firstBinding,
2657 uint32_t bindingCount,
2658 const VkBuffer *pBuffers,
2659 const VkDeviceSize *pOffsets)
2660 {
2661 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2662 struct pvr_vertex_binding *const vb = cmd_buffer->state.vertex_bindings;
2663
2664 /* We have to defer setting up vertex buffer since we need the buffer
2665 * stride from the pipeline.
2666 */
2667
2668 assert(firstBinding < PVR_MAX_VERTEX_INPUT_BINDINGS &&
2669 bindingCount <= PVR_MAX_VERTEX_INPUT_BINDINGS);
2670
2671 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2672
2673 for (uint32_t i = 0; i < bindingCount; i++) {
2674 vb[firstBinding + i].buffer = pvr_buffer_from_handle(pBuffers[i]);
2675 vb[firstBinding + i].offset = pOffsets[i];
2676 }
2677
2678 cmd_buffer->state.dirty.vertex_bindings = true;
2679 }
2680
pvr_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,VkBuffer buffer,VkDeviceSize offset,VkIndexType indexType)2681 void pvr_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,
2682 VkBuffer buffer,
2683 VkDeviceSize offset,
2684 VkIndexType indexType)
2685 {
2686 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2687 PVR_FROM_HANDLE(pvr_buffer, index_buffer, buffer);
2688 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
2689
2690 assert(offset < index_buffer->vk.size);
2691 assert(indexType == VK_INDEX_TYPE_UINT32 ||
2692 indexType == VK_INDEX_TYPE_UINT16);
2693
2694 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2695
2696 state->index_buffer_binding.buffer = index_buffer;
2697 state->index_buffer_binding.offset = offset;
2698 state->index_buffer_binding.type = indexType;
2699 state->dirty.index_buffer_binding = true;
2700 }
2701
pvr_CmdPushConstants(VkCommandBuffer commandBuffer,VkPipelineLayout layout,VkShaderStageFlags stageFlags,uint32_t offset,uint32_t size,const void * pValues)2702 void pvr_CmdPushConstants(VkCommandBuffer commandBuffer,
2703 VkPipelineLayout layout,
2704 VkShaderStageFlags stageFlags,
2705 uint32_t offset,
2706 uint32_t size,
2707 const void *pValues)
2708 {
2709 #if defined(DEBUG)
2710 const uint64_t ending = (uint64_t)offset + (uint64_t)size;
2711 #endif
2712
2713 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2714 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
2715
2716 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2717
2718 pvr_assert(ending <= PVR_MAX_PUSH_CONSTANTS_SIZE);
2719
2720 memcpy(&state->push_constants.data[offset], pValues, size);
2721
2722 state->push_constants.dirty_stages |= stageFlags;
2723 state->push_constants.uploaded = false;
2724 }
2725
2726 static VkResult
pvr_cmd_buffer_setup_attachments(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_render_pass * pass,const struct pvr_framebuffer * framebuffer)2727 pvr_cmd_buffer_setup_attachments(struct pvr_cmd_buffer *cmd_buffer,
2728 const struct pvr_render_pass *pass,
2729 const struct pvr_framebuffer *framebuffer)
2730 {
2731 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
2732 struct pvr_render_pass_info *info = &state->render_pass_info;
2733
2734 assert(pass->attachment_count == framebuffer->attachment_count);
2735
2736 /* Free any previously allocated attachments. */
2737 vk_free(&cmd_buffer->vk.pool->alloc, state->render_pass_info.attachments);
2738
2739 if (pass->attachment_count == 0) {
2740 info->attachments = NULL;
2741 return VK_SUCCESS;
2742 }
2743
2744 info->attachments =
2745 vk_zalloc(&cmd_buffer->vk.pool->alloc,
2746 pass->attachment_count * sizeof(*info->attachments),
2747 8,
2748 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
2749 if (!info->attachments) {
2750 return vk_command_buffer_set_error(&cmd_buffer->vk,
2751 VK_ERROR_OUT_OF_HOST_MEMORY);
2752 }
2753
2754 for (uint32_t i = 0; i < pass->attachment_count; i++)
2755 info->attachments[i] = framebuffer->attachments[i];
2756
2757 return VK_SUCCESS;
2758 }
2759
pvr_init_render_targets(struct pvr_device * device,struct pvr_render_pass * pass,struct pvr_framebuffer * framebuffer)2760 static VkResult pvr_init_render_targets(struct pvr_device *device,
2761 struct pvr_render_pass *pass,
2762 struct pvr_framebuffer *framebuffer)
2763 {
2764 for (uint32_t i = 0; i < pass->hw_setup->render_count; i++) {
2765 struct pvr_render_target *render_target =
2766 pvr_get_render_target(pass, framebuffer, i);
2767
2768 pthread_mutex_lock(&render_target->mutex);
2769
2770 if (!render_target->valid) {
2771 const struct pvr_renderpass_hwsetup_render *hw_render =
2772 &pass->hw_setup->renders[i];
2773 VkResult result;
2774
2775 result = pvr_render_target_dataset_create(device,
2776 framebuffer->width,
2777 framebuffer->height,
2778 hw_render->sample_count,
2779 framebuffer->layers,
2780 &render_target->rt_dataset);
2781 if (result != VK_SUCCESS) {
2782 pthread_mutex_unlock(&render_target->mutex);
2783 return result;
2784 }
2785
2786 render_target->valid = true;
2787 }
2788
2789 pthread_mutex_unlock(&render_target->mutex);
2790 }
2791
2792 return VK_SUCCESS;
2793 }
2794
2795 const struct pvr_renderpass_hwsetup_subpass *
pvr_get_hw_subpass(const struct pvr_render_pass * pass,const uint32_t subpass)2796 pvr_get_hw_subpass(const struct pvr_render_pass *pass, const uint32_t subpass)
2797 {
2798 const struct pvr_renderpass_hw_map *map =
2799 &pass->hw_setup->subpass_map[subpass];
2800
2801 return &pass->hw_setup->renders[map->render].subpasses[map->subpass];
2802 }
2803
pvr_perform_start_of_render_attachment_clear(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_framebuffer * framebuffer,uint32_t index,bool is_depth_stencil,uint32_t * index_list_clear_mask)2804 static void pvr_perform_start_of_render_attachment_clear(
2805 struct pvr_cmd_buffer *cmd_buffer,
2806 const struct pvr_framebuffer *framebuffer,
2807 uint32_t index,
2808 bool is_depth_stencil,
2809 uint32_t *index_list_clear_mask)
2810 {
2811 ASSERTED static const VkImageAspectFlags dsc_aspect_flags =
2812 VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT |
2813 VK_IMAGE_ASPECT_COLOR_BIT;
2814 struct pvr_render_pass_info *info = &cmd_buffer->state.render_pass_info;
2815 const struct pvr_render_pass *pass = info->pass;
2816 const struct pvr_renderpass_hwsetup *hw_setup = pass->hw_setup;
2817 const struct pvr_renderpass_hwsetup_render *hw_render =
2818 &hw_setup->renders[hw_setup->subpass_map[info->subpass_idx].render];
2819 VkImageAspectFlags image_aspect;
2820 struct pvr_image_view *iview;
2821 uint32_t view_idx;
2822
2823 if (is_depth_stencil) {
2824 bool stencil_clear;
2825 bool depth_clear;
2826 bool is_stencil;
2827 bool is_depth;
2828
2829 assert(hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED);
2830 assert(index == 0);
2831
2832 view_idx = hw_render->ds_attach_idx;
2833
2834 is_depth = vk_format_has_depth(pass->attachments[view_idx].vk_format);
2835 is_stencil = vk_format_has_stencil(pass->attachments[view_idx].vk_format);
2836 depth_clear = hw_render->depth_init == VK_ATTACHMENT_LOAD_OP_CLEAR;
2837 stencil_clear = hw_render->stencil_init == VK_ATTACHMENT_LOAD_OP_CLEAR;
2838
2839 /* Attempt to clear the ds attachment. Do not erroneously discard an
2840 * attachment that has no depth clear but has a stencil attachment.
2841 */
2842 /* if not (a ∧ c) ∨ (b ∧ d) */
2843 if (!((is_depth && depth_clear) || (is_stencil && stencil_clear)))
2844 return;
2845 } else if (hw_render->color_init[index].op != VK_ATTACHMENT_LOAD_OP_CLEAR) {
2846 return;
2847 } else {
2848 view_idx = hw_render->color_init[index].index;
2849 }
2850
2851 iview = info->attachments[view_idx];
2852
2853 /* FIXME: It would be nice if this function and pvr_sub_cmd_gfx_job_init()
2854 * were doing the same check (even if it's just an assert) to determine if a
2855 * clear is needed.
2856 */
2857 /* If this is single-layer fullscreen, we already do the clears in
2858 * pvr_sub_cmd_gfx_job_init().
2859 */
2860 if (pvr_is_render_area_tile_aligned(cmd_buffer, iview) &&
2861 framebuffer->layers == 1) {
2862 return;
2863 }
2864
2865 image_aspect = vk_format_aspects(pass->attachments[view_idx].vk_format);
2866 assert((image_aspect & ~dsc_aspect_flags) == 0);
2867
2868 if (image_aspect & VK_IMAGE_ASPECT_DEPTH_BIT &&
2869 hw_render->depth_init != VK_ATTACHMENT_LOAD_OP_CLEAR) {
2870 image_aspect &= ~VK_IMAGE_ASPECT_DEPTH_BIT;
2871 }
2872
2873 if (image_aspect & VK_IMAGE_ASPECT_STENCIL_BIT &&
2874 hw_render->stencil_init != VK_ATTACHMENT_LOAD_OP_CLEAR) {
2875 image_aspect &= ~VK_IMAGE_ASPECT_STENCIL_BIT;
2876 }
2877
2878 if (image_aspect != VK_IMAGE_ASPECT_NONE) {
2879 VkClearAttachment clear_attachment = {
2880 .aspectMask = image_aspect,
2881 .colorAttachment = index,
2882 .clearValue = info->clear_values[view_idx],
2883 };
2884 VkClearRect rect = {
2885 .rect = info->render_area,
2886 .baseArrayLayer = 0,
2887 .layerCount = info->framebuffer->layers,
2888 };
2889
2890 assert(view_idx < info->clear_value_count);
2891
2892 pvr_clear_attachments_render_init(cmd_buffer, &clear_attachment, &rect);
2893
2894 *index_list_clear_mask |= (1 << index);
2895 }
2896 }
2897
2898 static void
pvr_perform_start_of_render_clears(struct pvr_cmd_buffer * cmd_buffer)2899 pvr_perform_start_of_render_clears(struct pvr_cmd_buffer *cmd_buffer)
2900 {
2901 struct pvr_render_pass_info *info = &cmd_buffer->state.render_pass_info;
2902 const struct pvr_framebuffer *framebuffer = info->framebuffer;
2903 const struct pvr_render_pass *pass = info->pass;
2904 const struct pvr_renderpass_hwsetup *hw_setup = pass->hw_setup;
2905 const struct pvr_renderpass_hwsetup_render *hw_render =
2906 &hw_setup->renders[hw_setup->subpass_map[info->subpass_idx].render];
2907
2908 /* Mask of attachment clears using index lists instead of background object
2909 * to clear.
2910 */
2911 uint32_t index_list_clear_mask = 0;
2912
2913 for (uint32_t i = 0; i < hw_render->color_init_count; i++) {
2914 pvr_perform_start_of_render_attachment_clear(cmd_buffer,
2915 framebuffer,
2916 i,
2917 false,
2918 &index_list_clear_mask);
2919 }
2920
2921 info->enable_bg_tag = !!hw_render->color_init_count;
2922
2923 /* If we're not using index list for all clears/loads then we need to run
2924 * the background object on empty tiles.
2925 */
2926 if (hw_render->color_init_count &&
2927 index_list_clear_mask != ((1u << hw_render->color_init_count) - 1u)) {
2928 info->process_empty_tiles = true;
2929 } else {
2930 info->process_empty_tiles = false;
2931 }
2932
2933 if (hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED) {
2934 uint32_t ds_index_list = 0;
2935
2936 pvr_perform_start_of_render_attachment_clear(cmd_buffer,
2937 framebuffer,
2938 0,
2939 true,
2940 &ds_index_list);
2941 }
2942
2943 if (index_list_clear_mask)
2944 pvr_finishme("Add support for generating loadops shaders!");
2945 }
2946
pvr_stash_depth_format(struct pvr_cmd_buffer_state * state,struct pvr_sub_cmd_gfx * const sub_cmd)2947 static void pvr_stash_depth_format(struct pvr_cmd_buffer_state *state,
2948 struct pvr_sub_cmd_gfx *const sub_cmd)
2949 {
2950 const struct pvr_render_pass *pass = state->render_pass_info.pass;
2951 const struct pvr_renderpass_hwsetup_render *hw_render =
2952 &pass->hw_setup->renders[sub_cmd->hw_render_idx];
2953
2954 if (hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED) {
2955 struct pvr_image_view **iviews = state->render_pass_info.attachments;
2956
2957 state->depth_format = iviews[hw_render->ds_attach_idx]->vk.format;
2958 }
2959 }
2960
pvr_loadops_contain_clear(struct pvr_renderpass_hwsetup * hw_setup)2961 static bool pvr_loadops_contain_clear(struct pvr_renderpass_hwsetup *hw_setup)
2962 {
2963 for (uint32_t i = 0; i < hw_setup->render_count; i++) {
2964 struct pvr_renderpass_hwsetup_render *hw_render = &hw_setup->renders[i];
2965 uint32_t render_targets_count = hw_render->init_setup.num_render_targets;
2966
2967 for (uint32_t j = 0;
2968 j < (hw_render->color_init_count * render_targets_count);
2969 j += render_targets_count) {
2970 for (uint32_t k = 0; k < hw_render->init_setup.num_render_targets;
2971 k++) {
2972 if (hw_render->color_init[j + k].op ==
2973 VK_ATTACHMENT_LOAD_OP_CLEAR) {
2974 return true;
2975 }
2976 }
2977 }
2978 if (hw_render->depth_init == VK_ATTACHMENT_LOAD_OP_CLEAR ||
2979 hw_render->stencil_init == VK_ATTACHMENT_LOAD_OP_CLEAR) {
2980 return true;
2981 }
2982 }
2983
2984 return false;
2985 }
2986
2987 static VkResult
pvr_cmd_buffer_set_clear_values(struct pvr_cmd_buffer * cmd_buffer,const VkRenderPassBeginInfo * pRenderPassBegin)2988 pvr_cmd_buffer_set_clear_values(struct pvr_cmd_buffer *cmd_buffer,
2989 const VkRenderPassBeginInfo *pRenderPassBegin)
2990 {
2991 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
2992
2993 /* Free any previously allocated clear values. */
2994 vk_free(&cmd_buffer->vk.pool->alloc, state->render_pass_info.clear_values);
2995
2996 if (pRenderPassBegin->clearValueCount) {
2997 const size_t size = pRenderPassBegin->clearValueCount *
2998 sizeof(*state->render_pass_info.clear_values);
2999
3000 state->render_pass_info.clear_values =
3001 vk_zalloc(&cmd_buffer->vk.pool->alloc,
3002 size,
3003 8,
3004 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
3005 if (!state->render_pass_info.clear_values) {
3006 return vk_command_buffer_set_error(&cmd_buffer->vk,
3007 VK_ERROR_OUT_OF_HOST_MEMORY);
3008 }
3009
3010 memcpy(state->render_pass_info.clear_values,
3011 pRenderPassBegin->pClearValues,
3012 size);
3013 } else {
3014 state->render_pass_info.clear_values = NULL;
3015 }
3016
3017 state->render_pass_info.clear_value_count =
3018 pRenderPassBegin->clearValueCount;
3019
3020 return VK_SUCCESS;
3021 }
3022
3023 /**
3024 * \brief Indicates whether to use the large or normal clear state words.
3025 *
3026 * If the current render area can fit within a quarter of the max framebuffer
3027 * that the device is capable of, we can use the normal clear state words,
3028 * otherwise the large clear state words are needed.
3029 *
3030 * The requirement of a quarter of the max framebuffer comes from the index
3031 * count used in the normal clear state words and the vertices uploaded at
3032 * device creation.
3033 *
3034 * \param[in] cmd_buffer The command buffer for the clear.
3035 * \return true if large clear state words are required.
3036 */
3037 static bool
pvr_is_large_clear_required(const struct pvr_cmd_buffer * const cmd_buffer)3038 pvr_is_large_clear_required(const struct pvr_cmd_buffer *const cmd_buffer)
3039 {
3040 const struct pvr_device_info *const dev_info =
3041 &cmd_buffer->device->pdevice->dev_info;
3042 const VkRect2D render_area = cmd_buffer->state.render_pass_info.render_area;
3043 const uint32_t vf_max_x = rogue_get_param_vf_max_x(dev_info);
3044 const uint32_t vf_max_y = rogue_get_param_vf_max_x(dev_info);
3045
3046 return (render_area.extent.width > (vf_max_x / 2) - 1) ||
3047 (render_area.extent.height > (vf_max_y / 2) - 1);
3048 }
3049
pvr_emit_clear_words(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)3050 static void pvr_emit_clear_words(struct pvr_cmd_buffer *const cmd_buffer,
3051 struct pvr_sub_cmd_gfx *const sub_cmd)
3052 {
3053 struct pvr_device *device = cmd_buffer->device;
3054 struct pvr_csb *csb = &sub_cmd->control_stream;
3055 uint32_t vdm_state_size_in_dw;
3056 const uint32_t *vdm_state;
3057 uint32_t *stream;
3058
3059 vdm_state_size_in_dw =
3060 pvr_clear_vdm_state_get_size_in_dw(&device->pdevice->dev_info, 1);
3061
3062 pvr_csb_set_relocation_mark(csb);
3063
3064 stream = pvr_csb_alloc_dwords(csb, vdm_state_size_in_dw);
3065 if (!stream) {
3066 pvr_cmd_buffer_set_error_unwarned(cmd_buffer, csb->status);
3067 return;
3068 }
3069
3070 if (pvr_is_large_clear_required(cmd_buffer))
3071 vdm_state = device->static_clear_state.large_clear_vdm_words;
3072 else
3073 vdm_state = device->static_clear_state.vdm_words;
3074
3075 memcpy(stream, vdm_state, PVR_DW_TO_BYTES(vdm_state_size_in_dw));
3076
3077 pvr_csb_clear_relocation_mark(csb);
3078 }
3079
pvr_cs_write_load_op(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_gfx * sub_cmd,struct pvr_load_op * load_op,uint32_t isp_userpass)3080 static VkResult pvr_cs_write_load_op(struct pvr_cmd_buffer *cmd_buffer,
3081 struct pvr_sub_cmd_gfx *sub_cmd,
3082 struct pvr_load_op *load_op,
3083 uint32_t isp_userpass)
3084 {
3085 const struct pvr_device *device = cmd_buffer->device;
3086 struct pvr_static_clear_ppp_template template =
3087 device->static_clear_state.ppp_templates[VK_IMAGE_ASPECT_COLOR_BIT];
3088 uint32_t pds_state[PVR_STATIC_CLEAR_PDS_STATE_COUNT];
3089 struct pvr_pds_upload shareds_update_program;
3090 struct pvr_suballoc_bo *pvr_bo;
3091 VkResult result;
3092
3093 result = pvr_load_op_data_create_and_upload(cmd_buffer,
3094 load_op,
3095 &shareds_update_program);
3096 if (result != VK_SUCCESS)
3097 return result;
3098
3099 template.config.ispctl.upass = isp_userpass;
3100
3101 /* It might look odd that we aren't specifying the code segment's
3102 * address anywhere. This is because the hardware always assumes that the
3103 * data size is 2 128bit words and the code segments starts after that.
3104 */
3105 pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_SHADERBASE],
3106 TA_STATE_PDS_SHADERBASE,
3107 shaderbase) {
3108 shaderbase.addr = PVR_DEV_ADDR(load_op->pds_frag_prog.data_offset);
3109 }
3110
3111 pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_TEXUNICODEBASE],
3112 TA_STATE_PDS_TEXUNICODEBASE,
3113 texunicodebase) {
3114 texunicodebase.addr =
3115 PVR_DEV_ADDR(load_op->pds_tex_state_prog.code_offset);
3116 }
3117
3118 pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_SIZEINFO1],
3119 TA_STATE_PDS_SIZEINFO1,
3120 sizeinfo1) {
3121 /* Dummy coefficient loading program. */
3122 sizeinfo1.pds_varyingsize = 0;
3123
3124 sizeinfo1.pds_texturestatesize = DIV_ROUND_UP(
3125 shareds_update_program.data_size,
3126 PVRX(TA_STATE_PDS_SIZEINFO1_PDS_TEXTURESTATESIZE_UNIT_SIZE));
3127
3128 sizeinfo1.pds_tempsize =
3129 DIV_ROUND_UP(load_op->temps_count,
3130 PVRX(TA_STATE_PDS_SIZEINFO1_PDS_TEMPSIZE_UNIT_SIZE));
3131 }
3132
3133 pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_SIZEINFO2],
3134 TA_STATE_PDS_SIZEINFO2,
3135 sizeinfo2) {
3136 sizeinfo2.usc_sharedsize =
3137 DIV_ROUND_UP(load_op->const_shareds_count,
3138 PVRX(TA_STATE_PDS_SIZEINFO2_USC_SHAREDSIZE_UNIT_SIZE));
3139 }
3140
3141 /* Dummy coefficient loading program. */
3142 pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_VARYINGBASE] = 0;
3143
3144 pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_TEXTUREDATABASE],
3145 TA_STATE_PDS_TEXTUREDATABASE,
3146 texturedatabase) {
3147 texturedatabase.addr = PVR_DEV_ADDR(shareds_update_program.data_offset);
3148 }
3149
3150 template.config.pds_state = &pds_state;
3151
3152 pvr_emit_ppp_from_template(&sub_cmd->control_stream, &template, &pvr_bo);
3153 list_add(&pvr_bo->link, &cmd_buffer->bo_list);
3154
3155 pvr_emit_clear_words(cmd_buffer, sub_cmd);
3156
3157 pvr_reset_graphics_dirty_state(cmd_buffer, false);
3158
3159 return VK_SUCCESS;
3160 }
3161
pvr_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,const VkRenderPassBeginInfo * pRenderPassBeginInfo,const VkSubpassBeginInfo * pSubpassBeginInfo)3162 void pvr_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
3163 const VkRenderPassBeginInfo *pRenderPassBeginInfo,
3164 const VkSubpassBeginInfo *pSubpassBeginInfo)
3165 {
3166 PVR_FROM_HANDLE(pvr_framebuffer,
3167 framebuffer,
3168 pRenderPassBeginInfo->framebuffer);
3169 PVR_FROM_HANDLE(pvr_render_pass, pass, pRenderPassBeginInfo->renderPass);
3170 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
3171 const struct pvr_renderpass_hwsetup_subpass *hw_subpass;
3172 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
3173 VkResult result;
3174
3175 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
3176
3177 assert(!state->render_pass_info.pass);
3178 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
3179
3180 /* FIXME: Create a separate function for everything using pass->subpasses,
3181 * look at cmd_buffer_begin_subpass() for example. */
3182 state->render_pass_info.pass = pass;
3183 state->render_pass_info.framebuffer = framebuffer;
3184 state->render_pass_info.subpass_idx = 0;
3185 state->render_pass_info.render_area = pRenderPassBeginInfo->renderArea;
3186 state->render_pass_info.current_hw_subpass = 0;
3187 state->render_pass_info.pipeline_bind_point =
3188 pass->subpasses[0].pipeline_bind_point;
3189 state->render_pass_info.isp_userpass = pass->subpasses[0].isp_userpass;
3190 state->dirty.isp_userpass = true;
3191
3192 result = pvr_cmd_buffer_setup_attachments(cmd_buffer, pass, framebuffer);
3193 if (result != VK_SUCCESS)
3194 return;
3195
3196 result = pvr_init_render_targets(cmd_buffer->device, pass, framebuffer);
3197 if (result != VK_SUCCESS) {
3198 pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
3199 return;
3200 }
3201
3202 result = pvr_cmd_buffer_set_clear_values(cmd_buffer, pRenderPassBeginInfo);
3203 if (result != VK_SUCCESS)
3204 return;
3205
3206 assert(pass->subpasses[0].pipeline_bind_point ==
3207 VK_PIPELINE_BIND_POINT_GRAPHICS);
3208
3209 result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
3210 if (result != VK_SUCCESS)
3211 return;
3212
3213 /* Run subpass 0 "soft" background object after the actual background
3214 * object.
3215 */
3216 hw_subpass = pvr_get_hw_subpass(pass, 0);
3217 if (hw_subpass->load_op) {
3218 result = pvr_cs_write_load_op(cmd_buffer,
3219 &cmd_buffer->state.current_sub_cmd->gfx,
3220 hw_subpass->load_op,
3221 0);
3222 if (result != VK_SUCCESS)
3223 return;
3224 }
3225
3226 pvr_perform_start_of_render_clears(cmd_buffer);
3227 pvr_stash_depth_format(&cmd_buffer->state,
3228 &cmd_buffer->state.current_sub_cmd->gfx);
3229 }
3230
pvr_BeginCommandBuffer(VkCommandBuffer commandBuffer,const VkCommandBufferBeginInfo * pBeginInfo)3231 VkResult pvr_BeginCommandBuffer(VkCommandBuffer commandBuffer,
3232 const VkCommandBufferBeginInfo *pBeginInfo)
3233 {
3234 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
3235 struct pvr_cmd_buffer_state *state;
3236 VkResult result;
3237
3238 vk_command_buffer_begin(&cmd_buffer->vk, pBeginInfo);
3239
3240 cmd_buffer->usage_flags = pBeginInfo->flags;
3241 state = &cmd_buffer->state;
3242
3243 /* VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT must be ignored for
3244 * primary level command buffers.
3245 *
3246 * From the Vulkan 1.0 spec:
3247 *
3248 * VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT specifies that a
3249 * secondary command buffer is considered to be entirely inside a render
3250 * pass. If this is a primary command buffer, then this bit is ignored.
3251 */
3252 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
3253 cmd_buffer->usage_flags &=
3254 ~VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
3255 }
3256
3257 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
3258 if (cmd_buffer->usage_flags &
3259 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
3260 const VkCommandBufferInheritanceInfo *inheritance_info =
3261 pBeginInfo->pInheritanceInfo;
3262 struct pvr_render_pass *pass;
3263
3264 pass = pvr_render_pass_from_handle(inheritance_info->renderPass);
3265 state->render_pass_info.pass = pass;
3266 state->render_pass_info.framebuffer =
3267 pvr_framebuffer_from_handle(inheritance_info->framebuffer);
3268 state->render_pass_info.subpass_idx = inheritance_info->subpass;
3269 state->render_pass_info.isp_userpass =
3270 pass->subpasses[inheritance_info->subpass].isp_userpass;
3271
3272 result =
3273 pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
3274 if (result != VK_SUCCESS)
3275 return result;
3276
3277 state->vis_test_enabled = inheritance_info->occlusionQueryEnable;
3278 }
3279
3280 state->dirty.isp_userpass = true;
3281 }
3282
3283 util_dynarray_init(&state->query_indices, NULL);
3284
3285 memset(state->barriers_needed,
3286 0xFF,
3287 sizeof(*state->barriers_needed) * ARRAY_SIZE(state->barriers_needed));
3288
3289 return VK_SUCCESS;
3290 }
3291
pvr_cmd_buffer_add_transfer_cmd(struct pvr_cmd_buffer * cmd_buffer,struct pvr_transfer_cmd * transfer_cmd)3292 VkResult pvr_cmd_buffer_add_transfer_cmd(struct pvr_cmd_buffer *cmd_buffer,
3293 struct pvr_transfer_cmd *transfer_cmd)
3294 {
3295 struct pvr_sub_cmd_transfer *sub_cmd;
3296 VkResult result;
3297
3298 result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_TRANSFER);
3299 if (result != VK_SUCCESS)
3300 return result;
3301
3302 sub_cmd = &cmd_buffer->state.current_sub_cmd->transfer;
3303
3304 list_addtail(&transfer_cmd->link, sub_cmd->transfer_cmds);
3305
3306 return VK_SUCCESS;
3307 }
3308
3309 static VkResult
pvr_setup_vertex_buffers(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_graphics_pipeline * const gfx_pipeline)3310 pvr_setup_vertex_buffers(struct pvr_cmd_buffer *cmd_buffer,
3311 const struct pvr_graphics_pipeline *const gfx_pipeline)
3312 {
3313 const struct pvr_vertex_shader_state *const vertex_state =
3314 &gfx_pipeline->shader_state.vertex;
3315 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
3316 const struct pvr_pds_info *const pds_info = state->pds_shader.info;
3317 struct pvr_suballoc_bo *pvr_bo;
3318 const uint8_t *entries;
3319 uint32_t *dword_buffer;
3320 uint64_t *qword_buffer;
3321 VkResult result;
3322
3323 result =
3324 pvr_cmd_buffer_alloc_mem(cmd_buffer,
3325 cmd_buffer->device->heaps.pds_heap,
3326 PVR_DW_TO_BYTES(pds_info->data_size_in_dwords),
3327 &pvr_bo);
3328 if (result != VK_SUCCESS)
3329 return result;
3330
3331 dword_buffer = (uint32_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
3332 qword_buffer = (uint64_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
3333
3334 entries = (uint8_t *)pds_info->entries;
3335
3336 for (uint32_t i = 0; i < pds_info->entry_count; i++) {
3337 const struct pvr_const_map_entry *const entry_header =
3338 (struct pvr_const_map_entry *)entries;
3339
3340 switch (entry_header->type) {
3341 case PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32: {
3342 const struct pvr_const_map_entry_literal32 *const literal =
3343 (struct pvr_const_map_entry_literal32 *)entries;
3344
3345 PVR_WRITE(dword_buffer,
3346 literal->literal_value,
3347 literal->const_offset,
3348 pds_info->data_size_in_dwords);
3349
3350 entries += sizeof(*literal);
3351 break;
3352 }
3353
3354 case PVR_PDS_CONST_MAP_ENTRY_TYPE_DOUTU_ADDRESS: {
3355 const struct pvr_const_map_entry_doutu_address *const doutu_addr =
3356 (struct pvr_const_map_entry_doutu_address *)entries;
3357 const pvr_dev_addr_t exec_addr =
3358 PVR_DEV_ADDR_OFFSET(vertex_state->bo->dev_addr,
3359 vertex_state->entry_offset);
3360 uint64_t addr = 0ULL;
3361
3362 pvr_set_usc_execution_address64(&addr, exec_addr.addr);
3363
3364 PVR_WRITE(qword_buffer,
3365 addr | doutu_addr->doutu_control,
3366 doutu_addr->const_offset,
3367 pds_info->data_size_in_dwords);
3368
3369 entries += sizeof(*doutu_addr);
3370 break;
3371 }
3372
3373 case PVR_PDS_CONST_MAP_ENTRY_TYPE_BASE_INSTANCE: {
3374 const struct pvr_const_map_entry_base_instance *const base_instance =
3375 (struct pvr_const_map_entry_base_instance *)entries;
3376
3377 PVR_WRITE(dword_buffer,
3378 state->draw_state.base_instance,
3379 base_instance->const_offset,
3380 pds_info->data_size_in_dwords);
3381
3382 entries += sizeof(*base_instance);
3383 break;
3384 }
3385
3386 case PVR_PDS_CONST_MAP_ENTRY_TYPE_BASE_VERTEX: {
3387 const struct pvr_const_map_entry_base_instance *const base_instance =
3388 (struct pvr_const_map_entry_base_instance *)entries;
3389
3390 PVR_WRITE(dword_buffer,
3391 state->draw_state.base_vertex,
3392 base_instance->const_offset,
3393 pds_info->data_size_in_dwords);
3394
3395 entries += sizeof(*base_instance);
3396 break;
3397 }
3398
3399 case PVR_PDS_CONST_MAP_ENTRY_TYPE_VERTEX_ATTRIBUTE_ADDRESS: {
3400 const struct pvr_const_map_entry_vertex_attribute_address
3401 *const attribute =
3402 (struct pvr_const_map_entry_vertex_attribute_address *)entries;
3403 const struct pvr_vertex_binding *const binding =
3404 &state->vertex_bindings[attribute->binding_index];
3405 /* In relation to the Vulkan spec. 22.4. Vertex Input Address
3406 * Calculation:
3407 * Adding binding->offset corresponds to calculating the
3408 * `bufferBindingAddress`. Adding attribute->offset corresponds to
3409 * adding the `attribDesc.offset`. The `effectiveVertexOffset` is
3410 * taken care by the PDS program itself with a DDMAD which will
3411 * multiply the vertex/instance idx with the binding's stride and
3412 * add that to the address provided here.
3413 */
3414 const pvr_dev_addr_t addr =
3415 PVR_DEV_ADDR_OFFSET(binding->buffer->dev_addr,
3416 binding->offset + attribute->offset);
3417
3418 PVR_WRITE(qword_buffer,
3419 addr.addr,
3420 attribute->const_offset,
3421 pds_info->data_size_in_dwords);
3422
3423 entries += sizeof(*attribute);
3424 break;
3425 }
3426
3427 case PVR_PDS_CONST_MAP_ENTRY_TYPE_ROBUST_VERTEX_ATTRIBUTE_ADDRESS: {
3428 const struct pvr_const_map_entry_robust_vertex_attribute_address
3429 *const attribute =
3430 (struct pvr_const_map_entry_robust_vertex_attribute_address *)
3431 entries;
3432 const struct pvr_vertex_binding *const binding =
3433 &state->vertex_bindings[attribute->binding_index];
3434 pvr_dev_addr_t addr;
3435
3436 if (binding->buffer->vk.size <
3437 (attribute->offset + attribute->component_size_in_bytes)) {
3438 /* Replace with load from robustness buffer when no attribute is in
3439 * range
3440 */
3441 addr = PVR_DEV_ADDR_OFFSET(
3442 cmd_buffer->device->robustness_buffer->vma->dev_addr,
3443 attribute->robustness_buffer_offset);
3444 } else {
3445 addr = PVR_DEV_ADDR_OFFSET(binding->buffer->dev_addr,
3446 binding->offset + attribute->offset);
3447 }
3448
3449 PVR_WRITE(qword_buffer,
3450 addr.addr,
3451 attribute->const_offset,
3452 pds_info->data_size_in_dwords);
3453
3454 entries += sizeof(*attribute);
3455 break;
3456 }
3457
3458 case PVR_PDS_CONST_MAP_ENTRY_TYPE_VERTEX_ATTRIBUTE_MAX_INDEX: {
3459 const struct pvr_const_map_entry_vertex_attribute_max_index *attribute =
3460 (struct pvr_const_map_entry_vertex_attribute_max_index *)entries;
3461 const struct pvr_vertex_binding *const binding =
3462 &state->vertex_bindings[attribute->binding_index];
3463 const uint64_t bound_size = binding->buffer->vk.size - binding->offset;
3464 const uint32_t attribute_end =
3465 attribute->offset + attribute->component_size_in_bytes;
3466 uint32_t max_index;
3467
3468 if (PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info,
3469 pds_ddmadt)) {
3470 /* TODO: PVR_PDS_CONST_MAP_ENTRY_TYPE_VERTEX_ATTRIBUTE_MAX_INDEX
3471 * has the same define value as
3472 * PVR_PDS_CONST_MAP_ENTRY_TYPE_VERTEX_ATTR_DDMADT_OOB_BUFFER_SIZE
3473 * so maybe we want to remove one of the defines or change the
3474 * values.
3475 */
3476 pvr_finishme("Unimplemented robust buffer access with DDMADT");
3477 assert(false);
3478 }
3479
3480 /* If the stride is 0 then all attributes use the same single element
3481 * from the binding so the index can only be up to 0.
3482 */
3483 if (bound_size < attribute_end || attribute->stride == 0) {
3484 max_index = 0;
3485 } else {
3486 max_index = (uint32_t)(bound_size / attribute->stride) - 1;
3487
3488 /* There's one last attribute that can fit in. */
3489 if (bound_size % attribute->stride >= attribute_end)
3490 max_index++;
3491 }
3492
3493 PVR_WRITE(dword_buffer,
3494 max_index,
3495 attribute->const_offset,
3496 pds_info->data_size_in_dwords);
3497
3498 entries += sizeof(*attribute);
3499 break;
3500 }
3501
3502 default:
3503 unreachable("Unsupported data section map");
3504 break;
3505 }
3506 }
3507
3508 state->pds_vertex_attrib_offset =
3509 pvr_bo->dev_addr.addr -
3510 cmd_buffer->device->heaps.pds_heap->base_addr.addr;
3511
3512 return VK_SUCCESS;
3513 }
3514
pvr_setup_descriptor_mappings_old(struct pvr_cmd_buffer * const cmd_buffer,enum pvr_stage_allocation stage,const struct pvr_stage_allocation_descriptor_state * descriptor_state,const pvr_dev_addr_t * const num_worgroups_buff_addr,uint32_t * const descriptor_data_offset_out)3515 static VkResult pvr_setup_descriptor_mappings_old(
3516 struct pvr_cmd_buffer *const cmd_buffer,
3517 enum pvr_stage_allocation stage,
3518 const struct pvr_stage_allocation_descriptor_state *descriptor_state,
3519 const pvr_dev_addr_t *const num_worgroups_buff_addr,
3520 uint32_t *const descriptor_data_offset_out)
3521 {
3522 const struct pvr_pds_info *const pds_info = &descriptor_state->pds_info;
3523 const struct pvr_descriptor_state *desc_state;
3524 struct pvr_suballoc_bo *pvr_bo;
3525 const uint8_t *entries;
3526 uint32_t *dword_buffer;
3527 uint64_t *qword_buffer;
3528 VkResult result;
3529
3530 if (!pds_info->data_size_in_dwords)
3531 return VK_SUCCESS;
3532
3533 result =
3534 pvr_cmd_buffer_alloc_mem(cmd_buffer,
3535 cmd_buffer->device->heaps.pds_heap,
3536 PVR_DW_TO_BYTES(pds_info->data_size_in_dwords),
3537 &pvr_bo);
3538 if (result != VK_SUCCESS)
3539 return result;
3540
3541 dword_buffer = (uint32_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
3542 qword_buffer = (uint64_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
3543
3544 entries = (uint8_t *)pds_info->entries;
3545
3546 switch (stage) {
3547 case PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY:
3548 case PVR_STAGE_ALLOCATION_FRAGMENT:
3549 desc_state = &cmd_buffer->state.gfx_desc_state;
3550 break;
3551
3552 case PVR_STAGE_ALLOCATION_COMPUTE:
3553 desc_state = &cmd_buffer->state.compute_desc_state;
3554 break;
3555
3556 default:
3557 unreachable("Unsupported stage.");
3558 break;
3559 }
3560
3561 for (uint32_t i = 0; i < pds_info->entry_count; i++) {
3562 const struct pvr_const_map_entry *const entry_header =
3563 (struct pvr_const_map_entry *)entries;
3564
3565 switch (entry_header->type) {
3566 case PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32: {
3567 const struct pvr_const_map_entry_literal32 *const literal =
3568 (struct pvr_const_map_entry_literal32 *)entries;
3569
3570 PVR_WRITE(dword_buffer,
3571 literal->literal_value,
3572 literal->const_offset,
3573 pds_info->data_size_in_dwords);
3574
3575 entries += sizeof(*literal);
3576 break;
3577 }
3578
3579 case PVR_PDS_CONST_MAP_ENTRY_TYPE_CONSTANT_BUFFER: {
3580 const struct pvr_const_map_entry_constant_buffer *const_buffer_entry =
3581 (struct pvr_const_map_entry_constant_buffer *)entries;
3582 const uint32_t desc_set = const_buffer_entry->desc_set;
3583 const uint32_t binding = const_buffer_entry->binding;
3584 const struct pvr_descriptor_set *descriptor_set;
3585 const struct pvr_descriptor *descriptor;
3586 pvr_dev_addr_t buffer_addr;
3587
3588 assert(desc_set < PVR_MAX_DESCRIPTOR_SETS);
3589 descriptor_set = desc_state->descriptor_sets[desc_set];
3590
3591 /* TODO: Handle dynamic buffers. */
3592 descriptor = &descriptor_set->descriptors[binding];
3593 assert(descriptor->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
3594
3595 assert(descriptor->buffer_desc_range ==
3596 PVR_DW_TO_BYTES(const_buffer_entry->size_in_dwords));
3597 assert(descriptor->buffer_whole_range ==
3598 PVR_DW_TO_BYTES(const_buffer_entry->size_in_dwords));
3599
3600 buffer_addr =
3601 PVR_DEV_ADDR_OFFSET(descriptor->buffer_dev_addr,
3602 const_buffer_entry->offset * sizeof(uint32_t));
3603
3604 PVR_WRITE(qword_buffer,
3605 buffer_addr.addr,
3606 const_buffer_entry->const_offset,
3607 pds_info->data_size_in_dwords);
3608
3609 entries += sizeof(*const_buffer_entry);
3610 break;
3611 }
3612
3613 case PVR_PDS_CONST_MAP_ENTRY_TYPE_DESCRIPTOR_SET: {
3614 const struct pvr_const_map_entry_descriptor_set *desc_set_entry =
3615 (struct pvr_const_map_entry_descriptor_set *)entries;
3616 const uint32_t desc_set_num = desc_set_entry->descriptor_set;
3617 const struct pvr_descriptor_set *descriptor_set;
3618 pvr_dev_addr_t desc_set_addr;
3619 uint64_t desc_portion_offset;
3620
3621 assert(desc_set_num < PVR_MAX_DESCRIPTOR_SETS);
3622
3623 /* TODO: Remove this when the compiler provides us with usage info?
3624 */
3625 /* We skip DMAing unbound descriptor sets. */
3626 if (!(desc_state->valid_mask & BITFIELD_BIT(desc_set_num))) {
3627 const struct pvr_const_map_entry_literal32 *literal;
3628 uint32_t zero_literal_value;
3629
3630 /* The code segment contains a DOUT instructions so in the data
3631 * section we have to write a DOUTD_SRC0 and DOUTD_SRC1.
3632 * We'll write 0 for DOUTD_SRC0 since we don't have a buffer to DMA.
3633 * We're expecting a LITERAL32 entry containing the value for
3634 * DOUTD_SRC1 next so let's make sure we get it and write it
3635 * with BSIZE to 0 disabling the DMA operation.
3636 * We don't want the LITERAL32 to be processed as normal otherwise
3637 * we'd be DMAing from an address of 0.
3638 */
3639
3640 entries += sizeof(*desc_set_entry);
3641 literal = (struct pvr_const_map_entry_literal32 *)entries;
3642
3643 assert(literal->type == PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32);
3644
3645 zero_literal_value =
3646 literal->literal_value &
3647 PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTD_SRC1_BSIZE_CLRMSK;
3648
3649 PVR_WRITE(qword_buffer,
3650 UINT64_C(0),
3651 desc_set_entry->const_offset,
3652 pds_info->data_size_in_dwords);
3653
3654 PVR_WRITE(dword_buffer,
3655 zero_literal_value,
3656 desc_set_entry->const_offset,
3657 pds_info->data_size_in_dwords);
3658
3659 entries += sizeof(*literal);
3660 i++;
3661 continue;
3662 }
3663
3664 descriptor_set = desc_state->descriptor_sets[desc_set_num];
3665
3666 desc_set_addr = descriptor_set->pvr_bo->dev_addr;
3667
3668 if (desc_set_entry->primary) {
3669 desc_portion_offset =
3670 descriptor_set->layout->memory_layout_in_dwords_per_stage[stage]
3671 .primary_offset;
3672 } else {
3673 desc_portion_offset =
3674 descriptor_set->layout->memory_layout_in_dwords_per_stage[stage]
3675 .secondary_offset;
3676 }
3677 desc_portion_offset = PVR_DW_TO_BYTES(desc_portion_offset);
3678
3679 desc_set_addr =
3680 PVR_DEV_ADDR_OFFSET(desc_set_addr, desc_portion_offset);
3681
3682 desc_set_addr = PVR_DEV_ADDR_OFFSET(
3683 desc_set_addr,
3684 PVR_DW_TO_BYTES((uint64_t)desc_set_entry->offset_in_dwords));
3685
3686 PVR_WRITE(qword_buffer,
3687 desc_set_addr.addr,
3688 desc_set_entry->const_offset,
3689 pds_info->data_size_in_dwords);
3690
3691 entries += sizeof(*desc_set_entry);
3692 break;
3693 }
3694
3695 case PVR_PDS_CONST_MAP_ENTRY_TYPE_SPECIAL_BUFFER: {
3696 const struct pvr_const_map_entry_special_buffer *special_buff_entry =
3697 (struct pvr_const_map_entry_special_buffer *)entries;
3698
3699 switch (special_buff_entry->buffer_type) {
3700 case PVR_BUFFER_TYPE_COMPILE_TIME: {
3701 uint64_t addr = descriptor_state->static_consts->dev_addr.addr;
3702
3703 PVR_WRITE(qword_buffer,
3704 addr,
3705 special_buff_entry->const_offset,
3706 pds_info->data_size_in_dwords);
3707 break;
3708 }
3709
3710 case PVR_BUFFER_TYPE_BLEND_CONSTS:
3711 /* TODO: See if instead of reusing the blend constant buffer type
3712 * entry, we can setup a new buffer type specifically for
3713 * num_workgroups or other built-in variables. The mappings are
3714 * setup at pipeline creation when creating the descriptor program.
3715 */
3716 if (stage == PVR_STAGE_ALLOCATION_COMPUTE) {
3717 assert(num_worgroups_buff_addr->addr);
3718
3719 /* TODO: Check if we need to offset this (e.g. for just y and z),
3720 * or cope with any reordering?
3721 */
3722 PVR_WRITE(qword_buffer,
3723 num_worgroups_buff_addr->addr,
3724 special_buff_entry->const_offset,
3725 pds_info->data_size_in_dwords);
3726 } else {
3727 pvr_finishme("Add blend constants support.");
3728 }
3729 break;
3730
3731 default:
3732 unreachable("Unsupported special buffer type.");
3733 }
3734
3735 entries += sizeof(*special_buff_entry);
3736 break;
3737 }
3738
3739 default:
3740 unreachable("Unsupported map entry type.");
3741 }
3742 }
3743
3744 *descriptor_data_offset_out =
3745 pvr_bo->dev_addr.addr -
3746 cmd_buffer->device->heaps.pds_heap->base_addr.addr;
3747
3748 return VK_SUCCESS;
3749 }
3750
3751 /* Note that the descriptor set doesn't have any space for dynamic buffer
3752 * descriptors so this works on the assumption that you have a buffer with space
3753 * for them at the end.
3754 */
pvr_get_dynamic_descriptor_primary_offset(const struct pvr_device * device,const struct pvr_descriptor_set_layout * layout,const struct pvr_descriptor_set_layout_binding * binding,const uint32_t stage,const uint32_t desc_idx)3755 static uint16_t pvr_get_dynamic_descriptor_primary_offset(
3756 const struct pvr_device *device,
3757 const struct pvr_descriptor_set_layout *layout,
3758 const struct pvr_descriptor_set_layout_binding *binding,
3759 const uint32_t stage,
3760 const uint32_t desc_idx)
3761 {
3762 struct pvr_descriptor_size_info size_info;
3763 uint32_t offset;
3764
3765 assert(binding->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
3766 binding->type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC);
3767 assert(desc_idx < binding->descriptor_count);
3768
3769 pvr_descriptor_size_info_init(device, binding->type, &size_info);
3770
3771 offset = layout->total_size_in_dwords;
3772 offset += binding->per_stage_offset_in_dwords[stage].primary;
3773 offset += (desc_idx * size_info.primary);
3774
3775 /* Offset must be less than * 16bits. */
3776 assert(offset < UINT16_MAX);
3777
3778 return (uint16_t)offset;
3779 }
3780
3781 /* Note that the descriptor set doesn't have any space for dynamic buffer
3782 * descriptors so this works on the assumption that you have a buffer with space
3783 * for them at the end.
3784 */
pvr_get_dynamic_descriptor_secondary_offset(const struct pvr_device * device,const struct pvr_descriptor_set_layout * layout,const struct pvr_descriptor_set_layout_binding * binding,const uint32_t stage,const uint32_t desc_idx)3785 static uint16_t pvr_get_dynamic_descriptor_secondary_offset(
3786 const struct pvr_device *device,
3787 const struct pvr_descriptor_set_layout *layout,
3788 const struct pvr_descriptor_set_layout_binding *binding,
3789 const uint32_t stage,
3790 const uint32_t desc_idx)
3791 {
3792 struct pvr_descriptor_size_info size_info;
3793 uint32_t offset;
3794
3795 assert(binding->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
3796 binding->type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC);
3797 assert(desc_idx < binding->descriptor_count);
3798
3799 pvr_descriptor_size_info_init(device, binding->type, &size_info);
3800
3801 offset = layout->total_size_in_dwords;
3802 offset +=
3803 layout->memory_layout_in_dwords_per_stage[stage].primary_dynamic_size;
3804 offset += binding->per_stage_offset_in_dwords[stage].secondary;
3805 offset += (desc_idx * size_info.secondary);
3806
3807 /* Offset must be less than * 16bits. */
3808 assert(offset < UINT16_MAX);
3809
3810 return (uint16_t)offset;
3811 }
3812
3813 /**
3814 * \brief Upload a copy of the descriptor set with dynamic buffer offsets
3815 * applied.
3816 */
3817 /* TODO: We should probably make the compiler aware of the dynamic descriptors.
3818 * We could use push constants like Anv seems to do. This would avoid having to
3819 * duplicate all sets containing dynamic descriptors each time the offsets are
3820 * updated.
3821 */
pvr_cmd_buffer_upload_patched_desc_set(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_descriptor_set * desc_set,const uint32_t * dynamic_offsets,struct pvr_suballoc_bo ** const bo_out)3822 static VkResult pvr_cmd_buffer_upload_patched_desc_set(
3823 struct pvr_cmd_buffer *cmd_buffer,
3824 const struct pvr_descriptor_set *desc_set,
3825 const uint32_t *dynamic_offsets,
3826 struct pvr_suballoc_bo **const bo_out)
3827 {
3828 const struct pvr_descriptor_set_layout *layout = desc_set->layout;
3829 const uint64_t normal_desc_set_size =
3830 PVR_DW_TO_BYTES(layout->total_size_in_dwords);
3831 const uint64_t dynamic_descs_size =
3832 PVR_DW_TO_BYTES(layout->total_dynamic_size_in_dwords);
3833 struct pvr_descriptor_size_info dynamic_uniform_buffer_size_info;
3834 struct pvr_descriptor_size_info dynamic_storage_buffer_size_info;
3835 struct pvr_device *device = cmd_buffer->device;
3836 struct pvr_suballoc_bo *patched_desc_set_bo;
3837 uint32_t *src_mem_ptr, *dst_mem_ptr;
3838 uint32_t desc_idx_offset = 0;
3839 VkResult result;
3840
3841 assert(desc_set->layout->dynamic_buffer_count > 0);
3842
3843 pvr_descriptor_size_info_init(device,
3844 VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC,
3845 &dynamic_uniform_buffer_size_info);
3846 pvr_descriptor_size_info_init(device,
3847 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC,
3848 &dynamic_storage_buffer_size_info);
3849
3850 /* TODO: In the descriptor set we don't account for dynamic buffer
3851 * descriptors and take care of them in the pipeline layout. The pipeline
3852 * layout allocates them at the beginning but let's put them at the end just
3853 * because it makes things a bit easier. Ideally we should be using the
3854 * pipeline layout and use the offsets from the pipeline layout to patch
3855 * descriptors.
3856 */
3857 result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
3858 cmd_buffer->device->heaps.general_heap,
3859 normal_desc_set_size + dynamic_descs_size,
3860 &patched_desc_set_bo);
3861 if (result != VK_SUCCESS)
3862 return result;
3863
3864 src_mem_ptr = (uint32_t *)pvr_bo_suballoc_get_map_addr(desc_set->pvr_bo);
3865 dst_mem_ptr = (uint32_t *)pvr_bo_suballoc_get_map_addr(patched_desc_set_bo);
3866
3867 memcpy(dst_mem_ptr, src_mem_ptr, normal_desc_set_size);
3868
3869 for (uint32_t i = 0; i < desc_set->layout->binding_count; i++) {
3870 const struct pvr_descriptor_set_layout_binding *binding =
3871 &desc_set->layout->bindings[i];
3872 const struct pvr_descriptor *descriptors =
3873 &desc_set->descriptors[binding->descriptor_index];
3874 const struct pvr_descriptor_size_info *size_info;
3875
3876 if (binding->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC)
3877 size_info = &dynamic_uniform_buffer_size_info;
3878 else if (binding->type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC)
3879 size_info = &dynamic_storage_buffer_size_info;
3880 else
3881 continue;
3882
3883 for (uint32_t stage = 0; stage < PVR_STAGE_ALLOCATION_COUNT; stage++) {
3884 uint32_t primary_offset;
3885 uint32_t secondary_offset;
3886
3887 if (!(binding->shader_stage_mask & BITFIELD_BIT(stage)))
3888 continue;
3889
3890 /* Get the offsets for the first dynamic descriptor in the current
3891 * binding.
3892 */
3893 primary_offset =
3894 pvr_get_dynamic_descriptor_primary_offset(device,
3895 desc_set->layout,
3896 binding,
3897 stage,
3898 0);
3899 secondary_offset =
3900 pvr_get_dynamic_descriptor_secondary_offset(device,
3901 desc_set->layout,
3902 binding,
3903 stage,
3904 0);
3905
3906 /* clang-format off */
3907 for (uint32_t desc_idx = 0;
3908 desc_idx < binding->descriptor_count;
3909 desc_idx++) {
3910 /* clang-format on */
3911 const pvr_dev_addr_t addr =
3912 PVR_DEV_ADDR_OFFSET(descriptors[desc_idx].buffer_dev_addr,
3913 dynamic_offsets[desc_idx + desc_idx_offset]);
3914 const VkDeviceSize range =
3915 MIN2(descriptors[desc_idx].buffer_desc_range,
3916 descriptors[desc_idx].buffer_whole_range -
3917 dynamic_offsets[desc_idx]);
3918
3919 #if defined(DEBUG)
3920 uint32_t desc_primary_offset;
3921 uint32_t desc_secondary_offset;
3922
3923 desc_primary_offset =
3924 pvr_get_dynamic_descriptor_primary_offset(device,
3925 desc_set->layout,
3926 binding,
3927 stage,
3928 desc_idx);
3929 desc_secondary_offset =
3930 pvr_get_dynamic_descriptor_secondary_offset(device,
3931 desc_set->layout,
3932 binding,
3933 stage,
3934 desc_idx);
3935
3936 /* Check the assumption that the descriptors within a binding, for
3937 * a particular stage, are allocated consecutively.
3938 */
3939 assert(desc_primary_offset ==
3940 primary_offset + size_info->primary * desc_idx);
3941 assert(desc_secondary_offset ==
3942 secondary_offset + size_info->secondary * desc_idx);
3943 #endif
3944
3945 assert(descriptors[desc_idx].type == binding->type);
3946
3947 memcpy(dst_mem_ptr + primary_offset + size_info->primary * desc_idx,
3948 &addr.addr,
3949 PVR_DW_TO_BYTES(size_info->primary));
3950 memcpy(dst_mem_ptr + secondary_offset +
3951 size_info->secondary * desc_idx,
3952 &range,
3953 PVR_DW_TO_BYTES(size_info->secondary));
3954 }
3955 }
3956
3957 desc_idx_offset += binding->descriptor_count;
3958 }
3959
3960 *bo_out = patched_desc_set_bo;
3961
3962 return VK_SUCCESS;
3963 }
3964
3965 #define PVR_SELECT(_geom, _frag, _compute) \
3966 (stage == PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY) \
3967 ? (_geom) \
3968 : (stage == PVR_STAGE_ALLOCATION_FRAGMENT) ? (_frag) : (_compute)
3969
3970 static VkResult
pvr_cmd_buffer_upload_desc_set_table(struct pvr_cmd_buffer * const cmd_buffer,enum pvr_stage_allocation stage,pvr_dev_addr_t * addr_out)3971 pvr_cmd_buffer_upload_desc_set_table(struct pvr_cmd_buffer *const cmd_buffer,
3972 enum pvr_stage_allocation stage,
3973 pvr_dev_addr_t *addr_out)
3974 {
3975 uint64_t bound_desc_sets[PVR_MAX_DESCRIPTOR_SETS];
3976 const struct pvr_descriptor_state *desc_state;
3977 struct pvr_suballoc_bo *suballoc_bo;
3978 uint32_t dynamic_offset_idx = 0;
3979 VkResult result;
3980
3981 switch (stage) {
3982 case PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY:
3983 case PVR_STAGE_ALLOCATION_FRAGMENT:
3984 case PVR_STAGE_ALLOCATION_COMPUTE:
3985 break;
3986
3987 default:
3988 unreachable("Unsupported stage.");
3989 break;
3990 }
3991
3992 desc_state = PVR_SELECT(&cmd_buffer->state.gfx_desc_state,
3993 &cmd_buffer->state.gfx_desc_state,
3994 &cmd_buffer->state.compute_desc_state);
3995
3996 for (uint32_t set = 0; set < ARRAY_SIZE(bound_desc_sets); set++)
3997 bound_desc_sets[set] = ~0;
3998
3999 assert(util_last_bit(desc_state->valid_mask) <= ARRAY_SIZE(bound_desc_sets));
4000 for (uint32_t set = 0; set < util_last_bit(desc_state->valid_mask); set++) {
4001 const struct pvr_descriptor_set *desc_set;
4002
4003 if (!(desc_state->valid_mask & BITFIELD_BIT(set))) {
4004 const struct pvr_pipeline_layout *pipeline_layout =
4005 PVR_SELECT(cmd_buffer->state.gfx_pipeline->base.layout,
4006 cmd_buffer->state.gfx_pipeline->base.layout,
4007 cmd_buffer->state.compute_pipeline->base.layout);
4008 const struct pvr_descriptor_set_layout *set_layout;
4009
4010 assert(set <= pipeline_layout->set_count);
4011
4012 set_layout = pipeline_layout->set_layout[set];
4013 dynamic_offset_idx += set_layout->dynamic_buffer_count;
4014
4015 continue;
4016 }
4017
4018 desc_set = desc_state->descriptor_sets[set];
4019
4020 /* TODO: Is it better if we don't set the valid_mask for empty sets? */
4021 if (desc_set->layout->descriptor_count == 0)
4022 continue;
4023
4024 if (desc_set->layout->dynamic_buffer_count > 0) {
4025 struct pvr_suballoc_bo *new_desc_set_bo;
4026
4027 assert(dynamic_offset_idx + desc_set->layout->dynamic_buffer_count <=
4028 ARRAY_SIZE(desc_state->dynamic_offsets));
4029
4030 result = pvr_cmd_buffer_upload_patched_desc_set(
4031 cmd_buffer,
4032 desc_set,
4033 &desc_state->dynamic_offsets[dynamic_offset_idx],
4034 &new_desc_set_bo);
4035 if (result != VK_SUCCESS)
4036 return result;
4037
4038 dynamic_offset_idx += desc_set->layout->dynamic_buffer_count;
4039
4040 bound_desc_sets[set] = new_desc_set_bo->dev_addr.addr;
4041 } else {
4042 bound_desc_sets[set] = desc_set->pvr_bo->dev_addr.addr;
4043 }
4044 }
4045
4046 result = pvr_cmd_buffer_upload_general(cmd_buffer,
4047 bound_desc_sets,
4048 sizeof(bound_desc_sets),
4049 &suballoc_bo);
4050 if (result != VK_SUCCESS)
4051 return result;
4052
4053 *addr_out = suballoc_bo->dev_addr;
4054 return VK_SUCCESS;
4055 }
4056
4057 static VkResult
pvr_process_addr_literal(struct pvr_cmd_buffer * cmd_buffer,enum pvr_pds_addr_literal_type addr_literal_type,enum pvr_stage_allocation stage,pvr_dev_addr_t * addr_out)4058 pvr_process_addr_literal(struct pvr_cmd_buffer *cmd_buffer,
4059 enum pvr_pds_addr_literal_type addr_literal_type,
4060 enum pvr_stage_allocation stage,
4061 pvr_dev_addr_t *addr_out)
4062 {
4063 VkResult result;
4064
4065 switch (addr_literal_type) {
4066 case PVR_PDS_ADDR_LITERAL_DESC_SET_ADDRS_TABLE: {
4067 /* TODO: Maybe we want to free pvr_bo? And only when the data
4068 * section is written successfully we link all bos to the command
4069 * buffer.
4070 */
4071 result =
4072 pvr_cmd_buffer_upload_desc_set_table(cmd_buffer, stage, addr_out);
4073 if (result != VK_SUCCESS)
4074 return result;
4075
4076 break;
4077 }
4078
4079 case PVR_PDS_ADDR_LITERAL_PUSH_CONSTS: {
4080 const struct pvr_pipeline_layout *layout =
4081 PVR_SELECT(cmd_buffer->state.gfx_pipeline->base.layout,
4082 cmd_buffer->state.gfx_pipeline->base.layout,
4083 cmd_buffer->state.compute_pipeline->base.layout);
4084 const uint32_t push_constants_offset =
4085 PVR_SELECT(layout->vert_push_constants_offset,
4086 layout->frag_push_constants_offset,
4087 layout->compute_push_constants_offset);
4088
4089 *addr_out = PVR_DEV_ADDR_OFFSET(cmd_buffer->state.push_constants.dev_addr,
4090 push_constants_offset);
4091 break;
4092 }
4093
4094 case PVR_PDS_ADDR_LITERAL_BLEND_CONSTANTS: {
4095 float *blend_consts =
4096 cmd_buffer->vk.dynamic_graphics_state.cb.blend_constants;
4097 size_t size =
4098 sizeof(cmd_buffer->vk.dynamic_graphics_state.cb.blend_constants);
4099 struct pvr_suballoc_bo *blend_consts_bo;
4100
4101 result = pvr_cmd_buffer_upload_general(cmd_buffer,
4102 blend_consts,
4103 size,
4104 &blend_consts_bo);
4105 if (result != VK_SUCCESS)
4106 return result;
4107
4108 *addr_out = blend_consts_bo->dev_addr;
4109
4110 break;
4111 }
4112
4113 default:
4114 unreachable("Invalid add literal type.");
4115 }
4116
4117 return VK_SUCCESS;
4118 }
4119
4120 #undef PVR_SELECT
4121
pvr_setup_descriptor_mappings_new(struct pvr_cmd_buffer * const cmd_buffer,enum pvr_stage_allocation stage,const struct pvr_stage_allocation_descriptor_state * descriptor_state,uint32_t * const descriptor_data_offset_out)4122 static VkResult pvr_setup_descriptor_mappings_new(
4123 struct pvr_cmd_buffer *const cmd_buffer,
4124 enum pvr_stage_allocation stage,
4125 const struct pvr_stage_allocation_descriptor_state *descriptor_state,
4126 uint32_t *const descriptor_data_offset_out)
4127 {
4128 const struct pvr_pds_info *const pds_info = &descriptor_state->pds_info;
4129 struct pvr_suballoc_bo *pvr_bo;
4130 const uint8_t *entries;
4131 uint32_t *dword_buffer;
4132 uint64_t *qword_buffer;
4133 VkResult result;
4134
4135 if (!pds_info->data_size_in_dwords)
4136 return VK_SUCCESS;
4137
4138 result =
4139 pvr_cmd_buffer_alloc_mem(cmd_buffer,
4140 cmd_buffer->device->heaps.pds_heap,
4141 PVR_DW_TO_BYTES(pds_info->data_size_in_dwords),
4142 &pvr_bo);
4143 if (result != VK_SUCCESS)
4144 return result;
4145
4146 dword_buffer = (uint32_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
4147 qword_buffer = (uint64_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
4148
4149 entries = (uint8_t *)pds_info->entries;
4150
4151 switch (stage) {
4152 case PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY:
4153 case PVR_STAGE_ALLOCATION_FRAGMENT:
4154 case PVR_STAGE_ALLOCATION_COMPUTE:
4155 break;
4156
4157 default:
4158 unreachable("Unsupported stage.");
4159 break;
4160 }
4161
4162 for (uint32_t i = 0; i < pds_info->entry_count; i++) {
4163 const struct pvr_const_map_entry *const entry_header =
4164 (struct pvr_const_map_entry *)entries;
4165
4166 switch (entry_header->type) {
4167 case PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32: {
4168 const struct pvr_const_map_entry_literal32 *const literal =
4169 (struct pvr_const_map_entry_literal32 *)entries;
4170
4171 PVR_WRITE(dword_buffer,
4172 literal->literal_value,
4173 literal->const_offset,
4174 pds_info->data_size_in_dwords);
4175
4176 entries += sizeof(*literal);
4177 break;
4178 }
4179
4180 case PVR_PDS_CONST_MAP_ENTRY_TYPE_ADDR_LITERAL_BUFFER: {
4181 const struct pvr_pds_const_map_entry_addr_literal_buffer
4182 *const addr_literal_buffer_entry =
4183 (struct pvr_pds_const_map_entry_addr_literal_buffer *)entries;
4184 struct pvr_device *device = cmd_buffer->device;
4185 struct pvr_suballoc_bo *addr_literal_buffer_bo;
4186 uint32_t addr_literal_count = 0;
4187 uint64_t *addr_literal_buffer;
4188
4189 result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
4190 device->heaps.general_heap,
4191 addr_literal_buffer_entry->size,
4192 &addr_literal_buffer_bo);
4193 if (result != VK_SUCCESS)
4194 return result;
4195
4196 addr_literal_buffer =
4197 (uint64_t *)pvr_bo_suballoc_get_map_addr(addr_literal_buffer_bo);
4198
4199 entries += sizeof(*addr_literal_buffer_entry);
4200
4201 PVR_WRITE(qword_buffer,
4202 addr_literal_buffer_bo->dev_addr.addr,
4203 addr_literal_buffer_entry->const_offset,
4204 pds_info->data_size_in_dwords);
4205
4206 for (uint32_t j = i + 1; j < pds_info->entry_count; j++) {
4207 const struct pvr_const_map_entry *const entry_header =
4208 (struct pvr_const_map_entry *)entries;
4209 const struct pvr_pds_const_map_entry_addr_literal *addr_literal;
4210 pvr_dev_addr_t dev_addr;
4211
4212 if (entry_header->type != PVR_PDS_CONST_MAP_ENTRY_TYPE_ADDR_LITERAL)
4213 break;
4214
4215 addr_literal =
4216 (struct pvr_pds_const_map_entry_addr_literal *)entries;
4217
4218 result = pvr_process_addr_literal(cmd_buffer,
4219 addr_literal->addr_type,
4220 stage,
4221 &dev_addr);
4222 if (result != VK_SUCCESS)
4223 return result;
4224
4225 addr_literal_buffer[addr_literal_count++] = dev_addr.addr;
4226
4227 entries += sizeof(*addr_literal);
4228 }
4229
4230 assert(addr_literal_count * sizeof(uint64_t) ==
4231 addr_literal_buffer_entry->size);
4232
4233 i += addr_literal_count;
4234
4235 break;
4236 }
4237
4238 default:
4239 unreachable("Unsupported map entry type.");
4240 }
4241 }
4242
4243 *descriptor_data_offset_out =
4244 pvr_bo->dev_addr.addr -
4245 cmd_buffer->device->heaps.pds_heap->base_addr.addr;
4246
4247 return VK_SUCCESS;
4248 }
4249
pvr_setup_descriptor_mappings(struct pvr_cmd_buffer * const cmd_buffer,enum pvr_stage_allocation stage,const struct pvr_stage_allocation_descriptor_state * descriptor_state,const pvr_dev_addr_t * const num_worgroups_buff_addr,uint32_t * const descriptor_data_offset_out)4250 static VkResult pvr_setup_descriptor_mappings(
4251 struct pvr_cmd_buffer *const cmd_buffer,
4252 enum pvr_stage_allocation stage,
4253 const struct pvr_stage_allocation_descriptor_state *descriptor_state,
4254 const pvr_dev_addr_t *const num_worgroups_buff_addr,
4255 uint32_t *const descriptor_data_offset_out)
4256 {
4257 const bool old_path =
4258 pvr_has_hard_coded_shaders(&cmd_buffer->device->pdevice->dev_info);
4259
4260 if (old_path) {
4261 return pvr_setup_descriptor_mappings_old(cmd_buffer,
4262 stage,
4263 descriptor_state,
4264 num_worgroups_buff_addr,
4265 descriptor_data_offset_out);
4266 }
4267
4268 return pvr_setup_descriptor_mappings_new(cmd_buffer,
4269 stage,
4270 descriptor_state,
4271 descriptor_data_offset_out);
4272 }
4273
pvr_compute_update_shared(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd)4274 static void pvr_compute_update_shared(struct pvr_cmd_buffer *cmd_buffer,
4275 struct pvr_sub_cmd_compute *const sub_cmd)
4276 {
4277 const struct pvr_device *device = cmd_buffer->device;
4278 const struct pvr_physical_device *pdevice = device->pdevice;
4279 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4280 struct pvr_csb *csb = &sub_cmd->control_stream;
4281 const struct pvr_compute_pipeline *pipeline = state->compute_pipeline;
4282 const uint32_t const_shared_regs =
4283 pipeline->shader_state.const_shared_reg_count;
4284 struct pvr_compute_kernel_info info;
4285
4286 /* No shared regs, no need to use an allocation kernel. */
4287 if (!const_shared_regs)
4288 return;
4289
4290 /* Accumulate the MAX number of shared registers across the kernels in this
4291 * dispatch. This is used by the FW for context switching, so must be large
4292 * enough to contain all the shared registers that might be in use for this
4293 * compute job. Coefficients don't need to be included as the context switch
4294 * will not happen within the execution of a single workgroup, thus nothing
4295 * needs to be preserved.
4296 */
4297 state->max_shared_regs = MAX2(state->max_shared_regs, const_shared_regs);
4298
4299 info = (struct pvr_compute_kernel_info){
4300 .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
4301 .sd_type = PVRX(CDMCTRL_SD_TYPE_NONE),
4302
4303 .usc_target = PVRX(CDMCTRL_USC_TARGET_ALL),
4304 .usc_common_shared = true,
4305 .usc_common_size =
4306 DIV_ROUND_UP(const_shared_regs,
4307 PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE)),
4308
4309 .local_size = { 1, 1, 1 },
4310 .global_size = { 1, 1, 1 },
4311 };
4312
4313 /* Sometimes we don't have a secondary program if there were no constants to
4314 * write, but we still need to run a PDS program to accomplish the
4315 * allocation of the local/common store shared registers. Use the
4316 * pre-uploaded empty PDS program in this instance.
4317 */
4318 if (pipeline->descriptor_state.pds_info.code_size_in_dwords) {
4319 uint32_t pds_data_size_in_dwords =
4320 pipeline->descriptor_state.pds_info.data_size_in_dwords;
4321
4322 info.pds_data_offset = state->pds_compute_descriptor_data_offset;
4323 info.pds_data_size =
4324 DIV_ROUND_UP(PVR_DW_TO_BYTES(pds_data_size_in_dwords),
4325 PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE));
4326
4327 /* Check that we have upload the code section. */
4328 assert(pipeline->descriptor_state.pds_code.code_size);
4329 info.pds_code_offset = pipeline->descriptor_state.pds_code.code_offset;
4330 } else {
4331 const struct pvr_pds_upload *program = &device->pds_compute_empty_program;
4332
4333 info.pds_data_offset = program->data_offset;
4334 info.pds_data_size =
4335 DIV_ROUND_UP(PVR_DW_TO_BYTES(program->data_size),
4336 PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE));
4337 info.pds_code_offset = program->code_offset;
4338 }
4339
4340 /* We don't need to pad the workgroup size. */
4341
4342 info.max_instances =
4343 pvr_compute_flat_slot_size(pdevice, const_shared_regs, false, 1U);
4344
4345 pvr_compute_generate_control_stream(csb, sub_cmd, &info);
4346 }
4347
pvr_compute_update_shared_private(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd,struct pvr_private_compute_pipeline * pipeline)4348 void pvr_compute_update_shared_private(
4349 struct pvr_cmd_buffer *cmd_buffer,
4350 struct pvr_sub_cmd_compute *const sub_cmd,
4351 struct pvr_private_compute_pipeline *pipeline)
4352 {
4353 const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
4354 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4355 const uint32_t const_shared_regs = pipeline->const_shared_regs_count;
4356 struct pvr_csb *csb = &sub_cmd->control_stream;
4357 struct pvr_compute_kernel_info info;
4358
4359 /* No shared regs, no need to use an allocation kernel. */
4360 if (!const_shared_regs)
4361 return;
4362
4363 /* See comment in pvr_compute_update_shared() for details on this. */
4364 state->max_shared_regs = MAX2(state->max_shared_regs, const_shared_regs);
4365
4366 info = (struct pvr_compute_kernel_info){
4367 .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
4368 .usc_common_size =
4369 DIV_ROUND_UP(const_shared_regs,
4370 PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE)),
4371 .pds_data_size =
4372 DIV_ROUND_UP(PVR_DW_TO_BYTES(pipeline->pds_shared_update_data_size_dw),
4373 PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE)),
4374 .usc_target = PVRX(CDMCTRL_USC_TARGET_ALL),
4375 .pds_data_offset = pipeline->pds_shared_update_data_offset,
4376 .pds_code_offset = pipeline->pds_shared_update_code_offset,
4377 .sd_type = PVRX(CDMCTRL_SD_TYPE_NONE),
4378 .usc_common_shared = true,
4379 .local_size = { 1, 1, 1 },
4380 .global_size = { 1, 1, 1 },
4381 };
4382
4383 /* We don't need to pad the workgroup size. */
4384
4385 info.max_instances =
4386 pvr_compute_flat_slot_size(pdevice, const_shared_regs, false, 1U);
4387
4388 pvr_compute_generate_control_stream(csb, sub_cmd, &info);
4389 }
4390
4391 static uint32_t
pvr_compute_flat_pad_workgroup_size(const struct pvr_physical_device * pdevice,uint32_t workgroup_size,uint32_t coeff_regs_count)4392 pvr_compute_flat_pad_workgroup_size(const struct pvr_physical_device *pdevice,
4393 uint32_t workgroup_size,
4394 uint32_t coeff_regs_count)
4395 {
4396 const struct pvr_device_runtime_info *dev_runtime_info =
4397 &pdevice->dev_runtime_info;
4398 const struct pvr_device_info *dev_info = &pdevice->dev_info;
4399 uint32_t max_avail_coeff_regs =
4400 dev_runtime_info->cdm_max_local_mem_size_regs;
4401 uint32_t coeff_regs_count_aligned =
4402 ALIGN_POT(coeff_regs_count,
4403 PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE) >> 2U);
4404
4405 /* If the work group size is > ROGUE_MAX_INSTANCES_PER_TASK. We now *always*
4406 * pad the work group size to the next multiple of
4407 * ROGUE_MAX_INSTANCES_PER_TASK.
4408 *
4409 * If we use more than 1/8th of the max coefficient registers then we round
4410 * work group size up to the next multiple of ROGUE_MAX_INSTANCES_PER_TASK
4411 */
4412 /* TODO: See if this can be optimized. */
4413 if (workgroup_size > ROGUE_MAX_INSTANCES_PER_TASK ||
4414 coeff_regs_count_aligned > (max_avail_coeff_regs / 8)) {
4415 assert(workgroup_size < rogue_get_compute_max_work_group_size(dev_info));
4416
4417 return ALIGN_POT(workgroup_size, ROGUE_MAX_INSTANCES_PER_TASK);
4418 }
4419
4420 return workgroup_size;
4421 }
4422
pvr_compute_update_kernel_private(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd,struct pvr_private_compute_pipeline * pipeline,const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])4423 void pvr_compute_update_kernel_private(
4424 struct pvr_cmd_buffer *cmd_buffer,
4425 struct pvr_sub_cmd_compute *const sub_cmd,
4426 struct pvr_private_compute_pipeline *pipeline,
4427 const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])
4428 {
4429 const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
4430 const struct pvr_device_runtime_info *dev_runtime_info =
4431 &pdevice->dev_runtime_info;
4432 struct pvr_csb *csb = &sub_cmd->control_stream;
4433
4434 struct pvr_compute_kernel_info info = {
4435 .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
4436 .usc_target = PVRX(CDMCTRL_USC_TARGET_ANY),
4437 .pds_temp_size =
4438 DIV_ROUND_UP(pipeline->pds_temps_used << 2U,
4439 PVRX(CDMCTRL_KERNEL0_PDS_TEMP_SIZE_UNIT_SIZE)),
4440
4441 .pds_data_size =
4442 DIV_ROUND_UP(PVR_DW_TO_BYTES(pipeline->pds_data_size_dw),
4443 PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE)),
4444 .pds_data_offset = pipeline->pds_data_offset,
4445 .pds_code_offset = pipeline->pds_code_offset,
4446
4447 .sd_type = PVRX(CDMCTRL_SD_TYPE_NONE),
4448
4449 .usc_unified_size =
4450 DIV_ROUND_UP(pipeline->unified_store_regs_count << 2U,
4451 PVRX(CDMCTRL_KERNEL0_USC_UNIFIED_SIZE_UNIT_SIZE)),
4452
4453 /* clang-format off */
4454 .global_size = {
4455 global_workgroup_size[0],
4456 global_workgroup_size[1],
4457 global_workgroup_size[2]
4458 },
4459 /* clang-format on */
4460 };
4461
4462 uint32_t work_size = pipeline->workgroup_size.width *
4463 pipeline->workgroup_size.height *
4464 pipeline->workgroup_size.depth;
4465 uint32_t coeff_regs;
4466
4467 if (work_size > ROGUE_MAX_INSTANCES_PER_TASK) {
4468 /* Enforce a single workgroup per cluster through allocation starvation.
4469 */
4470 coeff_regs = dev_runtime_info->cdm_max_local_mem_size_regs;
4471 } else {
4472 coeff_regs = pipeline->coeff_regs_count;
4473 }
4474
4475 info.usc_common_size =
4476 DIV_ROUND_UP(PVR_DW_TO_BYTES(coeff_regs),
4477 PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE));
4478
4479 /* Use a whole slot per workgroup. */
4480 work_size = MAX2(work_size, ROGUE_MAX_INSTANCES_PER_TASK);
4481
4482 coeff_regs += pipeline->const_shared_regs_count;
4483
4484 if (pipeline->const_shared_regs_count > 0)
4485 info.sd_type = PVRX(CDMCTRL_SD_TYPE_USC);
4486
4487 work_size =
4488 pvr_compute_flat_pad_workgroup_size(pdevice, work_size, coeff_regs);
4489
4490 info.local_size[0] = work_size;
4491 info.local_size[1] = 1U;
4492 info.local_size[2] = 1U;
4493
4494 info.max_instances =
4495 pvr_compute_flat_slot_size(pdevice, coeff_regs, false, work_size);
4496
4497 pvr_compute_generate_control_stream(csb, sub_cmd, &info);
4498 }
4499
4500 /* TODO: Wire up the base_workgroup variant program when implementing
4501 * VK_KHR_device_group. The values will also need patching into the program.
4502 */
pvr_compute_update_kernel(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd,pvr_dev_addr_t indirect_addr,const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])4503 static void pvr_compute_update_kernel(
4504 struct pvr_cmd_buffer *cmd_buffer,
4505 struct pvr_sub_cmd_compute *const sub_cmd,
4506 pvr_dev_addr_t indirect_addr,
4507 const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])
4508 {
4509 const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
4510 const struct pvr_device_runtime_info *dev_runtime_info =
4511 &pdevice->dev_runtime_info;
4512 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4513 struct pvr_csb *csb = &sub_cmd->control_stream;
4514 const struct pvr_compute_pipeline *pipeline = state->compute_pipeline;
4515 const struct pvr_compute_shader_state *shader_state =
4516 &pipeline->shader_state;
4517 const struct pvr_pds_info *program_info = &pipeline->primary_program_info;
4518
4519 struct pvr_compute_kernel_info info = {
4520 .indirect_buffer_addr = indirect_addr,
4521 .usc_target = PVRX(CDMCTRL_USC_TARGET_ANY),
4522 .pds_temp_size =
4523 DIV_ROUND_UP(program_info->temps_required << 2U,
4524 PVRX(CDMCTRL_KERNEL0_PDS_TEMP_SIZE_UNIT_SIZE)),
4525
4526 .pds_data_size =
4527 DIV_ROUND_UP(PVR_DW_TO_BYTES(program_info->data_size_in_dwords),
4528 PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE)),
4529 .pds_data_offset = pipeline->primary_program.data_offset,
4530 .pds_code_offset = pipeline->primary_program.code_offset,
4531
4532 .sd_type = PVRX(CDMCTRL_SD_TYPE_NONE),
4533
4534 .usc_unified_size =
4535 DIV_ROUND_UP(shader_state->input_register_count << 2U,
4536 PVRX(CDMCTRL_KERNEL0_USC_UNIFIED_SIZE_UNIT_SIZE)),
4537
4538 /* clang-format off */
4539 .global_size = {
4540 global_workgroup_size[0],
4541 global_workgroup_size[1],
4542 global_workgroup_size[2]
4543 },
4544 /* clang-format on */
4545 };
4546
4547 uint32_t work_size = shader_state->work_size;
4548 uint32_t coeff_regs;
4549
4550 if (work_size > ROGUE_MAX_INSTANCES_PER_TASK) {
4551 /* Enforce a single workgroup per cluster through allocation starvation.
4552 */
4553 coeff_regs = dev_runtime_info->cdm_max_local_mem_size_regs;
4554 } else {
4555 coeff_regs = shader_state->coefficient_register_count;
4556 }
4557
4558 info.usc_common_size =
4559 DIV_ROUND_UP(PVR_DW_TO_BYTES(coeff_regs),
4560 PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE));
4561
4562 /* Use a whole slot per workgroup. */
4563 work_size = MAX2(work_size, ROGUE_MAX_INSTANCES_PER_TASK);
4564
4565 coeff_regs += shader_state->const_shared_reg_count;
4566
4567 if (shader_state->const_shared_reg_count > 0)
4568 info.sd_type = PVRX(CDMCTRL_SD_TYPE_USC);
4569
4570 work_size =
4571 pvr_compute_flat_pad_workgroup_size(pdevice, work_size, coeff_regs);
4572
4573 info.local_size[0] = work_size;
4574 info.local_size[1] = 1U;
4575 info.local_size[2] = 1U;
4576
4577 info.max_instances =
4578 pvr_compute_flat_slot_size(pdevice, coeff_regs, false, work_size);
4579
4580 pvr_compute_generate_control_stream(csb, sub_cmd, &info);
4581 }
4582
pvr_cmd_upload_push_consts(struct pvr_cmd_buffer * cmd_buffer)4583 static VkResult pvr_cmd_upload_push_consts(struct pvr_cmd_buffer *cmd_buffer)
4584 {
4585 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4586 struct pvr_suballoc_bo *suballoc_bo;
4587 VkResult result;
4588
4589 /* TODO: Here are some possible optimizations/things to consider:
4590 *
4591 * - Currently we upload maxPushConstantsSize. The application might only
4592 * be using a portion of that so we might end up with unused memory.
4593 * Should we be smarter about this. If we intend to upload the push
4594 * consts into shareds, we definitely want to do avoid reserving unused
4595 * regs.
4596 *
4597 * - For now we have to upload to a new buffer each time since the shaders
4598 * access the push constants from memory. If we were to reuse the same
4599 * buffer we might update the contents out of sync with job submission
4600 * and the shaders will see the updated contents while the command
4601 * buffer was still being recorded and not yet submitted.
4602 * If we were to upload the push constants directly to shared regs we
4603 * could reuse the same buffer (avoiding extra allocation overhead)
4604 * since the contents will be DMAed only on job submission when the
4605 * control stream is processed and the PDS program is executed. This
4606 * approach would also allow us to avoid regenerating the PDS data
4607 * section in some cases since the buffer address will be constants.
4608 */
4609
4610 if (cmd_buffer->state.push_constants.uploaded)
4611 return VK_SUCCESS;
4612
4613 result = pvr_cmd_buffer_upload_general(cmd_buffer,
4614 state->push_constants.data,
4615 sizeof(state->push_constants.data),
4616 &suballoc_bo);
4617 if (result != VK_SUCCESS)
4618 return result;
4619
4620 cmd_buffer->state.push_constants.dev_addr = suballoc_bo->dev_addr;
4621 cmd_buffer->state.push_constants.uploaded = true;
4622
4623 return VK_SUCCESS;
4624 }
4625
pvr_cmd_dispatch(struct pvr_cmd_buffer * const cmd_buffer,const pvr_dev_addr_t indirect_addr,const uint32_t workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])4626 static void pvr_cmd_dispatch(
4627 struct pvr_cmd_buffer *const cmd_buffer,
4628 const pvr_dev_addr_t indirect_addr,
4629 const uint32_t workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])
4630 {
4631 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4632 const struct pvr_compute_pipeline *compute_pipeline =
4633 state->compute_pipeline;
4634 struct pvr_sub_cmd_compute *sub_cmd;
4635 VkResult result;
4636
4637 pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_COMPUTE);
4638
4639 sub_cmd = &state->current_sub_cmd->compute;
4640 sub_cmd->uses_atomic_ops |= compute_pipeline->shader_state.uses_atomic_ops;
4641 sub_cmd->uses_barrier |= compute_pipeline->shader_state.uses_barrier;
4642
4643 if (state->push_constants.dirty_stages & VK_SHADER_STAGE_COMPUTE_BIT) {
4644 result = pvr_cmd_upload_push_consts(cmd_buffer);
4645 if (result != VK_SUCCESS)
4646 return;
4647
4648 /* Regenerate the PDS program to use the new push consts buffer. */
4649 state->dirty.compute_desc_dirty = true;
4650
4651 state->push_constants.dirty_stages &= ~VK_SHADER_STAGE_COMPUTE_BIT;
4652 }
4653
4654 if (compute_pipeline->shader_state.uses_num_workgroups) {
4655 pvr_dev_addr_t descriptor_data_offset_out;
4656
4657 if (indirect_addr.addr) {
4658 descriptor_data_offset_out = indirect_addr;
4659 } else {
4660 struct pvr_suballoc_bo *num_workgroups_bo;
4661
4662 result = pvr_cmd_buffer_upload_general(cmd_buffer,
4663 workgroup_size,
4664 sizeof(*workgroup_size) *
4665 PVR_WORKGROUP_DIMENSIONS,
4666 &num_workgroups_bo);
4667 if (result != VK_SUCCESS)
4668 return;
4669
4670 descriptor_data_offset_out = num_workgroups_bo->dev_addr;
4671 }
4672
4673 result = pvr_setup_descriptor_mappings(
4674 cmd_buffer,
4675 PVR_STAGE_ALLOCATION_COMPUTE,
4676 &compute_pipeline->descriptor_state,
4677 &descriptor_data_offset_out,
4678 &state->pds_compute_descriptor_data_offset);
4679 if (result != VK_SUCCESS)
4680 return;
4681 } else if ((compute_pipeline->base.layout
4682 ->per_stage_descriptor_masks[PVR_STAGE_ALLOCATION_COMPUTE] &&
4683 state->dirty.compute_desc_dirty) ||
4684 state->dirty.compute_pipeline_binding) {
4685 result = pvr_setup_descriptor_mappings(
4686 cmd_buffer,
4687 PVR_STAGE_ALLOCATION_COMPUTE,
4688 &compute_pipeline->descriptor_state,
4689 NULL,
4690 &state->pds_compute_descriptor_data_offset);
4691 if (result != VK_SUCCESS)
4692 return;
4693 }
4694
4695 pvr_compute_update_shared(cmd_buffer, sub_cmd);
4696 pvr_compute_update_kernel(cmd_buffer, sub_cmd, indirect_addr, workgroup_size);
4697 }
4698
pvr_CmdDispatch(VkCommandBuffer commandBuffer,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)4699 void pvr_CmdDispatch(VkCommandBuffer commandBuffer,
4700 uint32_t groupCountX,
4701 uint32_t groupCountY,
4702 uint32_t groupCountZ)
4703 {
4704 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
4705
4706 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
4707
4708 if (!groupCountX || !groupCountY || !groupCountZ)
4709 return;
4710
4711 pvr_cmd_dispatch(cmd_buffer,
4712 PVR_DEV_ADDR_INVALID,
4713 (uint32_t[]){ groupCountX, groupCountY, groupCountZ });
4714 }
4715
pvr_CmdDispatchIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset)4716 void pvr_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
4717 VkBuffer _buffer,
4718 VkDeviceSize offset)
4719 {
4720 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
4721 PVR_FROM_HANDLE(pvr_buffer, buffer, _buffer);
4722
4723 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
4724
4725 pvr_cmd_dispatch(cmd_buffer,
4726 PVR_DEV_ADDR_OFFSET(buffer->dev_addr, offset),
4727 (uint32_t[]){ 1, 1, 1 });
4728 }
4729
4730 static void
pvr_update_draw_state(struct pvr_cmd_buffer_state * const state,const struct pvr_cmd_buffer_draw_state * const draw_state)4731 pvr_update_draw_state(struct pvr_cmd_buffer_state *const state,
4732 const struct pvr_cmd_buffer_draw_state *const draw_state)
4733 {
4734 /* We don't have a state to tell us that base_instance is being used so it
4735 * gets used as a boolean - 0 means we'll use a pds program that skips the
4736 * base instance addition. If the base_instance gets used (and the last
4737 * draw's base_instance was 0) then we switch to the BASE_INSTANCE attrib
4738 * program.
4739 *
4740 * If base_instance changes then we only need to update the data section.
4741 *
4742 * The only draw call state that doesn't really matter is the start vertex
4743 * as that is handled properly in the VDM state in all cases.
4744 */
4745 if ((state->draw_state.draw_indexed != draw_state->draw_indexed) ||
4746 (state->draw_state.draw_indirect != draw_state->draw_indirect) ||
4747 (state->draw_state.base_instance == 0 &&
4748 draw_state->base_instance != 0)) {
4749 state->dirty.draw_variant = true;
4750 } else if (state->draw_state.base_instance != draw_state->base_instance) {
4751 state->dirty.draw_base_instance = true;
4752 }
4753
4754 state->draw_state = *draw_state;
4755 }
4756
pvr_calc_shared_regs_count(const struct pvr_graphics_pipeline * const gfx_pipeline)4757 static uint32_t pvr_calc_shared_regs_count(
4758 const struct pvr_graphics_pipeline *const gfx_pipeline)
4759 {
4760 const struct pvr_pipeline_stage_state *const vertex_state =
4761 &gfx_pipeline->shader_state.vertex.stage_state;
4762
4763 uint32_t shared_regs = vertex_state->const_shared_reg_count +
4764 vertex_state->const_shared_reg_offset;
4765
4766 if (gfx_pipeline->shader_state.fragment.bo) {
4767 const struct pvr_pipeline_stage_state *const fragment_state =
4768 &gfx_pipeline->shader_state.fragment.stage_state;
4769
4770 uint32_t fragment_regs = fragment_state->const_shared_reg_count +
4771 fragment_state->const_shared_reg_offset;
4772
4773 shared_regs = MAX2(shared_regs, fragment_regs);
4774 }
4775
4776 return shared_regs;
4777 }
4778
4779 static void
pvr_emit_dirty_pds_state(const struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd,const uint32_t pds_vertex_descriptor_data_offset)4780 pvr_emit_dirty_pds_state(const struct pvr_cmd_buffer *const cmd_buffer,
4781 struct pvr_sub_cmd_gfx *const sub_cmd,
4782 const uint32_t pds_vertex_descriptor_data_offset)
4783 {
4784 const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
4785 const struct pvr_stage_allocation_descriptor_state
4786 *const vertex_descriptor_state =
4787 &state->gfx_pipeline->shader_state.vertex.descriptor_state;
4788 const struct pvr_pipeline_stage_state *const vertex_stage_state =
4789 &state->gfx_pipeline->shader_state.vertex.stage_state;
4790 struct pvr_csb *const csb = &sub_cmd->control_stream;
4791
4792 if (!vertex_descriptor_state->pds_info.code_size_in_dwords)
4793 return;
4794
4795 pvr_csb_set_relocation_mark(csb);
4796
4797 pvr_csb_emit (csb, VDMCTRL_PDS_STATE0, state0) {
4798 state0.usc_target = PVRX(VDMCTRL_USC_TARGET_ALL);
4799
4800 state0.usc_common_size =
4801 DIV_ROUND_UP(vertex_stage_state->const_shared_reg_count << 2,
4802 PVRX(VDMCTRL_PDS_STATE0_USC_COMMON_SIZE_UNIT_SIZE));
4803
4804 state0.pds_data_size = DIV_ROUND_UP(
4805 PVR_DW_TO_BYTES(vertex_descriptor_state->pds_info.data_size_in_dwords),
4806 PVRX(VDMCTRL_PDS_STATE0_PDS_DATA_SIZE_UNIT_SIZE));
4807 }
4808
4809 pvr_csb_emit (csb, VDMCTRL_PDS_STATE1, state1) {
4810 state1.pds_data_addr = PVR_DEV_ADDR(pds_vertex_descriptor_data_offset);
4811 state1.sd_type = PVRX(VDMCTRL_SD_TYPE_NONE);
4812 }
4813
4814 pvr_csb_emit (csb, VDMCTRL_PDS_STATE2, state2) {
4815 state2.pds_code_addr =
4816 PVR_DEV_ADDR(vertex_descriptor_state->pds_code.code_offset);
4817 }
4818
4819 pvr_csb_clear_relocation_mark(csb);
4820 }
4821
pvr_setup_output_select(struct pvr_cmd_buffer * const cmd_buffer)4822 static void pvr_setup_output_select(struct pvr_cmd_buffer *const cmd_buffer)
4823 {
4824 const struct pvr_graphics_pipeline *const gfx_pipeline =
4825 cmd_buffer->state.gfx_pipeline;
4826 const struct pvr_vertex_shader_state *const vertex_state =
4827 &gfx_pipeline->shader_state.vertex;
4828 struct vk_dynamic_graphics_state *const dynamic_state =
4829 &cmd_buffer->vk.dynamic_graphics_state;
4830 struct PVRX(TA_STATE_HEADER) *const header = &cmd_buffer->state.emit_header;
4831 struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state;
4832 uint32_t output_selects;
4833
4834 /* TODO: Handle vertex and fragment shader state flags. */
4835
4836 pvr_csb_pack (&output_selects, TA_OUTPUT_SEL, state) {
4837 state.rhw_pres = true;
4838 state.vtxsize = DIV_ROUND_UP(vertex_state->vertex_output_size, 4U);
4839 state.psprite_size_pres = (dynamic_state->ia.primitive_topology ==
4840 VK_PRIMITIVE_TOPOLOGY_POINT_LIST);
4841 }
4842
4843 if (ppp_state->output_selects != output_selects) {
4844 ppp_state->output_selects = output_selects;
4845 header->pres_outselects = true;
4846 }
4847
4848 if (ppp_state->varying_word[0] != vertex_state->varying[0]) {
4849 ppp_state->varying_word[0] = vertex_state->varying[0];
4850 header->pres_varying_word0 = true;
4851 }
4852
4853 if (ppp_state->varying_word[1] != vertex_state->varying[1]) {
4854 ppp_state->varying_word[1] = vertex_state->varying[1];
4855 header->pres_varying_word1 = true;
4856 }
4857 }
4858
4859 static void
pvr_setup_isp_faces_and_control(struct pvr_cmd_buffer * const cmd_buffer,struct PVRX (TA_STATE_ISPA)* const ispa_out)4860 pvr_setup_isp_faces_and_control(struct pvr_cmd_buffer *const cmd_buffer,
4861 struct PVRX(TA_STATE_ISPA) *const ispa_out)
4862 {
4863 struct PVRX(TA_STATE_HEADER) *const header = &cmd_buffer->state.emit_header;
4864 const struct pvr_fragment_shader_state *const fragment_shader_state =
4865 &cmd_buffer->state.gfx_pipeline->shader_state.fragment;
4866 const struct pvr_render_pass_info *const pass_info =
4867 &cmd_buffer->state.render_pass_info;
4868 struct vk_dynamic_graphics_state *dynamic_state =
4869 &cmd_buffer->vk.dynamic_graphics_state;
4870 struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state;
4871
4872 const bool rasterizer_discard = dynamic_state->rs.rasterizer_discard_enable;
4873 const uint32_t subpass_idx = pass_info->subpass_idx;
4874 const uint32_t depth_stencil_attachment_idx =
4875 pass_info->pass->subpasses[subpass_idx].depth_stencil_attachment;
4876 const struct pvr_render_pass_attachment *const attachment =
4877 depth_stencil_attachment_idx != VK_ATTACHMENT_UNUSED
4878 ? &pass_info->pass->attachments[depth_stencil_attachment_idx]
4879 : NULL;
4880
4881 const enum PVRX(TA_OBJTYPE)
4882 obj_type = pvr_ta_objtype(dynamic_state->ia.primitive_topology);
4883
4884 const VkImageAspectFlags ds_aspects =
4885 (!rasterizer_discard && attachment)
4886 ? vk_format_aspects(attachment->vk_format) &
4887 (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)
4888 : VK_IMAGE_ASPECT_NONE;
4889
4890 /* This is deliberately a full copy rather than a pointer because
4891 * vk_optimize_depth_stencil_state() can only be run once against any given
4892 * instance of vk_depth_stencil_state.
4893 */
4894 struct vk_depth_stencil_state ds_state = dynamic_state->ds;
4895
4896 uint32_t ispb_stencil_off;
4897 bool is_two_sided = false;
4898 uint32_t isp_control;
4899
4900 uint32_t line_width;
4901 uint32_t common_a;
4902 uint32_t front_a;
4903 uint32_t front_b;
4904 uint32_t back_a;
4905 uint32_t back_b;
4906
4907 vk_optimize_depth_stencil_state(&ds_state, ds_aspects, true);
4908
4909 /* Convert to 4.4 fixed point format. */
4910 line_width = util_unsigned_fixed(dynamic_state->rs.line.width, 4);
4911
4912 /* Subtract 1 to shift values from range [0=0,256=16] to [0=1/16,255=16].
4913 * If 0 it stays at 0, otherwise we subtract 1.
4914 */
4915 line_width = (!!line_width) * (line_width - 1);
4916
4917 line_width = MIN2(line_width, PVRX(TA_STATE_ISPA_POINTLINEWIDTH_SIZE_MAX));
4918
4919 /* TODO: Part of the logic in this function is duplicated in another part
4920 * of the code. E.g. the dcmpmode, and sop1/2/3. Could we do this earlier?
4921 */
4922
4923 pvr_csb_pack (&common_a, TA_STATE_ISPA, ispa) {
4924 ispa.pointlinewidth = line_width;
4925
4926 ispa.dcmpmode = pvr_ta_cmpmode(ds_state.depth.compare_op);
4927 ispa.dwritedisable = !ds_state.depth.write_enable;
4928
4929 ispa.passtype = fragment_shader_state->pass_type;
4930
4931 ispa.objtype = obj_type;
4932
4933 /* Return unpacked ispa structure. dcmpmode, dwritedisable, passtype and
4934 * objtype are needed by pvr_setup_triangle_merging_flag.
4935 */
4936 if (ispa_out)
4937 *ispa_out = ispa;
4938 }
4939
4940 /* TODO: Does this actually represent the ispb control word on stencil off?
4941 * If not, rename the variable.
4942 */
4943 pvr_csb_pack (&ispb_stencil_off, TA_STATE_ISPB, ispb) {
4944 ispb.sop3 = PVRX(TA_ISPB_STENCILOP_KEEP);
4945 ispb.sop2 = PVRX(TA_ISPB_STENCILOP_KEEP);
4946 ispb.sop1 = PVRX(TA_ISPB_STENCILOP_KEEP);
4947 ispb.scmpmode = PVRX(TA_CMPMODE_ALWAYS);
4948 }
4949
4950 /* FIXME: This logic should be redone and improved. Can we also get rid of
4951 * the front and back variants?
4952 */
4953
4954 front_a = common_a;
4955 back_a = common_a;
4956
4957 if (ds_state.stencil.test_enable) {
4958 uint32_t front_a_sref;
4959 uint32_t back_a_sref;
4960
4961 pvr_csb_pack (&front_a_sref, TA_STATE_ISPA, ispa) {
4962 ispa.sref = ds_state.stencil.front.reference;
4963 }
4964 front_a |= front_a_sref;
4965
4966 pvr_csb_pack (&back_a_sref, TA_STATE_ISPA, ispa) {
4967 ispa.sref = ds_state.stencil.back.reference;
4968 }
4969 back_a |= back_a_sref;
4970
4971 pvr_csb_pack (&front_b, TA_STATE_ISPB, ispb) {
4972 const struct vk_stencil_test_face_state *const front =
4973 &ds_state.stencil.front;
4974
4975 if (ds_state.stencil.write_enable)
4976 ispb.swmask = front->write_mask;
4977
4978 ispb.scmpmask = front->compare_mask;
4979
4980 ispb.sop3 = pvr_ta_stencilop(front->op.pass);
4981 ispb.sop2 = pvr_ta_stencilop(front->op.depth_fail);
4982 ispb.sop1 = pvr_ta_stencilop(front->op.fail);
4983 ispb.scmpmode = pvr_ta_cmpmode(front->op.compare);
4984 }
4985
4986 pvr_csb_pack (&back_b, TA_STATE_ISPB, ispb) {
4987 const struct vk_stencil_test_face_state *const back =
4988 &ds_state.stencil.back;
4989
4990 if (ds_state.stencil.write_enable)
4991 ispb.swmask = back->write_mask;
4992
4993 ispb.scmpmask = back->compare_mask;
4994
4995 ispb.sop3 = pvr_ta_stencilop(back->op.pass);
4996 ispb.sop2 = pvr_ta_stencilop(back->op.depth_fail);
4997 ispb.sop1 = pvr_ta_stencilop(back->op.fail);
4998 ispb.scmpmode = pvr_ta_cmpmode(back->op.compare);
4999 }
5000 } else {
5001 front_b = ispb_stencil_off;
5002 back_b = ispb_stencil_off;
5003 }
5004
5005 if (front_a != back_a || front_b != back_b) {
5006 if (dynamic_state->rs.cull_mode & VK_CULL_MODE_BACK_BIT) {
5007 /* Single face, using front state. */
5008 } else if (dynamic_state->rs.cull_mode & VK_CULL_MODE_FRONT_BIT) {
5009 /* Single face, using back state. */
5010
5011 front_a = back_a;
5012 front_b = back_b;
5013 } else {
5014 /* Both faces. */
5015
5016 header->pres_ispctl_ba = is_two_sided = true;
5017
5018 if (dynamic_state->rs.front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE) {
5019 uint32_t tmp = front_a;
5020
5021 front_a = back_a;
5022 back_a = tmp;
5023
5024 tmp = front_b;
5025 front_b = back_b;
5026 back_b = tmp;
5027 }
5028
5029 /* HW defaults to stencil off. */
5030 if (back_b != ispb_stencil_off) {
5031 header->pres_ispctl_fb = true;
5032 header->pres_ispctl_bb = true;
5033 }
5034 }
5035 }
5036
5037 if (ds_state.stencil.test_enable && front_b != ispb_stencil_off)
5038 header->pres_ispctl_fb = true;
5039
5040 pvr_csb_pack (&isp_control, TA_STATE_ISPCTL, ispctl) {
5041 ispctl.upass = pass_info->isp_userpass;
5042
5043 /* TODO: is bo ever NULL? Figure out what to do. */
5044 ispctl.tagwritedisable = rasterizer_discard || !fragment_shader_state->bo;
5045
5046 ispctl.two_sided = is_two_sided;
5047 ispctl.bpres = header->pres_ispctl_fb || header->pres_ispctl_bb;
5048
5049 ispctl.dbenable = !rasterizer_discard &&
5050 dynamic_state->rs.depth_bias.enable &&
5051 obj_type == PVRX(TA_OBJTYPE_TRIANGLE);
5052 if (!rasterizer_discard && cmd_buffer->state.vis_test_enabled) {
5053 ispctl.vistest = true;
5054 ispctl.visreg = cmd_buffer->state.vis_reg;
5055 }
5056
5057 ispctl.scenable = !rasterizer_discard;
5058
5059 ppp_state->isp.control_struct = ispctl;
5060 }
5061
5062 header->pres_ispctl = true;
5063
5064 ppp_state->isp.control = isp_control;
5065 ppp_state->isp.front_a = front_a;
5066 ppp_state->isp.front_b = front_b;
5067 ppp_state->isp.back_a = back_a;
5068 ppp_state->isp.back_b = back_b;
5069 }
5070
5071 static float
pvr_calculate_final_depth_bias_contant_factor(struct pvr_device_info * dev_info,VkFormat format,float depth_bias)5072 pvr_calculate_final_depth_bias_contant_factor(struct pvr_device_info *dev_info,
5073 VkFormat format,
5074 float depth_bias)
5075 {
5076 /* Information for future modifiers of these depth bias calculations.
5077 * ==================================================================
5078 * Specified depth bias equations scale the specified constant factor by a
5079 * value 'r' that is guaranteed to cause a resolvable difference in depth
5080 * across the entire range of depth values.
5081 * For floating point depth formats 'r' is calculated by taking the maximum
5082 * exponent across the triangle.
5083 * For UNORM formats 'r' is constant.
5084 * Here 'n' is the number of mantissa bits stored in the floating point
5085 * representation (23 for F32).
5086 *
5087 * UNORM Format -> z += dbcf * r + slope
5088 * FLOAT Format -> z += dbcf * 2^(e-n) + slope
5089 *
5090 * HW Variations.
5091 * ==============
5092 * The HW either always performs the F32 depth bias equation (exponent based
5093 * r), or in the case of HW that correctly supports the integer depth bias
5094 * equation for UNORM depth formats, we can select between both equations
5095 * using the ROGUE_CR_ISP_CTL.dbias_is_int flag - this is required to
5096 * correctly perform Vulkan UNORM depth bias (constant r).
5097 *
5098 * if ern42307:
5099 * if DBIAS_IS_INT_EN:
5100 * z += dbcf + slope
5101 * else:
5102 * z += dbcf * 2^(e-n) + slope
5103 * else:
5104 * z += dbcf * 2^(e-n) + slope
5105 *
5106 */
5107
5108 float nudge_factor;
5109
5110 if (PVR_HAS_ERN(dev_info, 42307)) {
5111 switch (format) {
5112 case VK_FORMAT_D16_UNORM:
5113 return depth_bias / (1 << 15);
5114
5115 case VK_FORMAT_D24_UNORM_S8_UINT:
5116 case VK_FORMAT_X8_D24_UNORM_PACK32:
5117 return depth_bias / (1 << 23);
5118
5119 default:
5120 return depth_bias;
5121 }
5122 }
5123
5124 /* The reasoning behind clamping/nudging the value here is because UNORM
5125 * depth formats can have higher precision over our underlying D32F
5126 * representation for some depth ranges.
5127 *
5128 * When the HW scales the depth bias value by 2^(e-n) [The 'r' term'] a depth
5129 * bias of 1 can result in a value smaller than one F32 ULP, which will get
5130 * quantized to 0 - resulting in no bias.
5131 *
5132 * Biasing small values away from zero will ensure that small depth biases of
5133 * 1 still yield a result and overcome Z-fighting.
5134 */
5135 switch (format) {
5136 case VK_FORMAT_D16_UNORM:
5137 depth_bias *= 512.0f;
5138 nudge_factor = 1.0f;
5139 break;
5140
5141 case VK_FORMAT_D24_UNORM_S8_UINT:
5142 case VK_FORMAT_X8_D24_UNORM_PACK32:
5143 depth_bias *= 2.0f;
5144 nudge_factor = 2.0f;
5145 break;
5146
5147 default:
5148 nudge_factor = 0.0f;
5149 break;
5150 }
5151
5152 if (nudge_factor != 0.0f) {
5153 if (depth_bias < 0.0f && depth_bias > -nudge_factor)
5154 depth_bias -= nudge_factor;
5155 else if (depth_bias > 0.0f && depth_bias < nudge_factor)
5156 depth_bias += nudge_factor;
5157 }
5158
5159 return depth_bias;
5160 }
5161
pvr_get_viewport_scissor_overlap(const VkViewport * const viewport,const VkRect2D * const scissor,VkRect2D * const rect_out)5162 static void pvr_get_viewport_scissor_overlap(const VkViewport *const viewport,
5163 const VkRect2D *const scissor,
5164 VkRect2D *const rect_out)
5165 {
5166 /* TODO: See if we can remove this struct. */
5167 struct pvr_rect {
5168 int32_t x0, y0;
5169 int32_t x1, y1;
5170 };
5171
5172 /* TODO: Worry about overflow? */
5173 const struct pvr_rect scissor_rect = {
5174 .x0 = scissor->offset.x,
5175 .y0 = scissor->offset.y,
5176 .x1 = scissor->offset.x + scissor->extent.width,
5177 .y1 = scissor->offset.y + scissor->extent.height
5178 };
5179 struct pvr_rect viewport_rect = { 0 };
5180
5181 assert(viewport->width >= 0.0f);
5182 assert(scissor_rect.x0 >= 0);
5183 assert(scissor_rect.y0 >= 0);
5184
5185 if (scissor->extent.width == 0 || scissor->extent.height == 0) {
5186 *rect_out = (VkRect2D){ 0 };
5187 return;
5188 }
5189
5190 viewport_rect.x0 = (int32_t)viewport->x;
5191 viewport_rect.x1 = (int32_t)viewport->x + (int32_t)viewport->width;
5192
5193 /* TODO: Is there a mathematical way of doing all this and then clamp at
5194 * the end?
5195 */
5196 /* We flip the y0 and y1 when height is negative. */
5197 viewport_rect.y0 = (int32_t)viewport->y + MIN2(0, (int32_t)viewport->height);
5198 viewport_rect.y1 = (int32_t)viewport->y + MAX2(0, (int32_t)viewport->height);
5199
5200 if (scissor_rect.x1 <= viewport_rect.x0 ||
5201 scissor_rect.y1 <= viewport_rect.y0 ||
5202 scissor_rect.x0 >= viewport_rect.x1 ||
5203 scissor_rect.y0 >= viewport_rect.y1) {
5204 *rect_out = (VkRect2D){ 0 };
5205 return;
5206 }
5207
5208 /* Determine the overlapping rectangle. */
5209 viewport_rect.x0 = MAX2(viewport_rect.x0, scissor_rect.x0);
5210 viewport_rect.y0 = MAX2(viewport_rect.y0, scissor_rect.y0);
5211 viewport_rect.x1 = MIN2(viewport_rect.x1, scissor_rect.x1);
5212 viewport_rect.y1 = MIN2(viewport_rect.y1, scissor_rect.y1);
5213
5214 /* TODO: Is this conversion safe? Is this logic right? */
5215 rect_out->offset.x = (uint32_t)viewport_rect.x0;
5216 rect_out->offset.y = (uint32_t)viewport_rect.y0;
5217 rect_out->extent.height = (uint32_t)(viewport_rect.y1 - viewport_rect.y0);
5218 rect_out->extent.width = (uint32_t)(viewport_rect.x1 - viewport_rect.x0);
5219 }
5220
5221 static inline uint32_t
pvr_get_geom_region_clip_align_size(struct pvr_device_info * const dev_info)5222 pvr_get_geom_region_clip_align_size(struct pvr_device_info *const dev_info)
5223 {
5224 /* TODO: This should come from rogue_ppp.xml. */
5225 return 16U + 16U * (!PVR_HAS_FEATURE(dev_info, tile_size_16x16));
5226 }
5227
5228 static void
pvr_setup_isp_depth_bias_scissor_state(struct pvr_cmd_buffer * const cmd_buffer)5229 pvr_setup_isp_depth_bias_scissor_state(struct pvr_cmd_buffer *const cmd_buffer)
5230 {
5231 struct PVRX(TA_STATE_HEADER) *const header = &cmd_buffer->state.emit_header;
5232 struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state;
5233 struct vk_dynamic_graphics_state *const dynamic_state =
5234 &cmd_buffer->vk.dynamic_graphics_state;
5235 const struct PVRX(TA_STATE_ISPCTL) *const ispctl =
5236 &ppp_state->isp.control_struct;
5237 struct pvr_device_info *const dev_info =
5238 &cmd_buffer->device->pdevice->dev_info;
5239
5240 if (ispctl->dbenable &&
5241 (BITSET_TEST(dynamic_state->dirty,
5242 MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS) ||
5243 cmd_buffer->depth_bias_array.size == 0)) {
5244 struct pvr_depth_bias_state depth_bias = {
5245 .constant_factor = pvr_calculate_final_depth_bias_contant_factor(
5246 dev_info,
5247 cmd_buffer->state.depth_format,
5248 dynamic_state->rs.depth_bias.constant),
5249 .slope_factor = dynamic_state->rs.depth_bias.slope,
5250 .clamp = dynamic_state->rs.depth_bias.clamp,
5251 };
5252
5253 ppp_state->depthbias_scissor_indices.depthbias_index =
5254 util_dynarray_num_elements(&cmd_buffer->depth_bias_array,
5255 __typeof__(depth_bias));
5256
5257 util_dynarray_append(&cmd_buffer->depth_bias_array,
5258 __typeof__(depth_bias),
5259 depth_bias);
5260
5261 header->pres_ispctl_dbsc = true;
5262 }
5263
5264 if (ispctl->scenable) {
5265 const uint32_t region_clip_align_size =
5266 pvr_get_geom_region_clip_align_size(dev_info);
5267 const VkViewport *const viewport = &dynamic_state->vp.viewports[0];
5268 const VkRect2D *const scissor = &dynamic_state->vp.scissors[0];
5269 struct pvr_scissor_words scissor_words;
5270 VkRect2D overlap_rect;
5271 uint32_t height;
5272 uint32_t width;
5273 uint32_t x;
5274 uint32_t y;
5275
5276 /* For region clip. */
5277 uint32_t bottom;
5278 uint32_t right;
5279 uint32_t left;
5280 uint32_t top;
5281
5282 /* We don't support multiple viewport calculations. */
5283 assert(dynamic_state->vp.viewport_count == 1);
5284 /* We don't support multiple scissor calculations. */
5285 assert(dynamic_state->vp.scissor_count == 1);
5286
5287 pvr_get_viewport_scissor_overlap(viewport, scissor, &overlap_rect);
5288
5289 x = overlap_rect.offset.x;
5290 y = overlap_rect.offset.y;
5291 width = overlap_rect.extent.width;
5292 height = overlap_rect.extent.height;
5293
5294 pvr_csb_pack (&scissor_words.w0, IPF_SCISSOR_WORD_0, word0) {
5295 word0.scw0_xmax = x + width;
5296 word0.scw0_xmin = x;
5297 }
5298
5299 pvr_csb_pack (&scissor_words.w1, IPF_SCISSOR_WORD_1, word1) {
5300 word1.scw1_ymax = y + height;
5301 word1.scw1_ymin = y;
5302 }
5303
5304 if (cmd_buffer->scissor_array.size &&
5305 cmd_buffer->scissor_words.w0 == scissor_words.w0 &&
5306 cmd_buffer->scissor_words.w1 == scissor_words.w1) {
5307 return;
5308 }
5309
5310 cmd_buffer->scissor_words = scissor_words;
5311
5312 /* Calculate region clip. */
5313
5314 left = x / region_clip_align_size;
5315 top = y / region_clip_align_size;
5316
5317 /* We prevent right=-1 with the multiplication. */
5318 /* TODO: Is there a better way of doing this? */
5319 if ((x + width) != 0U)
5320 right = DIV_ROUND_UP(x + width, region_clip_align_size) - 1;
5321 else
5322 right = 0;
5323
5324 if ((y + height) != 0U)
5325 bottom = DIV_ROUND_UP(y + height, region_clip_align_size) - 1;
5326 else
5327 bottom = 0U;
5328
5329 /* Setup region clip to clip everything outside what was calculated. */
5330
5331 /* FIXME: Should we mask to prevent writing over other words? */
5332 pvr_csb_pack (&ppp_state->region_clipping.word0, TA_REGION_CLIP0, word0) {
5333 word0.right = right;
5334 word0.left = left;
5335 word0.mode = PVRX(TA_REGION_CLIP_MODE_OUTSIDE);
5336 }
5337
5338 pvr_csb_pack (&ppp_state->region_clipping.word1, TA_REGION_CLIP1, word1) {
5339 word1.bottom = bottom;
5340 word1.top = top;
5341 }
5342
5343 ppp_state->depthbias_scissor_indices.scissor_index =
5344 util_dynarray_num_elements(&cmd_buffer->scissor_array,
5345 struct pvr_scissor_words);
5346
5347 util_dynarray_append(&cmd_buffer->scissor_array,
5348 struct pvr_scissor_words,
5349 cmd_buffer->scissor_words);
5350
5351 header->pres_ispctl_dbsc = true;
5352 header->pres_region_clip = true;
5353 }
5354 }
5355
5356 static void
pvr_setup_triangle_merging_flag(struct pvr_cmd_buffer * const cmd_buffer,struct PVRX (TA_STATE_ISPA)* ispa)5357 pvr_setup_triangle_merging_flag(struct pvr_cmd_buffer *const cmd_buffer,
5358 struct PVRX(TA_STATE_ISPA) * ispa)
5359 {
5360 struct PVRX(TA_STATE_HEADER) *const header = &cmd_buffer->state.emit_header;
5361 struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state;
5362 uint32_t merge_word;
5363 uint32_t mask;
5364
5365 pvr_csb_pack (&merge_word, TA_STATE_PDS_SIZEINFO2, size_info) {
5366 /* Disable for lines or punch-through or for DWD and depth compare
5367 * always.
5368 */
5369 if (ispa->objtype == PVRX(TA_OBJTYPE_LINE) ||
5370 ispa->passtype == PVRX(TA_PASSTYPE_PUNCH_THROUGH) ||
5371 (ispa->dwritedisable && ispa->dcmpmode == PVRX(TA_CMPMODE_ALWAYS))) {
5372 size_info.pds_tri_merge_disable = true;
5373 }
5374 }
5375
5376 pvr_csb_pack (&mask, TA_STATE_PDS_SIZEINFO2, size_info) {
5377 size_info.pds_tri_merge_disable = true;
5378 }
5379
5380 merge_word |= ppp_state->pds.size_info2 & ~mask;
5381
5382 if (merge_word != ppp_state->pds.size_info2) {
5383 ppp_state->pds.size_info2 = merge_word;
5384 header->pres_pds_state_ptr0 = true;
5385 }
5386 }
5387
5388 static void
pvr_setup_fragment_state_pointers(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)5389 pvr_setup_fragment_state_pointers(struct pvr_cmd_buffer *const cmd_buffer,
5390 struct pvr_sub_cmd_gfx *const sub_cmd)
5391 {
5392 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
5393
5394 const struct pvr_fragment_shader_state *const fragment =
5395 &state->gfx_pipeline->shader_state.fragment;
5396 const struct pvr_stage_allocation_descriptor_state *descriptor_shader_state =
5397 &fragment->descriptor_state;
5398 const struct pvr_pipeline_stage_state *fragment_state =
5399 &fragment->stage_state;
5400 const struct pvr_pds_upload *pds_coeff_program =
5401 &fragment->pds_coeff_program;
5402
5403 const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
5404 struct PVRX(TA_STATE_HEADER) *const header = &state->emit_header;
5405 struct pvr_ppp_state *const ppp_state = &state->ppp_state;
5406
5407 const uint32_t pds_uniform_size =
5408 DIV_ROUND_UP(descriptor_shader_state->pds_info.data_size_in_dwords,
5409 PVRX(TA_STATE_PDS_SIZEINFO1_PDS_UNIFORMSIZE_UNIT_SIZE));
5410
5411 const uint32_t pds_varying_state_size =
5412 DIV_ROUND_UP(pds_coeff_program->data_size,
5413 PVRX(TA_STATE_PDS_SIZEINFO1_PDS_VARYINGSIZE_UNIT_SIZE));
5414
5415 const uint32_t usc_varying_size =
5416 DIV_ROUND_UP(fragment_state->coefficient_size,
5417 PVRX(TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_UNIT_SIZE));
5418
5419 const uint32_t pds_temp_size =
5420 DIV_ROUND_UP(fragment_state->pds_temps_count,
5421 PVRX(TA_STATE_PDS_SIZEINFO1_PDS_TEMPSIZE_UNIT_SIZE));
5422
5423 const uint32_t usc_shared_size =
5424 DIV_ROUND_UP(fragment_state->const_shared_reg_count,
5425 PVRX(TA_STATE_PDS_SIZEINFO2_USC_SHAREDSIZE_UNIT_SIZE));
5426
5427 const uint32_t max_tiles_in_flight =
5428 pvr_calc_fscommon_size_and_tiles_in_flight(
5429 &pdevice->dev_info,
5430 &pdevice->dev_runtime_info,
5431 usc_shared_size *
5432 PVRX(TA_STATE_PDS_SIZEINFO2_USC_SHAREDSIZE_UNIT_SIZE),
5433 1);
5434 uint32_t size_info_mask;
5435 uint32_t size_info2;
5436
5437 if (max_tiles_in_flight < sub_cmd->max_tiles_in_flight)
5438 sub_cmd->max_tiles_in_flight = max_tiles_in_flight;
5439
5440 pvr_csb_pack (&ppp_state->pds.pixel_shader_base,
5441 TA_STATE_PDS_SHADERBASE,
5442 shader_base) {
5443 const struct pvr_pds_upload *const pds_upload =
5444 &fragment->pds_fragment_program;
5445
5446 shader_base.addr = PVR_DEV_ADDR(pds_upload->data_offset);
5447 }
5448
5449 if (descriptor_shader_state->pds_code.pvr_bo) {
5450 pvr_csb_pack (&ppp_state->pds.texture_uniform_code_base,
5451 TA_STATE_PDS_TEXUNICODEBASE,
5452 tex_base) {
5453 tex_base.addr =
5454 PVR_DEV_ADDR(descriptor_shader_state->pds_code.code_offset);
5455 }
5456 } else {
5457 ppp_state->pds.texture_uniform_code_base = 0U;
5458 }
5459
5460 pvr_csb_pack (&ppp_state->pds.size_info1, TA_STATE_PDS_SIZEINFO1, info1) {
5461 info1.pds_uniformsize = pds_uniform_size;
5462 info1.pds_texturestatesize = 0U;
5463 info1.pds_varyingsize = pds_varying_state_size;
5464 info1.usc_varyingsize = usc_varying_size;
5465 info1.pds_tempsize = pds_temp_size;
5466 }
5467
5468 pvr_csb_pack (&size_info_mask, TA_STATE_PDS_SIZEINFO2, mask) {
5469 mask.pds_tri_merge_disable = true;
5470 }
5471
5472 ppp_state->pds.size_info2 &= size_info_mask;
5473
5474 pvr_csb_pack (&size_info2, TA_STATE_PDS_SIZEINFO2, info2) {
5475 info2.usc_sharedsize = usc_shared_size;
5476 }
5477
5478 ppp_state->pds.size_info2 |= size_info2;
5479
5480 if (pds_coeff_program->pvr_bo) {
5481 header->pres_pds_state_ptr1 = true;
5482
5483 pvr_csb_pack (&ppp_state->pds.varying_base,
5484 TA_STATE_PDS_VARYINGBASE,
5485 base) {
5486 base.addr = PVR_DEV_ADDR(pds_coeff_program->data_offset);
5487 }
5488 } else {
5489 ppp_state->pds.varying_base = 0U;
5490 }
5491
5492 pvr_csb_pack (&ppp_state->pds.uniform_state_data_base,
5493 TA_STATE_PDS_UNIFORMDATABASE,
5494 base) {
5495 base.addr = PVR_DEV_ADDR(state->pds_fragment_descriptor_data_offset);
5496 }
5497
5498 header->pres_pds_state_ptr0 = true;
5499 header->pres_pds_state_ptr3 = true;
5500 }
5501
pvr_setup_viewport(struct pvr_cmd_buffer * const cmd_buffer)5502 static void pvr_setup_viewport(struct pvr_cmd_buffer *const cmd_buffer)
5503 {
5504 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
5505 struct PVRX(TA_STATE_HEADER) *const header = &state->emit_header;
5506 struct vk_dynamic_graphics_state *const dynamic_state =
5507 &cmd_buffer->vk.dynamic_graphics_state;
5508 struct pvr_ppp_state *const ppp_state = &state->ppp_state;
5509
5510 if (ppp_state->viewport_count != dynamic_state->vp.viewport_count) {
5511 ppp_state->viewport_count = dynamic_state->vp.viewport_count;
5512 header->pres_viewport = true;
5513 }
5514
5515 if (dynamic_state->rs.rasterizer_discard_enable) {
5516 /* We don't want to emit any viewport data as it'll just get thrown
5517 * away. It's after the previous condition because we still want to
5518 * stash the viewport_count as it's our trigger for when
5519 * rasterizer discard gets disabled.
5520 */
5521 header->pres_viewport = false;
5522 return;
5523 }
5524
5525 for (uint32_t i = 0; i < ppp_state->viewport_count; i++) {
5526 VkViewport *viewport = &dynamic_state->vp.viewports[i];
5527 uint32_t x_scale = fui(viewport->width * 0.5f);
5528 uint32_t y_scale = fui(viewport->height * 0.5f);
5529 uint32_t z_scale = fui(viewport->maxDepth - viewport->minDepth);
5530 uint32_t x_center = fui(viewport->x + viewport->width * 0.5f);
5531 uint32_t y_center = fui(viewport->y + viewport->height * 0.5f);
5532 uint32_t z_center = fui(viewport->minDepth);
5533
5534 if (ppp_state->viewports[i].a0 != x_center ||
5535 ppp_state->viewports[i].m0 != x_scale ||
5536 ppp_state->viewports[i].a1 != y_center ||
5537 ppp_state->viewports[i].m1 != y_scale ||
5538 ppp_state->viewports[i].a2 != z_center ||
5539 ppp_state->viewports[i].m2 != z_scale) {
5540 ppp_state->viewports[i].a0 = x_center;
5541 ppp_state->viewports[i].m0 = x_scale;
5542 ppp_state->viewports[i].a1 = y_center;
5543 ppp_state->viewports[i].m1 = y_scale;
5544 ppp_state->viewports[i].a2 = z_center;
5545 ppp_state->viewports[i].m2 = z_scale;
5546
5547 header->pres_viewport = true;
5548 }
5549 }
5550 }
5551
pvr_setup_ppp_control(struct pvr_cmd_buffer * const cmd_buffer)5552 static void pvr_setup_ppp_control(struct pvr_cmd_buffer *const cmd_buffer)
5553 {
5554 struct vk_dynamic_graphics_state *const dynamic_state =
5555 &cmd_buffer->vk.dynamic_graphics_state;
5556 const VkPrimitiveTopology topology = dynamic_state->ia.primitive_topology;
5557 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
5558 struct PVRX(TA_STATE_HEADER) *const header = &state->emit_header;
5559 struct pvr_ppp_state *const ppp_state = &state->ppp_state;
5560 uint32_t ppp_control;
5561
5562 pvr_csb_pack (&ppp_control, TA_STATE_PPP_CTRL, control) {
5563 control.drawclippededges = true;
5564 control.wclampen = true;
5565
5566 if (topology == VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN)
5567 control.flatshade_vtx = PVRX(TA_FLATSHADE_VTX_VERTEX_1);
5568 else
5569 control.flatshade_vtx = PVRX(TA_FLATSHADE_VTX_VERTEX_0);
5570
5571 if (dynamic_state->rs.depth_clamp_enable)
5572 control.clip_mode = PVRX(TA_CLIP_MODE_NO_FRONT_OR_REAR);
5573 else
5574 control.clip_mode = PVRX(TA_CLIP_MODE_FRONT_REAR);
5575
5576 /* +--- FrontIsCCW?
5577 * | +--- Cull Front?
5578 * v v
5579 * 0|0 CULLMODE_CULL_CCW,
5580 * 0|1 CULLMODE_CULL_CW,
5581 * 1|0 CULLMODE_CULL_CW,
5582 * 1|1 CULLMODE_CULL_CCW,
5583 */
5584 switch (dynamic_state->rs.cull_mode) {
5585 case VK_CULL_MODE_BACK_BIT:
5586 case VK_CULL_MODE_FRONT_BIT:
5587 if ((dynamic_state->rs.front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE) ^
5588 (dynamic_state->rs.cull_mode == VK_CULL_MODE_FRONT_BIT)) {
5589 control.cullmode = PVRX(TA_CULLMODE_CULL_CW);
5590 } else {
5591 control.cullmode = PVRX(TA_CULLMODE_CULL_CCW);
5592 }
5593
5594 break;
5595
5596 case VK_CULL_MODE_FRONT_AND_BACK:
5597 case VK_CULL_MODE_NONE:
5598 control.cullmode = PVRX(TA_CULLMODE_NO_CULLING);
5599 break;
5600
5601 default:
5602 unreachable("Unsupported cull mode!");
5603 }
5604 }
5605
5606 if (ppp_control != ppp_state->ppp_control) {
5607 ppp_state->ppp_control = ppp_control;
5608 header->pres_ppp_ctrl = true;
5609 }
5610 }
5611
5612 /* Largest valid PPP State update in words = 31
5613 * 1 - Header
5614 * 3 - Stream Out Config words 0, 1 and 2
5615 * 1 - PPP Control word
5616 * 3 - Varying Config words 0, 1 and 2
5617 * 1 - Output Select
5618 * 1 - WClamp
5619 * 6 - Viewport Transform words
5620 * 2 - Region Clip words
5621 * 3 - PDS State for fragment phase (PDSSTATEPTR 1-3)
5622 * 4 - PDS State for fragment phase (PDSSTATEPTR0)
5623 * 6 - ISP Control Words
5624 */
5625 #define PVR_MAX_PPP_STATE_DWORDS 31
5626
pvr_emit_ppp_state(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)5627 static VkResult pvr_emit_ppp_state(struct pvr_cmd_buffer *const cmd_buffer,
5628 struct pvr_sub_cmd_gfx *const sub_cmd)
5629 {
5630 const bool deferred_secondary = pvr_cmd_uses_deferred_cs_cmds(cmd_buffer);
5631 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
5632 struct PVRX(TA_STATE_HEADER) *const header = &state->emit_header;
5633 struct pvr_csb *const control_stream = &sub_cmd->control_stream;
5634 struct pvr_ppp_state *const ppp_state = &state->ppp_state;
5635 uint32_t ppp_state_words[PVR_MAX_PPP_STATE_DWORDS];
5636 const bool emit_dbsc = header->pres_ispctl_dbsc;
5637 uint32_t *buffer_ptr = ppp_state_words;
5638 uint32_t dbsc_patching_offset = 0;
5639 uint32_t ppp_state_words_count;
5640 struct pvr_suballoc_bo *pvr_bo;
5641 VkResult result;
5642
5643 #if !defined(NDEBUG)
5644 struct PVRX(TA_STATE_HEADER) emit_mask = *header;
5645 uint32_t packed_emit_mask;
5646
5647 static_assert(pvr_cmd_length(TA_STATE_HEADER) == 1,
5648 "EMIT_MASK_IS_CLEAR assumes 1 dword sized header.");
5649
5650 # define EMIT_MASK_GET(field) (emit_mask.field)
5651 # define EMIT_MASK_SET(field, value) (emit_mask.field = (value))
5652 # define EMIT_MASK_IS_CLEAR \
5653 (pvr_cmd_pack(TA_STATE_HEADER)(&packed_emit_mask, &emit_mask), \
5654 packed_emit_mask == 0)
5655 #else
5656 # define EMIT_MASK_GET(field)
5657 # define EMIT_MASK_SET(field, value)
5658 #endif
5659
5660 header->view_port_count =
5661 (ppp_state->viewport_count == 0) ? 0U : (ppp_state->viewport_count - 1);
5662 header->pres_ispctl_fa = header->pres_ispctl;
5663
5664 /* If deferred_secondary is true then we do a separate state update
5665 * which gets patched in vkCmdExecuteCommands().
5666 */
5667 header->pres_ispctl_dbsc &= !deferred_secondary;
5668
5669 pvr_csb_write_struct(buffer_ptr, TA_STATE_HEADER, header);
5670
5671 static_assert(pvr_cmd_length(TA_STATE_HEADER) == 1,
5672 "Following header check assumes 1 dword sized header.");
5673 /* If the header is empty we exit early and prevent a bo alloc of 0 size. */
5674 if (ppp_state_words[0] == 0)
5675 return VK_SUCCESS;
5676
5677 if (header->pres_ispctl) {
5678 pvr_csb_write_value(buffer_ptr, TA_STATE_ISPCTL, ppp_state->isp.control);
5679
5680 assert(header->pres_ispctl_fa);
5681 /* This is not a mistake. FA, BA have the ISPA format, and FB, BB have the
5682 * ISPB format.
5683 */
5684 pvr_csb_write_value(buffer_ptr, TA_STATE_ISPA, ppp_state->isp.front_a);
5685 EMIT_MASK_SET(pres_ispctl_fa, false);
5686
5687 if (header->pres_ispctl_fb) {
5688 pvr_csb_write_value(buffer_ptr, TA_STATE_ISPB, ppp_state->isp.front_b);
5689 EMIT_MASK_SET(pres_ispctl_fb, false);
5690 }
5691
5692 if (header->pres_ispctl_ba) {
5693 pvr_csb_write_value(buffer_ptr, TA_STATE_ISPA, ppp_state->isp.back_a);
5694 EMIT_MASK_SET(pres_ispctl_ba, false);
5695 }
5696
5697 if (header->pres_ispctl_bb) {
5698 pvr_csb_write_value(buffer_ptr, TA_STATE_ISPB, ppp_state->isp.back_b);
5699 EMIT_MASK_SET(pres_ispctl_bb, false);
5700 }
5701
5702 EMIT_MASK_SET(pres_ispctl, false);
5703 }
5704
5705 if (header->pres_ispctl_dbsc) {
5706 assert(!deferred_secondary);
5707
5708 dbsc_patching_offset = buffer_ptr - ppp_state_words;
5709
5710 pvr_csb_pack (buffer_ptr, TA_STATE_ISPDBSC, ispdbsc) {
5711 ispdbsc.dbindex = ppp_state->depthbias_scissor_indices.depthbias_index;
5712 ispdbsc.scindex = ppp_state->depthbias_scissor_indices.scissor_index;
5713 }
5714 buffer_ptr += pvr_cmd_length(TA_STATE_ISPDBSC);
5715
5716 EMIT_MASK_SET(pres_ispctl_dbsc, false);
5717 }
5718
5719 if (header->pres_pds_state_ptr0) {
5720 pvr_csb_write_value(buffer_ptr,
5721 TA_STATE_PDS_SHADERBASE,
5722 ppp_state->pds.pixel_shader_base);
5723
5724 pvr_csb_write_value(buffer_ptr,
5725 TA_STATE_PDS_TEXUNICODEBASE,
5726 ppp_state->pds.texture_uniform_code_base);
5727
5728 pvr_csb_write_value(buffer_ptr,
5729 TA_STATE_PDS_SIZEINFO1,
5730 ppp_state->pds.size_info1);
5731 pvr_csb_write_value(buffer_ptr,
5732 TA_STATE_PDS_SIZEINFO2,
5733 ppp_state->pds.size_info2);
5734
5735 EMIT_MASK_SET(pres_pds_state_ptr0, false);
5736 }
5737
5738 if (header->pres_pds_state_ptr1) {
5739 pvr_csb_write_value(buffer_ptr,
5740 TA_STATE_PDS_VARYINGBASE,
5741 ppp_state->pds.varying_base);
5742 EMIT_MASK_SET(pres_pds_state_ptr1, false);
5743 }
5744
5745 /* We don't use pds_state_ptr2 (texture state programs) control word, but
5746 * this doesn't mean we need to set it to 0. This is because the hardware
5747 * runs the texture state program only when
5748 * ROGUE_TA_STATE_PDS_SIZEINFO1.pds_texturestatesize is non-zero.
5749 */
5750 assert(pvr_csb_unpack(&ppp_state->pds.size_info1, TA_STATE_PDS_SIZEINFO1)
5751 .pds_texturestatesize == 0);
5752
5753 if (header->pres_pds_state_ptr3) {
5754 pvr_csb_write_value(buffer_ptr,
5755 TA_STATE_PDS_UNIFORMDATABASE,
5756 ppp_state->pds.uniform_state_data_base);
5757 EMIT_MASK_SET(pres_pds_state_ptr3, false);
5758 }
5759
5760 if (header->pres_region_clip) {
5761 pvr_csb_write_value(buffer_ptr,
5762 TA_REGION_CLIP0,
5763 ppp_state->region_clipping.word0);
5764 pvr_csb_write_value(buffer_ptr,
5765 TA_REGION_CLIP1,
5766 ppp_state->region_clipping.word1);
5767
5768 EMIT_MASK_SET(pres_region_clip, false);
5769 }
5770
5771 if (header->pres_viewport) {
5772 const uint32_t viewports = MAX2(1, ppp_state->viewport_count);
5773 EMIT_MASK_SET(view_port_count, viewports);
5774
5775 for (uint32_t i = 0; i < viewports; i++) {
5776 /* These don't have any definitions in the csbgen xml files and none
5777 * will be added.
5778 */
5779 *buffer_ptr++ = ppp_state->viewports[i].a0;
5780 *buffer_ptr++ = ppp_state->viewports[i].m0;
5781 *buffer_ptr++ = ppp_state->viewports[i].a1;
5782 *buffer_ptr++ = ppp_state->viewports[i].m1;
5783 *buffer_ptr++ = ppp_state->viewports[i].a2;
5784 *buffer_ptr++ = ppp_state->viewports[i].m2;
5785
5786 EMIT_MASK_SET(view_port_count, EMIT_MASK_GET(view_port_count) - 1);
5787 }
5788
5789 EMIT_MASK_SET(pres_viewport, false);
5790 }
5791
5792 if (header->pres_wclamp) {
5793 pvr_csb_pack (buffer_ptr, TA_WCLAMP, wclamp) {
5794 wclamp.val = fui(0.00001f);
5795 }
5796 buffer_ptr += pvr_cmd_length(TA_WCLAMP);
5797 EMIT_MASK_SET(pres_wclamp, false);
5798 }
5799
5800 if (header->pres_outselects) {
5801 pvr_csb_write_value(buffer_ptr, TA_OUTPUT_SEL, ppp_state->output_selects);
5802 EMIT_MASK_SET(pres_outselects, false);
5803 }
5804
5805 if (header->pres_varying_word0) {
5806 pvr_csb_write_value(buffer_ptr,
5807 TA_STATE_VARYING0,
5808 ppp_state->varying_word[0]);
5809 EMIT_MASK_SET(pres_varying_word0, false);
5810 }
5811
5812 if (header->pres_varying_word1) {
5813 pvr_csb_write_value(buffer_ptr,
5814 TA_STATE_VARYING1,
5815 ppp_state->varying_word[1]);
5816 EMIT_MASK_SET(pres_varying_word1, false);
5817 }
5818
5819 /* We only emit this on the first draw of a render job to prevent us from
5820 * inheriting a non-zero value set elsewhere.
5821 */
5822 if (header->pres_varying_word2) {
5823 pvr_csb_write_value(buffer_ptr, TA_STATE_VARYING2, 0);
5824 EMIT_MASK_SET(pres_varying_word2, false);
5825 }
5826
5827 if (header->pres_ppp_ctrl) {
5828 pvr_csb_write_value(buffer_ptr,
5829 TA_STATE_PPP_CTRL,
5830 ppp_state->ppp_control);
5831 EMIT_MASK_SET(pres_ppp_ctrl, false);
5832 }
5833
5834 /* We only emit this on the first draw of a render job to prevent us from
5835 * inheriting a non-zero value set elsewhere.
5836 */
5837 if (header->pres_stream_out_size) {
5838 pvr_csb_write_value(buffer_ptr, TA_STATE_STREAM_OUT0, 0);
5839 EMIT_MASK_SET(pres_stream_out_size, false);
5840 }
5841
5842 assert(EMIT_MASK_IS_CLEAR);
5843
5844 #undef EMIT_MASK_GET
5845 #undef EMIT_MASK_SET
5846 #if !defined(NDEBUG)
5847 # undef EMIT_MASK_IS_CLEAR
5848 #endif
5849
5850 ppp_state_words_count = buffer_ptr - ppp_state_words;
5851 assert(ppp_state_words_count <= PVR_MAX_PPP_STATE_DWORDS);
5852
5853 result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
5854 cmd_buffer->device->heaps.general_heap,
5855 PVR_DW_TO_BYTES(ppp_state_words_count),
5856 &pvr_bo);
5857 if (result != VK_SUCCESS)
5858 return result;
5859
5860 memcpy(pvr_bo_suballoc_get_map_addr(pvr_bo),
5861 ppp_state_words,
5862 PVR_DW_TO_BYTES(ppp_state_words_count));
5863
5864 pvr_csb_set_relocation_mark(control_stream);
5865
5866 /* Write the VDM state update into the VDM control stream. */
5867 pvr_csb_emit (control_stream, VDMCTRL_PPP_STATE0, state0) {
5868 state0.word_count = ppp_state_words_count;
5869 state0.addrmsb = pvr_bo->dev_addr;
5870 }
5871
5872 pvr_csb_emit (control_stream, VDMCTRL_PPP_STATE1, state1) {
5873 state1.addrlsb = pvr_bo->dev_addr;
5874 }
5875
5876 pvr_csb_clear_relocation_mark(control_stream);
5877
5878 if (emit_dbsc && cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
5879 struct pvr_deferred_cs_command cmd;
5880
5881 if (deferred_secondary) {
5882 const uint32_t num_dwords = pvr_cmd_length(VDMCTRL_PPP_STATE0) +
5883 pvr_cmd_length(VDMCTRL_PPP_STATE1);
5884 uint32_t *vdm_state;
5885
5886 pvr_csb_set_relocation_mark(control_stream);
5887
5888 vdm_state = pvr_csb_alloc_dwords(control_stream, num_dwords);
5889 if (!vdm_state) {
5890 result = pvr_csb_get_status(control_stream);
5891 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
5892 }
5893
5894 pvr_csb_clear_relocation_mark(control_stream);
5895
5896 cmd = (struct pvr_deferred_cs_command){
5897 .type = PVR_DEFERRED_CS_COMMAND_TYPE_DBSC,
5898 .dbsc = {
5899 .state = ppp_state->depthbias_scissor_indices,
5900 .vdm_state = vdm_state,
5901 },
5902 };
5903 } else {
5904 cmd = (struct pvr_deferred_cs_command){
5905 .type = PVR_DEFERRED_CS_COMMAND_TYPE_DBSC2,
5906 .dbsc2 = {
5907 .state = ppp_state->depthbias_scissor_indices,
5908 .ppp_cs_bo = pvr_bo,
5909 .patch_offset = dbsc_patching_offset,
5910 },
5911 };
5912 }
5913
5914 util_dynarray_append(&cmd_buffer->deferred_csb_commands,
5915 struct pvr_deferred_cs_command,
5916 cmd);
5917 }
5918
5919 state->emit_header = (struct PVRX(TA_STATE_HEADER)){ 0 };
5920
5921 return VK_SUCCESS;
5922 }
5923
5924 static inline bool
pvr_ppp_state_update_required(const struct pvr_cmd_buffer * cmd_buffer)5925 pvr_ppp_state_update_required(const struct pvr_cmd_buffer *cmd_buffer)
5926 {
5927 const BITSET_WORD *const dynamic_dirty =
5928 cmd_buffer->vk.dynamic_graphics_state.dirty;
5929 const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
5930 const struct PVRX(TA_STATE_HEADER) *const header = &state->emit_header;
5931
5932 /* For push constants we only need to worry if they are updated for the
5933 * fragment stage since we're only updating the pds programs used in the
5934 * fragment stage.
5935 */
5936
5937 return header->pres_ppp_ctrl || header->pres_ispctl ||
5938 header->pres_ispctl_fb || header->pres_ispctl_ba ||
5939 header->pres_ispctl_bb || header->pres_ispctl_dbsc ||
5940 header->pres_pds_state_ptr0 || header->pres_pds_state_ptr1 ||
5941 header->pres_pds_state_ptr2 || header->pres_pds_state_ptr3 ||
5942 header->pres_region_clip || header->pres_viewport ||
5943 header->pres_wclamp || header->pres_outselects ||
5944 header->pres_varying_word0 || header->pres_varying_word1 ||
5945 header->pres_varying_word2 || header->pres_stream_out_program ||
5946 state->dirty.fragment_descriptors || state->dirty.vis_test ||
5947 state->dirty.gfx_pipeline_binding || state->dirty.isp_userpass ||
5948 state->push_constants.dirty_stages & VK_SHADER_STAGE_FRAGMENT_BIT ||
5949 BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) ||
5950 BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) ||
5951 BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE) ||
5952 BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE) ||
5953 BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS) ||
5954 BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH) ||
5955 BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_VP_SCISSORS) ||
5956 BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_VP_SCISSOR_COUNT) ||
5957 BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) ||
5958 BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT);
5959 }
5960
5961 static VkResult
pvr_emit_dirty_ppp_state(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)5962 pvr_emit_dirty_ppp_state(struct pvr_cmd_buffer *const cmd_buffer,
5963 struct pvr_sub_cmd_gfx *const sub_cmd)
5964 {
5965 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
5966 struct vk_dynamic_graphics_state *const dynamic_state =
5967 &cmd_buffer->vk.dynamic_graphics_state;
5968 VkResult result;
5969
5970 /* TODO: The emit_header will be dirty only if
5971 * pvr_reset_graphics_dirty_state() was called before this (so when command
5972 * buffer begins recording or when it's reset). Otherwise it will have been
5973 * zeroed out by the previous pvr_emit_ppp_state(). We can probably set a
5974 * flag in there and check it here instead of checking the header.
5975 * Check if this is true and implement the flag.
5976 */
5977 if (!pvr_ppp_state_update_required(cmd_buffer))
5978 return VK_SUCCESS;
5979
5980 if (state->dirty.gfx_pipeline_binding) {
5981 struct PVRX(TA_STATE_ISPA) ispa;
5982
5983 pvr_setup_output_select(cmd_buffer);
5984 pvr_setup_isp_faces_and_control(cmd_buffer, &ispa);
5985 pvr_setup_triangle_merging_flag(cmd_buffer, &ispa);
5986 } else if (BITSET_TEST(dynamic_state->dirty,
5987 MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) ||
5988 BITSET_TEST(dynamic_state->dirty,
5989 MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE) ||
5990 BITSET_TEST(dynamic_state->dirty,
5991 MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) ||
5992 BITSET_TEST(dynamic_state->dirty,
5993 MESA_VK_DYNAMIC_RS_LINE_WIDTH) ||
5994 state->dirty.isp_userpass || state->dirty.vis_test) {
5995 pvr_setup_isp_faces_and_control(cmd_buffer, NULL);
5996 }
5997
5998 if (!dynamic_state->rs.rasterizer_discard_enable &&
5999 state->dirty.fragment_descriptors &&
6000 state->gfx_pipeline->shader_state.fragment.bo) {
6001 pvr_setup_fragment_state_pointers(cmd_buffer, sub_cmd);
6002 }
6003
6004 pvr_setup_isp_depth_bias_scissor_state(cmd_buffer);
6005
6006 if (BITSET_TEST(dynamic_state->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) ||
6007 BITSET_TEST(dynamic_state->dirty, MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT))
6008 pvr_setup_viewport(cmd_buffer);
6009
6010 pvr_setup_ppp_control(cmd_buffer);
6011
6012 /* The hardware doesn't have an explicit mode for this so we use a
6013 * negative viewport to make sure all objects are culled out early.
6014 */
6015 if (dynamic_state->rs.cull_mode == VK_CULL_MODE_FRONT_AND_BACK) {
6016 /* Shift the viewport out of the guard-band culling everything. */
6017 const uint32_t negative_vp_val = fui(-2.0f);
6018
6019 state->ppp_state.viewports[0].a0 = negative_vp_val;
6020 state->ppp_state.viewports[0].m0 = 0;
6021 state->ppp_state.viewports[0].a1 = negative_vp_val;
6022 state->ppp_state.viewports[0].m1 = 0;
6023 state->ppp_state.viewports[0].a2 = negative_vp_val;
6024 state->ppp_state.viewports[0].m2 = 0;
6025
6026 state->ppp_state.viewport_count = 1;
6027
6028 state->emit_header.pres_viewport = true;
6029 }
6030
6031 result = pvr_emit_ppp_state(cmd_buffer, sub_cmd);
6032 if (result != VK_SUCCESS)
6033 return result;
6034
6035 return VK_SUCCESS;
6036 }
6037
pvr_calculate_vertex_cam_size(const struct pvr_device_info * dev_info,const uint32_t vs_output_size,const bool raster_enable,uint32_t * const cam_size_out,uint32_t * const vs_max_instances_out)6038 void pvr_calculate_vertex_cam_size(const struct pvr_device_info *dev_info,
6039 const uint32_t vs_output_size,
6040 const bool raster_enable,
6041 uint32_t *const cam_size_out,
6042 uint32_t *const vs_max_instances_out)
6043 {
6044 /* First work out the size of a vertex in the UVS and multiply by 4 for
6045 * column ordering.
6046 */
6047 const uint32_t uvs_vertex_vector_size_in_dwords =
6048 (vs_output_size + 1U + raster_enable * 4U) * 4U;
6049 const uint32_t vdm_cam_size =
6050 PVR_GET_FEATURE_VALUE(dev_info, vdm_cam_size, 32U);
6051
6052 /* This is a proxy for 8XE. */
6053 if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format) &&
6054 vdm_cam_size < 96U) {
6055 /* Comparisons are based on size including scratch per vertex vector. */
6056 if (uvs_vertex_vector_size_in_dwords < (14U * 4U)) {
6057 *cam_size_out = MIN2(31U, vdm_cam_size - 1U);
6058 *vs_max_instances_out = 16U;
6059 } else if (uvs_vertex_vector_size_in_dwords < (20U * 4U)) {
6060 *cam_size_out = 15U;
6061 *vs_max_instances_out = 16U;
6062 } else if (uvs_vertex_vector_size_in_dwords < (28U * 4U)) {
6063 *cam_size_out = 11U;
6064 *vs_max_instances_out = 12U;
6065 } else if (uvs_vertex_vector_size_in_dwords < (44U * 4U)) {
6066 *cam_size_out = 7U;
6067 *vs_max_instances_out = 8U;
6068 } else if (PVR_HAS_FEATURE(dev_info,
6069 simple_internal_parameter_format_v2) ||
6070 uvs_vertex_vector_size_in_dwords < (64U * 4U)) {
6071 *cam_size_out = 7U;
6072 *vs_max_instances_out = 4U;
6073 } else {
6074 *cam_size_out = 3U;
6075 *vs_max_instances_out = 2U;
6076 }
6077 } else {
6078 /* Comparisons are based on size including scratch per vertex vector. */
6079 if (uvs_vertex_vector_size_in_dwords <= (32U * 4U)) {
6080 /* output size <= 27 + 5 scratch. */
6081 *cam_size_out = MIN2(95U, vdm_cam_size - 1U);
6082 *vs_max_instances_out = 0U;
6083 } else if (uvs_vertex_vector_size_in_dwords <= 48U * 4U) {
6084 /* output size <= 43 + 5 scratch */
6085 *cam_size_out = 63U;
6086 if (PVR_GET_FEATURE_VALUE(dev_info, uvs_vtx_entries, 144U) < 288U)
6087 *vs_max_instances_out = 16U;
6088 else
6089 *vs_max_instances_out = 0U;
6090 } else if (uvs_vertex_vector_size_in_dwords <= 64U * 4U) {
6091 /* output size <= 59 + 5 scratch. */
6092 *cam_size_out = 31U;
6093 if (PVR_GET_FEATURE_VALUE(dev_info, uvs_vtx_entries, 144U) < 288U)
6094 *vs_max_instances_out = 16U;
6095 else
6096 *vs_max_instances_out = 0U;
6097 } else {
6098 *cam_size_out = 15U;
6099 *vs_max_instances_out = 16U;
6100 }
6101 }
6102 }
6103
pvr_emit_dirty_vdm_state(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)6104 static void pvr_emit_dirty_vdm_state(struct pvr_cmd_buffer *const cmd_buffer,
6105 struct pvr_sub_cmd_gfx *const sub_cmd)
6106 {
6107 /* FIXME: Assume all state is dirty for the moment. */
6108 struct pvr_device_info *const dev_info =
6109 &cmd_buffer->device->pdevice->dev_info;
6110 ASSERTED const uint32_t max_user_vertex_output_components =
6111 pvr_get_max_user_vertex_output_components(dev_info);
6112 struct PVRX(VDMCTRL_VDM_STATE0)
6113 header = { pvr_cmd_header(VDMCTRL_VDM_STATE0) };
6114 struct vk_dynamic_graphics_state *const dynamic_state =
6115 &cmd_buffer->vk.dynamic_graphics_state;
6116 const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
6117 const struct pvr_vertex_shader_state *const vertex_shader_state =
6118 &state->gfx_pipeline->shader_state.vertex;
6119 struct pvr_csb *const csb = &sub_cmd->control_stream;
6120 uint32_t vs_output_size;
6121 uint32_t max_instances;
6122 uint32_t cam_size;
6123
6124 /* CAM Calculations and HW state take vertex size aligned to DWORDS. */
6125 vs_output_size =
6126 DIV_ROUND_UP(vertex_shader_state->vertex_output_size,
6127 PVRX(VDMCTRL_VDM_STATE4_VS_OUTPUT_SIZE_UNIT_SIZE));
6128
6129 assert(vs_output_size <= max_user_vertex_output_components);
6130
6131 pvr_calculate_vertex_cam_size(dev_info,
6132 vs_output_size,
6133 true,
6134 &cam_size,
6135 &max_instances);
6136
6137 pvr_csb_set_relocation_mark(csb);
6138
6139 pvr_csb_emit (csb, VDMCTRL_VDM_STATE0, state0) {
6140 state0.cam_size = cam_size;
6141
6142 if (dynamic_state->ia.primitive_restart_enable) {
6143 state0.cut_index_enable = true;
6144 state0.cut_index_present = true;
6145 }
6146
6147 switch (dynamic_state->ia.primitive_topology) {
6148 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
6149 state0.flatshade_control = PVRX(VDMCTRL_FLATSHADE_CONTROL_VERTEX_1);
6150 break;
6151
6152 default:
6153 state0.flatshade_control = PVRX(VDMCTRL_FLATSHADE_CONTROL_VERTEX_0);
6154 break;
6155 }
6156
6157 /* If we've bound a different vertex buffer, or this draw-call requires
6158 * a different PDS attrib data-section from the last draw call (changed
6159 * base_instance) then we need to specify a new data section. This is
6160 * also the case if we've switched pipeline or attrib program as the
6161 * data-section layout will be different.
6162 */
6163 state0.vs_data_addr_present =
6164 state->dirty.gfx_pipeline_binding || state->dirty.vertex_bindings ||
6165 state->dirty.draw_base_instance || state->dirty.draw_variant;
6166
6167 /* Need to specify new PDS Attrib program if we've bound a different
6168 * pipeline or we needed a different PDS Attrib variant for this
6169 * draw-call.
6170 */
6171 state0.vs_other_present = state->dirty.gfx_pipeline_binding ||
6172 state->dirty.draw_variant;
6173
6174 /* UVB_SCRATCH_SELECT_ONE with no rasterization is only valid when
6175 * stream output is enabled. We use UVB_SCRATCH_SELECT_FIVE because
6176 * Vulkan doesn't support stream output and the vertex position is
6177 * always emitted to the UVB.
6178 */
6179 state0.uvs_scratch_size_select =
6180 PVRX(VDMCTRL_UVS_SCRATCH_SIZE_SELECT_FIVE);
6181
6182 header = state0;
6183 }
6184
6185 if (header.cut_index_present) {
6186 pvr_csb_emit (csb, VDMCTRL_VDM_STATE1, state1) {
6187 switch (state->index_buffer_binding.type) {
6188 case VK_INDEX_TYPE_UINT32:
6189 /* FIXME: Defines for these? These seem to come from the Vulkan
6190 * spec. for VkPipelineInputAssemblyStateCreateInfo
6191 * primitiveRestartEnable.
6192 */
6193 state1.cut_index = 0xFFFFFFFF;
6194 break;
6195
6196 case VK_INDEX_TYPE_UINT16:
6197 state1.cut_index = 0xFFFF;
6198 break;
6199
6200 default:
6201 unreachable("Invalid index type");
6202 }
6203 }
6204 }
6205
6206 if (header.vs_data_addr_present) {
6207 pvr_csb_emit (csb, VDMCTRL_VDM_STATE2, state2) {
6208 state2.vs_pds_data_base_addr =
6209 PVR_DEV_ADDR(state->pds_vertex_attrib_offset);
6210 }
6211 }
6212
6213 if (header.vs_other_present) {
6214 const uint32_t usc_unified_store_size_in_bytes =
6215 vertex_shader_state->vertex_input_size << 2;
6216
6217 pvr_csb_emit (csb, VDMCTRL_VDM_STATE3, state3) {
6218 state3.vs_pds_code_base_addr =
6219 PVR_DEV_ADDR(state->pds_shader.code_offset);
6220 }
6221
6222 pvr_csb_emit (csb, VDMCTRL_VDM_STATE4, state4) {
6223 state4.vs_output_size = vs_output_size;
6224 }
6225
6226 pvr_csb_emit (csb, VDMCTRL_VDM_STATE5, state5) {
6227 state5.vs_max_instances = max_instances;
6228 state5.vs_usc_common_size = 0U;
6229 state5.vs_usc_unified_size = DIV_ROUND_UP(
6230 usc_unified_store_size_in_bytes,
6231 PVRX(VDMCTRL_VDM_STATE5_VS_USC_UNIFIED_SIZE_UNIT_SIZE));
6232 state5.vs_pds_temp_size =
6233 DIV_ROUND_UP(state->pds_shader.info->temps_required << 2,
6234 PVRX(VDMCTRL_VDM_STATE5_VS_PDS_TEMP_SIZE_UNIT_SIZE));
6235 state5.vs_pds_data_size = DIV_ROUND_UP(
6236 PVR_DW_TO_BYTES(state->pds_shader.info->data_size_in_dwords),
6237 PVRX(VDMCTRL_VDM_STATE5_VS_PDS_DATA_SIZE_UNIT_SIZE));
6238 }
6239 }
6240
6241 pvr_csb_clear_relocation_mark(csb);
6242 }
6243
pvr_validate_draw_state(struct pvr_cmd_buffer * cmd_buffer)6244 static VkResult pvr_validate_draw_state(struct pvr_cmd_buffer *cmd_buffer)
6245 {
6246 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
6247 struct vk_dynamic_graphics_state *const dynamic_state =
6248 &cmd_buffer->vk.dynamic_graphics_state;
6249 const struct pvr_graphics_pipeline *const gfx_pipeline = state->gfx_pipeline;
6250 const struct pvr_pipeline_stage_state *const fragment_state =
6251 &gfx_pipeline->shader_state.fragment.stage_state;
6252 const struct pvr_pipeline_stage_state *const vertex_state =
6253 &gfx_pipeline->shader_state.vertex.stage_state;
6254 const struct pvr_pipeline_layout *const pipeline_layout =
6255 gfx_pipeline->base.layout;
6256 struct pvr_sub_cmd_gfx *sub_cmd;
6257 bool fstencil_writemask_zero;
6258 bool bstencil_writemask_zero;
6259 bool fstencil_keep;
6260 bool bstencil_keep;
6261 VkResult result;
6262
6263 pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
6264
6265 sub_cmd = &state->current_sub_cmd->gfx;
6266 sub_cmd->empty_cmd = false;
6267
6268 /* Determine pipeline depth/stencil usage. If a pipeline uses depth or
6269 * stencil testing, those attachments are using their loaded values, and
6270 * the loadOps cannot be optimized out.
6271 */
6272 /* Pipeline uses depth testing. */
6273 if (sub_cmd->depth_usage == PVR_DEPTH_STENCIL_USAGE_UNDEFINED &&
6274 dynamic_state->ds.depth.compare_op != VK_COMPARE_OP_ALWAYS) {
6275 sub_cmd->depth_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED;
6276 }
6277
6278 /* Pipeline uses stencil testing. */
6279 if (sub_cmd->stencil_usage == PVR_DEPTH_STENCIL_USAGE_UNDEFINED &&
6280 (dynamic_state->ds.stencil.front.op.compare != VK_COMPARE_OP_ALWAYS ||
6281 dynamic_state->ds.stencil.back.op.compare != VK_COMPARE_OP_ALWAYS)) {
6282 sub_cmd->stencil_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED;
6283 }
6284
6285 if (PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info,
6286 compute_overlap)) {
6287 uint32_t coefficient_size =
6288 DIV_ROUND_UP(fragment_state->coefficient_size,
6289 PVRX(TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_UNIT_SIZE));
6290
6291 if (coefficient_size >
6292 PVRX(TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_MAX_SIZE))
6293 sub_cmd->disable_compute_overlap = true;
6294 }
6295
6296 sub_cmd->frag_uses_atomic_ops |= fragment_state->uses_atomic_ops;
6297 sub_cmd->frag_has_side_effects |= fragment_state->has_side_effects;
6298 sub_cmd->frag_uses_texture_rw |= fragment_state->uses_texture_rw;
6299 sub_cmd->vertex_uses_texture_rw |= vertex_state->uses_texture_rw;
6300
6301 sub_cmd->job.get_vis_results = state->vis_test_enabled;
6302
6303 fstencil_keep =
6304 (dynamic_state->ds.stencil.front.op.fail == VK_STENCIL_OP_KEEP) &&
6305 (dynamic_state->ds.stencil.front.op.pass == VK_STENCIL_OP_KEEP);
6306 bstencil_keep =
6307 (dynamic_state->ds.stencil.back.op.fail == VK_STENCIL_OP_KEEP) &&
6308 (dynamic_state->ds.stencil.back.op.pass == VK_STENCIL_OP_KEEP);
6309 fstencil_writemask_zero = (dynamic_state->ds.stencil.front.write_mask == 0);
6310 bstencil_writemask_zero = (dynamic_state->ds.stencil.back.write_mask == 0);
6311
6312 /* Set stencil modified flag if:
6313 * - Neither front nor back-facing stencil has a fail_op/pass_op of KEEP.
6314 * - Neither front nor back-facing stencil has a write_mask of zero.
6315 */
6316 if (!(fstencil_keep && bstencil_keep) &&
6317 !(fstencil_writemask_zero && bstencil_writemask_zero)) {
6318 sub_cmd->modifies_stencil = true;
6319 }
6320
6321 /* Set depth modified flag if depth write is enabled. */
6322 if (dynamic_state->ds.depth.write_enable)
6323 sub_cmd->modifies_depth = true;
6324
6325 /* If either the data or code changes for pds vertex attribs, regenerate the
6326 * data segment.
6327 */
6328 if (state->dirty.vertex_bindings || state->dirty.gfx_pipeline_binding ||
6329 state->dirty.draw_variant || state->dirty.draw_base_instance) {
6330 enum pvr_pds_vertex_attrib_program_type prog_type;
6331 const struct pvr_pds_attrib_program *program;
6332
6333 if (state->draw_state.draw_indirect)
6334 prog_type = PVR_PDS_VERTEX_ATTRIB_PROGRAM_DRAW_INDIRECT;
6335 else if (state->draw_state.base_instance)
6336 prog_type = PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASE_INSTANCE;
6337 else
6338 prog_type = PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASIC;
6339
6340 program =
6341 &gfx_pipeline->shader_state.vertex.pds_attrib_programs[prog_type];
6342 state->pds_shader.info = &program->info;
6343 state->pds_shader.code_offset = program->program.code_offset;
6344
6345 state->max_shared_regs =
6346 MAX2(state->max_shared_regs, pvr_calc_shared_regs_count(gfx_pipeline));
6347
6348 pvr_setup_vertex_buffers(cmd_buffer, gfx_pipeline);
6349 }
6350
6351 if (state->push_constants.dirty_stages & VK_SHADER_STAGE_ALL_GRAPHICS) {
6352 result = pvr_cmd_upload_push_consts(cmd_buffer);
6353 if (result != VK_SUCCESS)
6354 return result;
6355 }
6356
6357 state->dirty.vertex_descriptors = state->dirty.gfx_pipeline_binding;
6358 state->dirty.fragment_descriptors = state->dirty.vertex_descriptors;
6359
6360 /* Account for dirty descriptor set. */
6361 state->dirty.vertex_descriptors |=
6362 state->dirty.gfx_desc_dirty &&
6363 pipeline_layout
6364 ->per_stage_descriptor_masks[PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY];
6365 state->dirty.fragment_descriptors |=
6366 state->dirty.gfx_desc_dirty &&
6367 pipeline_layout->per_stage_descriptor_masks[PVR_STAGE_ALLOCATION_FRAGMENT];
6368
6369 if (BITSET_TEST(dynamic_state->dirty, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS))
6370 state->dirty.fragment_descriptors = true;
6371
6372 state->dirty.vertex_descriptors |=
6373 state->push_constants.dirty_stages &
6374 (VK_SHADER_STAGE_ALL_GRAPHICS & ~VK_SHADER_STAGE_FRAGMENT_BIT);
6375 state->dirty.fragment_descriptors |= state->push_constants.dirty_stages &
6376 VK_SHADER_STAGE_FRAGMENT_BIT;
6377
6378 if (state->dirty.fragment_descriptors) {
6379 result = pvr_setup_descriptor_mappings(
6380 cmd_buffer,
6381 PVR_STAGE_ALLOCATION_FRAGMENT,
6382 &state->gfx_pipeline->shader_state.fragment.descriptor_state,
6383 NULL,
6384 &state->pds_fragment_descriptor_data_offset);
6385 if (result != VK_SUCCESS) {
6386 mesa_loge("Could not setup fragment descriptor mappings.");
6387 return result;
6388 }
6389 }
6390
6391 if (state->dirty.vertex_descriptors) {
6392 uint32_t pds_vertex_descriptor_data_offset;
6393
6394 result = pvr_setup_descriptor_mappings(
6395 cmd_buffer,
6396 PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY,
6397 &state->gfx_pipeline->shader_state.vertex.descriptor_state,
6398 NULL,
6399 &pds_vertex_descriptor_data_offset);
6400 if (result != VK_SUCCESS) {
6401 mesa_loge("Could not setup vertex descriptor mappings.");
6402 return result;
6403 }
6404
6405 pvr_emit_dirty_pds_state(cmd_buffer,
6406 sub_cmd,
6407 pds_vertex_descriptor_data_offset);
6408 }
6409
6410 pvr_emit_dirty_ppp_state(cmd_buffer, sub_cmd);
6411 pvr_emit_dirty_vdm_state(cmd_buffer, sub_cmd);
6412
6413 vk_dynamic_graphics_state_clear_dirty(dynamic_state);
6414 state->dirty.gfx_desc_dirty = false;
6415 state->dirty.draw_base_instance = false;
6416 state->dirty.draw_variant = false;
6417 state->dirty.fragment_descriptors = false;
6418 state->dirty.gfx_pipeline_binding = false;
6419 state->dirty.isp_userpass = false;
6420 state->dirty.vertex_bindings = false;
6421 state->dirty.vis_test = false;
6422
6423 state->push_constants.dirty_stages &= ~VK_SHADER_STAGE_ALL_GRAPHICS;
6424
6425 return VK_SUCCESS;
6426 }
6427
pvr_get_hw_primitive_topology(VkPrimitiveTopology topology)6428 static uint32_t pvr_get_hw_primitive_topology(VkPrimitiveTopology topology)
6429 {
6430 switch (topology) {
6431 case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
6432 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_POINT_LIST);
6433 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
6434 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_LIST);
6435 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
6436 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_STRIP);
6437 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
6438 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_LIST);
6439 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
6440 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_STRIP);
6441 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
6442 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_FAN);
6443 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
6444 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_LIST_ADJ);
6445 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
6446 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_STRIP_ADJ);
6447 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
6448 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_LIST_ADJ);
6449 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
6450 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_STRIP_ADJ);
6451 case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
6452 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_PATCH_LIST);
6453 default:
6454 unreachable("Undefined primitive topology");
6455 }
6456 }
6457
6458 /* TODO: Rewrite this in terms of ALIGN_POT() and pvr_cmd_length(). */
6459 /* Aligned to 128 bit for PDS loads / stores */
6460 #define DUMMY_VDM_CONTROL_STREAM_BLOCK_SIZE 8
6461
6462 static VkResult
pvr_write_draw_indirect_vdm_stream(struct pvr_cmd_buffer * cmd_buffer,struct pvr_csb * const csb,pvr_dev_addr_t idx_buffer_addr,uint32_t idx_stride,struct PVRX (VDMCTRL_INDEX_LIST0)* list_hdr,struct pvr_buffer * buffer,VkDeviceSize offset,uint32_t count,uint32_t stride)6463 pvr_write_draw_indirect_vdm_stream(struct pvr_cmd_buffer *cmd_buffer,
6464 struct pvr_csb *const csb,
6465 pvr_dev_addr_t idx_buffer_addr,
6466 uint32_t idx_stride,
6467 struct PVRX(VDMCTRL_INDEX_LIST0) * list_hdr,
6468 struct pvr_buffer *buffer,
6469 VkDeviceSize offset,
6470 uint32_t count,
6471 uint32_t stride)
6472 {
6473 struct pvr_pds_drawindirect_program pds_prog = { 0 };
6474 uint32_t word0;
6475
6476 /* Draw indirect always has index offset and instance count. */
6477 list_hdr->index_offset_present = true;
6478 list_hdr->index_instance_count_present = true;
6479
6480 pvr_cmd_pack(VDMCTRL_INDEX_LIST0)(&word0, list_hdr);
6481
6482 pds_prog.support_base_instance = true;
6483 pds_prog.arg_buffer = buffer->dev_addr.addr + offset;
6484 pds_prog.index_buffer = idx_buffer_addr.addr;
6485 pds_prog.index_block_header = word0;
6486 pds_prog.index_stride = idx_stride;
6487 pds_prog.num_views = 1U;
6488
6489 /* TODO: See if we can pre-upload the code section of all the pds programs
6490 * and reuse them here.
6491 */
6492 /* Generate and upload the PDS programs (code + data). */
6493 for (uint32_t i = 0U; i < count; i++) {
6494 const struct pvr_device_info *dev_info =
6495 &cmd_buffer->device->pdevice->dev_info;
6496 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6497 struct pvr_suballoc_bo *dummy_bo;
6498 struct pvr_suballoc_bo *pds_bo;
6499 uint32_t *dummy_stream;
6500 uint32_t *pds_base;
6501 uint32_t pds_size;
6502 VkResult result;
6503
6504 /* TODO: Move this outside the loop and allocate all of them in one go? */
6505 result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
6506 cmd_buffer->device->heaps.general_heap,
6507 DUMMY_VDM_CONTROL_STREAM_BLOCK_SIZE,
6508 &dummy_bo);
6509 if (result != VK_SUCCESS)
6510 return result;
6511
6512 pds_prog.increment_draw_id = (i != 0);
6513 pds_prog.index_list_addr_buffer = dummy_bo->dev_addr.addr;
6514
6515 if (state->draw_state.draw_indexed) {
6516 pvr_pds_generate_draw_elements_indirect(&pds_prog,
6517 0,
6518 PDS_GENERATE_SIZES,
6519 dev_info);
6520 } else {
6521 pvr_pds_generate_draw_arrays_indirect(&pds_prog,
6522 0,
6523 PDS_GENERATE_SIZES,
6524 dev_info);
6525 }
6526
6527 pds_size = PVR_DW_TO_BYTES(pds_prog.program.data_size_aligned +
6528 pds_prog.program.code_size_aligned);
6529
6530 result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
6531 cmd_buffer->device->heaps.pds_heap,
6532 pds_size,
6533 &pds_bo);
6534 if (result != VK_SUCCESS)
6535 return result;
6536
6537 pds_base = pvr_bo_suballoc_get_map_addr(pds_bo);
6538 memcpy(pds_base,
6539 pds_prog.program.code,
6540 PVR_DW_TO_BYTES(pds_prog.program.code_size_aligned));
6541
6542 if (state->draw_state.draw_indexed) {
6543 pvr_pds_generate_draw_elements_indirect(
6544 &pds_prog,
6545 pds_base + pds_prog.program.code_size_aligned,
6546 PDS_GENERATE_DATA_SEGMENT,
6547 dev_info);
6548 } else {
6549 pvr_pds_generate_draw_arrays_indirect(
6550 &pds_prog,
6551 pds_base + pds_prog.program.code_size_aligned,
6552 PDS_GENERATE_DATA_SEGMENT,
6553 dev_info);
6554 }
6555
6556 pvr_csb_set_relocation_mark(csb);
6557
6558 pvr_csb_emit (csb, VDMCTRL_PDS_STATE0, state0) {
6559 state0.usc_target = PVRX(VDMCTRL_USC_TARGET_ANY);
6560
6561 state0.pds_temp_size =
6562 DIV_ROUND_UP(PVR_DW_TO_BYTES(pds_prog.program.temp_size_aligned),
6563 PVRX(VDMCTRL_PDS_STATE0_PDS_TEMP_SIZE_UNIT_SIZE));
6564
6565 state0.pds_data_size =
6566 DIV_ROUND_UP(PVR_DW_TO_BYTES(pds_prog.program.data_size_aligned),
6567 PVRX(VDMCTRL_PDS_STATE0_PDS_DATA_SIZE_UNIT_SIZE));
6568 }
6569
6570 pvr_csb_emit (csb, VDMCTRL_PDS_STATE1, state1) {
6571 const uint32_t data_offset =
6572 pds_bo->dev_addr.addr +
6573 PVR_DW_TO_BYTES(pds_prog.program.code_size_aligned) -
6574 cmd_buffer->device->heaps.pds_heap->base_addr.addr;
6575
6576 state1.pds_data_addr = PVR_DEV_ADDR(data_offset);
6577 state1.sd_type = PVRX(VDMCTRL_SD_TYPE_PDS);
6578 state1.sd_next_type = PVRX(VDMCTRL_SD_TYPE_NONE);
6579 }
6580
6581 pvr_csb_emit (csb, VDMCTRL_PDS_STATE2, state2) {
6582 const uint32_t code_offset =
6583 pds_bo->dev_addr.addr -
6584 cmd_buffer->device->heaps.pds_heap->base_addr.addr;
6585
6586 state2.pds_code_addr = PVR_DEV_ADDR(code_offset);
6587 }
6588
6589 pvr_csb_clear_relocation_mark(csb);
6590
6591 /* We don't really need to set the relocation mark since the following
6592 * state update is just one emit but let's be nice and use it.
6593 */
6594 pvr_csb_set_relocation_mark(csb);
6595
6596 /* Sync task to ensure the VDM doesn't start reading the dummy blocks
6597 * before they are ready.
6598 */
6599 pvr_csb_emit (csb, VDMCTRL_INDEX_LIST0, list0) {
6600 list0.primitive_topology = PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_LIST);
6601 }
6602
6603 pvr_csb_clear_relocation_mark(csb);
6604
6605 dummy_stream = pvr_bo_suballoc_get_map_addr(dummy_bo);
6606
6607 /* For indexed draw cmds fill in the dummy's header (as it won't change
6608 * based on the indirect args) and increment by the in-use size of each
6609 * dummy block.
6610 */
6611 if (!state->draw_state.draw_indexed) {
6612 dummy_stream[0] = word0;
6613 dummy_stream += 4;
6614 } else {
6615 dummy_stream += 5;
6616 }
6617
6618 /* clang-format off */
6619 pvr_csb_pack (dummy_stream, VDMCTRL_STREAM_RETURN, word);
6620 /* clang-format on */
6621
6622 pvr_csb_set_relocation_mark(csb);
6623
6624 /* Stream link to the first dummy which forces the VDM to discard any
6625 * prefetched (dummy) control stream.
6626 */
6627 pvr_csb_emit (csb, VDMCTRL_STREAM_LINK0, link) {
6628 link.with_return = true;
6629 link.link_addrmsb = dummy_bo->dev_addr;
6630 }
6631
6632 pvr_csb_emit (csb, VDMCTRL_STREAM_LINK1, link) {
6633 link.link_addrlsb = dummy_bo->dev_addr;
6634 }
6635
6636 pvr_csb_clear_relocation_mark(csb);
6637
6638 /* Point the pds program to the next argument buffer and the next VDM
6639 * dummy buffer.
6640 */
6641 pds_prog.arg_buffer += stride;
6642 }
6643
6644 return VK_SUCCESS;
6645 }
6646
6647 #undef DUMMY_VDM_CONTROL_STREAM_BLOCK_SIZE
6648
pvr_emit_vdm_index_list(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd,VkPrimitiveTopology topology,uint32_t index_offset,uint32_t first_index,uint32_t index_count,uint32_t instance_count,struct pvr_buffer * buffer,VkDeviceSize offset,uint32_t count,uint32_t stride)6649 static void pvr_emit_vdm_index_list(struct pvr_cmd_buffer *cmd_buffer,
6650 struct pvr_sub_cmd_gfx *const sub_cmd,
6651 VkPrimitiveTopology topology,
6652 uint32_t index_offset,
6653 uint32_t first_index,
6654 uint32_t index_count,
6655 uint32_t instance_count,
6656 struct pvr_buffer *buffer,
6657 VkDeviceSize offset,
6658 uint32_t count,
6659 uint32_t stride)
6660 {
6661 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6662 const bool vertex_shader_has_side_effects =
6663 state->gfx_pipeline->shader_state.vertex.stage_state.has_side_effects;
6664 struct PVRX(VDMCTRL_INDEX_LIST0)
6665 list_hdr = { pvr_cmd_header(VDMCTRL_INDEX_LIST0) };
6666 pvr_dev_addr_t index_buffer_addr = PVR_DEV_ADDR_INVALID;
6667 struct pvr_csb *const csb = &sub_cmd->control_stream;
6668 unsigned int index_stride = 0;
6669
6670 list_hdr.primitive_topology = pvr_get_hw_primitive_topology(topology);
6671
6672 /* firstInstance is not handled here in the VDM state, it's implemented as
6673 * an addition in the PDS vertex fetch using
6674 * PVR_PDS_CONST_MAP_ENTRY_TYPE_BASE_INSTANCE entry type.
6675 */
6676
6677 list_hdr.index_count_present = true;
6678
6679 if (instance_count > 1)
6680 list_hdr.index_instance_count_present = true;
6681
6682 if (index_offset)
6683 list_hdr.index_offset_present = true;
6684
6685 if (state->draw_state.draw_indexed) {
6686 switch (state->index_buffer_binding.type) {
6687 case VK_INDEX_TYPE_UINT32:
6688 list_hdr.index_size = PVRX(VDMCTRL_INDEX_SIZE_B32);
6689 index_stride = 4;
6690 break;
6691
6692 case VK_INDEX_TYPE_UINT16:
6693 list_hdr.index_size = PVRX(VDMCTRL_INDEX_SIZE_B16);
6694 index_stride = 2;
6695 break;
6696
6697 default:
6698 unreachable("Invalid index type");
6699 }
6700
6701 index_buffer_addr = PVR_DEV_ADDR_OFFSET(
6702 state->index_buffer_binding.buffer->dev_addr,
6703 state->index_buffer_binding.offset + first_index * index_stride);
6704
6705 list_hdr.index_addr_present = true;
6706 list_hdr.index_base_addrmsb = index_buffer_addr;
6707 }
6708
6709 list_hdr.degen_cull_enable =
6710 PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info,
6711 vdm_degenerate_culling) &&
6712 !vertex_shader_has_side_effects;
6713
6714 if (state->draw_state.draw_indirect) {
6715 assert(buffer);
6716 pvr_write_draw_indirect_vdm_stream(cmd_buffer,
6717 csb,
6718 index_buffer_addr,
6719 index_stride,
6720 &list_hdr,
6721 buffer,
6722 offset,
6723 count,
6724 stride);
6725 return;
6726 }
6727
6728 pvr_csb_set_relocation_mark(csb);
6729
6730 pvr_csb_emit (csb, VDMCTRL_INDEX_LIST0, list0) {
6731 list0 = list_hdr;
6732 }
6733
6734 if (list_hdr.index_addr_present) {
6735 pvr_csb_emit (csb, VDMCTRL_INDEX_LIST1, list1) {
6736 list1.index_base_addrlsb = index_buffer_addr;
6737 }
6738 }
6739
6740 if (list_hdr.index_count_present) {
6741 pvr_csb_emit (csb, VDMCTRL_INDEX_LIST2, list2) {
6742 list2.index_count = index_count;
6743 }
6744 }
6745
6746 if (list_hdr.index_instance_count_present) {
6747 pvr_csb_emit (csb, VDMCTRL_INDEX_LIST3, list3) {
6748 list3.instance_count = instance_count - 1;
6749 }
6750 }
6751
6752 if (list_hdr.index_offset_present) {
6753 pvr_csb_emit (csb, VDMCTRL_INDEX_LIST4, list4) {
6754 list4.index_offset = index_offset;
6755 }
6756 }
6757
6758 pvr_csb_clear_relocation_mark(csb);
6759 }
6760
pvr_CmdDraw(VkCommandBuffer commandBuffer,uint32_t vertexCount,uint32_t instanceCount,uint32_t firstVertex,uint32_t firstInstance)6761 void pvr_CmdDraw(VkCommandBuffer commandBuffer,
6762 uint32_t vertexCount,
6763 uint32_t instanceCount,
6764 uint32_t firstVertex,
6765 uint32_t firstInstance)
6766 {
6767 const struct pvr_cmd_buffer_draw_state draw_state = {
6768 .base_vertex = firstVertex,
6769 .base_instance = firstInstance,
6770 };
6771
6772 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
6773 struct vk_dynamic_graphics_state *const dynamic_state =
6774 &cmd_buffer->vk.dynamic_graphics_state;
6775 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6776 VkResult result;
6777
6778 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
6779
6780 pvr_update_draw_state(state, &draw_state);
6781
6782 result = pvr_validate_draw_state(cmd_buffer);
6783 if (result != VK_SUCCESS)
6784 return;
6785
6786 /* Write the VDM control stream for the primitive. */
6787 pvr_emit_vdm_index_list(cmd_buffer,
6788 &state->current_sub_cmd->gfx,
6789 dynamic_state->ia.primitive_topology,
6790 firstVertex,
6791 0U,
6792 vertexCount,
6793 instanceCount,
6794 NULL,
6795 0U,
6796 0U,
6797 0U);
6798 }
6799
pvr_CmdDrawIndexed(VkCommandBuffer commandBuffer,uint32_t indexCount,uint32_t instanceCount,uint32_t firstIndex,int32_t vertexOffset,uint32_t firstInstance)6800 void pvr_CmdDrawIndexed(VkCommandBuffer commandBuffer,
6801 uint32_t indexCount,
6802 uint32_t instanceCount,
6803 uint32_t firstIndex,
6804 int32_t vertexOffset,
6805 uint32_t firstInstance)
6806 {
6807 const struct pvr_cmd_buffer_draw_state draw_state = {
6808 .base_vertex = vertexOffset,
6809 .base_instance = firstInstance,
6810 .draw_indexed = true,
6811 };
6812
6813 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
6814 struct vk_dynamic_graphics_state *const dynamic_state =
6815 &cmd_buffer->vk.dynamic_graphics_state;
6816 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6817 VkResult result;
6818
6819 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
6820
6821 pvr_update_draw_state(state, &draw_state);
6822
6823 result = pvr_validate_draw_state(cmd_buffer);
6824 if (result != VK_SUCCESS)
6825 return;
6826
6827 /* Write the VDM control stream for the primitive. */
6828 pvr_emit_vdm_index_list(cmd_buffer,
6829 &state->current_sub_cmd->gfx,
6830 dynamic_state->ia.primitive_topology,
6831 vertexOffset,
6832 firstIndex,
6833 indexCount,
6834 instanceCount,
6835 NULL,
6836 0U,
6837 0U,
6838 0U);
6839 }
6840
pvr_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)6841 void pvr_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
6842 VkBuffer _buffer,
6843 VkDeviceSize offset,
6844 uint32_t drawCount,
6845 uint32_t stride)
6846 {
6847 const struct pvr_cmd_buffer_draw_state draw_state = {
6848 .draw_indirect = true,
6849 .draw_indexed = true,
6850 };
6851
6852 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
6853 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6854 struct vk_dynamic_graphics_state *const dynamic_state =
6855 &cmd_buffer->vk.dynamic_graphics_state;
6856 PVR_FROM_HANDLE(pvr_buffer, buffer, _buffer);
6857 VkResult result;
6858
6859 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
6860
6861 pvr_update_draw_state(state, &draw_state);
6862
6863 result = pvr_validate_draw_state(cmd_buffer);
6864 if (result != VK_SUCCESS)
6865 return;
6866
6867 /* Write the VDM control stream for the primitive. */
6868 pvr_emit_vdm_index_list(cmd_buffer,
6869 &state->current_sub_cmd->gfx,
6870 dynamic_state->ia.primitive_topology,
6871 0U,
6872 0U,
6873 0U,
6874 0U,
6875 buffer,
6876 offset,
6877 drawCount,
6878 stride);
6879 }
6880
pvr_CmdDrawIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)6881 void pvr_CmdDrawIndirect(VkCommandBuffer commandBuffer,
6882 VkBuffer _buffer,
6883 VkDeviceSize offset,
6884 uint32_t drawCount,
6885 uint32_t stride)
6886 {
6887 const struct pvr_cmd_buffer_draw_state draw_state = {
6888 .draw_indirect = true,
6889 };
6890
6891 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
6892 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6893 PVR_FROM_HANDLE(pvr_buffer, buffer, _buffer);
6894 struct vk_dynamic_graphics_state *const dynamic_state =
6895 &cmd_buffer->vk.dynamic_graphics_state;
6896 VkResult result;
6897
6898 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
6899
6900 pvr_update_draw_state(state, &draw_state);
6901
6902 result = pvr_validate_draw_state(cmd_buffer);
6903 if (result != VK_SUCCESS)
6904 return;
6905
6906 /* Write the VDM control stream for the primitive. */
6907 pvr_emit_vdm_index_list(cmd_buffer,
6908 &state->current_sub_cmd->gfx,
6909 dynamic_state->ia.primitive_topology,
6910 0U,
6911 0U,
6912 0U,
6913 0U,
6914 buffer,
6915 offset,
6916 drawCount,
6917 stride);
6918 }
6919
6920 static VkResult
pvr_resolve_unemitted_resolve_attachments(struct pvr_cmd_buffer * cmd_buffer,struct pvr_render_pass_info * info)6921 pvr_resolve_unemitted_resolve_attachments(struct pvr_cmd_buffer *cmd_buffer,
6922 struct pvr_render_pass_info *info)
6923 {
6924 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6925 const struct pvr_renderpass_hwsetup_render *hw_render =
6926 &state->render_pass_info.pass->hw_setup->renders[info->current_hw_subpass];
6927
6928 for (uint32_t i = 0U; i < hw_render->eot_surface_count; i++) {
6929 const struct pvr_renderpass_hwsetup_eot_surface *surface =
6930 &hw_render->eot_surfaces[i];
6931 const uint32_t color_attach_idx = surface->src_attachment_idx;
6932 const uint32_t resolve_attach_idx = surface->attachment_idx;
6933 VkImageSubresourceLayers src_subresource;
6934 VkImageSubresourceLayers dst_subresource;
6935 struct pvr_image_view *dst_view;
6936 struct pvr_image_view *src_view;
6937 VkFormat src_format;
6938 VkFormat dst_format;
6939 VkImageCopy2 region;
6940 VkResult result;
6941
6942 if (!surface->need_resolve ||
6943 surface->resolve_type != PVR_RESOLVE_TYPE_TRANSFER)
6944 continue;
6945
6946 dst_view = info->attachments[resolve_attach_idx];
6947 src_view = info->attachments[color_attach_idx];
6948
6949 src_subresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
6950 src_subresource.mipLevel = src_view->vk.base_mip_level;
6951 src_subresource.baseArrayLayer = src_view->vk.base_array_layer;
6952 src_subresource.layerCount = src_view->vk.layer_count;
6953
6954 dst_subresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
6955 dst_subresource.mipLevel = dst_view->vk.base_mip_level;
6956 dst_subresource.baseArrayLayer = dst_view->vk.base_array_layer;
6957 dst_subresource.layerCount = dst_view->vk.layer_count;
6958
6959 region.srcOffset = (VkOffset3D){ info->render_area.offset.x,
6960 info->render_area.offset.y,
6961 0 };
6962 region.dstOffset = (VkOffset3D){ info->render_area.offset.x,
6963 info->render_area.offset.y,
6964 0 };
6965 region.extent = (VkExtent3D){ info->render_area.extent.width,
6966 info->render_area.extent.height,
6967 1 };
6968
6969 region.srcSubresource = src_subresource;
6970 region.dstSubresource = dst_subresource;
6971
6972 /* TODO: if ERN_46863 is supported, Depth and stencil are sampled
6973 * separately from images with combined depth+stencil. Add logic here to
6974 * handle it using appropriate format from image view.
6975 */
6976 src_format = src_view->vk.image->format;
6977 dst_format = dst_view->vk.image->format;
6978 src_view->vk.image->format = src_view->vk.format;
6979 dst_view->vk.image->format = dst_view->vk.format;
6980
6981 result = pvr_copy_or_resolve_color_image_region(
6982 cmd_buffer,
6983 vk_to_pvr_image(src_view->vk.image),
6984 vk_to_pvr_image(dst_view->vk.image),
6985 ®ion);
6986
6987 src_view->vk.image->format = src_format;
6988 dst_view->vk.image->format = dst_format;
6989
6990 state->current_sub_cmd->transfer.serialize_with_frag = true;
6991
6992 if (result != VK_SUCCESS)
6993 return result;
6994 }
6995
6996 return pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
6997 }
6998
pvr_CmdEndRenderPass2(VkCommandBuffer commandBuffer,const VkSubpassEndInfo * pSubpassEndInfo)6999 void pvr_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
7000 const VkSubpassEndInfo *pSubpassEndInfo)
7001 {
7002 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7003 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
7004 struct pvr_image_view **attachments;
7005 VkClearValue *clear_values;
7006 VkResult result;
7007
7008 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
7009
7010 assert(state->render_pass_info.pass);
7011 assert(state->render_pass_info.framebuffer);
7012
7013 result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7014 if (result != VK_SUCCESS)
7015 return;
7016
7017 result = pvr_resolve_unemitted_resolve_attachments(cmd_buffer,
7018 &state->render_pass_info);
7019 if (result != VK_SUCCESS)
7020 return;
7021
7022 /* Save the required fields before clearing render_pass_info struct. */
7023 attachments = state->render_pass_info.attachments;
7024 clear_values = state->render_pass_info.clear_values;
7025
7026 memset(&state->render_pass_info, 0, sizeof(state->render_pass_info));
7027
7028 state->render_pass_info.attachments = attachments;
7029 state->render_pass_info.clear_values = clear_values;
7030 }
7031
7032 static VkResult
pvr_execute_deferred_cmd_buffer(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_cmd_buffer * sec_cmd_buffer)7033 pvr_execute_deferred_cmd_buffer(struct pvr_cmd_buffer *cmd_buffer,
7034 const struct pvr_cmd_buffer *sec_cmd_buffer)
7035 {
7036 struct vk_dynamic_graphics_state *const dynamic_state =
7037 &cmd_buffer->vk.dynamic_graphics_state;
7038 const uint32_t prim_db_elems =
7039 util_dynarray_num_elements(&cmd_buffer->depth_bias_array,
7040 struct pvr_depth_bias_state);
7041 const uint32_t prim_scissor_elems =
7042 util_dynarray_num_elements(&cmd_buffer->scissor_array,
7043 struct pvr_scissor_words);
7044
7045 util_dynarray_foreach (&sec_cmd_buffer->deferred_csb_commands,
7046 struct pvr_deferred_cs_command,
7047 cmd) {
7048 switch (cmd->type) {
7049 case PVR_DEFERRED_CS_COMMAND_TYPE_DBSC: {
7050 const uint32_t scissor_idx =
7051 prim_scissor_elems + cmd->dbsc.state.scissor_index;
7052 const uint32_t db_idx =
7053 prim_db_elems + cmd->dbsc.state.depthbias_index;
7054 const uint32_t num_dwords =
7055 pvr_cmd_length(TA_STATE_HEADER) + pvr_cmd_length(TA_STATE_ISPDBSC);
7056 struct pvr_suballoc_bo *suballoc_bo;
7057 uint32_t ppp_state[num_dwords];
7058 VkResult result;
7059
7060 pvr_csb_pack (&ppp_state[0], TA_STATE_HEADER, header) {
7061 header.pres_ispctl_dbsc = true;
7062 }
7063
7064 pvr_csb_pack (&ppp_state[1], TA_STATE_ISPDBSC, ispdbsc) {
7065 ispdbsc.dbindex = db_idx;
7066 ispdbsc.scindex = scissor_idx;
7067 }
7068
7069 result = pvr_cmd_buffer_upload_general(cmd_buffer,
7070 &ppp_state[0],
7071 sizeof(ppp_state),
7072 &suballoc_bo);
7073 if (result != VK_SUCCESS)
7074 return result;
7075
7076 pvr_csb_pack (&cmd->dbsc.vdm_state[0], VDMCTRL_PPP_STATE0, state) {
7077 state.word_count = num_dwords;
7078 state.addrmsb = suballoc_bo->dev_addr;
7079 }
7080
7081 pvr_csb_pack (&cmd->dbsc.vdm_state[1], VDMCTRL_PPP_STATE1, state) {
7082 state.addrlsb = suballoc_bo->dev_addr;
7083 }
7084
7085 break;
7086 }
7087
7088 case PVR_DEFERRED_CS_COMMAND_TYPE_DBSC2: {
7089 const uint32_t scissor_idx =
7090 prim_scissor_elems + cmd->dbsc2.state.scissor_index;
7091 const uint32_t db_idx =
7092 prim_db_elems + cmd->dbsc2.state.depthbias_index;
7093
7094 uint32_t *const addr =
7095 (uint32_t *)pvr_bo_suballoc_get_map_addr(cmd->dbsc2.ppp_cs_bo) +
7096 cmd->dbsc2.patch_offset;
7097
7098 assert(pvr_bo_suballoc_get_map_addr(cmd->dbsc2.ppp_cs_bo));
7099
7100 pvr_csb_pack (addr, TA_STATE_ISPDBSC, ispdbsc) {
7101 ispdbsc.dbindex = db_idx;
7102 ispdbsc.scindex = scissor_idx;
7103 }
7104
7105 break;
7106 }
7107
7108 default:
7109 unreachable("Invalid deferred control stream command type.");
7110 break;
7111 }
7112 }
7113
7114 util_dynarray_append_dynarray(&cmd_buffer->depth_bias_array,
7115 &sec_cmd_buffer->depth_bias_array);
7116
7117 util_dynarray_append_dynarray(&cmd_buffer->scissor_array,
7118 &sec_cmd_buffer->scissor_array);
7119
7120 BITSET_SET(dynamic_state->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS);
7121 cmd_buffer->scissor_words = (struct pvr_scissor_words){ 0 };
7122
7123 return VK_SUCCESS;
7124 }
7125
7126 /* Caller needs to make sure that it ends the current sub_cmd. This function
7127 * only creates a copy of sec_sub_cmd and links it to the cmd_buffer's
7128 * sub_cmd list.
7129 */
pvr_execute_sub_cmd(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd * sec_sub_cmd)7130 static VkResult pvr_execute_sub_cmd(struct pvr_cmd_buffer *cmd_buffer,
7131 struct pvr_sub_cmd *sec_sub_cmd)
7132 {
7133 struct pvr_sub_cmd *primary_sub_cmd =
7134 vk_zalloc(&cmd_buffer->vk.pool->alloc,
7135 sizeof(*primary_sub_cmd),
7136 8,
7137 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
7138 if (!primary_sub_cmd) {
7139 return vk_command_buffer_set_error(&cmd_buffer->vk,
7140 VK_ERROR_OUT_OF_HOST_MEMORY);
7141 }
7142
7143 primary_sub_cmd->type = sec_sub_cmd->type;
7144 primary_sub_cmd->owned = false;
7145
7146 list_addtail(&primary_sub_cmd->link, &cmd_buffer->sub_cmds);
7147
7148 switch (sec_sub_cmd->type) {
7149 case PVR_SUB_CMD_TYPE_GRAPHICS:
7150 primary_sub_cmd->gfx = sec_sub_cmd->gfx;
7151 break;
7152
7153 case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY:
7154 case PVR_SUB_CMD_TYPE_COMPUTE:
7155 primary_sub_cmd->compute = sec_sub_cmd->compute;
7156 break;
7157
7158 case PVR_SUB_CMD_TYPE_TRANSFER:
7159 primary_sub_cmd->transfer = sec_sub_cmd->transfer;
7160 break;
7161
7162 case PVR_SUB_CMD_TYPE_EVENT:
7163 primary_sub_cmd->event = sec_sub_cmd->event;
7164 break;
7165
7166 default:
7167 unreachable("Unsupported sub-command type");
7168 }
7169
7170 return VK_SUCCESS;
7171 }
7172
7173 static VkResult
pvr_execute_graphics_cmd_buffer(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_cmd_buffer * sec_cmd_buffer)7174 pvr_execute_graphics_cmd_buffer(struct pvr_cmd_buffer *cmd_buffer,
7175 const struct pvr_cmd_buffer *sec_cmd_buffer)
7176 {
7177 const struct pvr_device_info *dev_info =
7178 &cmd_buffer->device->pdevice->dev_info;
7179 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
7180 struct pvr_sub_cmd *primary_sub_cmd = state->current_sub_cmd;
7181 struct pvr_sub_cmd *first_sec_cmd;
7182 VkResult result;
7183
7184 /* Inherited queries are not supported. */
7185 assert(!state->vis_test_enabled);
7186
7187 if (list_is_empty(&sec_cmd_buffer->sub_cmds))
7188 return VK_SUCCESS;
7189
7190 first_sec_cmd =
7191 list_first_entry(&sec_cmd_buffer->sub_cmds, struct pvr_sub_cmd, link);
7192
7193 /* Kick a render if we have a new base address. */
7194 if (primary_sub_cmd->gfx.query_pool && first_sec_cmd->gfx.query_pool &&
7195 primary_sub_cmd->gfx.query_pool != first_sec_cmd->gfx.query_pool) {
7196 state->current_sub_cmd->gfx.barrier_store = true;
7197
7198 result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7199 if (result != VK_SUCCESS)
7200 return result;
7201
7202 result =
7203 pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
7204 if (result != VK_SUCCESS)
7205 return result;
7206
7207 primary_sub_cmd = state->current_sub_cmd;
7208
7209 /* Use existing render setup, but load color attachments from HW
7210 * Background object.
7211 */
7212 primary_sub_cmd->gfx.barrier_load = true;
7213 primary_sub_cmd->gfx.barrier_store = false;
7214 }
7215
7216 list_for_each_entry (struct pvr_sub_cmd,
7217 sec_sub_cmd,
7218 &sec_cmd_buffer->sub_cmds,
7219 link) {
7220 /* Only graphics secondary execution supported within a renderpass. */
7221 assert(sec_sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS);
7222
7223 if (!sec_sub_cmd->gfx.empty_cmd)
7224 primary_sub_cmd->gfx.empty_cmd = false;
7225
7226 if (sec_sub_cmd->gfx.query_pool) {
7227 primary_sub_cmd->gfx.query_pool = sec_sub_cmd->gfx.query_pool;
7228
7229 util_dynarray_append_dynarray(&state->query_indices,
7230 &sec_sub_cmd->gfx.sec_query_indices);
7231 }
7232
7233 if (pvr_cmd_uses_deferred_cs_cmds(sec_cmd_buffer)) {
7234 /* TODO: In case if secondary buffer is created with
7235 * VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT, then we patch the
7236 * stream and copy it to primary stream using pvr_csb_copy below.
7237 * This will need locking if the same secondary command buffer is
7238 * executed in multiple primary buffers at the same time.
7239 */
7240 result = pvr_execute_deferred_cmd_buffer(cmd_buffer, sec_cmd_buffer);
7241 if (result != VK_SUCCESS)
7242 return result;
7243
7244 result = pvr_csb_copy(&primary_sub_cmd->gfx.control_stream,
7245 &sec_sub_cmd->gfx.control_stream);
7246 if (result != VK_SUCCESS)
7247 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
7248 } else {
7249 result = pvr_execute_deferred_cmd_buffer(cmd_buffer, sec_cmd_buffer);
7250 if (result != VK_SUCCESS)
7251 return result;
7252
7253 pvr_csb_emit_link(
7254 &primary_sub_cmd->gfx.control_stream,
7255 pvr_csb_get_start_address(&sec_sub_cmd->gfx.control_stream),
7256 true);
7257 }
7258
7259 if (PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info,
7260 compute_overlap)) {
7261 primary_sub_cmd->gfx.job.disable_compute_overlap |=
7262 sec_sub_cmd->gfx.job.disable_compute_overlap;
7263 }
7264
7265 primary_sub_cmd->gfx.max_tiles_in_flight =
7266 MIN2(primary_sub_cmd->gfx.max_tiles_in_flight,
7267 sec_sub_cmd->gfx.max_tiles_in_flight);
7268
7269 /* Pass loaded depth/stencil usage from secondary command buffer. */
7270 if (sec_sub_cmd->gfx.depth_usage == PVR_DEPTH_STENCIL_USAGE_NEEDED)
7271 primary_sub_cmd->gfx.depth_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED;
7272
7273 if (sec_sub_cmd->gfx.stencil_usage == PVR_DEPTH_STENCIL_USAGE_NEEDED)
7274 primary_sub_cmd->gfx.stencil_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED;
7275
7276 /* Pass depth/stencil modification state from secondary command buffer. */
7277 if (sec_sub_cmd->gfx.modifies_depth)
7278 primary_sub_cmd->gfx.modifies_depth = true;
7279
7280 if (sec_sub_cmd->gfx.modifies_stencil)
7281 primary_sub_cmd->gfx.modifies_stencil = true;
7282
7283 if (sec_sub_cmd->gfx.barrier_store) {
7284 struct pvr_sub_cmd *sec_next =
7285 list_entry(sec_sub_cmd->link.next, struct pvr_sub_cmd, link);
7286
7287 /* This shouldn't be the last sub cmd. There should be a barrier load
7288 * subsequent to the barrier store.
7289 */
7290 assert(list_last_entry(&sec_cmd_buffer->sub_cmds,
7291 struct pvr_sub_cmd,
7292 link) != sec_sub_cmd);
7293
7294 /* Kick render to store stencil. */
7295 state->current_sub_cmd->gfx.barrier_store = true;
7296 state->current_sub_cmd->gfx.empty_cmd = false;
7297
7298 result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7299 if (result != VK_SUCCESS)
7300 return result;
7301
7302 result =
7303 pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
7304 if (result != VK_SUCCESS)
7305 return result;
7306
7307 primary_sub_cmd = state->current_sub_cmd;
7308
7309 /* Use existing render setup, but load color attachments from HW
7310 * Background object.
7311 */
7312 primary_sub_cmd->gfx.barrier_load = sec_next->gfx.barrier_load;
7313 primary_sub_cmd->gfx.barrier_store = sec_next->gfx.barrier_store;
7314 primary_sub_cmd->gfx.empty_cmd = false;
7315 }
7316
7317 if (!PVR_HAS_FEATURE(dev_info, gs_rta_support)) {
7318 util_dynarray_append_dynarray(&cmd_buffer->deferred_clears,
7319 &sec_cmd_buffer->deferred_clears);
7320 }
7321 }
7322
7323 return VK_SUCCESS;
7324 }
7325
pvr_CmdExecuteCommands(VkCommandBuffer commandBuffer,uint32_t commandBufferCount,const VkCommandBuffer * pCommandBuffers)7326 void pvr_CmdExecuteCommands(VkCommandBuffer commandBuffer,
7327 uint32_t commandBufferCount,
7328 const VkCommandBuffer *pCommandBuffers)
7329 {
7330 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7331 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
7332 struct pvr_cmd_buffer *last_cmd_buffer;
7333 VkResult result;
7334
7335 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
7336
7337 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
7338
7339 /* Reset the CPU copy of the most recent PPP state of the primary command
7340 * buffer.
7341 *
7342 * The next draw call in the primary after CmdExecuteCommands may send
7343 * redundant state, if it all goes in the same geom job.
7344 *
7345 * Can't just copy state from the secondary because the recording state of
7346 * the secondary command buffers would have been deleted at this point.
7347 */
7348 pvr_reset_graphics_dirty_state(cmd_buffer, false);
7349
7350 if (state->current_sub_cmd &&
7351 state->current_sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS) {
7352 for (uint32_t i = 0; i < commandBufferCount; i++) {
7353 PVR_FROM_HANDLE(pvr_cmd_buffer, sec_cmd_buffer, pCommandBuffers[i]);
7354
7355 assert(sec_cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
7356
7357 result = pvr_execute_graphics_cmd_buffer(cmd_buffer, sec_cmd_buffer);
7358 if (result != VK_SUCCESS)
7359 return;
7360 }
7361
7362 last_cmd_buffer =
7363 pvr_cmd_buffer_from_handle(pCommandBuffers[commandBufferCount - 1]);
7364
7365 /* Set barriers from final command secondary command buffer. */
7366 for (uint32_t i = 0; i != PVR_NUM_SYNC_PIPELINE_STAGES; i++) {
7367 state->barriers_needed[i] |=
7368 last_cmd_buffer->state.barriers_needed[i] &
7369 PVR_PIPELINE_STAGE_ALL_GRAPHICS_BITS;
7370 }
7371 } else {
7372 for (uint32_t i = 0; i < commandBufferCount; i++) {
7373 PVR_FROM_HANDLE(pvr_cmd_buffer, sec_cmd_buffer, pCommandBuffers[i]);
7374
7375 assert(sec_cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
7376
7377 result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7378 if (result != VK_SUCCESS)
7379 return;
7380
7381 list_for_each_entry_safe (struct pvr_sub_cmd,
7382 sec_sub_cmd,
7383 &sec_cmd_buffer->sub_cmds,
7384 link) {
7385 result = pvr_execute_sub_cmd(cmd_buffer, sec_sub_cmd);
7386 if (result != VK_SUCCESS)
7387 return;
7388 }
7389 }
7390
7391 last_cmd_buffer =
7392 pvr_cmd_buffer_from_handle(pCommandBuffers[commandBufferCount - 1]);
7393
7394 memcpy(state->barriers_needed,
7395 last_cmd_buffer->state.barriers_needed,
7396 sizeof(state->barriers_needed));
7397 }
7398 }
7399
pvr_insert_transparent_obj(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)7400 static void pvr_insert_transparent_obj(struct pvr_cmd_buffer *const cmd_buffer,
7401 struct pvr_sub_cmd_gfx *const sub_cmd)
7402 {
7403 struct pvr_device *const device = cmd_buffer->device;
7404 /* Yes we want a copy. The user could be recording multiple command buffers
7405 * in parallel so writing the template in place could cause problems.
7406 */
7407 struct pvr_static_clear_ppp_template clear =
7408 device->static_clear_state.ppp_templates[VK_IMAGE_ASPECT_COLOR_BIT];
7409 uint32_t pds_state[PVR_STATIC_CLEAR_PDS_STATE_COUNT] = { 0 };
7410 struct pvr_csb *csb = &sub_cmd->control_stream;
7411 struct pvr_suballoc_bo *ppp_bo;
7412
7413 assert(clear.requires_pds_state);
7414
7415 /* Patch the template. */
7416
7417 pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_SHADERBASE],
7418 TA_STATE_PDS_SHADERBASE,
7419 shaderbase) {
7420 shaderbase.addr = PVR_DEV_ADDR(device->nop_program.pds.data_offset);
7421 }
7422
7423 clear.config.pds_state = &pds_state;
7424
7425 clear.config.ispctl.upass = cmd_buffer->state.render_pass_info.isp_userpass;
7426
7427 /* Emit PPP state from template. */
7428
7429 pvr_emit_ppp_from_template(csb, &clear, &ppp_bo);
7430 list_add(&ppp_bo->link, &cmd_buffer->bo_list);
7431
7432 /* Emit VDM state. */
7433
7434 pvr_emit_clear_words(cmd_buffer, sub_cmd);
7435
7436 /* Reset graphics state. */
7437 pvr_reset_graphics_dirty_state(cmd_buffer, false);
7438 }
7439
7440 static inline struct pvr_render_subpass *
pvr_get_current_subpass(const struct pvr_cmd_buffer_state * const state)7441 pvr_get_current_subpass(const struct pvr_cmd_buffer_state *const state)
7442 {
7443 const uint32_t subpass_idx = state->render_pass_info.subpass_idx;
7444
7445 return &state->render_pass_info.pass->subpasses[subpass_idx];
7446 }
7447
pvr_CmdNextSubpass2(VkCommandBuffer commandBuffer,const VkSubpassBeginInfo * pSubpassBeginInfo,const VkSubpassEndInfo * pSubpassEndInfo)7448 void pvr_CmdNextSubpass2(VkCommandBuffer commandBuffer,
7449 const VkSubpassBeginInfo *pSubpassBeginInfo,
7450 const VkSubpassEndInfo *pSubpassEndInfo)
7451 {
7452 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7453 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
7454 struct pvr_render_pass_info *rp_info = &state->render_pass_info;
7455 const struct pvr_renderpass_hwsetup_subpass *hw_subpass;
7456 struct pvr_renderpass_hwsetup_render *next_hw_render;
7457 const struct pvr_render_pass *pass = rp_info->pass;
7458 const struct pvr_renderpass_hw_map *current_map;
7459 const struct pvr_renderpass_hw_map *next_map;
7460 struct pvr_load_op *hw_subpass_load_op;
7461 VkResult result;
7462
7463 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
7464
7465 current_map = &pass->hw_setup->subpass_map[rp_info->subpass_idx];
7466 next_map = &pass->hw_setup->subpass_map[rp_info->subpass_idx + 1];
7467 next_hw_render = &pass->hw_setup->renders[next_map->render];
7468
7469 if (current_map->render != next_map->render) {
7470 result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7471 if (result != VK_SUCCESS)
7472 return;
7473
7474 result = pvr_resolve_unemitted_resolve_attachments(cmd_buffer, rp_info);
7475 if (result != VK_SUCCESS)
7476 return;
7477
7478 rp_info->current_hw_subpass = next_map->render;
7479
7480 result =
7481 pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
7482 if (result != VK_SUCCESS)
7483 return;
7484
7485 rp_info->enable_bg_tag = false;
7486 rp_info->process_empty_tiles = false;
7487
7488 /* If this subpass contains any load ops the HW Background Object must be
7489 * run to do the clears/loads.
7490 */
7491 if (next_hw_render->color_init_count > 0) {
7492 rp_info->enable_bg_tag = true;
7493
7494 for (uint32_t i = 0; i < next_hw_render->color_init_count; i++) {
7495 /* Empty tiles need to be cleared too. */
7496 if (next_hw_render->color_init[i].op ==
7497 VK_ATTACHMENT_LOAD_OP_CLEAR) {
7498 rp_info->process_empty_tiles = true;
7499 break;
7500 }
7501 }
7502 }
7503
7504 /* Set isp_userpass to zero for new hw_render. This will be used to set
7505 * ROGUE_CR_ISP_CTL::upass_start.
7506 */
7507 rp_info->isp_userpass = 0;
7508 }
7509
7510 hw_subpass = &next_hw_render->subpasses[next_map->subpass];
7511 hw_subpass_load_op = hw_subpass->load_op;
7512
7513 if (hw_subpass_load_op) {
7514 result = pvr_cs_write_load_op(cmd_buffer,
7515 &state->current_sub_cmd->gfx,
7516 hw_subpass_load_op,
7517 rp_info->isp_userpass);
7518 }
7519
7520 /* Pipelines are created for a particular subpass so unbind but leave the
7521 * vertex and descriptor bindings intact as they are orthogonal to the
7522 * subpass.
7523 */
7524 state->gfx_pipeline = NULL;
7525
7526 /* User-pass spawn is 4 bits so if the driver has to wrap it, it will emit a
7527 * full screen transparent object to flush all tags up until now, then the
7528 * user-pass spawn value will implicitly be reset to 0 because
7529 * pvr_render_subpass::isp_userpass values are stored ANDed with
7530 * ROGUE_CR_ISP_CTL_UPASS_START_SIZE_MAX.
7531 */
7532 /* If hw_subpass_load_op is valid then pvr_write_load_op_control_stream
7533 * has already done a full-screen transparent object.
7534 */
7535 if (rp_info->isp_userpass == PVRX(CR_ISP_CTL_UPASS_START_SIZE_MAX) &&
7536 !hw_subpass_load_op) {
7537 pvr_insert_transparent_obj(cmd_buffer, &state->current_sub_cmd->gfx);
7538 }
7539
7540 rp_info->subpass_idx++;
7541
7542 rp_info->isp_userpass = pass->subpasses[rp_info->subpass_idx].isp_userpass;
7543 state->dirty.isp_userpass = true;
7544
7545 rp_info->pipeline_bind_point =
7546 pass->subpasses[rp_info->subpass_idx].pipeline_bind_point;
7547
7548 pvr_stash_depth_format(state, &state->current_sub_cmd->gfx);
7549 }
7550
7551 static bool
pvr_stencil_has_self_dependency(const struct pvr_cmd_buffer_state * const state)7552 pvr_stencil_has_self_dependency(const struct pvr_cmd_buffer_state *const state)
7553 {
7554 const struct pvr_render_subpass *const current_subpass =
7555 pvr_get_current_subpass(state);
7556 const uint32_t *const input_attachments = current_subpass->input_attachments;
7557
7558 if (current_subpass->depth_stencil_attachment == VK_ATTACHMENT_UNUSED)
7559 return false;
7560
7561 /* We only need to check the current software subpass as we don't support
7562 * merging to/from a subpass with self-dep stencil.
7563 */
7564
7565 for (uint32_t i = 0; i < current_subpass->input_count; i++) {
7566 if (input_attachments[i] == current_subpass->depth_stencil_attachment)
7567 return true;
7568 }
7569
7570 return false;
7571 }
7572
pvr_is_stencil_store_load_needed(const struct pvr_cmd_buffer * const cmd_buffer,VkPipelineStageFlags2 vk_src_stage_mask,VkPipelineStageFlags2 vk_dst_stage_mask,uint32_t memory_barrier_count,const VkMemoryBarrier2 * const memory_barriers,uint32_t image_barrier_count,const VkImageMemoryBarrier2 * const image_barriers)7573 static bool pvr_is_stencil_store_load_needed(
7574 const struct pvr_cmd_buffer *const cmd_buffer,
7575 VkPipelineStageFlags2 vk_src_stage_mask,
7576 VkPipelineStageFlags2 vk_dst_stage_mask,
7577 uint32_t memory_barrier_count,
7578 const VkMemoryBarrier2 *const memory_barriers,
7579 uint32_t image_barrier_count,
7580 const VkImageMemoryBarrier2 *const image_barriers)
7581 {
7582 const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
7583 const uint32_t fragment_test_stages =
7584 VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
7585 VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT;
7586 const struct pvr_render_pass *const pass = state->render_pass_info.pass;
7587 const struct pvr_renderpass_hwsetup_render *hw_render;
7588 struct pvr_image_view **const attachments =
7589 state->render_pass_info.attachments;
7590 const struct pvr_image_view *attachment;
7591 uint32_t hw_render_idx;
7592
7593 if (!pass)
7594 return false;
7595
7596 hw_render_idx = state->current_sub_cmd->gfx.hw_render_idx;
7597 hw_render = &pass->hw_setup->renders[hw_render_idx];
7598
7599 if (hw_render->ds_attach_idx == VK_ATTACHMENT_UNUSED)
7600 return false;
7601
7602 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
7603 attachment = attachments[hw_render->ds_attach_idx];
7604 } else {
7605 assert(!attachments);
7606 attachment = NULL;
7607 }
7608
7609 if (!(vk_src_stage_mask & fragment_test_stages) &&
7610 vk_dst_stage_mask & VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT)
7611 return false;
7612
7613 for (uint32_t i = 0; i < memory_barrier_count; i++) {
7614 const uint32_t stencil_write_bit =
7615 VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
7616 const uint32_t input_attachment_read_bit =
7617 VK_ACCESS_INPUT_ATTACHMENT_READ_BIT;
7618
7619 if (!(memory_barriers[i].srcAccessMask & stencil_write_bit))
7620 continue;
7621
7622 if (!(memory_barriers[i].dstAccessMask & input_attachment_read_bit))
7623 continue;
7624
7625 return pvr_stencil_has_self_dependency(state);
7626 }
7627
7628 for (uint32_t i = 0; i < image_barrier_count; i++) {
7629 PVR_FROM_HANDLE(pvr_image, image, image_barriers[i].image);
7630 const uint32_t stencil_bit = VK_IMAGE_ASPECT_STENCIL_BIT;
7631
7632 if (!(image_barriers[i].subresourceRange.aspectMask & stencil_bit))
7633 continue;
7634
7635 if (attachment && image != vk_to_pvr_image(attachment->vk.image))
7636 continue;
7637
7638 if (!vk_format_has_stencil(image->vk.format))
7639 continue;
7640
7641 return pvr_stencil_has_self_dependency(state);
7642 }
7643
7644 return false;
7645 }
7646
7647 static VkResult
pvr_cmd_buffer_insert_mid_frag_barrier_event(struct pvr_cmd_buffer * cmd_buffer,uint32_t src_stage_mask,uint32_t dst_stage_mask)7648 pvr_cmd_buffer_insert_mid_frag_barrier_event(struct pvr_cmd_buffer *cmd_buffer,
7649 uint32_t src_stage_mask,
7650 uint32_t dst_stage_mask)
7651 {
7652 VkResult result;
7653
7654 assert(cmd_buffer->state.current_sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS);
7655
7656 cmd_buffer->state.current_sub_cmd->gfx.empty_cmd = false;
7657
7658 /* Submit graphics job to store stencil. */
7659 cmd_buffer->state.current_sub_cmd->gfx.barrier_store = true;
7660
7661 pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7662 result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
7663 if (result != VK_SUCCESS)
7664 return result;
7665
7666 cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
7667 .type = PVR_EVENT_TYPE_BARRIER,
7668 .barrier = {
7669 .in_render_pass = true,
7670 .wait_for_stage_mask = src_stage_mask,
7671 .wait_at_stage_mask = dst_stage_mask,
7672 },
7673 };
7674
7675 pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7676 pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
7677
7678 /* Use existing render setup, but load color attachments from HW BGOBJ */
7679 cmd_buffer->state.current_sub_cmd->gfx.barrier_load = true;
7680 cmd_buffer->state.current_sub_cmd->gfx.barrier_store = false;
7681
7682 return VK_SUCCESS;
7683 }
7684
7685 static VkResult
pvr_cmd_buffer_insert_barrier_event(struct pvr_cmd_buffer * cmd_buffer,uint32_t src_stage_mask,uint32_t dst_stage_mask)7686 pvr_cmd_buffer_insert_barrier_event(struct pvr_cmd_buffer *cmd_buffer,
7687 uint32_t src_stage_mask,
7688 uint32_t dst_stage_mask)
7689 {
7690 VkResult result;
7691
7692 result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
7693 if (result != VK_SUCCESS)
7694 return result;
7695
7696 cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
7697 .type = PVR_EVENT_TYPE_BARRIER,
7698 .barrier = {
7699 .wait_for_stage_mask = src_stage_mask,
7700 .wait_at_stage_mask = dst_stage_mask,
7701 },
7702 };
7703
7704 return pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7705 }
7706
7707 /* This is just enough to handle vkCmdPipelineBarrier().
7708 * TODO: Complete?
7709 */
pvr_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,const VkDependencyInfo * pDependencyInfo)7710 void pvr_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,
7711 const VkDependencyInfo *pDependencyInfo)
7712 {
7713 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7714 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
7715 const struct pvr_render_pass *const render_pass =
7716 state->render_pass_info.pass;
7717 VkPipelineStageFlags vk_src_stage_mask = 0U;
7718 VkPipelineStageFlags vk_dst_stage_mask = 0U;
7719 bool is_stencil_store_load_needed;
7720 uint32_t required_stage_mask = 0U;
7721 uint32_t src_stage_mask;
7722 uint32_t dst_stage_mask;
7723 bool is_barrier_needed;
7724
7725 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
7726
7727 for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++) {
7728 vk_src_stage_mask |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
7729 vk_dst_stage_mask |= pDependencyInfo->pMemoryBarriers[i].dstStageMask;
7730 }
7731
7732 for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++) {
7733 vk_src_stage_mask |=
7734 pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
7735 vk_dst_stage_mask |=
7736 pDependencyInfo->pBufferMemoryBarriers[i].dstStageMask;
7737 }
7738
7739 for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++) {
7740 vk_src_stage_mask |=
7741 pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
7742 vk_dst_stage_mask |=
7743 pDependencyInfo->pImageMemoryBarriers[i].dstStageMask;
7744 }
7745
7746 src_stage_mask = pvr_stage_mask_src(vk_src_stage_mask);
7747 dst_stage_mask = pvr_stage_mask_dst(vk_dst_stage_mask);
7748
7749 for (uint32_t stage = 0U; stage != PVR_NUM_SYNC_PIPELINE_STAGES; stage++) {
7750 if (!(dst_stage_mask & BITFIELD_BIT(stage)))
7751 continue;
7752
7753 required_stage_mask |= state->barriers_needed[stage];
7754 }
7755
7756 src_stage_mask &= required_stage_mask;
7757 for (uint32_t stage = 0U; stage != PVR_NUM_SYNC_PIPELINE_STAGES; stage++) {
7758 if (!(dst_stage_mask & BITFIELD_BIT(stage)))
7759 continue;
7760
7761 state->barriers_needed[stage] &= ~src_stage_mask;
7762 }
7763
7764 if (src_stage_mask == 0 || dst_stage_mask == 0) {
7765 is_barrier_needed = false;
7766 } else if (src_stage_mask == PVR_PIPELINE_STAGE_GEOM_BIT &&
7767 dst_stage_mask == PVR_PIPELINE_STAGE_FRAG_BIT) {
7768 /* This is implicit so no need to barrier. */
7769 is_barrier_needed = false;
7770 } else if (src_stage_mask == dst_stage_mask &&
7771 util_bitcount(src_stage_mask) == 1) {
7772 struct pvr_sub_cmd *const current_sub_cmd = state->current_sub_cmd;
7773
7774 switch (src_stage_mask) {
7775 case PVR_PIPELINE_STAGE_FRAG_BIT:
7776 is_barrier_needed = !render_pass;
7777
7778 if (is_barrier_needed)
7779 break;
7780
7781 assert(current_sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS);
7782
7783 /* Flush all fragment work up to this point. */
7784 pvr_insert_transparent_obj(cmd_buffer, ¤t_sub_cmd->gfx);
7785 break;
7786
7787 case PVR_PIPELINE_STAGE_COMPUTE_BIT:
7788 is_barrier_needed = false;
7789
7790 if (!current_sub_cmd ||
7791 current_sub_cmd->type != PVR_SUB_CMD_TYPE_COMPUTE) {
7792 break;
7793 }
7794
7795 /* Multiple dispatches can be merged into a single job. When back to
7796 * back dispatches have a sequential dependency (Compute -> compute
7797 * pipeline barrier) we need to do the following.
7798 * - Dispatch a kernel which fences all previous memory writes and
7799 * flushes the MADD cache.
7800 * - Issue a compute fence which ensures all previous tasks emitted
7801 * by the compute data master are completed before starting
7802 * anything new.
7803 */
7804
7805 /* Issue Data Fence, Wait for Data Fence (IDFWDF) makes the PDS wait
7806 * for data.
7807 */
7808 pvr_compute_generate_idfwdf(cmd_buffer, ¤t_sub_cmd->compute);
7809
7810 pvr_compute_generate_fence(cmd_buffer,
7811 ¤t_sub_cmd->compute,
7812 false);
7813 break;
7814
7815 default:
7816 is_barrier_needed = false;
7817 break;
7818 };
7819 } else {
7820 is_barrier_needed = true;
7821 }
7822
7823 is_stencil_store_load_needed =
7824 pvr_is_stencil_store_load_needed(cmd_buffer,
7825 vk_src_stage_mask,
7826 vk_dst_stage_mask,
7827 pDependencyInfo->memoryBarrierCount,
7828 pDependencyInfo->pMemoryBarriers,
7829 pDependencyInfo->imageMemoryBarrierCount,
7830 pDependencyInfo->pImageMemoryBarriers);
7831
7832 if (is_stencil_store_load_needed) {
7833 VkResult result;
7834
7835 result = pvr_cmd_buffer_insert_mid_frag_barrier_event(cmd_buffer,
7836 src_stage_mask,
7837 dst_stage_mask);
7838 if (result != VK_SUCCESS)
7839 mesa_loge("Failed to insert mid frag barrier event.");
7840 } else {
7841 if (is_barrier_needed) {
7842 VkResult result;
7843
7844 result = pvr_cmd_buffer_insert_barrier_event(cmd_buffer,
7845 src_stage_mask,
7846 dst_stage_mask);
7847 if (result != VK_SUCCESS)
7848 mesa_loge("Failed to insert pipeline barrier event.");
7849 }
7850 }
7851 }
7852
pvr_CmdResetEvent2(VkCommandBuffer commandBuffer,VkEvent _event,VkPipelineStageFlags2 stageMask)7853 void pvr_CmdResetEvent2(VkCommandBuffer commandBuffer,
7854 VkEvent _event,
7855 VkPipelineStageFlags2 stageMask)
7856 {
7857 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7858 PVR_FROM_HANDLE(pvr_event, event, _event);
7859 VkResult result;
7860
7861 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
7862
7863 result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
7864 if (result != VK_SUCCESS)
7865 return;
7866
7867 cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
7868 .type = PVR_EVENT_TYPE_RESET,
7869 .set_reset = {
7870 .event = event,
7871 .wait_for_stage_mask = pvr_stage_mask_src(stageMask),
7872 },
7873 };
7874
7875 pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7876 }
7877
pvr_CmdSetEvent2(VkCommandBuffer commandBuffer,VkEvent _event,const VkDependencyInfo * pDependencyInfo)7878 void pvr_CmdSetEvent2(VkCommandBuffer commandBuffer,
7879 VkEvent _event,
7880 const VkDependencyInfo *pDependencyInfo)
7881 {
7882 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7883 PVR_FROM_HANDLE(pvr_event, event, _event);
7884 VkPipelineStageFlags2 stage_mask = 0;
7885 VkResult result;
7886
7887 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
7888
7889 result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
7890 if (result != VK_SUCCESS)
7891 return;
7892
7893 for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++)
7894 stage_mask |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
7895
7896 for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++)
7897 stage_mask |= pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
7898
7899 for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++)
7900 stage_mask |= pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
7901
7902 cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
7903 .type = PVR_EVENT_TYPE_SET,
7904 .set_reset = {
7905 .event = event,
7906 .wait_for_stage_mask = pvr_stage_mask_dst(stage_mask),
7907 },
7908 };
7909
7910 pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7911 }
7912
pvr_CmdWaitEvents2(VkCommandBuffer commandBuffer,uint32_t eventCount,const VkEvent * pEvents,const VkDependencyInfo * pDependencyInfos)7913 void pvr_CmdWaitEvents2(VkCommandBuffer commandBuffer,
7914 uint32_t eventCount,
7915 const VkEvent *pEvents,
7916 const VkDependencyInfo *pDependencyInfos)
7917 {
7918 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7919 struct pvr_event **events_array;
7920 uint32_t *stage_masks;
7921 VkResult result;
7922
7923 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
7924
7925 VK_MULTIALLOC(ma);
7926 vk_multialloc_add(&ma, &events_array, __typeof__(*events_array), eventCount);
7927 vk_multialloc_add(&ma, &stage_masks, __typeof__(*stage_masks), eventCount);
7928
7929 if (!vk_multialloc_alloc(&ma,
7930 &cmd_buffer->vk.pool->alloc,
7931 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT)) {
7932 vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
7933 return;
7934 }
7935
7936 result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
7937 if (result != VK_SUCCESS) {
7938 vk_free(&cmd_buffer->vk.pool->alloc, events_array);
7939 return;
7940 }
7941
7942 memcpy(events_array, pEvents, sizeof(*events_array) * eventCount);
7943
7944 for (uint32_t i = 0; i < eventCount; i++) {
7945 const VkDependencyInfo *info = &pDependencyInfos[i];
7946 VkPipelineStageFlags2 mask = 0;
7947
7948 for (uint32_t j = 0; j < info->memoryBarrierCount; j++)
7949 mask |= info->pMemoryBarriers[j].dstStageMask;
7950
7951 for (uint32_t j = 0; j < info->bufferMemoryBarrierCount; j++)
7952 mask |= info->pBufferMemoryBarriers[j].dstStageMask;
7953
7954 for (uint32_t j = 0; j < info->imageMemoryBarrierCount; j++)
7955 mask |= info->pImageMemoryBarriers[j].dstStageMask;
7956
7957 stage_masks[i] = pvr_stage_mask_dst(mask);
7958 }
7959
7960 cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
7961 .type = PVR_EVENT_TYPE_WAIT,
7962 .wait = {
7963 .count = eventCount,
7964 .events = events_array,
7965 .wait_at_stage_masks = stage_masks,
7966 },
7967 };
7968
7969 pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7970 }
7971
pvr_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,VkPipelineStageFlags2 stage,VkQueryPool queryPool,uint32_t query)7972 void pvr_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,
7973 VkPipelineStageFlags2 stage,
7974 VkQueryPool queryPool,
7975 uint32_t query)
7976 {
7977 unreachable("Timestamp queries are not supported.");
7978 }
7979
pvr_EndCommandBuffer(VkCommandBuffer commandBuffer)7980 VkResult pvr_EndCommandBuffer(VkCommandBuffer commandBuffer)
7981 {
7982 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7983 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
7984 VkResult result;
7985
7986 if (vk_command_buffer_has_error(&cmd_buffer->vk))
7987 return vk_command_buffer_end(&cmd_buffer->vk);
7988
7989 /* TODO: We should be freeing all the resources, allocated for recording,
7990 * here.
7991 */
7992 util_dynarray_fini(&state->query_indices);
7993
7994 result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7995 if (result != VK_SUCCESS)
7996 pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
7997
7998 return vk_command_buffer_end(&cmd_buffer->vk);
7999 }
8000