1 /*
2 * Copyright © 2022 Imagination Technologies Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a copy
5 * of this software and associated documentation files (the "Software"), to deal
6 * in the Software without restriction, including without limitation the rights
7 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 * copies of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <limits.h>
26 #include <stdbool.h>
27 #include <stddef.h>
28 #include <stdint.h>
29 #include <string.h>
30 #include <vulkan/vulkan.h>
31
32 #include "hwdef/rogue_hw_defs.h"
33 #include "hwdef/rogue_hw_utils.h"
34 #include "pvr_blit.h"
35 #include "pvr_bo.h"
36 #include "pvr_clear.h"
37 #include "pvr_common.h"
38 #include "pvr_csb.h"
39 #include "pvr_csb_enum_helpers.h"
40 #include "pvr_device_info.h"
41 #include "pvr_formats.h"
42 #include "pvr_hardcode.h"
43 #include "pvr_hw_pass.h"
44 #include "pvr_job_common.h"
45 #include "pvr_job_render.h"
46 #include "pvr_limits.h"
47 #include "pvr_pds.h"
48 #include "pvr_private.h"
49 #include "pvr_tex_state.h"
50 #include "pvr_types.h"
51 #include "usc/pvr_uscgen.h"
52 #include "pvr_winsys.h"
53 #include "util/bitscan.h"
54 #include "util/bitset.h"
55 #include "util/compiler.h"
56 #include "util/list.h"
57 #include "util/macros.h"
58 #include "util/u_dynarray.h"
59 #include "util/u_math.h"
60 #include "util/u_pack_color.h"
61 #include "vk_alloc.h"
62 #include "vk_command_buffer.h"
63 #include "vk_command_pool.h"
64 #include "vk_common_entrypoints.h"
65 #include "vk_format.h"
66 #include "vk_graphics_state.h"
67 #include "vk_log.h"
68 #include "vk_object.h"
69 #include "vk_util.h"
70
71 /* Structure used to pass data into pvr_compute_generate_control_stream()
72 * function.
73 */
74 struct pvr_compute_kernel_info {
75 pvr_dev_addr_t indirect_buffer_addr;
76 bool global_offsets_present;
77 uint32_t usc_common_size;
78 uint32_t usc_unified_size;
79 uint32_t pds_temp_size;
80 uint32_t pds_data_size;
81 enum ROGUE_CDMCTRL_USC_TARGET usc_target;
82 bool is_fence;
83 uint32_t pds_data_offset;
84 uint32_t pds_code_offset;
85 enum ROGUE_CDMCTRL_SD_TYPE sd_type;
86 bool usc_common_shared;
87 uint32_t global_size[PVR_WORKGROUP_DIMENSIONS];
88 uint32_t local_size[PVR_WORKGROUP_DIMENSIONS];
89 uint32_t max_instances;
90 };
91
pvr_cmd_buffer_free_sub_cmd(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd * sub_cmd)92 static void pvr_cmd_buffer_free_sub_cmd(struct pvr_cmd_buffer *cmd_buffer,
93 struct pvr_sub_cmd *sub_cmd)
94 {
95 if (sub_cmd->owned) {
96 switch (sub_cmd->type) {
97 case PVR_SUB_CMD_TYPE_GRAPHICS:
98 util_dynarray_fini(&sub_cmd->gfx.sec_query_indices);
99 pvr_csb_finish(&sub_cmd->gfx.control_stream);
100 pvr_bo_free(cmd_buffer->device, sub_cmd->gfx.terminate_ctrl_stream);
101 pvr_bo_suballoc_free(sub_cmd->gfx.depth_bias_bo);
102 pvr_bo_suballoc_free(sub_cmd->gfx.scissor_bo);
103 break;
104
105 case PVR_SUB_CMD_TYPE_COMPUTE:
106 case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY:
107 pvr_csb_finish(&sub_cmd->compute.control_stream);
108 break;
109
110 case PVR_SUB_CMD_TYPE_TRANSFER:
111 list_for_each_entry_safe (struct pvr_transfer_cmd,
112 transfer_cmd,
113 sub_cmd->transfer.transfer_cmds,
114 link) {
115 list_del(&transfer_cmd->link);
116 if (!transfer_cmd->is_deferred_clear)
117 vk_free(&cmd_buffer->vk.pool->alloc, transfer_cmd);
118 }
119 break;
120
121 case PVR_SUB_CMD_TYPE_EVENT:
122 if (sub_cmd->event.type == PVR_EVENT_TYPE_WAIT)
123 vk_free(&cmd_buffer->vk.pool->alloc, sub_cmd->event.wait.events);
124 break;
125
126 default:
127 unreachable("Unsupported sub-command type");
128 }
129 }
130
131 list_del(&sub_cmd->link);
132 vk_free(&cmd_buffer->vk.pool->alloc, sub_cmd);
133 }
134
pvr_cmd_buffer_free_sub_cmds(struct pvr_cmd_buffer * cmd_buffer)135 static void pvr_cmd_buffer_free_sub_cmds(struct pvr_cmd_buffer *cmd_buffer)
136 {
137 list_for_each_entry_safe (struct pvr_sub_cmd,
138 sub_cmd,
139 &cmd_buffer->sub_cmds,
140 link) {
141 pvr_cmd_buffer_free_sub_cmd(cmd_buffer, sub_cmd);
142 }
143 }
144
pvr_cmd_buffer_free_resources(struct pvr_cmd_buffer * cmd_buffer)145 static void pvr_cmd_buffer_free_resources(struct pvr_cmd_buffer *cmd_buffer)
146 {
147 vk_free(&cmd_buffer->vk.pool->alloc,
148 cmd_buffer->state.render_pass_info.attachments);
149 vk_free(&cmd_buffer->vk.pool->alloc,
150 cmd_buffer->state.render_pass_info.clear_values);
151
152 util_dynarray_fini(&cmd_buffer->state.query_indices);
153
154 pvr_cmd_buffer_free_sub_cmds(cmd_buffer);
155
156 list_for_each_entry_safe (struct pvr_suballoc_bo,
157 suballoc_bo,
158 &cmd_buffer->bo_list,
159 link) {
160 list_del(&suballoc_bo->link);
161 pvr_bo_suballoc_free(suballoc_bo);
162 }
163
164 util_dynarray_fini(&cmd_buffer->deferred_clears);
165 util_dynarray_fini(&cmd_buffer->deferred_csb_commands);
166 util_dynarray_fini(&cmd_buffer->scissor_array);
167 util_dynarray_fini(&cmd_buffer->depth_bias_array);
168 }
169
pvr_cmd_buffer_reset(struct vk_command_buffer * vk_cmd_buffer,VkCommandBufferResetFlags flags)170 static void pvr_cmd_buffer_reset(struct vk_command_buffer *vk_cmd_buffer,
171 VkCommandBufferResetFlags flags)
172 {
173 struct pvr_cmd_buffer *cmd_buffer =
174 container_of(vk_cmd_buffer, struct pvr_cmd_buffer, vk);
175
176 /* FIXME: For now we always free all resources as if
177 * VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT was set.
178 */
179 pvr_cmd_buffer_free_resources(cmd_buffer);
180
181 vk_command_buffer_reset(&cmd_buffer->vk);
182
183 memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
184 memset(&cmd_buffer->scissor_words, 0, sizeof(cmd_buffer->scissor_words));
185
186 cmd_buffer->usage_flags = 0;
187 }
188
pvr_cmd_buffer_destroy(struct vk_command_buffer * vk_cmd_buffer)189 static void pvr_cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer)
190 {
191 struct pvr_cmd_buffer *cmd_buffer =
192 container_of(vk_cmd_buffer, struct pvr_cmd_buffer, vk);
193
194 pvr_cmd_buffer_free_resources(cmd_buffer);
195 vk_command_buffer_finish(&cmd_buffer->vk);
196 vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer);
197 }
198
199 static const struct vk_command_buffer_ops cmd_buffer_ops = {
200 .reset = pvr_cmd_buffer_reset,
201 .destroy = pvr_cmd_buffer_destroy,
202 };
203
pvr_cmd_buffer_create(struct pvr_device * device,struct vk_command_pool * pool,VkCommandBufferLevel level,VkCommandBuffer * pCommandBuffer)204 static VkResult pvr_cmd_buffer_create(struct pvr_device *device,
205 struct vk_command_pool *pool,
206 VkCommandBufferLevel level,
207 VkCommandBuffer *pCommandBuffer)
208 {
209 struct pvr_cmd_buffer *cmd_buffer;
210 VkResult result;
211
212 cmd_buffer = vk_zalloc(&pool->alloc,
213 sizeof(*cmd_buffer),
214 8U,
215 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
216 if (!cmd_buffer)
217 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
218
219 result =
220 vk_command_buffer_init(pool, &cmd_buffer->vk, &cmd_buffer_ops, level);
221 if (result != VK_SUCCESS) {
222 vk_free(&pool->alloc, cmd_buffer);
223 return result;
224 }
225
226 cmd_buffer->device = device;
227
228 util_dynarray_init(&cmd_buffer->depth_bias_array, NULL);
229 util_dynarray_init(&cmd_buffer->scissor_array, NULL);
230 util_dynarray_init(&cmd_buffer->deferred_csb_commands, NULL);
231 util_dynarray_init(&cmd_buffer->deferred_clears, NULL);
232
233 list_inithead(&cmd_buffer->sub_cmds);
234 list_inithead(&cmd_buffer->bo_list);
235
236 *pCommandBuffer = pvr_cmd_buffer_to_handle(cmd_buffer);
237
238 return VK_SUCCESS;
239 }
240
241 VkResult
pvr_AllocateCommandBuffers(VkDevice _device,const VkCommandBufferAllocateInfo * pAllocateInfo,VkCommandBuffer * pCommandBuffers)242 pvr_AllocateCommandBuffers(VkDevice _device,
243 const VkCommandBufferAllocateInfo *pAllocateInfo,
244 VkCommandBuffer *pCommandBuffers)
245 {
246 VK_FROM_HANDLE(vk_command_pool, pool, pAllocateInfo->commandPool);
247 PVR_FROM_HANDLE(pvr_device, device, _device);
248 VkResult result = VK_SUCCESS;
249 uint32_t i;
250
251 for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
252 result = pvr_cmd_buffer_create(device,
253 pool,
254 pAllocateInfo->level,
255 &pCommandBuffers[i]);
256 if (result != VK_SUCCESS)
257 break;
258 }
259
260 if (result != VK_SUCCESS) {
261 while (i--) {
262 VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, pCommandBuffers[i]);
263 pvr_cmd_buffer_destroy(cmd_buffer);
264 }
265
266 for (i = 0; i < pAllocateInfo->commandBufferCount; i++)
267 pCommandBuffers[i] = VK_NULL_HANDLE;
268 }
269
270 return result;
271 }
272
pvr_cmd_buffer_update_barriers(struct pvr_cmd_buffer * cmd_buffer,enum pvr_sub_cmd_type type)273 static void pvr_cmd_buffer_update_barriers(struct pvr_cmd_buffer *cmd_buffer,
274 enum pvr_sub_cmd_type type)
275 {
276 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
277 uint32_t barriers;
278
279 switch (type) {
280 case PVR_SUB_CMD_TYPE_GRAPHICS:
281 barriers = PVR_PIPELINE_STAGE_GEOM_BIT | PVR_PIPELINE_STAGE_FRAG_BIT;
282 break;
283
284 case PVR_SUB_CMD_TYPE_COMPUTE:
285 barriers = PVR_PIPELINE_STAGE_COMPUTE_BIT;
286 break;
287
288 case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY:
289 case PVR_SUB_CMD_TYPE_TRANSFER:
290 /* Compute jobs are used for occlusion queries but to copy the results we
291 * have to sync with transfer jobs because vkCmdCopyQueryPoolResults() is
292 * deemed as a transfer operation by the spec.
293 */
294 barriers = PVR_PIPELINE_STAGE_TRANSFER_BIT;
295 break;
296
297 case PVR_SUB_CMD_TYPE_EVENT:
298 barriers = 0;
299 break;
300
301 default:
302 unreachable("Unsupported sub-command type");
303 }
304
305 for (uint32_t i = 0; i < ARRAY_SIZE(state->barriers_needed); i++)
306 state->barriers_needed[i] |= barriers;
307 }
308
309 static VkResult
pvr_cmd_buffer_upload_tables(struct pvr_device * device,struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)310 pvr_cmd_buffer_upload_tables(struct pvr_device *device,
311 struct pvr_cmd_buffer *cmd_buffer,
312 struct pvr_sub_cmd_gfx *const sub_cmd)
313 {
314 const uint32_t cache_line_size =
315 rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
316 VkResult result;
317
318 assert(!sub_cmd->depth_bias_bo && !sub_cmd->scissor_bo);
319
320 if (cmd_buffer->depth_bias_array.size > 0) {
321 result =
322 pvr_gpu_upload(device,
323 device->heaps.general_heap,
324 util_dynarray_begin(&cmd_buffer->depth_bias_array),
325 cmd_buffer->depth_bias_array.size,
326 cache_line_size,
327 &sub_cmd->depth_bias_bo);
328 if (result != VK_SUCCESS)
329 return result;
330 }
331
332 if (cmd_buffer->scissor_array.size > 0) {
333 result = pvr_gpu_upload(device,
334 device->heaps.general_heap,
335 util_dynarray_begin(&cmd_buffer->scissor_array),
336 cmd_buffer->scissor_array.size,
337 cache_line_size,
338 &sub_cmd->scissor_bo);
339 if (result != VK_SUCCESS)
340 goto err_free_depth_bias_bo;
341 }
342
343 util_dynarray_clear(&cmd_buffer->depth_bias_array);
344 util_dynarray_clear(&cmd_buffer->scissor_array);
345
346 return VK_SUCCESS;
347
348 err_free_depth_bias_bo:
349 pvr_bo_suballoc_free(sub_cmd->depth_bias_bo);
350 sub_cmd->depth_bias_bo = NULL;
351
352 return result;
353 }
354
355 static VkResult
pvr_cmd_buffer_emit_ppp_state(const struct pvr_cmd_buffer * const cmd_buffer,struct pvr_csb * const csb)356 pvr_cmd_buffer_emit_ppp_state(const struct pvr_cmd_buffer *const cmd_buffer,
357 struct pvr_csb *const csb)
358 {
359 const struct pvr_framebuffer *const framebuffer =
360 cmd_buffer->state.render_pass_info.framebuffer;
361
362 assert(csb->stream_type == PVR_CMD_STREAM_TYPE_GRAPHICS ||
363 csb->stream_type == PVR_CMD_STREAM_TYPE_GRAPHICS_DEFERRED);
364
365 pvr_csb_set_relocation_mark(csb);
366
367 pvr_csb_emit (csb, VDMCTRL_PPP_STATE0, state0) {
368 state0.addrmsb = framebuffer->ppp_state_bo->dev_addr;
369 state0.word_count = framebuffer->ppp_state_size;
370 }
371
372 pvr_csb_emit (csb, VDMCTRL_PPP_STATE1, state1) {
373 state1.addrlsb = framebuffer->ppp_state_bo->dev_addr;
374 }
375
376 pvr_csb_clear_relocation_mark(csb);
377
378 return csb->status;
379 }
380
381 VkResult
pvr_cmd_buffer_upload_general(struct pvr_cmd_buffer * const cmd_buffer,const void * const data,const size_t size,struct pvr_suballoc_bo ** const pvr_bo_out)382 pvr_cmd_buffer_upload_general(struct pvr_cmd_buffer *const cmd_buffer,
383 const void *const data,
384 const size_t size,
385 struct pvr_suballoc_bo **const pvr_bo_out)
386 {
387 struct pvr_device *const device = cmd_buffer->device;
388 const uint32_t cache_line_size =
389 rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
390 struct pvr_suballoc_bo *suballoc_bo;
391 VkResult result;
392
393 result = pvr_gpu_upload(device,
394 device->heaps.general_heap,
395 data,
396 size,
397 cache_line_size,
398 &suballoc_bo);
399 if (result != VK_SUCCESS)
400 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
401
402 list_add(&suballoc_bo->link, &cmd_buffer->bo_list);
403
404 *pvr_bo_out = suballoc_bo;
405
406 return VK_SUCCESS;
407 }
408
409 static VkResult
pvr_cmd_buffer_upload_usc(struct pvr_cmd_buffer * const cmd_buffer,const void * const code,const size_t code_size,uint64_t code_alignment,struct pvr_suballoc_bo ** const pvr_bo_out)410 pvr_cmd_buffer_upload_usc(struct pvr_cmd_buffer *const cmd_buffer,
411 const void *const code,
412 const size_t code_size,
413 uint64_t code_alignment,
414 struct pvr_suballoc_bo **const pvr_bo_out)
415 {
416 struct pvr_device *const device = cmd_buffer->device;
417 const uint32_t cache_line_size =
418 rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
419 struct pvr_suballoc_bo *suballoc_bo;
420 VkResult result;
421
422 code_alignment = MAX2(code_alignment, cache_line_size);
423
424 result =
425 pvr_gpu_upload_usc(device, code, code_size, code_alignment, &suballoc_bo);
426 if (result != VK_SUCCESS)
427 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
428
429 list_add(&suballoc_bo->link, &cmd_buffer->bo_list);
430
431 *pvr_bo_out = suballoc_bo;
432
433 return VK_SUCCESS;
434 }
435
pvr_cmd_buffer_upload_pds(struct pvr_cmd_buffer * const cmd_buffer,const uint32_t * data,uint32_t data_size_dwords,uint32_t data_alignment,const uint32_t * code,uint32_t code_size_dwords,uint32_t code_alignment,uint64_t min_alignment,struct pvr_pds_upload * const pds_upload_out)436 VkResult pvr_cmd_buffer_upload_pds(struct pvr_cmd_buffer *const cmd_buffer,
437 const uint32_t *data,
438 uint32_t data_size_dwords,
439 uint32_t data_alignment,
440 const uint32_t *code,
441 uint32_t code_size_dwords,
442 uint32_t code_alignment,
443 uint64_t min_alignment,
444 struct pvr_pds_upload *const pds_upload_out)
445 {
446 struct pvr_device *const device = cmd_buffer->device;
447 VkResult result;
448
449 result = pvr_gpu_upload_pds(device,
450 data,
451 data_size_dwords,
452 data_alignment,
453 code,
454 code_size_dwords,
455 code_alignment,
456 min_alignment,
457 pds_upload_out);
458 if (result != VK_SUCCESS)
459 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
460
461 list_add(&pds_upload_out->pvr_bo->link, &cmd_buffer->bo_list);
462
463 return VK_SUCCESS;
464 }
465
466 static inline VkResult
pvr_cmd_buffer_upload_pds_data(struct pvr_cmd_buffer * const cmd_buffer,const uint32_t * data,uint32_t data_size_dwords,uint32_t data_alignment,struct pvr_pds_upload * const pds_upload_out)467 pvr_cmd_buffer_upload_pds_data(struct pvr_cmd_buffer *const cmd_buffer,
468 const uint32_t *data,
469 uint32_t data_size_dwords,
470 uint32_t data_alignment,
471 struct pvr_pds_upload *const pds_upload_out)
472 {
473 return pvr_cmd_buffer_upload_pds(cmd_buffer,
474 data,
475 data_size_dwords,
476 data_alignment,
477 NULL,
478 0,
479 0,
480 data_alignment,
481 pds_upload_out);
482 }
483
484 /* pbe_cs_words must be an array of length emit_count with
485 * ROGUE_NUM_PBESTATE_STATE_WORDS entries
486 */
pvr_sub_cmd_gfx_per_job_fragment_programs_create_and_upload(struct pvr_cmd_buffer * const cmd_buffer,const uint32_t emit_count,const uint32_t * pbe_cs_words,struct pvr_pds_upload * const pds_upload_out)487 static VkResult pvr_sub_cmd_gfx_per_job_fragment_programs_create_and_upload(
488 struct pvr_cmd_buffer *const cmd_buffer,
489 const uint32_t emit_count,
490 const uint32_t *pbe_cs_words,
491 struct pvr_pds_upload *const pds_upload_out)
492 {
493 struct pvr_pds_event_program pixel_event_program = {
494 /* No data to DMA, just a DOUTU needed. */
495 .num_emit_word_pairs = 0,
496 };
497 const uint32_t staging_buffer_size =
498 PVR_DW_TO_BYTES(cmd_buffer->device->pixel_event_data_size_in_dwords);
499 const VkAllocationCallbacks *const allocator = &cmd_buffer->vk.pool->alloc;
500 struct pvr_device *const device = cmd_buffer->device;
501 struct pvr_suballoc_bo *usc_eot_program = NULL;
502 struct util_dynarray eot_program_bin;
503 uint32_t *staging_buffer;
504 uint32_t usc_temp_count;
505 VkResult result;
506
507 assert(emit_count > 0);
508
509 pvr_uscgen_eot("per-job EOT",
510 emit_count,
511 pbe_cs_words,
512 &usc_temp_count,
513 &eot_program_bin);
514
515 result = pvr_cmd_buffer_upload_usc(cmd_buffer,
516 eot_program_bin.data,
517 eot_program_bin.size,
518 4,
519 &usc_eot_program);
520
521 util_dynarray_fini(&eot_program_bin);
522
523 if (result != VK_SUCCESS)
524 return result;
525
526 pvr_pds_setup_doutu(&pixel_event_program.task_control,
527 usc_eot_program->dev_addr.addr,
528 usc_temp_count,
529 ROGUE_PDSINST_DOUTU_SAMPLE_RATE_INSTANCE,
530 false);
531
532 /* TODO: We could skip allocating this and generate directly into the device
533 * buffer thus removing one allocation and memcpy() per job. Would this
534 * speed up things in a noticeable way?
535 */
536 staging_buffer = vk_alloc(allocator,
537 staging_buffer_size,
538 8,
539 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
540 if (!staging_buffer) {
541 result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
542 goto err_free_usc_pixel_program;
543 }
544
545 /* Generate the data segment. The code segment was uploaded earlier when
546 * setting up the PDS static heap data.
547 */
548 pvr_pds_generate_pixel_event_data_segment(&pixel_event_program,
549 staging_buffer,
550 &device->pdevice->dev_info);
551
552 result = pvr_cmd_buffer_upload_pds_data(
553 cmd_buffer,
554 staging_buffer,
555 cmd_buffer->device->pixel_event_data_size_in_dwords,
556 4,
557 pds_upload_out);
558 if (result != VK_SUCCESS)
559 goto err_free_pixel_event_staging_buffer;
560
561 vk_free(allocator, staging_buffer);
562
563 return VK_SUCCESS;
564
565 err_free_pixel_event_staging_buffer:
566 vk_free(allocator, staging_buffer);
567
568 err_free_usc_pixel_program:
569 list_del(&usc_eot_program->link);
570 pvr_bo_suballoc_free(usc_eot_program);
571
572 return result;
573 }
574
pvr_sub_cmd_gfx_build_terminate_ctrl_stream(struct pvr_device * const device,const struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const gfx_sub_cmd)575 static VkResult pvr_sub_cmd_gfx_build_terminate_ctrl_stream(
576 struct pvr_device *const device,
577 const struct pvr_cmd_buffer *const cmd_buffer,
578 struct pvr_sub_cmd_gfx *const gfx_sub_cmd)
579 {
580 struct list_head bo_list;
581 struct pvr_csb csb;
582 VkResult result;
583
584 pvr_csb_init(device, PVR_CMD_STREAM_TYPE_GRAPHICS, &csb);
585
586 result = pvr_cmd_buffer_emit_ppp_state(cmd_buffer, &csb);
587 if (result != VK_SUCCESS)
588 goto err_csb_finish;
589
590 result = pvr_csb_emit_terminate(&csb);
591 if (result != VK_SUCCESS)
592 goto err_csb_finish;
593
594 result = pvr_csb_bake(&csb, &bo_list);
595 if (result != VK_SUCCESS)
596 goto err_csb_finish;
597
598 /* This is a trivial control stream, there's no reason it should ever require
599 * more memory than a single bo can provide.
600 */
601 assert(list_is_singular(&bo_list));
602 gfx_sub_cmd->terminate_ctrl_stream =
603 list_first_entry(&bo_list, struct pvr_bo, link);
604
605 return VK_SUCCESS;
606
607 err_csb_finish:
608 pvr_csb_finish(&csb);
609
610 return result;
611 }
612
pvr_setup_texture_state_words(struct pvr_device * device,struct pvr_combined_image_sampler_descriptor * descriptor,const struct pvr_image_view * image_view)613 static VkResult pvr_setup_texture_state_words(
614 struct pvr_device *device,
615 struct pvr_combined_image_sampler_descriptor *descriptor,
616 const struct pvr_image_view *image_view)
617 {
618 const struct pvr_image *image = vk_to_pvr_image(image_view->vk.image);
619 struct pvr_texture_state_info info = {
620 .format = image_view->vk.format,
621 .mem_layout = image->memlayout,
622 .type = image_view->vk.view_type,
623 .is_cube = image_view->vk.view_type == VK_IMAGE_VIEW_TYPE_CUBE ||
624 image_view->vk.view_type == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY,
625 .tex_state_type = PVR_TEXTURE_STATE_SAMPLE,
626 .extent = image_view->vk.extent,
627 .mip_levels = 1,
628 .sample_count = image_view->vk.image->samples,
629 .stride = image->physical_extent.width,
630 .addr = image->dev_addr,
631 };
632 const uint8_t *const swizzle = pvr_get_format_swizzle(info.format);
633 VkResult result;
634
635 memcpy(&info.swizzle, swizzle, sizeof(info.swizzle));
636
637 /* TODO: Can we use image_view->texture_state instead of generating here? */
638 result = pvr_pack_tex_state(device, &info, descriptor->image);
639 if (result != VK_SUCCESS)
640 return result;
641
642 descriptor->sampler = (union pvr_sampler_descriptor){ 0 };
643
644 pvr_csb_pack (&descriptor->sampler.data.sampler_word,
645 TEXSTATE_SAMPLER,
646 sampler) {
647 sampler.non_normalized_coords = true;
648 sampler.addrmode_v = ROGUE_TEXSTATE_ADDRMODE_CLAMP_TO_EDGE;
649 sampler.addrmode_u = ROGUE_TEXSTATE_ADDRMODE_CLAMP_TO_EDGE;
650 sampler.minfilter = ROGUE_TEXSTATE_FILTER_POINT;
651 sampler.magfilter = ROGUE_TEXSTATE_FILTER_POINT;
652 sampler.dadjust = ROGUE_TEXSTATE_DADJUST_ZERO_UINT;
653 }
654
655 return VK_SUCCESS;
656 }
657
658 static VkResult
pvr_load_op_constants_create_and_upload(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_load_op * load_op,pvr_dev_addr_t * const addr_out)659 pvr_load_op_constants_create_and_upload(struct pvr_cmd_buffer *cmd_buffer,
660 const struct pvr_load_op *load_op,
661 pvr_dev_addr_t *const addr_out)
662 {
663 const struct pvr_render_pass_info *render_pass_info =
664 &cmd_buffer->state.render_pass_info;
665 const struct pvr_render_pass *pass = render_pass_info->pass;
666 const struct pvr_renderpass_hwsetup_render *hw_render = load_op->hw_render;
667 const struct pvr_renderpass_colorinit *color_init =
668 &hw_render->color_init[0];
669 const VkClearValue *clear_value =
670 &render_pass_info->clear_values[color_init->index];
671 struct pvr_suballoc_bo *clear_bo;
672 uint32_t attachment_count;
673 bool has_depth_clear;
674 bool has_depth_load;
675 VkResult result;
676
677 /* These are only setup and never used for now. These will need to be
678 * uploaded into a buffer based on some compiler info.
679 */
680 /* TODO: Remove the above comment once the compiler is hooked up and we're
681 * setting up + uploading the buffer.
682 */
683 struct pvr_combined_image_sampler_descriptor
684 texture_states[PVR_LOAD_OP_CLEARS_LOADS_MAX_RTS];
685 uint32_t texture_count = 0;
686 uint32_t hw_clear_value[PVR_LOAD_OP_CLEARS_LOADS_MAX_RTS *
687 PVR_CLEAR_COLOR_ARRAY_SIZE];
688 uint32_t next_clear_consts = 0;
689
690 if (load_op->is_hw_object)
691 attachment_count = load_op->hw_render->color_init_count;
692 else
693 attachment_count = load_op->subpass->color_count;
694
695 for (uint32_t i = 0; i < attachment_count; i++) {
696 struct pvr_image_view *image_view;
697 uint32_t attachment_idx;
698
699 if (load_op->is_hw_object)
700 attachment_idx = load_op->hw_render->color_init[i].index;
701 else
702 attachment_idx = load_op->subpass->color_attachments[i];
703
704 image_view = render_pass_info->attachments[attachment_idx];
705
706 assert((load_op->clears_loads_state.rt_load_mask &
707 load_op->clears_loads_state.rt_clear_mask) == 0);
708 if (load_op->clears_loads_state.rt_load_mask & BITFIELD_BIT(i)) {
709 result = pvr_setup_texture_state_words(cmd_buffer->device,
710 &texture_states[texture_count],
711 image_view);
712 if (result != VK_SUCCESS)
713 return result;
714
715 texture_count++;
716 } else if (load_op->clears_loads_state.rt_clear_mask & BITFIELD_BIT(i)) {
717 const uint32_t accum_fmt_size =
718 pvr_get_pbe_accum_format_size_in_bytes(image_view->vk.format);
719
720 assert(next_clear_consts +
721 vk_format_get_blocksize(image_view->vk.format) <=
722 ARRAY_SIZE(hw_clear_value));
723
724 /* FIXME: do this at the point we store the clear values? */
725 pvr_get_hw_clear_color(image_view->vk.format,
726 clear_value->color,
727 &hw_clear_value[next_clear_consts]);
728
729 next_clear_consts += DIV_ROUND_UP(accum_fmt_size, sizeof(uint32_t));
730 }
731 }
732
733 has_depth_load = false;
734 for (uint32_t i = 0;
735 i < ARRAY_SIZE(load_op->clears_loads_state.dest_vk_format);
736 i++) {
737 if (load_op->clears_loads_state.dest_vk_format[i] ==
738 VK_FORMAT_D32_SFLOAT) {
739 has_depth_load = true;
740 break;
741 }
742 }
743
744 has_depth_clear = load_op->clears_loads_state.depth_clear_to_reg != -1;
745
746 assert(!(has_depth_clear && has_depth_load));
747
748 if (has_depth_load) {
749 const struct pvr_render_pass_attachment *attachment;
750 const struct pvr_image_view *image_view;
751
752 assert(load_op->subpass->depth_stencil_attachment !=
753 VK_ATTACHMENT_UNUSED);
754 assert(!load_op->is_hw_object);
755 attachment =
756 &pass->attachments[load_op->subpass->depth_stencil_attachment];
757
758 image_view = render_pass_info->attachments[attachment->index];
759
760 result = pvr_setup_texture_state_words(cmd_buffer->device,
761 &texture_states[texture_count],
762 image_view);
763 if (result != VK_SUCCESS)
764 return result;
765
766 texture_count++;
767 } else if (has_depth_clear) {
768 const struct pvr_render_pass_attachment *attachment;
769 VkClearValue clear_value;
770
771 assert(load_op->subpass->depth_stencil_attachment !=
772 VK_ATTACHMENT_UNUSED);
773 attachment =
774 &pass->attachments[load_op->subpass->depth_stencil_attachment];
775
776 clear_value = render_pass_info->clear_values[attachment->index];
777
778 assert(next_clear_consts < ARRAY_SIZE(hw_clear_value));
779 hw_clear_value[next_clear_consts++] = fui(clear_value.depthStencil.depth);
780 }
781
782 result = pvr_cmd_buffer_upload_general(cmd_buffer,
783 &hw_clear_value[0],
784 sizeof(hw_clear_value),
785 &clear_bo);
786 if (result != VK_SUCCESS)
787 return result;
788
789 *addr_out = clear_bo->dev_addr;
790
791 return VK_SUCCESS;
792 }
793
pvr_load_op_pds_data_create_and_upload(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_load_op * load_op,pvr_dev_addr_t constants_addr,struct pvr_pds_upload * const pds_upload_out)794 static VkResult pvr_load_op_pds_data_create_and_upload(
795 struct pvr_cmd_buffer *cmd_buffer,
796 const struct pvr_load_op *load_op,
797 pvr_dev_addr_t constants_addr,
798 struct pvr_pds_upload *const pds_upload_out)
799 {
800 struct pvr_device *device = cmd_buffer->device;
801 const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
802 struct pvr_pds_pixel_shader_sa_program program = { 0 };
803 uint32_t staging_buffer_size;
804 uint32_t *staging_buffer;
805 VkResult result;
806
807 program.num_texture_dma_kicks = 1;
808
809 pvr_csb_pack (&program.texture_dma_address[0],
810 PDSINST_DOUT_FIELDS_DOUTD_SRC0,
811 value) {
812 value.sbase = constants_addr;
813 }
814
815 pvr_csb_pack (&program.texture_dma_control[0],
816 PDSINST_DOUT_FIELDS_DOUTD_SRC1,
817 value) {
818 value.dest = ROGUE_PDSINST_DOUTD_DEST_COMMON_STORE;
819 value.a0 = load_op->shareds_dest_offset;
820 value.bsize = load_op->shareds_count;
821 }
822
823 pvr_pds_set_sizes_pixel_shader_sa_texture_data(&program, dev_info);
824
825 staging_buffer_size = PVR_DW_TO_BYTES(program.data_size);
826
827 staging_buffer = vk_alloc(&cmd_buffer->vk.pool->alloc,
828 staging_buffer_size,
829 8,
830 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
831 if (!staging_buffer)
832 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
833
834 pvr_pds_generate_pixel_shader_sa_texture_state_data(&program,
835 staging_buffer,
836 dev_info);
837
838 result = pvr_cmd_buffer_upload_pds_data(cmd_buffer,
839 staging_buffer,
840 program.data_size,
841 1,
842 pds_upload_out);
843 if (result != VK_SUCCESS) {
844 vk_free(&cmd_buffer->vk.pool->alloc, staging_buffer);
845 return result;
846 }
847
848 vk_free(&cmd_buffer->vk.pool->alloc, staging_buffer);
849
850 return VK_SUCCESS;
851 }
852
853 /* FIXME: Should this function be specific to the HW background object, in
854 * which case its name should be changed, or should it have the load op
855 * structure passed in?
856 */
857 static VkResult
pvr_load_op_data_create_and_upload(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_load_op * load_op,struct pvr_pds_upload * const pds_upload_out)858 pvr_load_op_data_create_and_upload(struct pvr_cmd_buffer *cmd_buffer,
859 const struct pvr_load_op *load_op,
860 struct pvr_pds_upload *const pds_upload_out)
861 {
862 pvr_dev_addr_t constants_addr;
863 VkResult result;
864
865 result = pvr_load_op_constants_create_and_upload(cmd_buffer,
866 load_op,
867 &constants_addr);
868 if (result != VK_SUCCESS)
869 return result;
870
871 return pvr_load_op_pds_data_create_and_upload(cmd_buffer,
872 load_op,
873 constants_addr,
874 pds_upload_out);
875 }
876
pvr_pds_bgnd_pack_state(const struct pvr_load_op * load_op,const struct pvr_pds_upload * load_op_program,uint64_t pds_reg_values[static const ROGUE_NUM_CR_PDS_BGRND_WORDS])877 static void pvr_pds_bgnd_pack_state(
878 const struct pvr_load_op *load_op,
879 const struct pvr_pds_upload *load_op_program,
880 uint64_t pds_reg_values[static const ROGUE_NUM_CR_PDS_BGRND_WORDS])
881 {
882 pvr_csb_pack (&pds_reg_values[0], CR_PDS_BGRND0_BASE, value) {
883 value.shader_addr = PVR_DEV_ADDR(load_op->pds_frag_prog.data_offset);
884 value.texunicode_addr =
885 PVR_DEV_ADDR(load_op->pds_tex_state_prog.code_offset);
886 }
887
888 pvr_csb_pack (&pds_reg_values[1], CR_PDS_BGRND1_BASE, value) {
889 value.texturedata_addr = PVR_DEV_ADDR(load_op_program->data_offset);
890 }
891
892 pvr_csb_pack (&pds_reg_values[2], CR_PDS_BGRND3_SIZEINFO, value) {
893 value.usc_sharedsize =
894 DIV_ROUND_UP(load_op->const_shareds_count,
895 ROGUE_CR_PDS_BGRND3_SIZEINFO_USC_SHAREDSIZE_UNIT_SIZE);
896 value.pds_texturestatesize = DIV_ROUND_UP(
897 load_op_program->data_size,
898 ROGUE_CR_PDS_BGRND3_SIZEINFO_PDS_TEXTURESTATESIZE_UNIT_SIZE);
899 value.pds_tempsize =
900 DIV_ROUND_UP(load_op->temps_count,
901 ROGUE_CR_PDS_BGRND3_SIZEINFO_PDS_TEMPSIZE_UNIT_SIZE);
902 }
903 }
904
905 /**
906 * \brief Calculates the stride in pixels based on the pitch in bytes and pixel
907 * format.
908 *
909 * \param[in] pitch Width pitch in bytes.
910 * \param[in] vk_format Vulkan image format.
911 * \return Stride in pixels.
912 */
pvr_stride_from_pitch(uint32_t pitch,VkFormat vk_format)913 static inline uint32_t pvr_stride_from_pitch(uint32_t pitch, VkFormat vk_format)
914 {
915 const unsigned int cpp = vk_format_get_blocksize(vk_format);
916
917 assert(pitch % cpp == 0);
918
919 return pitch / cpp;
920 }
921
pvr_setup_pbe_state(const struct pvr_device_info * dev_info,const struct pvr_framebuffer * framebuffer,uint32_t mrt_index,const struct usc_mrt_resource * mrt_resource,const struct pvr_image_view * const iview,const VkRect2D * render_area,const bool down_scale,const uint32_t samples,uint32_t pbe_cs_words[static const ROGUE_NUM_PBESTATE_STATE_WORDS],uint64_t pbe_reg_words[static const ROGUE_NUM_PBESTATE_REG_WORDS])922 static void pvr_setup_pbe_state(
923 const struct pvr_device_info *dev_info,
924 const struct pvr_framebuffer *framebuffer,
925 uint32_t mrt_index,
926 const struct usc_mrt_resource *mrt_resource,
927 const struct pvr_image_view *const iview,
928 const VkRect2D *render_area,
929 const bool down_scale,
930 const uint32_t samples,
931 uint32_t pbe_cs_words[static const ROGUE_NUM_PBESTATE_STATE_WORDS],
932 uint64_t pbe_reg_words[static const ROGUE_NUM_PBESTATE_REG_WORDS])
933 {
934 const struct pvr_image *image = pvr_image_view_get_image(iview);
935 uint32_t level_pitch = image->mip_levels[iview->vk.base_mip_level].pitch;
936
937 struct pvr_pbe_surf_params surface_params;
938 struct pvr_pbe_render_params render_params;
939 bool with_packed_usc_channel;
940 const uint8_t *swizzle;
941 uint32_t position;
942
943 /* down_scale should be true when performing a resolve, in which case there
944 * should be more than one sample.
945 */
946 assert((down_scale && samples > 1U) || (!down_scale && samples == 1U));
947
948 /* Setup surface parameters. */
949
950 if (PVR_HAS_FEATURE(dev_info, usc_f16sop_u8)) {
951 with_packed_usc_channel = vk_format_is_unorm(iview->vk.format) ||
952 vk_format_is_snorm(iview->vk.format);
953 } else {
954 with_packed_usc_channel = false;
955 }
956
957 swizzle = pvr_get_format_swizzle(iview->vk.format);
958 memcpy(surface_params.swizzle, swizzle, sizeof(surface_params.swizzle));
959
960 pvr_pbe_get_src_format_and_gamma(iview->vk.format,
961 PVR_PBE_GAMMA_NONE,
962 with_packed_usc_channel,
963 &surface_params.source_format,
964 &surface_params.gamma);
965
966 surface_params.is_normalized =
967 pvr_vk_format_is_fully_normalized(iview->vk.format);
968 surface_params.pbe_packmode = pvr_get_pbe_packmode(iview->vk.format);
969 surface_params.nr_components = vk_format_get_nr_components(iview->vk.format);
970
971 /* FIXME: Should we have an inline function to return the address of a mip
972 * level?
973 */
974 surface_params.addr =
975 PVR_DEV_ADDR_OFFSET(image->vma->dev_addr,
976 image->mip_levels[iview->vk.base_mip_level].offset);
977 surface_params.addr =
978 PVR_DEV_ADDR_OFFSET(surface_params.addr,
979 iview->vk.base_array_layer * image->layer_size);
980
981 surface_params.mem_layout = image->memlayout;
982 surface_params.stride = pvr_stride_from_pitch(level_pitch, iview->vk.format);
983 surface_params.depth = iview->vk.extent.depth;
984 surface_params.width = iview->vk.extent.width;
985 surface_params.height = iview->vk.extent.height;
986 surface_params.z_only_render = false;
987 surface_params.down_scale = down_scale;
988
989 /* Setup render parameters. */
990
991 if (mrt_resource->type == USC_MRT_RESOURCE_TYPE_MEMORY) {
992 position = mrt_resource->mem.offset_dw;
993 } else {
994 assert(mrt_resource->type == USC_MRT_RESOURCE_TYPE_OUTPUT_REG);
995 assert(mrt_resource->reg.offset == 0);
996
997 position = mrt_resource->reg.output_reg;
998 }
999
1000 assert(position <= 3 || PVR_HAS_FEATURE(dev_info, eight_output_registers));
1001
1002 switch (position) {
1003 case 0:
1004 case 4:
1005 render_params.source_start = PVR_PBE_STARTPOS_BIT0;
1006 break;
1007 case 1:
1008 case 5:
1009 render_params.source_start = PVR_PBE_STARTPOS_BIT32;
1010 break;
1011 case 2:
1012 case 6:
1013 render_params.source_start = PVR_PBE_STARTPOS_BIT64;
1014 break;
1015 case 3:
1016 case 7:
1017 render_params.source_start = PVR_PBE_STARTPOS_BIT96;
1018 break;
1019 default:
1020 assert(!"Invalid output register");
1021 break;
1022 }
1023
1024 #define PVR_DEC_IF_NOT_ZERO(_v) (((_v) > 0) ? (_v)-1 : 0)
1025
1026 render_params.min_x_clip = MAX2(0, render_area->offset.x);
1027 render_params.min_y_clip = MAX2(0, render_area->offset.y);
1028 render_params.max_x_clip = MIN2(
1029 framebuffer->width - 1,
1030 PVR_DEC_IF_NOT_ZERO(render_area->offset.x + render_area->extent.width));
1031 render_params.max_y_clip = MIN2(
1032 framebuffer->height - 1,
1033 PVR_DEC_IF_NOT_ZERO(render_area->offset.y + render_area->extent.height));
1034
1035 #undef PVR_DEC_IF_NOT_ZERO
1036
1037 render_params.slice = 0;
1038 render_params.mrt_index = mrt_index;
1039
1040 pvr_pbe_pack_state(dev_info,
1041 &surface_params,
1042 &render_params,
1043 pbe_cs_words,
1044 pbe_reg_words);
1045 }
1046
1047 static struct pvr_render_target *
pvr_get_render_target(const struct pvr_render_pass * pass,const struct pvr_framebuffer * framebuffer,uint32_t idx)1048 pvr_get_render_target(const struct pvr_render_pass *pass,
1049 const struct pvr_framebuffer *framebuffer,
1050 uint32_t idx)
1051 {
1052 const struct pvr_renderpass_hwsetup_render *hw_render =
1053 &pass->hw_setup->renders[idx];
1054 uint32_t rt_idx = 0;
1055
1056 switch (hw_render->sample_count) {
1057 case 1:
1058 case 2:
1059 case 4:
1060 case 8:
1061 rt_idx = util_logbase2(hw_render->sample_count);
1062 break;
1063
1064 default:
1065 unreachable("Unsupported sample count");
1066 break;
1067 }
1068
1069 return &framebuffer->render_targets[rt_idx];
1070 }
1071
1072 static uint32_t
pvr_pass_get_pixel_output_width(const struct pvr_render_pass * pass,uint32_t idx,const struct pvr_device_info * dev_info)1073 pvr_pass_get_pixel_output_width(const struct pvr_render_pass *pass,
1074 uint32_t idx,
1075 const struct pvr_device_info *dev_info)
1076 {
1077 const struct pvr_renderpass_hwsetup_render *hw_render =
1078 &pass->hw_setup->renders[idx];
1079 /* Default value based on the maximum value found in all existing cores. The
1080 * maximum is used as this is being treated as a lower bound, making it a
1081 * "safer" choice than the minimum value found in all existing cores.
1082 */
1083 const uint32_t min_output_regs =
1084 PVR_GET_FEATURE_VALUE(dev_info, usc_min_output_registers_per_pix, 2U);
1085 const uint32_t width = MAX2(hw_render->output_regs_count, min_output_regs);
1086
1087 return util_next_power_of_two(width);
1088 }
1089
1090 static inline bool
pvr_ds_attachment_requires_zls(const struct pvr_ds_attachment * attachment)1091 pvr_ds_attachment_requires_zls(const struct pvr_ds_attachment *attachment)
1092 {
1093 bool zls_used;
1094
1095 zls_used = attachment->load.d || attachment->load.s;
1096 zls_used |= attachment->store.d || attachment->store.s;
1097
1098 return zls_used;
1099 }
1100
1101 /**
1102 * \brief If depth and/or stencil attachment dimensions are not tile-aligned,
1103 * then we may need to insert some additional transfer subcommands.
1104 *
1105 * It's worth noting that we check whether the dimensions are smaller than a
1106 * tile here, rather than checking whether they're tile-aligned - this relies
1107 * on the assumption that we can safely use any attachment with dimensions
1108 * larger than a tile. If the attachment is twiddled, it will be over-allocated
1109 * to the nearest power-of-two (which will be tile-aligned). If the attachment
1110 * is not twiddled, we don't need to worry about tile-alignment at all.
1111 */
pvr_sub_cmd_gfx_requires_ds_subtile_alignment(const struct pvr_device_info * dev_info,const struct pvr_render_job * job)1112 static bool pvr_sub_cmd_gfx_requires_ds_subtile_alignment(
1113 const struct pvr_device_info *dev_info,
1114 const struct pvr_render_job *job)
1115 {
1116 const struct pvr_image *const ds_image =
1117 pvr_image_view_get_image(job->ds.iview);
1118 uint32_t zls_tile_size_x;
1119 uint32_t zls_tile_size_y;
1120
1121 rogue_get_zls_tile_size_xy(dev_info, &zls_tile_size_x, &zls_tile_size_y);
1122
1123 if (ds_image->physical_extent.width >= zls_tile_size_x &&
1124 ds_image->physical_extent.height >= zls_tile_size_y) {
1125 return false;
1126 }
1127
1128 /* If we have the zls_subtile feature, we can skip the alignment iff:
1129 * - The attachment is not multisampled, and
1130 * - The depth and stencil attachments are the same.
1131 */
1132 if (PVR_HAS_FEATURE(dev_info, zls_subtile) &&
1133 ds_image->vk.samples == VK_SAMPLE_COUNT_1_BIT &&
1134 job->has_stencil_attachment == job->has_depth_attachment) {
1135 return false;
1136 }
1137
1138 /* No ZLS functions enabled; nothing to do. */
1139 if ((!job->has_depth_attachment && !job->has_stencil_attachment) ||
1140 !pvr_ds_attachment_requires_zls(&job->ds)) {
1141 return false;
1142 }
1143
1144 return true;
1145 }
1146
1147 static VkResult
pvr_sub_cmd_gfx_align_ds_subtiles(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const gfx_sub_cmd)1148 pvr_sub_cmd_gfx_align_ds_subtiles(struct pvr_cmd_buffer *const cmd_buffer,
1149 struct pvr_sub_cmd_gfx *const gfx_sub_cmd)
1150 {
1151 struct pvr_sub_cmd *const prev_sub_cmd =
1152 container_of(gfx_sub_cmd, struct pvr_sub_cmd, gfx);
1153 struct pvr_ds_attachment *const ds = &gfx_sub_cmd->job.ds;
1154 const struct pvr_image *const ds_image = pvr_image_view_get_image(ds->iview);
1155 const VkFormat copy_format = pvr_get_raw_copy_format(ds_image->vk.format);
1156
1157 struct pvr_suballoc_bo *buffer;
1158 uint32_t buffer_layer_size;
1159 VkBufferImageCopy2 region;
1160 VkExtent2D zls_tile_size;
1161 VkExtent2D rounded_size;
1162 uint32_t buffer_size;
1163 VkExtent2D scale;
1164 VkResult result;
1165
1166 /* The operations below assume the last command in the buffer was the target
1167 * gfx subcommand. Assert that this is the case.
1168 */
1169 assert(list_last_entry(&cmd_buffer->sub_cmds, struct pvr_sub_cmd, link) ==
1170 prev_sub_cmd);
1171 assert(prev_sub_cmd == cmd_buffer->state.current_sub_cmd);
1172
1173 if (!pvr_ds_attachment_requires_zls(ds))
1174 return VK_SUCCESS;
1175
1176 rogue_get_zls_tile_size_xy(&cmd_buffer->device->pdevice->dev_info,
1177 &zls_tile_size.width,
1178 &zls_tile_size.height);
1179 rogue_get_isp_scale_xy_from_samples(ds_image->vk.samples,
1180 &scale.width,
1181 &scale.height);
1182
1183 rounded_size = (VkExtent2D){
1184 .width = ALIGN_POT(ds_image->physical_extent.width, zls_tile_size.width),
1185 .height =
1186 ALIGN_POT(ds_image->physical_extent.height, zls_tile_size.height),
1187 };
1188
1189 buffer_layer_size = vk_format_get_blocksize(ds_image->vk.format) *
1190 rounded_size.width * rounded_size.height * scale.width *
1191 scale.height;
1192
1193 if (ds->iview->vk.layer_count > 1)
1194 buffer_layer_size = ALIGN_POT(buffer_layer_size, ds_image->alignment);
1195
1196 buffer_size = buffer_layer_size * ds->iview->vk.layer_count;
1197
1198 result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
1199 cmd_buffer->device->heaps.general_heap,
1200 buffer_size,
1201 &buffer);
1202 if (result != VK_SUCCESS)
1203 return result;
1204
1205 region = (VkBufferImageCopy2){
1206 .sType = VK_STRUCTURE_TYPE_BUFFER_IMAGE_COPY_2,
1207 .pNext = NULL,
1208 .bufferOffset = 0,
1209 .bufferRowLength = rounded_size.width,
1210 .bufferImageHeight = 0,
1211 .imageSubresource = {
1212 .aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT,
1213 .mipLevel = ds->iview->vk.base_mip_level,
1214 .baseArrayLayer = ds->iview->vk.base_array_layer,
1215 .layerCount = ds->iview->vk.layer_count,
1216 },
1217 .imageOffset = { 0 },
1218 .imageExtent = {
1219 .width = ds->iview->vk.extent.width,
1220 .height = ds->iview->vk.extent.height,
1221 .depth = 1,
1222 },
1223 };
1224
1225 if (ds->load.d || ds->load.s) {
1226 struct pvr_sub_cmd *new_sub_cmd;
1227
1228 cmd_buffer->state.current_sub_cmd = NULL;
1229
1230 result =
1231 pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_TRANSFER);
1232 if (result != VK_SUCCESS)
1233 return result;
1234
1235 new_sub_cmd = cmd_buffer->state.current_sub_cmd;
1236
1237 result = pvr_copy_image_to_buffer_region_format(cmd_buffer,
1238 ds_image,
1239 buffer->dev_addr,
1240 ®ion,
1241 copy_format,
1242 copy_format);
1243 if (result != VK_SUCCESS)
1244 return result;
1245
1246 new_sub_cmd->transfer.serialize_with_frag = true;
1247
1248 result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
1249 if (result != VK_SUCCESS)
1250 return result;
1251
1252 /* Now we have to fiddle with cmd_buffer to place this transfer command
1253 * *before* the target gfx subcommand.
1254 *
1255 * Note the doc for list_move_to() is subtly wrong - item is placed
1256 * directly *after* loc in the list, not "in front of".
1257 */
1258 list_move_to(&new_sub_cmd->link, prev_sub_cmd->link.prev);
1259
1260 cmd_buffer->state.current_sub_cmd = prev_sub_cmd;
1261 }
1262
1263 if (ds->store.d || ds->store.s) {
1264 cmd_buffer->state.current_sub_cmd = NULL;
1265
1266 result =
1267 pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_TRANSFER);
1268 if (result != VK_SUCCESS)
1269 return result;
1270
1271 result = pvr_copy_buffer_to_image_region_format(cmd_buffer,
1272 buffer->dev_addr,
1273 ds_image,
1274 ®ion,
1275 copy_format,
1276 copy_format,
1277 0);
1278 if (result != VK_SUCCESS)
1279 return result;
1280
1281 cmd_buffer->state.current_sub_cmd->transfer.serialize_with_frag = true;
1282
1283 result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
1284 if (result != VK_SUCCESS)
1285 return result;
1286
1287 cmd_buffer->state.current_sub_cmd = prev_sub_cmd;
1288 }
1289
1290 /* Finally, patch up the target graphics sub_cmd to use the correctly-strided
1291 * buffer.
1292 */
1293 ds->has_alignment_transfers = true;
1294 ds->addr = buffer->dev_addr;
1295 ds->physical_extent = rounded_size;
1296
1297 gfx_sub_cmd->wait_on_previous_transfer = true;
1298
1299 return VK_SUCCESS;
1300 }
1301
1302 struct pvr_emit_state {
1303 uint32_t pbe_cs_words[PVR_MAX_COLOR_ATTACHMENTS]
1304 [ROGUE_NUM_PBESTATE_STATE_WORDS];
1305
1306 uint64_t pbe_reg_words[PVR_MAX_COLOR_ATTACHMENTS]
1307 [ROGUE_NUM_PBESTATE_REG_WORDS];
1308
1309 uint32_t emit_count;
1310 };
1311
1312 static void
pvr_setup_emit_state(const struct pvr_device_info * dev_info,const struct pvr_renderpass_hwsetup_render * hw_render,struct pvr_render_pass_info * render_pass_info,struct pvr_emit_state * emit_state)1313 pvr_setup_emit_state(const struct pvr_device_info *dev_info,
1314 const struct pvr_renderpass_hwsetup_render *hw_render,
1315 struct pvr_render_pass_info *render_pass_info,
1316 struct pvr_emit_state *emit_state)
1317 {
1318 assert(hw_render->pbe_emits <= PVR_NUM_PBE_EMIT_REGS);
1319
1320 if (hw_render->eot_surface_count == 0) {
1321 emit_state->emit_count = 1;
1322 pvr_csb_pack (&emit_state->pbe_cs_words[0][1],
1323 PBESTATE_STATE_WORD1,
1324 state) {
1325 state.emptytile = true;
1326 }
1327 return;
1328 }
1329
1330 static_assert(USC_MRT_RESOURCE_TYPE_OUTPUT_REG + 1 ==
1331 USC_MRT_RESOURCE_TYPE_MEMORY,
1332 "The loop below needs adjusting.");
1333
1334 emit_state->emit_count = 0;
1335 for (uint32_t resource_type = USC_MRT_RESOURCE_TYPE_OUTPUT_REG;
1336 resource_type <= USC_MRT_RESOURCE_TYPE_MEMORY;
1337 resource_type++) {
1338 for (uint32_t i = 0; i < hw_render->eot_surface_count; i++) {
1339 const struct pvr_framebuffer *framebuffer =
1340 render_pass_info->framebuffer;
1341 const struct pvr_renderpass_hwsetup_eot_surface *surface =
1342 &hw_render->eot_surfaces[i];
1343 const struct pvr_image_view *iview =
1344 render_pass_info->attachments[surface->attachment_idx];
1345 const struct usc_mrt_resource *mrt_resource =
1346 &hw_render->eot_setup.mrt_resources[surface->mrt_idx];
1347 uint32_t samples = 1;
1348
1349 if (mrt_resource->type != resource_type)
1350 continue;
1351
1352 if (surface->need_resolve) {
1353 const struct pvr_image_view *resolve_src =
1354 render_pass_info->attachments[surface->src_attachment_idx];
1355
1356 /* Attachments that are the destination of resolve operations must
1357 * be loaded before their next use.
1358 */
1359 render_pass_info->enable_bg_tag = true;
1360 render_pass_info->process_empty_tiles = true;
1361
1362 if (surface->resolve_type != PVR_RESOLVE_TYPE_PBE)
1363 continue;
1364
1365 samples = (uint32_t)resolve_src->vk.image->samples;
1366 }
1367
1368 assert(emit_state->emit_count < ARRAY_SIZE(emit_state->pbe_cs_words));
1369 assert(emit_state->emit_count < ARRAY_SIZE(emit_state->pbe_reg_words));
1370
1371 pvr_setup_pbe_state(dev_info,
1372 framebuffer,
1373 emit_state->emit_count,
1374 mrt_resource,
1375 iview,
1376 &render_pass_info->render_area,
1377 surface->need_resolve,
1378 samples,
1379 emit_state->pbe_cs_words[emit_state->emit_count],
1380 emit_state->pbe_reg_words[emit_state->emit_count]);
1381 emit_state->emit_count += 1;
1382 }
1383 }
1384
1385 assert(emit_state->emit_count == hw_render->pbe_emits);
1386 }
1387
1388 static inline bool
pvr_is_render_area_tile_aligned(const struct pvr_cmd_buffer * cmd_buffer,const struct pvr_image_view * iview)1389 pvr_is_render_area_tile_aligned(const struct pvr_cmd_buffer *cmd_buffer,
1390 const struct pvr_image_view *iview)
1391 {
1392 const VkRect2D *render_area =
1393 &cmd_buffer->state.render_pass_info.render_area;
1394
1395 return render_area->offset.x == 0 && render_area->offset.y == 0 &&
1396 render_area->extent.height == iview->vk.extent.height &&
1397 render_area->extent.width == iview->vk.extent.width;
1398 }
1399
pvr_sub_cmd_gfx_job_init(const struct pvr_device_info * dev_info,struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_gfx * sub_cmd)1400 static VkResult pvr_sub_cmd_gfx_job_init(const struct pvr_device_info *dev_info,
1401 struct pvr_cmd_buffer *cmd_buffer,
1402 struct pvr_sub_cmd_gfx *sub_cmd)
1403 {
1404 static const VkClearDepthStencilValue default_ds_clear_value = {
1405 .depth = 1.0f,
1406 .stencil = 0xFFFFFFFF,
1407 };
1408
1409 const struct vk_dynamic_graphics_state *dynamic_state =
1410 &cmd_buffer->vk.dynamic_graphics_state;
1411 struct pvr_render_pass_info *render_pass_info =
1412 &cmd_buffer->state.render_pass_info;
1413 const struct pvr_renderpass_hwsetup_render *hw_render =
1414 &render_pass_info->pass->hw_setup->renders[sub_cmd->hw_render_idx];
1415 struct pvr_render_job *job = &sub_cmd->job;
1416 struct pvr_pds_upload pds_pixel_event_program;
1417 struct pvr_framebuffer *framebuffer = render_pass_info->framebuffer;
1418 struct pvr_spm_bgobj_state *spm_bgobj_state =
1419 &framebuffer->spm_bgobj_state_per_render[sub_cmd->hw_render_idx];
1420 struct pvr_render_target *render_target;
1421 VkResult result;
1422
1423 if (sub_cmd->barrier_store) {
1424 /* There can only ever be one frag job running on the hardware at any one
1425 * time, and a context switch is not allowed mid-tile, so instead of
1426 * allocating a new scratch buffer we can reuse the SPM scratch buffer to
1427 * perform the store.
1428 * So use the SPM EOT program with the SPM PBE reg words in order to store
1429 * the render to the SPM scratch buffer.
1430 */
1431
1432 memcpy(job->pbe_reg_words,
1433 &framebuffer->spm_eot_state_per_render[0].pbe_reg_words,
1434 sizeof(job->pbe_reg_words));
1435 job->pds_pixel_event_data_offset =
1436 framebuffer->spm_eot_state_per_render[0]
1437 .pixel_event_program_data_offset;
1438 } else {
1439 struct pvr_emit_state emit_state = { 0 };
1440
1441 pvr_setup_emit_state(dev_info, hw_render, render_pass_info, &emit_state);
1442
1443 memcpy(job->pbe_reg_words,
1444 emit_state.pbe_reg_words,
1445 sizeof(job->pbe_reg_words));
1446
1447 result = pvr_sub_cmd_gfx_per_job_fragment_programs_create_and_upload(
1448 cmd_buffer,
1449 emit_state.emit_count,
1450 emit_state.pbe_cs_words[0],
1451 &pds_pixel_event_program);
1452 if (result != VK_SUCCESS)
1453 return result;
1454
1455 job->pds_pixel_event_data_offset = pds_pixel_event_program.data_offset;
1456 }
1457
1458 if (sub_cmd->barrier_load) {
1459 job->enable_bg_tag = true;
1460 job->process_empty_tiles = true;
1461
1462 /* Load the previously stored render from the SPM scratch buffer. */
1463
1464 STATIC_ASSERT(ARRAY_SIZE(job->pds_bgnd_reg_values) ==
1465 ARRAY_SIZE(spm_bgobj_state->pds_reg_values));
1466 typed_memcpy(job->pds_bgnd_reg_values,
1467 spm_bgobj_state->pds_reg_values,
1468 ARRAY_SIZE(spm_bgobj_state->pds_reg_values));
1469 } else if (hw_render->load_op) {
1470 const struct pvr_load_op *load_op = hw_render->load_op;
1471 struct pvr_pds_upload load_op_program;
1472
1473 /* Recalculate Background Object(s). */
1474
1475 /* FIXME: Should we free the PDS pixel event data or let it be freed
1476 * when the pool gets emptied?
1477 */
1478 result = pvr_load_op_data_create_and_upload(cmd_buffer,
1479 load_op,
1480 &load_op_program);
1481 if (result != VK_SUCCESS)
1482 return result;
1483
1484 job->enable_bg_tag = render_pass_info->enable_bg_tag;
1485 job->process_empty_tiles = render_pass_info->process_empty_tiles;
1486
1487 pvr_pds_bgnd_pack_state(load_op,
1488 &load_op_program,
1489 job->pds_bgnd_reg_values);
1490 }
1491
1492 /* TODO: In some cases a PR can be removed by storing to the color attachment
1493 * and have the background object load directly from it instead of using the
1494 * scratch buffer. In those cases we can also set this to "false" and avoid
1495 * extra fw overhead.
1496 */
1497 /* The scratch buffer is always needed and allocated to avoid data loss in
1498 * case SPM is hit so set the flag unconditionally.
1499 */
1500 job->requires_spm_scratch_buffer = true;
1501
1502 memcpy(job->pr_pbe_reg_words,
1503 &framebuffer->spm_eot_state_per_render[0].pbe_reg_words,
1504 sizeof(job->pbe_reg_words));
1505 job->pr_pds_pixel_event_data_offset =
1506 framebuffer->spm_eot_state_per_render[0].pixel_event_program_data_offset;
1507
1508 STATIC_ASSERT(ARRAY_SIZE(job->pds_pr_bgnd_reg_values) ==
1509 ARRAY_SIZE(spm_bgobj_state->pds_reg_values));
1510 typed_memcpy(job->pds_pr_bgnd_reg_values,
1511 spm_bgobj_state->pds_reg_values,
1512 ARRAY_SIZE(spm_bgobj_state->pds_reg_values));
1513
1514 render_target = pvr_get_render_target(render_pass_info->pass,
1515 framebuffer,
1516 sub_cmd->hw_render_idx);
1517 job->rt_dataset = render_target->rt_dataset;
1518
1519 job->ctrl_stream_addr = pvr_csb_get_start_address(&sub_cmd->control_stream);
1520
1521 if (sub_cmd->depth_bias_bo)
1522 job->depth_bias_table_addr = sub_cmd->depth_bias_bo->dev_addr;
1523 else
1524 job->depth_bias_table_addr = PVR_DEV_ADDR_INVALID;
1525
1526 if (sub_cmd->scissor_bo)
1527 job->scissor_table_addr = sub_cmd->scissor_bo->dev_addr;
1528 else
1529 job->scissor_table_addr = PVR_DEV_ADDR_INVALID;
1530
1531 job->pixel_output_width =
1532 pvr_pass_get_pixel_output_width(render_pass_info->pass,
1533 sub_cmd->hw_render_idx,
1534 dev_info);
1535
1536 /* Setup depth/stencil job information. */
1537 if (hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED) {
1538 struct pvr_image_view *ds_iview =
1539 render_pass_info->attachments[hw_render->ds_attach_idx];
1540 const struct pvr_image *ds_image = pvr_image_view_get_image(ds_iview);
1541
1542 job->has_depth_attachment = vk_format_has_depth(ds_image->vk.format);
1543 job->has_stencil_attachment = vk_format_has_stencil(ds_image->vk.format);
1544
1545 if (job->has_depth_attachment || job->has_stencil_attachment) {
1546 uint32_t level_pitch =
1547 ds_image->mip_levels[ds_iview->vk.base_mip_level].pitch;
1548 const bool render_area_is_tile_aligned =
1549 pvr_is_render_area_tile_aligned(cmd_buffer, ds_iview);
1550 bool store_was_optimised_out = false;
1551 bool d_store = false, s_store = false;
1552 bool d_load = false, s_load = false;
1553
1554 job->ds.iview = ds_iview;
1555 job->ds.addr = ds_image->dev_addr;
1556
1557 job->ds.stride =
1558 pvr_stride_from_pitch(level_pitch, ds_iview->vk.format);
1559 job->ds.height = ds_iview->vk.extent.height;
1560 job->ds.physical_extent = (VkExtent2D){
1561 .width = u_minify(ds_image->physical_extent.width,
1562 ds_iview->vk.base_mip_level),
1563 .height = u_minify(ds_image->physical_extent.height,
1564 ds_iview->vk.base_mip_level),
1565 };
1566 job->ds.layer_size = ds_image->layer_size;
1567
1568 job->ds_clear_value = default_ds_clear_value;
1569
1570 if (hw_render->ds_attach_idx < render_pass_info->clear_value_count) {
1571 const VkClearDepthStencilValue *const clear_values =
1572 &render_pass_info->clear_values[hw_render->ds_attach_idx]
1573 .depthStencil;
1574
1575 if (job->has_depth_attachment)
1576 job->ds_clear_value.depth = clear_values->depth;
1577
1578 if (job->has_stencil_attachment)
1579 job->ds_clear_value.stencil = clear_values->stencil;
1580 }
1581
1582 switch (ds_iview->vk.format) {
1583 case VK_FORMAT_D16_UNORM:
1584 job->ds.zls_format = ROGUE_CR_ZLS_FORMAT_TYPE_16BITINT;
1585 break;
1586
1587 case VK_FORMAT_S8_UINT:
1588 case VK_FORMAT_D32_SFLOAT:
1589 job->ds.zls_format = ROGUE_CR_ZLS_FORMAT_TYPE_F32Z;
1590 break;
1591
1592 case VK_FORMAT_D24_UNORM_S8_UINT:
1593 job->ds.zls_format = ROGUE_CR_ZLS_FORMAT_TYPE_24BITINT;
1594 break;
1595
1596 default:
1597 unreachable("Unsupported depth stencil format");
1598 }
1599
1600 job->ds.memlayout = ds_image->memlayout;
1601
1602 if (job->has_depth_attachment) {
1603 if (hw_render->depth_store || sub_cmd->barrier_store) {
1604 const bool depth_init_is_clear = hw_render->depth_init ==
1605 VK_ATTACHMENT_LOAD_OP_CLEAR;
1606
1607 d_store = true;
1608
1609 if (hw_render->depth_store && render_area_is_tile_aligned &&
1610 !(sub_cmd->modifies_depth || depth_init_is_clear)) {
1611 d_store = false;
1612 store_was_optimised_out = true;
1613 }
1614 }
1615
1616 if (d_store && !render_area_is_tile_aligned) {
1617 d_load = true;
1618 } else if (hw_render->depth_init == VK_ATTACHMENT_LOAD_OP_LOAD) {
1619 enum pvr_depth_stencil_usage depth_usage = sub_cmd->depth_usage;
1620
1621 assert(depth_usage != PVR_DEPTH_STENCIL_USAGE_UNDEFINED);
1622 d_load = (depth_usage != PVR_DEPTH_STENCIL_USAGE_NEVER);
1623 } else {
1624 d_load = sub_cmd->barrier_load;
1625 }
1626 }
1627
1628 if (job->has_stencil_attachment) {
1629 if (hw_render->stencil_store || sub_cmd->barrier_store) {
1630 const bool stencil_init_is_clear = hw_render->stencil_init ==
1631 VK_ATTACHMENT_LOAD_OP_CLEAR;
1632
1633 s_store = true;
1634
1635 if (hw_render->stencil_store && render_area_is_tile_aligned &&
1636 !(sub_cmd->modifies_stencil || stencil_init_is_clear)) {
1637 s_store = false;
1638 store_was_optimised_out = true;
1639 }
1640 }
1641
1642 if (s_store && !render_area_is_tile_aligned) {
1643 s_load = true;
1644 } else if (hw_render->stencil_init == VK_ATTACHMENT_LOAD_OP_LOAD) {
1645 enum pvr_depth_stencil_usage stencil_usage =
1646 sub_cmd->stencil_usage;
1647
1648 assert(stencil_usage != PVR_DEPTH_STENCIL_USAGE_UNDEFINED);
1649 s_load = (stencil_usage != PVR_DEPTH_STENCIL_USAGE_NEVER);
1650 } else {
1651 s_load = sub_cmd->barrier_load;
1652 }
1653 }
1654
1655 job->ds.load.d = d_load;
1656 job->ds.load.s = s_load;
1657 job->ds.store.d = d_store;
1658 job->ds.store.s = s_store;
1659
1660 /* ZLS can't do masked writes for packed depth stencil formats so if
1661 * we store anything, we have to store everything.
1662 */
1663 if ((job->ds.store.d || job->ds.store.s) &&
1664 pvr_zls_format_type_is_packed(job->ds.zls_format)) {
1665 job->ds.store.d = true;
1666 job->ds.store.s = true;
1667
1668 /* In case we are only operating on one aspect of the attachment we
1669 * need to load the unused one in order to preserve its contents due
1670 * to the forced store which might otherwise corrupt it.
1671 */
1672 if (hw_render->depth_init != VK_ATTACHMENT_LOAD_OP_CLEAR)
1673 job->ds.load.d = true;
1674
1675 if (hw_render->stencil_init != VK_ATTACHMENT_LOAD_OP_CLEAR)
1676 job->ds.load.s = true;
1677 }
1678
1679 if (pvr_ds_attachment_requires_zls(&job->ds) ||
1680 store_was_optimised_out) {
1681 job->process_empty_tiles = true;
1682 }
1683
1684 if (pvr_sub_cmd_gfx_requires_ds_subtile_alignment(dev_info, job)) {
1685 result = pvr_sub_cmd_gfx_align_ds_subtiles(cmd_buffer, sub_cmd);
1686 if (result != VK_SUCCESS)
1687 return result;
1688 }
1689 }
1690 } else {
1691 job->has_depth_attachment = false;
1692 job->has_stencil_attachment = false;
1693 job->ds_clear_value = default_ds_clear_value;
1694 }
1695
1696 if (hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED) {
1697 struct pvr_image_view *iview =
1698 render_pass_info->attachments[hw_render->ds_attach_idx];
1699 const struct pvr_image *image = pvr_image_view_get_image(iview);
1700
1701 /* If the HW render pass has a valid depth/stencil surface, determine the
1702 * sample count from the attachment's image.
1703 */
1704 job->samples = image->vk.samples;
1705 } else if (hw_render->output_regs_count) {
1706 /* If the HW render pass has output registers, we have color attachments
1707 * to write to, so determine the sample count from the count specified for
1708 * every color attachment in this render.
1709 */
1710 job->samples = hw_render->sample_count;
1711 } else if (cmd_buffer->state.gfx_pipeline) {
1712 /* If the HW render pass has no color or depth/stencil attachments, we
1713 * determine the sample count from the count given during pipeline
1714 * creation.
1715 */
1716 job->samples = dynamic_state->ms.rasterization_samples;
1717 } else if (render_pass_info->pass->attachment_count > 0) {
1718 /* If we get here, we have a render pass with subpasses containing no
1719 * attachments. The next best thing is largest of the sample counts
1720 * specified by the render pass attachment descriptions.
1721 */
1722 job->samples = render_pass_info->pass->max_sample_count;
1723 } else {
1724 /* No appropriate framebuffer attachment is available. */
1725 mesa_logw("Defaulting render job sample count to 1.");
1726 job->samples = VK_SAMPLE_COUNT_1_BIT;
1727 }
1728
1729 if (sub_cmd->max_tiles_in_flight ==
1730 PVR_GET_FEATURE_VALUE(dev_info, isp_max_tiles_in_flight, 1U)) {
1731 /* Use the default limit based on the partition store. */
1732 job->max_tiles_in_flight = 0U;
1733 } else {
1734 job->max_tiles_in_flight = sub_cmd->max_tiles_in_flight;
1735 }
1736
1737 job->frag_uses_atomic_ops = sub_cmd->frag_uses_atomic_ops;
1738 job->disable_compute_overlap = false;
1739 job->max_shared_registers = cmd_buffer->state.max_shared_regs;
1740 job->run_frag = true;
1741 job->geometry_terminate = true;
1742
1743 /* TODO: Enable pixel merging when it's safe to do. */
1744 job->disable_pixel_merging = true;
1745
1746 return VK_SUCCESS;
1747 }
1748
1749 static void
pvr_sub_cmd_compute_job_init(const struct pvr_physical_device * pdevice,struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * sub_cmd)1750 pvr_sub_cmd_compute_job_init(const struct pvr_physical_device *pdevice,
1751 struct pvr_cmd_buffer *cmd_buffer,
1752 struct pvr_sub_cmd_compute *sub_cmd)
1753 {
1754 sub_cmd->num_shared_regs = MAX2(cmd_buffer->device->idfwdf_state.usc_shareds,
1755 cmd_buffer->state.max_shared_regs);
1756
1757 cmd_buffer->state.max_shared_regs = 0U;
1758 }
1759
1760 #define PIXEL_ALLOCATION_SIZE_MAX_IN_BLOCKS \
1761 (1024 / ROGUE_CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE)
1762
1763 static uint32_t
pvr_compute_flat_slot_size(const struct pvr_physical_device * pdevice,uint32_t coeff_regs_count,bool use_barrier,uint32_t total_workitems)1764 pvr_compute_flat_slot_size(const struct pvr_physical_device *pdevice,
1765 uint32_t coeff_regs_count,
1766 bool use_barrier,
1767 uint32_t total_workitems)
1768 {
1769 const struct pvr_device_runtime_info *dev_runtime_info =
1770 &pdevice->dev_runtime_info;
1771 const struct pvr_device_info *dev_info = &pdevice->dev_info;
1772 uint32_t max_workgroups_per_task = ROGUE_CDM_MAX_PACKED_WORKGROUPS_PER_TASK;
1773 uint32_t max_avail_coeff_regs =
1774 dev_runtime_info->cdm_max_local_mem_size_regs;
1775 uint32_t localstore_chunks_count =
1776 DIV_ROUND_UP(PVR_DW_TO_BYTES(coeff_regs_count),
1777 ROGUE_CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE);
1778
1779 /* Ensure that we cannot have more workgroups in a slot than the available
1780 * number of coefficients allow us to have.
1781 */
1782 if (coeff_regs_count > 0U) {
1783 /* If the geometry or fragment jobs can overlap with the compute job, or
1784 * if there is a vertex shader already running then we need to consider
1785 * this in calculating max allowed work-groups.
1786 */
1787 if (PVR_HAS_QUIRK(dev_info, 52354) &&
1788 (PVR_HAS_FEATURE(dev_info, compute_overlap) ||
1789 PVR_HAS_FEATURE(dev_info, gs_rta_support))) {
1790 /* Solve for n (number of work-groups per task). All values are in
1791 * size of common store alloc blocks:
1792 *
1793 * n + (2n + 7) * (local_memory_size_max - 1) =
1794 * (coefficient_memory_pool_size) - (7 * pixel_allocation_size_max)
1795 * ==>
1796 * n + 2n * (local_memory_size_max - 1) =
1797 * (coefficient_memory_pool_size) - (7 * pixel_allocation_size_max)
1798 * - (7 * (local_memory_size_max - 1))
1799 * ==>
1800 * n * (1 + 2 * (local_memory_size_max - 1)) =
1801 * (coefficient_memory_pool_size) - (7 * pixel_allocation_size_max)
1802 * - (7 * (local_memory_size_max - 1))
1803 * ==>
1804 * n = ((coefficient_memory_pool_size) -
1805 * (7 * pixel_allocation_size_max) -
1806 * (7 * (local_memory_size_max - 1)) / (1 +
1807 * 2 * (local_memory_size_max - 1)))
1808 */
1809 uint32_t max_common_store_blocks =
1810 DIV_ROUND_UP(max_avail_coeff_regs * 4U,
1811 ROGUE_CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE);
1812
1813 /* (coefficient_memory_pool_size) - (7 * pixel_allocation_size_max)
1814 */
1815 max_common_store_blocks -= ROGUE_MAX_OVERLAPPED_PIXEL_TASK_INSTANCES *
1816 PIXEL_ALLOCATION_SIZE_MAX_IN_BLOCKS;
1817
1818 /* - (7 * (local_memory_size_max - 1)) */
1819 max_common_store_blocks -= (ROGUE_MAX_OVERLAPPED_PIXEL_TASK_INSTANCES *
1820 (localstore_chunks_count - 1U));
1821
1822 /* Divide by (1 + 2 * (local_memory_size_max - 1)) */
1823 max_workgroups_per_task = max_common_store_blocks /
1824 (1U + 2U * (localstore_chunks_count - 1U));
1825
1826 max_workgroups_per_task =
1827 MIN2(max_workgroups_per_task,
1828 ROGUE_CDM_MAX_PACKED_WORKGROUPS_PER_TASK);
1829
1830 } else {
1831 max_workgroups_per_task =
1832 MIN2((max_avail_coeff_regs / coeff_regs_count),
1833 max_workgroups_per_task);
1834 }
1835 }
1836
1837 /* max_workgroups_per_task should at least be one. */
1838 assert(max_workgroups_per_task >= 1U);
1839
1840 if (total_workitems >= ROGUE_MAX_INSTANCES_PER_TASK) {
1841 /* In this case, the work group size will have been padded up to the
1842 * next ROGUE_MAX_INSTANCES_PER_TASK so we just set max instances to be
1843 * ROGUE_MAX_INSTANCES_PER_TASK.
1844 */
1845 return ROGUE_MAX_INSTANCES_PER_TASK;
1846 }
1847
1848 /* In this case, the number of instances in the slot must be clamped to
1849 * accommodate whole work-groups only.
1850 */
1851 if (PVR_HAS_QUIRK(dev_info, 49032) || use_barrier) {
1852 max_workgroups_per_task =
1853 MIN2(max_workgroups_per_task,
1854 ROGUE_MAX_INSTANCES_PER_TASK / total_workitems);
1855 return total_workitems * max_workgroups_per_task;
1856 }
1857
1858 return MIN2(total_workitems * max_workgroups_per_task,
1859 ROGUE_MAX_INSTANCES_PER_TASK);
1860 }
1861
1862 static void
pvr_compute_generate_control_stream(struct pvr_csb * csb,struct pvr_sub_cmd_compute * sub_cmd,const struct pvr_compute_kernel_info * info)1863 pvr_compute_generate_control_stream(struct pvr_csb *csb,
1864 struct pvr_sub_cmd_compute *sub_cmd,
1865 const struct pvr_compute_kernel_info *info)
1866 {
1867 pvr_csb_set_relocation_mark(csb);
1868
1869 /* Compute kernel 0. */
1870 pvr_csb_emit (csb, CDMCTRL_KERNEL0, kernel0) {
1871 kernel0.indirect_present = !!info->indirect_buffer_addr.addr;
1872 kernel0.global_offsets_present = info->global_offsets_present;
1873 kernel0.usc_common_size = info->usc_common_size;
1874 kernel0.usc_unified_size = info->usc_unified_size;
1875 kernel0.pds_temp_size = info->pds_temp_size;
1876 kernel0.pds_data_size = info->pds_data_size;
1877 kernel0.usc_target = info->usc_target;
1878 kernel0.fence = info->is_fence;
1879 }
1880
1881 /* Compute kernel 1. */
1882 pvr_csb_emit (csb, CDMCTRL_KERNEL1, kernel1) {
1883 kernel1.data_addr = PVR_DEV_ADDR(info->pds_data_offset);
1884 kernel1.sd_type = info->sd_type;
1885 kernel1.usc_common_shared = info->usc_common_shared;
1886 }
1887
1888 /* Compute kernel 2. */
1889 pvr_csb_emit (csb, CDMCTRL_KERNEL2, kernel2) {
1890 kernel2.code_addr = PVR_DEV_ADDR(info->pds_code_offset);
1891 }
1892
1893 if (info->indirect_buffer_addr.addr) {
1894 /* Compute kernel 6. */
1895 pvr_csb_emit (csb, CDMCTRL_KERNEL6, kernel6) {
1896 kernel6.indirect_addrmsb = info->indirect_buffer_addr;
1897 }
1898
1899 /* Compute kernel 7. */
1900 pvr_csb_emit (csb, CDMCTRL_KERNEL7, kernel7) {
1901 kernel7.indirect_addrlsb = info->indirect_buffer_addr;
1902 }
1903 } else {
1904 /* Compute kernel 3. */
1905 pvr_csb_emit (csb, CDMCTRL_KERNEL3, kernel3) {
1906 assert(info->global_size[0U] > 0U);
1907 kernel3.workgroup_x = info->global_size[0U] - 1U;
1908 }
1909
1910 /* Compute kernel 4. */
1911 pvr_csb_emit (csb, CDMCTRL_KERNEL4, kernel4) {
1912 assert(info->global_size[1U] > 0U);
1913 kernel4.workgroup_y = info->global_size[1U] - 1U;
1914 }
1915
1916 /* Compute kernel 5. */
1917 pvr_csb_emit (csb, CDMCTRL_KERNEL5, kernel5) {
1918 assert(info->global_size[2U] > 0U);
1919 kernel5.workgroup_z = info->global_size[2U] - 1U;
1920 }
1921 }
1922
1923 /* Compute kernel 8. */
1924 pvr_csb_emit (csb, CDMCTRL_KERNEL8, kernel8) {
1925 if (info->max_instances == ROGUE_MAX_INSTANCES_PER_TASK)
1926 kernel8.max_instances = 0U;
1927 else
1928 kernel8.max_instances = info->max_instances;
1929
1930 assert(info->local_size[0U] > 0U);
1931 kernel8.workgroup_size_x = info->local_size[0U] - 1U;
1932 assert(info->local_size[1U] > 0U);
1933 kernel8.workgroup_size_y = info->local_size[1U] - 1U;
1934 assert(info->local_size[2U] > 0U);
1935 kernel8.workgroup_size_z = info->local_size[2U] - 1U;
1936 }
1937
1938 pvr_csb_clear_relocation_mark(csb);
1939
1940 /* Track the highest amount of shared registers usage in this dispatch.
1941 * This is used by the FW for context switching, so must be large enough
1942 * to contain all the shared registers that might be in use for this compute
1943 * job. Coefficients don't need to be included as the context switch will not
1944 * happen within the execution of a single workgroup, thus nothing needs to
1945 * be preserved.
1946 */
1947 if (info->usc_common_shared) {
1948 sub_cmd->num_shared_regs =
1949 MAX2(sub_cmd->num_shared_regs, info->usc_common_size);
1950 }
1951 }
1952
1953 /* TODO: This can be pre-packed and uploaded directly. Would that provide any
1954 * speed up?
1955 */
1956 static void
pvr_compute_generate_idfwdf(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd)1957 pvr_compute_generate_idfwdf(struct pvr_cmd_buffer *cmd_buffer,
1958 struct pvr_sub_cmd_compute *const sub_cmd)
1959 {
1960 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
1961 bool *const is_sw_barrier_required =
1962 &state->current_sub_cmd->compute.pds_sw_barrier_requires_clearing;
1963 const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
1964 struct pvr_csb *csb = &sub_cmd->control_stream;
1965 const struct pvr_pds_upload *program;
1966
1967 if (PVR_NEED_SW_COMPUTE_PDS_BARRIER(&pdevice->dev_info) &&
1968 *is_sw_barrier_required) {
1969 *is_sw_barrier_required = false;
1970 program = &cmd_buffer->device->idfwdf_state.sw_compute_barrier_pds;
1971 } else {
1972 program = &cmd_buffer->device->idfwdf_state.pds;
1973 }
1974
1975 struct pvr_compute_kernel_info info = {
1976 .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
1977 .global_offsets_present = false,
1978 .usc_common_size = DIV_ROUND_UP(
1979 PVR_DW_TO_BYTES(cmd_buffer->device->idfwdf_state.usc_shareds),
1980 ROGUE_CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE),
1981 .usc_unified_size = 0U,
1982 .pds_temp_size = 0U,
1983 .pds_data_size =
1984 DIV_ROUND_UP(PVR_DW_TO_BYTES(program->data_size),
1985 ROGUE_CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE),
1986 .usc_target = ROGUE_CDMCTRL_USC_TARGET_ALL,
1987 .is_fence = false,
1988 .pds_data_offset = program->data_offset,
1989 .sd_type = ROGUE_CDMCTRL_SD_TYPE_USC,
1990 .usc_common_shared = true,
1991 .pds_code_offset = program->code_offset,
1992 .global_size = { 1U, 1U, 1U },
1993 .local_size = { 1U, 1U, 1U },
1994 };
1995
1996 /* We don't need to pad work-group size for this case. */
1997
1998 info.max_instances =
1999 pvr_compute_flat_slot_size(pdevice,
2000 cmd_buffer->device->idfwdf_state.usc_shareds,
2001 false,
2002 1U);
2003
2004 pvr_compute_generate_control_stream(csb, sub_cmd, &info);
2005 }
2006
pvr_compute_generate_fence(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd,bool deallocate_shareds)2007 void pvr_compute_generate_fence(struct pvr_cmd_buffer *cmd_buffer,
2008 struct pvr_sub_cmd_compute *const sub_cmd,
2009 bool deallocate_shareds)
2010 {
2011 const struct pvr_pds_upload *program =
2012 &cmd_buffer->device->pds_compute_fence_program;
2013 const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
2014 struct pvr_csb *csb = &sub_cmd->control_stream;
2015
2016 struct pvr_compute_kernel_info info = {
2017 .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
2018 .global_offsets_present = false,
2019 .usc_common_size = 0U,
2020 .usc_unified_size = 0U,
2021 .pds_temp_size = 0U,
2022 .pds_data_size =
2023 DIV_ROUND_UP(PVR_DW_TO_BYTES(program->data_size),
2024 ROGUE_CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE),
2025 .usc_target = ROGUE_CDMCTRL_USC_TARGET_ANY,
2026 .is_fence = true,
2027 .pds_data_offset = program->data_offset,
2028 .sd_type = ROGUE_CDMCTRL_SD_TYPE_PDS,
2029 .usc_common_shared = deallocate_shareds,
2030 .pds_code_offset = program->code_offset,
2031 .global_size = { 1U, 1U, 1U },
2032 .local_size = { 1U, 1U, 1U },
2033 };
2034
2035 /* We don't need to pad work-group size for this case. */
2036 /* Here we calculate the slot size. This can depend on the use of barriers,
2037 * local memory, BRN's or other factors.
2038 */
2039 info.max_instances = pvr_compute_flat_slot_size(pdevice, 0U, false, 1U);
2040
2041 pvr_compute_generate_control_stream(csb, sub_cmd, &info);
2042 }
2043
2044 static VkResult
pvr_cmd_buffer_process_deferred_clears(struct pvr_cmd_buffer * cmd_buffer)2045 pvr_cmd_buffer_process_deferred_clears(struct pvr_cmd_buffer *cmd_buffer)
2046 {
2047 util_dynarray_foreach (&cmd_buffer->deferred_clears,
2048 struct pvr_transfer_cmd,
2049 transfer_cmd) {
2050 VkResult result;
2051
2052 result = pvr_cmd_buffer_add_transfer_cmd(cmd_buffer, transfer_cmd);
2053 if (result != VK_SUCCESS)
2054 return result;
2055
2056 cmd_buffer->state.current_sub_cmd->transfer.serialize_with_frag = true;
2057 }
2058
2059 return VK_SUCCESS;
2060 }
2061
pvr_cmd_buffer_end_sub_cmd(struct pvr_cmd_buffer * cmd_buffer)2062 VkResult pvr_cmd_buffer_end_sub_cmd(struct pvr_cmd_buffer *cmd_buffer)
2063 {
2064 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
2065 struct pvr_sub_cmd *sub_cmd = state->current_sub_cmd;
2066 struct pvr_device *device = cmd_buffer->device;
2067 const struct pvr_query_pool *query_pool = NULL;
2068 struct pvr_suballoc_bo *query_bo = NULL;
2069 size_t query_indices_size = 0;
2070 VkResult result;
2071
2072 /* FIXME: Is this NULL check required because this function is called from
2073 * pvr_resolve_unemitted_resolve_attachments()? See comment about this
2074 * function being called twice in a row in pvr_CmdEndRenderPass().
2075 */
2076 if (!sub_cmd)
2077 return VK_SUCCESS;
2078
2079 if (!sub_cmd->owned) {
2080 state->current_sub_cmd = NULL;
2081 return VK_SUCCESS;
2082 }
2083
2084 switch (sub_cmd->type) {
2085 case PVR_SUB_CMD_TYPE_GRAPHICS: {
2086 struct pvr_sub_cmd_gfx *const gfx_sub_cmd = &sub_cmd->gfx;
2087
2088 query_indices_size =
2089 util_dynarray_num_elements(&state->query_indices, char);
2090
2091 if (query_indices_size > 0) {
2092 const bool secondary_cont =
2093 cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
2094 cmd_buffer->usage_flags &
2095 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
2096
2097 assert(gfx_sub_cmd->query_pool);
2098
2099 if (secondary_cont) {
2100 util_dynarray_append_dynarray(&state->query_indices,
2101 &gfx_sub_cmd->sec_query_indices);
2102 } else {
2103 const void *data = util_dynarray_begin(&state->query_indices);
2104
2105 result = pvr_cmd_buffer_upload_general(cmd_buffer,
2106 data,
2107 query_indices_size,
2108 &query_bo);
2109 if (result != VK_SUCCESS)
2110 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2111
2112 query_pool = gfx_sub_cmd->query_pool;
2113 }
2114
2115 gfx_sub_cmd->has_occlusion_query = true;
2116
2117 util_dynarray_clear(&state->query_indices);
2118 }
2119
2120 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
2121 result = pvr_csb_emit_return(&gfx_sub_cmd->control_stream);
2122 if (result != VK_SUCCESS)
2123 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2124
2125 break;
2126 }
2127
2128 /* TODO: Check if the sub_cmd can be skipped based on
2129 * sub_cmd->gfx.empty_cmd flag.
2130 */
2131
2132 /* TODO: Set the state in the functions called with the command buffer
2133 * instead of here.
2134 */
2135
2136 result = pvr_cmd_buffer_upload_tables(device, cmd_buffer, gfx_sub_cmd);
2137 if (result != VK_SUCCESS)
2138 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2139
2140 result = pvr_cmd_buffer_emit_ppp_state(cmd_buffer,
2141 &gfx_sub_cmd->control_stream);
2142 if (result != VK_SUCCESS)
2143 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2144
2145 result = pvr_csb_emit_terminate(&gfx_sub_cmd->control_stream);
2146 if (result != VK_SUCCESS)
2147 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2148
2149 result = pvr_sub_cmd_gfx_job_init(&device->pdevice->dev_info,
2150 cmd_buffer,
2151 gfx_sub_cmd);
2152 if (result != VK_SUCCESS)
2153 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2154
2155 if (pvr_sub_cmd_gfx_requires_split_submit(gfx_sub_cmd)) {
2156 result = pvr_sub_cmd_gfx_build_terminate_ctrl_stream(device,
2157 cmd_buffer,
2158 gfx_sub_cmd);
2159 if (result != VK_SUCCESS)
2160 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2161 }
2162
2163 break;
2164 }
2165
2166 case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY:
2167 case PVR_SUB_CMD_TYPE_COMPUTE: {
2168 struct pvr_sub_cmd_compute *const compute_sub_cmd = &sub_cmd->compute;
2169
2170 pvr_compute_generate_fence(cmd_buffer, compute_sub_cmd, true);
2171
2172 result = pvr_csb_emit_terminate(&compute_sub_cmd->control_stream);
2173 if (result != VK_SUCCESS)
2174 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2175
2176 pvr_sub_cmd_compute_job_init(device->pdevice,
2177 cmd_buffer,
2178 compute_sub_cmd);
2179 break;
2180 }
2181
2182 case PVR_SUB_CMD_TYPE_TRANSFER:
2183 break;
2184
2185 case PVR_SUB_CMD_TYPE_EVENT:
2186 break;
2187
2188 default:
2189 unreachable("Unsupported sub-command type");
2190 }
2191
2192 state->current_sub_cmd = NULL;
2193
2194 /* pvr_cmd_buffer_process_deferred_clears() must be called with a NULL
2195 * current_sub_cmd.
2196 *
2197 * We can start a sub_cmd of a different type from the current sub_cmd only
2198 * after having ended the current sub_cmd. However, we can't end the current
2199 * sub_cmd if this depends on starting sub_cmd(s) of a different type. Hence,
2200 * don't try to start transfer sub_cmd(s) with
2201 * pvr_cmd_buffer_process_deferred_clears() until the current hasn't ended.
2202 * Failing to do so we will cause a circular dependency between
2203 * pvr_cmd_buffer_{end,start}_cmd and blow the stack.
2204 */
2205 if (sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS) {
2206 result = pvr_cmd_buffer_process_deferred_clears(cmd_buffer);
2207 if (result != VK_SUCCESS)
2208 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2209 }
2210
2211 if (query_pool) {
2212 struct pvr_query_info query_info;
2213
2214 assert(query_bo);
2215 assert(query_indices_size);
2216
2217 query_info.type = PVR_QUERY_TYPE_AVAILABILITY_WRITE;
2218
2219 /* sizeof(uint32_t) is for the size of single query. */
2220 query_info.availability_write.num_query_indices =
2221 query_indices_size / sizeof(uint32_t);
2222 query_info.availability_write.index_bo = query_bo;
2223
2224 query_info.availability_write.num_queries = query_pool->query_count;
2225 query_info.availability_write.availability_bo =
2226 query_pool->availability_buffer;
2227
2228 /* Insert a barrier after the graphics sub command and before the
2229 * query sub command so that the availability write program waits for the
2230 * fragment shader to complete.
2231 */
2232
2233 result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
2234 if (result != VK_SUCCESS)
2235 return result;
2236
2237 cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
2238 .type = PVR_EVENT_TYPE_BARRIER,
2239 .barrier = {
2240 .wait_for_stage_mask = PVR_PIPELINE_STAGE_FRAG_BIT,
2241 .wait_at_stage_mask = PVR_PIPELINE_STAGE_OCCLUSION_QUERY_BIT,
2242 },
2243 };
2244
2245 return pvr_add_query_program(cmd_buffer, &query_info);
2246 }
2247
2248 return VK_SUCCESS;
2249 }
2250
pvr_reset_graphics_dirty_state(struct pvr_cmd_buffer * const cmd_buffer,bool start_geom)2251 void pvr_reset_graphics_dirty_state(struct pvr_cmd_buffer *const cmd_buffer,
2252 bool start_geom)
2253 {
2254 struct vk_dynamic_graphics_state *const dynamic_state =
2255 &cmd_buffer->vk.dynamic_graphics_state;
2256
2257 if (start_geom) {
2258 /*
2259 * Initial geometry phase state.
2260 * It's the driver's responsibility to ensure that the state of the
2261 * hardware is correctly initialized at the start of every geometry
2262 * phase. This is required to prevent stale state from a previous
2263 * geometry phase erroneously affecting the next geometry phase.
2264 *
2265 * If a geometry phase does not contain any geometry, this restriction
2266 * can be ignored. If the first draw call in a geometry phase will only
2267 * update the depth or stencil buffers i.e. ISP_TAGWRITEDISABLE is set
2268 * in the ISP State Control Word, the PDS State Pointers
2269 * (TA_PRES_PDSSTATEPTR*) in the first PPP State Update do not need to
2270 * be supplied, since they will never reach the PDS in the fragment
2271 * phase.
2272 */
2273
2274 cmd_buffer->state.emit_header = (struct ROGUE_TA_STATE_HEADER){
2275 .pres_stream_out_size = true,
2276 .pres_ppp_ctrl = true,
2277 .pres_varying_word2 = true,
2278 .pres_varying_word1 = true,
2279 .pres_varying_word0 = true,
2280 .pres_outselects = true,
2281 .pres_wclamp = true,
2282 .pres_viewport = true,
2283 .pres_region_clip = true,
2284 .pres_pds_state_ptr0 = true,
2285 .pres_ispctl_fb = true,
2286 .pres_ispctl = true,
2287 };
2288 } else {
2289 struct ROGUE_TA_STATE_HEADER *const emit_header =
2290 &cmd_buffer->state.emit_header;
2291
2292 emit_header->pres_ppp_ctrl = true;
2293 emit_header->pres_varying_word1 = true;
2294 emit_header->pres_varying_word0 = true;
2295 emit_header->pres_outselects = true;
2296 emit_header->pres_viewport = true;
2297 emit_header->pres_region_clip = true;
2298 emit_header->pres_pds_state_ptr0 = true;
2299 emit_header->pres_ispctl_fb = true;
2300 emit_header->pres_ispctl = true;
2301 }
2302
2303 memset(&cmd_buffer->state.ppp_state,
2304 0U,
2305 sizeof(cmd_buffer->state.ppp_state));
2306
2307 cmd_buffer->state.dirty.vertex_bindings = true;
2308 cmd_buffer->state.dirty.gfx_pipeline_binding = true;
2309
2310 BITSET_SET(dynamic_state->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS);
2311 BITSET_SET(dynamic_state->dirty, MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT);
2312 }
2313
2314 static inline bool
pvr_cmd_uses_deferred_cs_cmds(const struct pvr_cmd_buffer * const cmd_buffer)2315 pvr_cmd_uses_deferred_cs_cmds(const struct pvr_cmd_buffer *const cmd_buffer)
2316 {
2317 const VkCommandBufferUsageFlags deferred_control_stream_flags =
2318 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT |
2319 VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
2320
2321 return cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
2322 (cmd_buffer->usage_flags & deferred_control_stream_flags) ==
2323 deferred_control_stream_flags;
2324 }
2325
pvr_cmd_buffer_start_sub_cmd(struct pvr_cmd_buffer * cmd_buffer,enum pvr_sub_cmd_type type)2326 VkResult pvr_cmd_buffer_start_sub_cmd(struct pvr_cmd_buffer *cmd_buffer,
2327 enum pvr_sub_cmd_type type)
2328 {
2329 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
2330 struct pvr_device *device = cmd_buffer->device;
2331 struct pvr_sub_cmd *sub_cmd;
2332 VkResult result;
2333
2334 /* Check the current status of the buffer. */
2335 if (vk_command_buffer_has_error(&cmd_buffer->vk))
2336 return vk_command_buffer_get_record_result(&cmd_buffer->vk);
2337
2338 pvr_cmd_buffer_update_barriers(cmd_buffer, type);
2339
2340 /* TODO: Add proper support for joining consecutive event sub_cmd? */
2341 if (state->current_sub_cmd) {
2342 if (state->current_sub_cmd->type == type) {
2343 /* Continue adding to the current sub command. */
2344 return VK_SUCCESS;
2345 }
2346
2347 /* End the current sub command. */
2348 result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
2349 if (result != VK_SUCCESS)
2350 return result;
2351 }
2352
2353 sub_cmd = vk_zalloc(&cmd_buffer->vk.pool->alloc,
2354 sizeof(*sub_cmd),
2355 8,
2356 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
2357 if (!sub_cmd) {
2358 return vk_command_buffer_set_error(&cmd_buffer->vk,
2359 VK_ERROR_OUT_OF_HOST_MEMORY);
2360 }
2361
2362 sub_cmd->type = type;
2363 sub_cmd->owned = true;
2364
2365 switch (type) {
2366 case PVR_SUB_CMD_TYPE_GRAPHICS:
2367 sub_cmd->gfx.depth_usage = PVR_DEPTH_STENCIL_USAGE_UNDEFINED;
2368 sub_cmd->gfx.stencil_usage = PVR_DEPTH_STENCIL_USAGE_UNDEFINED;
2369 sub_cmd->gfx.modifies_depth = false;
2370 sub_cmd->gfx.modifies_stencil = false;
2371 sub_cmd->gfx.max_tiles_in_flight =
2372 PVR_GET_FEATURE_VALUE(&device->pdevice->dev_info,
2373 isp_max_tiles_in_flight,
2374 1);
2375 sub_cmd->gfx.hw_render_idx = state->render_pass_info.current_hw_subpass;
2376 sub_cmd->gfx.framebuffer = state->render_pass_info.framebuffer;
2377 sub_cmd->gfx.empty_cmd = true;
2378
2379 if (state->vis_test_enabled)
2380 sub_cmd->gfx.query_pool = state->query_pool;
2381
2382 pvr_reset_graphics_dirty_state(cmd_buffer, true);
2383
2384 if (pvr_cmd_uses_deferred_cs_cmds(cmd_buffer)) {
2385 pvr_csb_init(device,
2386 PVR_CMD_STREAM_TYPE_GRAPHICS_DEFERRED,
2387 &sub_cmd->gfx.control_stream);
2388 } else {
2389 pvr_csb_init(device,
2390 PVR_CMD_STREAM_TYPE_GRAPHICS,
2391 &sub_cmd->gfx.control_stream);
2392 }
2393
2394 util_dynarray_init(&sub_cmd->gfx.sec_query_indices, NULL);
2395 break;
2396
2397 case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY:
2398 case PVR_SUB_CMD_TYPE_COMPUTE:
2399 pvr_csb_init(device,
2400 PVR_CMD_STREAM_TYPE_COMPUTE,
2401 &sub_cmd->compute.control_stream);
2402 break;
2403
2404 case PVR_SUB_CMD_TYPE_TRANSFER:
2405 sub_cmd->transfer.transfer_cmds = &sub_cmd->transfer.transfer_cmds_priv;
2406 list_inithead(sub_cmd->transfer.transfer_cmds);
2407 break;
2408
2409 case PVR_SUB_CMD_TYPE_EVENT:
2410 break;
2411
2412 default:
2413 unreachable("Unsupported sub-command type");
2414 }
2415
2416 list_addtail(&sub_cmd->link, &cmd_buffer->sub_cmds);
2417 state->current_sub_cmd = sub_cmd;
2418
2419 return VK_SUCCESS;
2420 }
2421
pvr_cmd_buffer_alloc_mem(struct pvr_cmd_buffer * cmd_buffer,struct pvr_winsys_heap * heap,uint64_t size,struct pvr_suballoc_bo ** const pvr_bo_out)2422 VkResult pvr_cmd_buffer_alloc_mem(struct pvr_cmd_buffer *cmd_buffer,
2423 struct pvr_winsys_heap *heap,
2424 uint64_t size,
2425 struct pvr_suballoc_bo **const pvr_bo_out)
2426 {
2427 const uint32_t cache_line_size =
2428 rogue_get_slc_cache_line_size(&cmd_buffer->device->pdevice->dev_info);
2429 struct pvr_suballoc_bo *suballoc_bo;
2430 struct pvr_suballocator *allocator;
2431 VkResult result;
2432
2433 if (heap == cmd_buffer->device->heaps.general_heap)
2434 allocator = &cmd_buffer->device->suballoc_general;
2435 else if (heap == cmd_buffer->device->heaps.pds_heap)
2436 allocator = &cmd_buffer->device->suballoc_pds;
2437 else if (heap == cmd_buffer->device->heaps.transfer_frag_heap)
2438 allocator = &cmd_buffer->device->suballoc_transfer;
2439 else if (heap == cmd_buffer->device->heaps.usc_heap)
2440 allocator = &cmd_buffer->device->suballoc_usc;
2441 else
2442 unreachable("Unknown heap type");
2443
2444 result =
2445 pvr_bo_suballoc(allocator, size, cache_line_size, false, &suballoc_bo);
2446 if (result != VK_SUCCESS)
2447 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2448
2449 list_add(&suballoc_bo->link, &cmd_buffer->bo_list);
2450
2451 *pvr_bo_out = suballoc_bo;
2452
2453 return VK_SUCCESS;
2454 }
2455
pvr_cmd_bind_compute_pipeline(const struct pvr_compute_pipeline * const compute_pipeline,struct pvr_cmd_buffer * const cmd_buffer)2456 static void pvr_cmd_bind_compute_pipeline(
2457 const struct pvr_compute_pipeline *const compute_pipeline,
2458 struct pvr_cmd_buffer *const cmd_buffer)
2459 {
2460 cmd_buffer->state.compute_pipeline = compute_pipeline;
2461 cmd_buffer->state.dirty.compute_pipeline_binding = true;
2462 }
2463
pvr_cmd_bind_graphics_pipeline(const struct pvr_graphics_pipeline * const gfx_pipeline,struct pvr_cmd_buffer * const cmd_buffer)2464 static void pvr_cmd_bind_graphics_pipeline(
2465 const struct pvr_graphics_pipeline *const gfx_pipeline,
2466 struct pvr_cmd_buffer *const cmd_buffer)
2467 {
2468 cmd_buffer->state.gfx_pipeline = gfx_pipeline;
2469 cmd_buffer->state.dirty.gfx_pipeline_binding = true;
2470
2471 vk_cmd_set_dynamic_graphics_state(&cmd_buffer->vk,
2472 &gfx_pipeline->dynamic_state);
2473 }
2474
pvr_CmdBindPipeline(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipeline _pipeline)2475 void pvr_CmdBindPipeline(VkCommandBuffer commandBuffer,
2476 VkPipelineBindPoint pipelineBindPoint,
2477 VkPipeline _pipeline)
2478 {
2479 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2480 PVR_FROM_HANDLE(pvr_pipeline, pipeline, _pipeline);
2481
2482 switch (pipelineBindPoint) {
2483 case VK_PIPELINE_BIND_POINT_COMPUTE:
2484 pvr_cmd_bind_compute_pipeline(to_pvr_compute_pipeline(pipeline),
2485 cmd_buffer);
2486 break;
2487
2488 case VK_PIPELINE_BIND_POINT_GRAPHICS:
2489 pvr_cmd_bind_graphics_pipeline(to_pvr_graphics_pipeline(pipeline),
2490 cmd_buffer);
2491 break;
2492
2493 default:
2494 unreachable("Invalid bind point.");
2495 break;
2496 }
2497 }
2498
2499 #if MESA_DEBUG
check_viewport_quirk_70165(const struct pvr_device * device,const VkViewport * pViewport)2500 static void check_viewport_quirk_70165(const struct pvr_device *device,
2501 const VkViewport *pViewport)
2502 {
2503 const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
2504 float min_vertex_x, max_vertex_x, min_vertex_y, max_vertex_y;
2505 float min_screen_space_value, max_screen_space_value;
2506 float sign_to_unsigned_offset, fixed_point_max;
2507 float guardband_width, guardband_height;
2508
2509 if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
2510 /* Max representable value in 13.4 fixed point format.
2511 * Round-down to avoid precision issues.
2512 * Calculated as (2 ** 13) - 2*(2 ** -4)
2513 */
2514 fixed_point_max = 8192.0f - 2.0f / 16.0f;
2515
2516 if (PVR_HAS_FEATURE(dev_info, screen_size8K)) {
2517 if (pViewport->width <= 4096 && pViewport->height <= 4096) {
2518 guardband_width = pViewport->width / 4.0f;
2519 guardband_height = pViewport->height / 4.0f;
2520
2521 /* 2k of the range is negative */
2522 sign_to_unsigned_offset = 2048.0f;
2523 } else {
2524 guardband_width = 0.0f;
2525 guardband_height = 0.0f;
2526
2527 /* For > 4k renders, the entire range is positive */
2528 sign_to_unsigned_offset = 0.0f;
2529 }
2530 } else {
2531 guardband_width = pViewport->width / 4.0f;
2532 guardband_height = pViewport->height / 4.0f;
2533
2534 /* 2k of the range is negative */
2535 sign_to_unsigned_offset = 2048.0f;
2536 }
2537 } else {
2538 /* Max representable value in 16.8 fixed point format
2539 * Calculated as (2 ** 16) - (2 ** -8)
2540 */
2541 fixed_point_max = 65535.99609375f;
2542 guardband_width = pViewport->width / 4.0f;
2543 guardband_height = pViewport->height / 4.0f;
2544
2545 /* 4k/20k of the range is negative */
2546 sign_to_unsigned_offset = (float)PVR_MAX_NEG_OFFSCREEN_OFFSET;
2547 }
2548
2549 min_screen_space_value = -sign_to_unsigned_offset;
2550 max_screen_space_value = fixed_point_max - sign_to_unsigned_offset;
2551
2552 min_vertex_x = pViewport->x - guardband_width;
2553 max_vertex_x = pViewport->x + pViewport->width + guardband_width;
2554 min_vertex_y = pViewport->y - guardband_height;
2555 max_vertex_y = pViewport->y + pViewport->height + guardband_height;
2556 if (min_vertex_x < min_screen_space_value ||
2557 max_vertex_x > max_screen_space_value ||
2558 min_vertex_y < min_screen_space_value ||
2559 max_vertex_y > max_screen_space_value) {
2560 mesa_logw("Viewport is affected by BRN70165, geometry outside "
2561 "the viewport could be corrupted");
2562 }
2563 }
2564 #endif
2565
pvr_CmdSetViewport(VkCommandBuffer commandBuffer,uint32_t firstViewport,uint32_t viewportCount,const VkViewport * pViewports)2566 void pvr_CmdSetViewport(VkCommandBuffer commandBuffer,
2567 uint32_t firstViewport,
2568 uint32_t viewportCount,
2569 const VkViewport *pViewports)
2570 {
2571 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2572 const uint32_t total_count = firstViewport + viewportCount;
2573
2574 assert(firstViewport < PVR_MAX_VIEWPORTS && viewportCount > 0);
2575 assert(total_count >= 1 && total_count <= PVR_MAX_VIEWPORTS);
2576
2577 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2578
2579 #if MESA_DEBUG
2580 if (PVR_HAS_QUIRK(&cmd_buffer->device->pdevice->dev_info, 70165)) {
2581 for (uint32_t viewport = 0; viewport < viewportCount; viewport++) {
2582 check_viewport_quirk_70165(cmd_buffer->device, &pViewports[viewport]);
2583 }
2584 }
2585 #endif
2586
2587 vk_common_CmdSetViewport(commandBuffer,
2588 firstViewport,
2589 viewportCount,
2590 pViewports);
2591 }
2592
pvr_CmdSetDepthBounds(VkCommandBuffer commandBuffer,float minDepthBounds,float maxDepthBounds)2593 void pvr_CmdSetDepthBounds(VkCommandBuffer commandBuffer,
2594 float minDepthBounds,
2595 float maxDepthBounds)
2596 {
2597 mesa_logd("No support for depth bounds testing.");
2598 }
2599
pvr_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipelineLayout _layout,uint32_t firstSet,uint32_t descriptorSetCount,const VkDescriptorSet * pDescriptorSets,uint32_t dynamicOffsetCount,const uint32_t * pDynamicOffsets)2600 void pvr_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
2601 VkPipelineBindPoint pipelineBindPoint,
2602 VkPipelineLayout _layout,
2603 uint32_t firstSet,
2604 uint32_t descriptorSetCount,
2605 const VkDescriptorSet *pDescriptorSets,
2606 uint32_t dynamicOffsetCount,
2607 const uint32_t *pDynamicOffsets)
2608 {
2609 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2610 struct pvr_descriptor_state *descriptor_state;
2611
2612 assert(firstSet + descriptorSetCount <= PVR_MAX_DESCRIPTOR_SETS);
2613
2614 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2615
2616 switch (pipelineBindPoint) {
2617 case VK_PIPELINE_BIND_POINT_GRAPHICS:
2618 case VK_PIPELINE_BIND_POINT_COMPUTE:
2619 break;
2620
2621 default:
2622 unreachable("Unsupported bind point.");
2623 break;
2624 }
2625
2626 if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) {
2627 descriptor_state = &cmd_buffer->state.gfx_desc_state;
2628 cmd_buffer->state.dirty.gfx_desc_dirty = true;
2629 } else {
2630 descriptor_state = &cmd_buffer->state.compute_desc_state;
2631 cmd_buffer->state.dirty.compute_desc_dirty = true;
2632 }
2633
2634 for (uint32_t i = 0; i < descriptorSetCount; i++) {
2635 PVR_FROM_HANDLE(pvr_descriptor_set, set, pDescriptorSets[i]);
2636 uint32_t index = firstSet + i;
2637
2638 if (descriptor_state->descriptor_sets[index] != set) {
2639 descriptor_state->descriptor_sets[index] = set;
2640 descriptor_state->valid_mask |= (1u << index);
2641 }
2642 }
2643
2644 if (dynamicOffsetCount > 0) {
2645 PVR_FROM_HANDLE(pvr_pipeline_layout, pipeline_layout, _layout);
2646 uint32_t set_offset = 0;
2647
2648 for (uint32_t set = 0; set < firstSet; set++)
2649 set_offset += pipeline_layout->set_layout[set]->dynamic_buffer_count;
2650
2651 assert(set_offset + dynamicOffsetCount <=
2652 ARRAY_SIZE(descriptor_state->dynamic_offsets));
2653
2654 /* From the Vulkan 1.3.238 spec. :
2655 *
2656 * "If any of the sets being bound include dynamic uniform or storage
2657 * buffers, then pDynamicOffsets includes one element for each array
2658 * element in each dynamic descriptor type binding in each set."
2659 *
2660 */
2661 for (uint32_t i = 0; i < dynamicOffsetCount; i++)
2662 descriptor_state->dynamic_offsets[set_offset + i] = pDynamicOffsets[i];
2663 }
2664 }
2665
pvr_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets)2666 void pvr_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,
2667 uint32_t firstBinding,
2668 uint32_t bindingCount,
2669 const VkBuffer *pBuffers,
2670 const VkDeviceSize *pOffsets)
2671 {
2672 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2673 struct pvr_vertex_binding *const vb = cmd_buffer->state.vertex_bindings;
2674
2675 /* We have to defer setting up vertex buffer since we need the buffer
2676 * stride from the pipeline.
2677 */
2678
2679 assert(firstBinding < PVR_MAX_VERTEX_INPUT_BINDINGS &&
2680 bindingCount <= PVR_MAX_VERTEX_INPUT_BINDINGS);
2681
2682 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2683
2684 for (uint32_t i = 0; i < bindingCount; i++) {
2685 vb[firstBinding + i].buffer = pvr_buffer_from_handle(pBuffers[i]);
2686 vb[firstBinding + i].offset = pOffsets[i];
2687 }
2688
2689 cmd_buffer->state.dirty.vertex_bindings = true;
2690 }
2691
pvr_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,VkBuffer buffer,VkDeviceSize offset,VkIndexType indexType)2692 void pvr_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,
2693 VkBuffer buffer,
2694 VkDeviceSize offset,
2695 VkIndexType indexType)
2696 {
2697 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2698 PVR_FROM_HANDLE(pvr_buffer, index_buffer, buffer);
2699 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
2700
2701 assert(offset < index_buffer->vk.size);
2702 assert(indexType == VK_INDEX_TYPE_UINT32 ||
2703 indexType == VK_INDEX_TYPE_UINT16 ||
2704 indexType == VK_INDEX_TYPE_UINT8_KHR);
2705
2706 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2707
2708 state->index_buffer_binding.buffer = index_buffer;
2709 state->index_buffer_binding.offset = offset;
2710 state->index_buffer_binding.type = indexType;
2711 state->dirty.index_buffer_binding = true;
2712 }
2713
pvr_CmdPushConstants(VkCommandBuffer commandBuffer,VkPipelineLayout layout,VkShaderStageFlags stageFlags,uint32_t offset,uint32_t size,const void * pValues)2714 void pvr_CmdPushConstants(VkCommandBuffer commandBuffer,
2715 VkPipelineLayout layout,
2716 VkShaderStageFlags stageFlags,
2717 uint32_t offset,
2718 uint32_t size,
2719 const void *pValues)
2720 {
2721 #if MESA_DEBUG
2722 const uint64_t ending = (uint64_t)offset + (uint64_t)size;
2723 #endif
2724
2725 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2726 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
2727
2728 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2729
2730 pvr_assert(ending <= PVR_MAX_PUSH_CONSTANTS_SIZE);
2731
2732 memcpy(&state->push_constants.data[offset], pValues, size);
2733
2734 state->push_constants.dirty_stages |= stageFlags;
2735 state->push_constants.uploaded = false;
2736 }
2737
2738 static VkResult
pvr_cmd_buffer_setup_attachments(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_render_pass * pass,const struct pvr_framebuffer * framebuffer)2739 pvr_cmd_buffer_setup_attachments(struct pvr_cmd_buffer *cmd_buffer,
2740 const struct pvr_render_pass *pass,
2741 const struct pvr_framebuffer *framebuffer)
2742 {
2743 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
2744 struct pvr_render_pass_info *info = &state->render_pass_info;
2745
2746 assert(pass->attachment_count == framebuffer->attachment_count);
2747
2748 /* Free any previously allocated attachments. */
2749 vk_free(&cmd_buffer->vk.pool->alloc, state->render_pass_info.attachments);
2750
2751 if (pass->attachment_count == 0) {
2752 info->attachments = NULL;
2753 return VK_SUCCESS;
2754 }
2755
2756 info->attachments =
2757 vk_zalloc(&cmd_buffer->vk.pool->alloc,
2758 pass->attachment_count * sizeof(*info->attachments),
2759 8,
2760 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
2761 if (!info->attachments) {
2762 return vk_command_buffer_set_error(&cmd_buffer->vk,
2763 VK_ERROR_OUT_OF_HOST_MEMORY);
2764 }
2765
2766 for (uint32_t i = 0; i < pass->attachment_count; i++)
2767 info->attachments[i] = framebuffer->attachments[i];
2768
2769 return VK_SUCCESS;
2770 }
2771
pvr_init_render_targets(struct pvr_device * device,struct pvr_render_pass * pass,struct pvr_framebuffer * framebuffer)2772 static VkResult pvr_init_render_targets(struct pvr_device *device,
2773 struct pvr_render_pass *pass,
2774 struct pvr_framebuffer *framebuffer)
2775 {
2776 for (uint32_t i = 0; i < pass->hw_setup->render_count; i++) {
2777 struct pvr_render_target *render_target =
2778 pvr_get_render_target(pass, framebuffer, i);
2779
2780 pthread_mutex_lock(&render_target->mutex);
2781
2782 if (!render_target->valid) {
2783 const struct pvr_renderpass_hwsetup_render *hw_render =
2784 &pass->hw_setup->renders[i];
2785 VkResult result;
2786
2787 result = pvr_render_target_dataset_create(device,
2788 framebuffer->width,
2789 framebuffer->height,
2790 hw_render->sample_count,
2791 framebuffer->layers,
2792 &render_target->rt_dataset);
2793 if (result != VK_SUCCESS) {
2794 pthread_mutex_unlock(&render_target->mutex);
2795 return result;
2796 }
2797
2798 render_target->valid = true;
2799 }
2800
2801 pthread_mutex_unlock(&render_target->mutex);
2802 }
2803
2804 return VK_SUCCESS;
2805 }
2806
2807 const struct pvr_renderpass_hwsetup_subpass *
pvr_get_hw_subpass(const struct pvr_render_pass * pass,const uint32_t subpass)2808 pvr_get_hw_subpass(const struct pvr_render_pass *pass, const uint32_t subpass)
2809 {
2810 const struct pvr_renderpass_hw_map *map =
2811 &pass->hw_setup->subpass_map[subpass];
2812
2813 return &pass->hw_setup->renders[map->render].subpasses[map->subpass];
2814 }
2815
pvr_perform_start_of_render_attachment_clear(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_framebuffer * framebuffer,uint32_t index,bool is_depth_stencil,uint32_t * index_list_clear_mask)2816 static void pvr_perform_start_of_render_attachment_clear(
2817 struct pvr_cmd_buffer *cmd_buffer,
2818 const struct pvr_framebuffer *framebuffer,
2819 uint32_t index,
2820 bool is_depth_stencil,
2821 uint32_t *index_list_clear_mask)
2822 {
2823 ASSERTED static const VkImageAspectFlags dsc_aspect_flags =
2824 VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT |
2825 VK_IMAGE_ASPECT_COLOR_BIT;
2826 struct pvr_render_pass_info *info = &cmd_buffer->state.render_pass_info;
2827 const struct pvr_render_pass *pass = info->pass;
2828 const struct pvr_renderpass_hwsetup *hw_setup = pass->hw_setup;
2829 const struct pvr_renderpass_hwsetup_render *hw_render =
2830 &hw_setup->renders[hw_setup->subpass_map[info->subpass_idx].render];
2831 VkImageAspectFlags image_aspect;
2832 struct pvr_image_view *iview;
2833 uint32_t view_idx;
2834
2835 if (is_depth_stencil) {
2836 bool stencil_clear;
2837 bool depth_clear;
2838 bool is_stencil;
2839 bool is_depth;
2840
2841 assert(hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED);
2842 assert(index == 0);
2843
2844 view_idx = hw_render->ds_attach_idx;
2845
2846 is_depth = vk_format_has_depth(pass->attachments[view_idx].vk_format);
2847 is_stencil = vk_format_has_stencil(pass->attachments[view_idx].vk_format);
2848 depth_clear = hw_render->depth_init == VK_ATTACHMENT_LOAD_OP_CLEAR;
2849 stencil_clear = hw_render->stencil_init == VK_ATTACHMENT_LOAD_OP_CLEAR;
2850
2851 /* Attempt to clear the ds attachment. Do not erroneously discard an
2852 * attachment that has no depth clear but has a stencil attachment.
2853 */
2854 /* if not (a ∧ c) ∨ (b ∧ d) */
2855 if (!((is_depth && depth_clear) || (is_stencil && stencil_clear)))
2856 return;
2857 } else if (hw_render->color_init[index].op != VK_ATTACHMENT_LOAD_OP_CLEAR) {
2858 return;
2859 } else {
2860 view_idx = hw_render->color_init[index].index;
2861 }
2862
2863 iview = info->attachments[view_idx];
2864
2865 /* FIXME: It would be nice if this function and pvr_sub_cmd_gfx_job_init()
2866 * were doing the same check (even if it's just an assert) to determine if a
2867 * clear is needed.
2868 */
2869 /* If this is single-layer fullscreen, we already do the clears in
2870 * pvr_sub_cmd_gfx_job_init().
2871 */
2872 if (pvr_is_render_area_tile_aligned(cmd_buffer, iview) &&
2873 framebuffer->layers == 1) {
2874 return;
2875 }
2876
2877 image_aspect = vk_format_aspects(pass->attachments[view_idx].vk_format);
2878 assert((image_aspect & ~dsc_aspect_flags) == 0);
2879
2880 if (image_aspect & VK_IMAGE_ASPECT_DEPTH_BIT &&
2881 hw_render->depth_init != VK_ATTACHMENT_LOAD_OP_CLEAR) {
2882 image_aspect &= ~VK_IMAGE_ASPECT_DEPTH_BIT;
2883 }
2884
2885 if (image_aspect & VK_IMAGE_ASPECT_STENCIL_BIT &&
2886 hw_render->stencil_init != VK_ATTACHMENT_LOAD_OP_CLEAR) {
2887 image_aspect &= ~VK_IMAGE_ASPECT_STENCIL_BIT;
2888 }
2889
2890 if (image_aspect != VK_IMAGE_ASPECT_NONE) {
2891 VkClearAttachment clear_attachment = {
2892 .aspectMask = image_aspect,
2893 .colorAttachment = index,
2894 .clearValue = info->clear_values[view_idx],
2895 };
2896 VkClearRect rect = {
2897 .rect = info->render_area,
2898 .baseArrayLayer = 0,
2899 .layerCount = info->framebuffer->layers,
2900 };
2901
2902 assert(view_idx < info->clear_value_count);
2903
2904 pvr_clear_attachments_render_init(cmd_buffer, &clear_attachment, &rect);
2905
2906 *index_list_clear_mask |= (1 << index);
2907 }
2908 }
2909
2910 static void
pvr_perform_start_of_render_clears(struct pvr_cmd_buffer * cmd_buffer)2911 pvr_perform_start_of_render_clears(struct pvr_cmd_buffer *cmd_buffer)
2912 {
2913 struct pvr_render_pass_info *info = &cmd_buffer->state.render_pass_info;
2914 const struct pvr_framebuffer *framebuffer = info->framebuffer;
2915 const struct pvr_render_pass *pass = info->pass;
2916 const struct pvr_renderpass_hwsetup *hw_setup = pass->hw_setup;
2917 const struct pvr_renderpass_hwsetup_render *hw_render =
2918 &hw_setup->renders[hw_setup->subpass_map[info->subpass_idx].render];
2919
2920 /* Mask of attachment clears using index lists instead of background object
2921 * to clear.
2922 */
2923 uint32_t index_list_clear_mask = 0;
2924
2925 for (uint32_t i = 0; i < hw_render->color_init_count; i++) {
2926 pvr_perform_start_of_render_attachment_clear(cmd_buffer,
2927 framebuffer,
2928 i,
2929 false,
2930 &index_list_clear_mask);
2931 }
2932
2933 info->enable_bg_tag = !!hw_render->color_init_count;
2934
2935 /* If we're not using index list for all clears/loads then we need to run
2936 * the background object on empty tiles.
2937 */
2938 if (hw_render->color_init_count &&
2939 index_list_clear_mask != ((1u << hw_render->color_init_count) - 1u)) {
2940 info->process_empty_tiles = true;
2941 } else {
2942 info->process_empty_tiles = false;
2943 }
2944
2945 if (hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED) {
2946 uint32_t ds_index_list = 0;
2947
2948 pvr_perform_start_of_render_attachment_clear(cmd_buffer,
2949 framebuffer,
2950 0,
2951 true,
2952 &ds_index_list);
2953 }
2954
2955 if (index_list_clear_mask)
2956 pvr_finishme("Add support for generating loadops shaders!");
2957 }
2958
pvr_stash_depth_format(struct pvr_cmd_buffer_state * state,struct pvr_sub_cmd_gfx * const sub_cmd)2959 static void pvr_stash_depth_format(struct pvr_cmd_buffer_state *state,
2960 struct pvr_sub_cmd_gfx *const sub_cmd)
2961 {
2962 const struct pvr_render_pass *pass = state->render_pass_info.pass;
2963 const struct pvr_renderpass_hwsetup_render *hw_render =
2964 &pass->hw_setup->renders[sub_cmd->hw_render_idx];
2965
2966 if (hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED) {
2967 struct pvr_image_view **iviews = state->render_pass_info.attachments;
2968
2969 state->depth_format = iviews[hw_render->ds_attach_idx]->vk.format;
2970 }
2971 }
2972
pvr_loadops_contain_clear(struct pvr_renderpass_hwsetup * hw_setup)2973 static bool pvr_loadops_contain_clear(struct pvr_renderpass_hwsetup *hw_setup)
2974 {
2975 for (uint32_t i = 0; i < hw_setup->render_count; i++) {
2976 struct pvr_renderpass_hwsetup_render *hw_render = &hw_setup->renders[i];
2977 uint32_t render_targets_count = hw_render->init_setup.num_render_targets;
2978
2979 for (uint32_t j = 0;
2980 j < (hw_render->color_init_count * render_targets_count);
2981 j += render_targets_count) {
2982 for (uint32_t k = 0; k < hw_render->init_setup.num_render_targets;
2983 k++) {
2984 if (hw_render->color_init[j + k].op ==
2985 VK_ATTACHMENT_LOAD_OP_CLEAR) {
2986 return true;
2987 }
2988 }
2989 }
2990 if (hw_render->depth_init == VK_ATTACHMENT_LOAD_OP_CLEAR ||
2991 hw_render->stencil_init == VK_ATTACHMENT_LOAD_OP_CLEAR) {
2992 return true;
2993 }
2994 }
2995
2996 return false;
2997 }
2998
2999 static VkResult
pvr_cmd_buffer_set_clear_values(struct pvr_cmd_buffer * cmd_buffer,const VkRenderPassBeginInfo * pRenderPassBegin)3000 pvr_cmd_buffer_set_clear_values(struct pvr_cmd_buffer *cmd_buffer,
3001 const VkRenderPassBeginInfo *pRenderPassBegin)
3002 {
3003 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
3004
3005 /* Free any previously allocated clear values. */
3006 vk_free(&cmd_buffer->vk.pool->alloc, state->render_pass_info.clear_values);
3007
3008 if (pRenderPassBegin->clearValueCount) {
3009 const size_t size = pRenderPassBegin->clearValueCount *
3010 sizeof(*state->render_pass_info.clear_values);
3011
3012 state->render_pass_info.clear_values =
3013 vk_zalloc(&cmd_buffer->vk.pool->alloc,
3014 size,
3015 8,
3016 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
3017 if (!state->render_pass_info.clear_values) {
3018 return vk_command_buffer_set_error(&cmd_buffer->vk,
3019 VK_ERROR_OUT_OF_HOST_MEMORY);
3020 }
3021
3022 memcpy(state->render_pass_info.clear_values,
3023 pRenderPassBegin->pClearValues,
3024 size);
3025 } else {
3026 state->render_pass_info.clear_values = NULL;
3027 }
3028
3029 state->render_pass_info.clear_value_count =
3030 pRenderPassBegin->clearValueCount;
3031
3032 return VK_SUCCESS;
3033 }
3034
3035 /**
3036 * \brief Indicates whether to use the large or normal clear state words.
3037 *
3038 * If the current render area can fit within a quarter of the max framebuffer
3039 * that the device is capable of, we can use the normal clear state words,
3040 * otherwise the large clear state words are needed.
3041 *
3042 * The requirement of a quarter of the max framebuffer comes from the index
3043 * count used in the normal clear state words and the vertices uploaded at
3044 * device creation.
3045 *
3046 * \param[in] cmd_buffer The command buffer for the clear.
3047 * \return true if large clear state words are required.
3048 */
3049 static bool
pvr_is_large_clear_required(const struct pvr_cmd_buffer * const cmd_buffer)3050 pvr_is_large_clear_required(const struct pvr_cmd_buffer *const cmd_buffer)
3051 {
3052 const struct pvr_device_info *const dev_info =
3053 &cmd_buffer->device->pdevice->dev_info;
3054 const VkRect2D render_area = cmd_buffer->state.render_pass_info.render_area;
3055 const uint32_t vf_max_x = rogue_get_param_vf_max_x(dev_info);
3056 const uint32_t vf_max_y = rogue_get_param_vf_max_x(dev_info);
3057
3058 return (render_area.extent.width > (vf_max_x / 2) - 1) ||
3059 (render_area.extent.height > (vf_max_y / 2) - 1);
3060 }
3061
pvr_emit_clear_words(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)3062 static void pvr_emit_clear_words(struct pvr_cmd_buffer *const cmd_buffer,
3063 struct pvr_sub_cmd_gfx *const sub_cmd)
3064 {
3065 struct pvr_device *device = cmd_buffer->device;
3066 struct pvr_csb *csb = &sub_cmd->control_stream;
3067 uint32_t vdm_state_size_in_dw;
3068 const uint32_t *vdm_state;
3069 uint32_t *stream;
3070
3071 vdm_state_size_in_dw =
3072 pvr_clear_vdm_state_get_size_in_dw(&device->pdevice->dev_info, 1);
3073
3074 pvr_csb_set_relocation_mark(csb);
3075
3076 stream = pvr_csb_alloc_dwords(csb, vdm_state_size_in_dw);
3077 if (!stream) {
3078 pvr_cmd_buffer_set_error_unwarned(cmd_buffer, csb->status);
3079 return;
3080 }
3081
3082 if (pvr_is_large_clear_required(cmd_buffer))
3083 vdm_state = device->static_clear_state.large_clear_vdm_words;
3084 else
3085 vdm_state = device->static_clear_state.vdm_words;
3086
3087 memcpy(stream, vdm_state, PVR_DW_TO_BYTES(vdm_state_size_in_dw));
3088
3089 pvr_csb_clear_relocation_mark(csb);
3090 }
3091
pvr_cs_write_load_op(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_gfx * sub_cmd,struct pvr_load_op * load_op,uint32_t isp_userpass)3092 static VkResult pvr_cs_write_load_op(struct pvr_cmd_buffer *cmd_buffer,
3093 struct pvr_sub_cmd_gfx *sub_cmd,
3094 struct pvr_load_op *load_op,
3095 uint32_t isp_userpass)
3096 {
3097 const struct pvr_device *device = cmd_buffer->device;
3098 struct pvr_static_clear_ppp_template template =
3099 device->static_clear_state.ppp_templates[VK_IMAGE_ASPECT_COLOR_BIT];
3100 uint32_t pds_state[PVR_STATIC_CLEAR_PDS_STATE_COUNT];
3101 struct pvr_pds_upload shareds_update_program;
3102 struct pvr_suballoc_bo *pvr_bo;
3103 VkResult result;
3104
3105 result = pvr_load_op_data_create_and_upload(cmd_buffer,
3106 load_op,
3107 &shareds_update_program);
3108 if (result != VK_SUCCESS)
3109 return result;
3110
3111 template.config.ispctl.upass = isp_userpass;
3112
3113 /* It might look odd that we aren't specifying the code segment's
3114 * address anywhere. This is because the hardware always assumes that the
3115 * data size is 2 128bit words and the code segments starts after that.
3116 */
3117 pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_SHADERBASE],
3118 TA_STATE_PDS_SHADERBASE,
3119 shaderbase) {
3120 shaderbase.addr = PVR_DEV_ADDR(load_op->pds_frag_prog.data_offset);
3121 }
3122
3123 pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_TEXUNICODEBASE],
3124 TA_STATE_PDS_TEXUNICODEBASE,
3125 texunicodebase) {
3126 texunicodebase.addr =
3127 PVR_DEV_ADDR(load_op->pds_tex_state_prog.code_offset);
3128 }
3129
3130 pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_SIZEINFO1],
3131 TA_STATE_PDS_SIZEINFO1,
3132 sizeinfo1) {
3133 /* Dummy coefficient loading program. */
3134 sizeinfo1.pds_varyingsize = 0;
3135
3136 sizeinfo1.pds_texturestatesize = DIV_ROUND_UP(
3137 shareds_update_program.data_size,
3138 ROGUE_TA_STATE_PDS_SIZEINFO1_PDS_TEXTURESTATESIZE_UNIT_SIZE);
3139
3140 sizeinfo1.pds_tempsize =
3141 DIV_ROUND_UP(load_op->temps_count,
3142 ROGUE_TA_STATE_PDS_SIZEINFO1_PDS_TEMPSIZE_UNIT_SIZE);
3143 }
3144
3145 pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_SIZEINFO2],
3146 TA_STATE_PDS_SIZEINFO2,
3147 sizeinfo2) {
3148 sizeinfo2.usc_sharedsize =
3149 DIV_ROUND_UP(load_op->const_shareds_count,
3150 ROGUE_TA_STATE_PDS_SIZEINFO2_USC_SHAREDSIZE_UNIT_SIZE);
3151 }
3152
3153 /* Dummy coefficient loading program. */
3154 pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_VARYINGBASE] = 0;
3155
3156 pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_TEXTUREDATABASE],
3157 TA_STATE_PDS_TEXTUREDATABASE,
3158 texturedatabase) {
3159 texturedatabase.addr = PVR_DEV_ADDR(shareds_update_program.data_offset);
3160 }
3161
3162 template.config.pds_state = &pds_state;
3163
3164 pvr_emit_ppp_from_template(&sub_cmd->control_stream, &template, &pvr_bo);
3165 list_add(&pvr_bo->link, &cmd_buffer->bo_list);
3166
3167 pvr_emit_clear_words(cmd_buffer, sub_cmd);
3168
3169 pvr_reset_graphics_dirty_state(cmd_buffer, false);
3170
3171 return VK_SUCCESS;
3172 }
3173
pvr_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,const VkRenderPassBeginInfo * pRenderPassBeginInfo,const VkSubpassBeginInfo * pSubpassBeginInfo)3174 void pvr_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
3175 const VkRenderPassBeginInfo *pRenderPassBeginInfo,
3176 const VkSubpassBeginInfo *pSubpassBeginInfo)
3177 {
3178 PVR_FROM_HANDLE(pvr_framebuffer,
3179 framebuffer,
3180 pRenderPassBeginInfo->framebuffer);
3181 PVR_FROM_HANDLE(pvr_render_pass, pass, pRenderPassBeginInfo->renderPass);
3182 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
3183 const struct pvr_renderpass_hwsetup_subpass *hw_subpass;
3184 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
3185 VkResult result;
3186
3187 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
3188
3189 assert(!state->render_pass_info.pass);
3190 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
3191
3192 /* FIXME: Create a separate function for everything using pass->subpasses,
3193 * look at cmd_buffer_begin_subpass() for example. */
3194 state->render_pass_info.pass = pass;
3195 state->render_pass_info.framebuffer = framebuffer;
3196 state->render_pass_info.subpass_idx = 0;
3197 state->render_pass_info.render_area = pRenderPassBeginInfo->renderArea;
3198 state->render_pass_info.current_hw_subpass = 0;
3199 state->render_pass_info.pipeline_bind_point =
3200 pass->subpasses[0].pipeline_bind_point;
3201 state->render_pass_info.isp_userpass = pass->subpasses[0].isp_userpass;
3202 state->dirty.isp_userpass = true;
3203
3204 result = pvr_cmd_buffer_setup_attachments(cmd_buffer, pass, framebuffer);
3205 if (result != VK_SUCCESS)
3206 return;
3207
3208 result = pvr_init_render_targets(cmd_buffer->device, pass, framebuffer);
3209 if (result != VK_SUCCESS) {
3210 pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
3211 return;
3212 }
3213
3214 result = pvr_cmd_buffer_set_clear_values(cmd_buffer, pRenderPassBeginInfo);
3215 if (result != VK_SUCCESS)
3216 return;
3217
3218 assert(pass->subpasses[0].pipeline_bind_point ==
3219 VK_PIPELINE_BIND_POINT_GRAPHICS);
3220
3221 result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
3222 if (result != VK_SUCCESS)
3223 return;
3224
3225 /* Run subpass 0 "soft" background object after the actual background
3226 * object.
3227 */
3228 hw_subpass = pvr_get_hw_subpass(pass, 0);
3229 if (hw_subpass->load_op) {
3230 result = pvr_cs_write_load_op(cmd_buffer,
3231 &cmd_buffer->state.current_sub_cmd->gfx,
3232 hw_subpass->load_op,
3233 0);
3234 if (result != VK_SUCCESS)
3235 return;
3236 }
3237
3238 pvr_perform_start_of_render_clears(cmd_buffer);
3239 pvr_stash_depth_format(&cmd_buffer->state,
3240 &cmd_buffer->state.current_sub_cmd->gfx);
3241 }
3242
pvr_BeginCommandBuffer(VkCommandBuffer commandBuffer,const VkCommandBufferBeginInfo * pBeginInfo)3243 VkResult pvr_BeginCommandBuffer(VkCommandBuffer commandBuffer,
3244 const VkCommandBufferBeginInfo *pBeginInfo)
3245 {
3246 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
3247 struct pvr_cmd_buffer_state *state;
3248 VkResult result;
3249
3250 vk_command_buffer_begin(&cmd_buffer->vk, pBeginInfo);
3251
3252 cmd_buffer->usage_flags = pBeginInfo->flags;
3253 state = &cmd_buffer->state;
3254
3255 /* VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT must be ignored for
3256 * primary level command buffers.
3257 *
3258 * From the Vulkan 1.0 spec:
3259 *
3260 * VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT specifies that a
3261 * secondary command buffer is considered to be entirely inside a render
3262 * pass. If this is a primary command buffer, then this bit is ignored.
3263 */
3264 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
3265 cmd_buffer->usage_flags &=
3266 ~VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
3267 }
3268
3269 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
3270 if (cmd_buffer->usage_flags &
3271 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
3272 const VkCommandBufferInheritanceInfo *inheritance_info =
3273 pBeginInfo->pInheritanceInfo;
3274 struct pvr_render_pass *pass;
3275
3276 pass = pvr_render_pass_from_handle(inheritance_info->renderPass);
3277 state->render_pass_info.pass = pass;
3278 state->render_pass_info.framebuffer =
3279 pvr_framebuffer_from_handle(inheritance_info->framebuffer);
3280 state->render_pass_info.subpass_idx = inheritance_info->subpass;
3281 state->render_pass_info.isp_userpass =
3282 pass->subpasses[inheritance_info->subpass].isp_userpass;
3283
3284 result =
3285 pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
3286 if (result != VK_SUCCESS)
3287 return result;
3288
3289 state->vis_test_enabled = inheritance_info->occlusionQueryEnable;
3290 }
3291
3292 state->dirty.isp_userpass = true;
3293 }
3294
3295 util_dynarray_init(&state->query_indices, NULL);
3296
3297 memset(state->barriers_needed,
3298 0xFF,
3299 sizeof(*state->barriers_needed) * ARRAY_SIZE(state->barriers_needed));
3300
3301 return VK_SUCCESS;
3302 }
3303
pvr_cmd_buffer_add_transfer_cmd(struct pvr_cmd_buffer * cmd_buffer,struct pvr_transfer_cmd * transfer_cmd)3304 VkResult pvr_cmd_buffer_add_transfer_cmd(struct pvr_cmd_buffer *cmd_buffer,
3305 struct pvr_transfer_cmd *transfer_cmd)
3306 {
3307 struct pvr_sub_cmd_transfer *sub_cmd;
3308 VkResult result;
3309
3310 result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_TRANSFER);
3311 if (result != VK_SUCCESS)
3312 return result;
3313
3314 sub_cmd = &cmd_buffer->state.current_sub_cmd->transfer;
3315
3316 list_addtail(&transfer_cmd->link, sub_cmd->transfer_cmds);
3317
3318 return VK_SUCCESS;
3319 }
3320
3321 static VkResult
pvr_setup_vertex_buffers(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_graphics_pipeline * const gfx_pipeline)3322 pvr_setup_vertex_buffers(struct pvr_cmd_buffer *cmd_buffer,
3323 const struct pvr_graphics_pipeline *const gfx_pipeline)
3324 {
3325 const struct pvr_vertex_shader_state *const vertex_state =
3326 &gfx_pipeline->shader_state.vertex;
3327 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
3328 const struct pvr_pds_info *const pds_info = state->pds_shader.info;
3329 struct pvr_suballoc_bo *pvr_bo;
3330 const uint8_t *entries;
3331 uint32_t *dword_buffer;
3332 uint64_t *qword_buffer;
3333 VkResult result;
3334
3335 result =
3336 pvr_cmd_buffer_alloc_mem(cmd_buffer,
3337 cmd_buffer->device->heaps.pds_heap,
3338 PVR_DW_TO_BYTES(pds_info->data_size_in_dwords),
3339 &pvr_bo);
3340 if (result != VK_SUCCESS)
3341 return result;
3342
3343 dword_buffer = (uint32_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
3344 qword_buffer = (uint64_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
3345
3346 entries = (uint8_t *)pds_info->entries;
3347
3348 for (uint32_t i = 0; i < pds_info->entry_count; i++) {
3349 const struct pvr_const_map_entry *const entry_header =
3350 (struct pvr_const_map_entry *)entries;
3351
3352 switch (entry_header->type) {
3353 case PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32: {
3354 const struct pvr_const_map_entry_literal32 *const literal =
3355 (struct pvr_const_map_entry_literal32 *)entries;
3356
3357 PVR_WRITE(dword_buffer,
3358 literal->literal_value,
3359 literal->const_offset,
3360 pds_info->data_size_in_dwords);
3361
3362 entries += sizeof(*literal);
3363 break;
3364 }
3365
3366 case PVR_PDS_CONST_MAP_ENTRY_TYPE_DOUTU_ADDRESS: {
3367 const struct pvr_const_map_entry_doutu_address *const doutu_addr =
3368 (struct pvr_const_map_entry_doutu_address *)entries;
3369
3370 const pco_data *const vs_data = &state->gfx_pipeline->vs_data;
3371 const pvr_dev_addr_t exec_addr =
3372 PVR_DEV_ADDR_OFFSET(vertex_state->bo->dev_addr,
3373 vs_data->common.entry_offset);
3374 uint64_t addr = 0ULL;
3375
3376 pvr_set_usc_execution_address64(&addr, exec_addr.addr);
3377
3378 PVR_WRITE(qword_buffer,
3379 addr | doutu_addr->doutu_control,
3380 doutu_addr->const_offset,
3381 pds_info->data_size_in_dwords);
3382
3383 entries += sizeof(*doutu_addr);
3384 break;
3385 }
3386
3387 case PVR_PDS_CONST_MAP_ENTRY_TYPE_BASE_INSTANCE: {
3388 const struct pvr_const_map_entry_base_instance *const base_instance =
3389 (struct pvr_const_map_entry_base_instance *)entries;
3390
3391 PVR_WRITE(dword_buffer,
3392 state->draw_state.base_instance,
3393 base_instance->const_offset,
3394 pds_info->data_size_in_dwords);
3395
3396 entries += sizeof(*base_instance);
3397 break;
3398 }
3399
3400 case PVR_PDS_CONST_MAP_ENTRY_TYPE_BASE_VERTEX: {
3401 const struct pvr_const_map_entry_base_instance *const base_instance =
3402 (struct pvr_const_map_entry_base_instance *)entries;
3403
3404 PVR_WRITE(dword_buffer,
3405 state->draw_state.base_vertex,
3406 base_instance->const_offset,
3407 pds_info->data_size_in_dwords);
3408
3409 entries += sizeof(*base_instance);
3410 break;
3411 }
3412
3413 case PVR_PDS_CONST_MAP_ENTRY_TYPE_VERTEX_ATTRIBUTE_ADDRESS: {
3414 const struct pvr_const_map_entry_vertex_attribute_address
3415 *const attribute =
3416 (struct pvr_const_map_entry_vertex_attribute_address *)entries;
3417 const struct pvr_vertex_binding *const binding =
3418 &state->vertex_bindings[attribute->binding_index];
3419 /* In relation to the Vulkan spec. 22.4. Vertex Input Address
3420 * Calculation:
3421 * Adding binding->offset corresponds to calculating the
3422 * `bufferBindingAddress`. Adding attribute->offset corresponds to
3423 * adding the `attribDesc.offset`. The `effectiveVertexOffset` is
3424 * taken care by the PDS program itself with a DDMAD which will
3425 * multiply the vertex/instance idx with the binding's stride and
3426 * add that to the address provided here.
3427 */
3428 const pvr_dev_addr_t addr =
3429 PVR_DEV_ADDR_OFFSET(binding->buffer->dev_addr,
3430 binding->offset + attribute->offset);
3431
3432 PVR_WRITE(qword_buffer,
3433 addr.addr,
3434 attribute->const_offset,
3435 pds_info->data_size_in_dwords);
3436
3437 entries += sizeof(*attribute);
3438 break;
3439 }
3440
3441 case PVR_PDS_CONST_MAP_ENTRY_TYPE_ROBUST_VERTEX_ATTRIBUTE_ADDRESS: {
3442 const struct pvr_const_map_entry_robust_vertex_attribute_address
3443 *const attribute =
3444 (struct pvr_const_map_entry_robust_vertex_attribute_address *)
3445 entries;
3446 const struct pvr_vertex_binding *const binding =
3447 &state->vertex_bindings[attribute->binding_index];
3448 pvr_dev_addr_t addr;
3449
3450 if (binding->buffer->vk.size <
3451 (attribute->offset + attribute->component_size_in_bytes)) {
3452 /* Replace with load from robustness buffer when no attribute is in
3453 * range
3454 */
3455 addr = PVR_DEV_ADDR_OFFSET(
3456 cmd_buffer->device->robustness_buffer->vma->dev_addr,
3457 attribute->robustness_buffer_offset);
3458 } else {
3459 addr = PVR_DEV_ADDR_OFFSET(binding->buffer->dev_addr,
3460 binding->offset + attribute->offset);
3461 }
3462
3463 PVR_WRITE(qword_buffer,
3464 addr.addr,
3465 attribute->const_offset,
3466 pds_info->data_size_in_dwords);
3467
3468 entries += sizeof(*attribute);
3469 break;
3470 }
3471
3472 case PVR_PDS_CONST_MAP_ENTRY_TYPE_VERTEX_ATTRIBUTE_MAX_INDEX: {
3473 const struct pvr_const_map_entry_vertex_attribute_max_index *attribute =
3474 (struct pvr_const_map_entry_vertex_attribute_max_index *)entries;
3475 const struct pvr_vertex_binding *const binding =
3476 &state->vertex_bindings[attribute->binding_index];
3477 const uint64_t bound_size = binding->buffer->vk.size - binding->offset;
3478 const uint32_t attribute_end =
3479 attribute->offset + attribute->component_size_in_bytes;
3480 uint32_t max_index;
3481
3482 if (PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info,
3483 pds_ddmadt)) {
3484 /* TODO: PVR_PDS_CONST_MAP_ENTRY_TYPE_VERTEX_ATTRIBUTE_MAX_INDEX
3485 * has the same define value as
3486 * PVR_PDS_CONST_MAP_ENTRY_TYPE_VERTEX_ATTR_DDMADT_OOB_BUFFER_SIZE
3487 * so maybe we want to remove one of the defines or change the
3488 * values.
3489 */
3490 pvr_finishme("Unimplemented robust buffer access with DDMADT");
3491 assert(false);
3492 }
3493
3494 /* If the stride is 0 then all attributes use the same single element
3495 * from the binding so the index can only be up to 0.
3496 */
3497 if (bound_size < attribute_end || attribute->stride == 0) {
3498 max_index = 0;
3499 } else {
3500 max_index = (uint32_t)(bound_size / attribute->stride) - 1;
3501
3502 /* There's one last attribute that can fit in. */
3503 if (bound_size % attribute->stride >= attribute_end)
3504 max_index++;
3505 }
3506
3507 PVR_WRITE(dword_buffer,
3508 max_index,
3509 attribute->const_offset,
3510 pds_info->data_size_in_dwords);
3511
3512 entries += sizeof(*attribute);
3513 break;
3514 }
3515
3516 default:
3517 unreachable("Unsupported data section map");
3518 break;
3519 }
3520 }
3521
3522 state->pds_vertex_attrib_offset =
3523 pvr_bo->dev_addr.addr -
3524 cmd_buffer->device->heaps.pds_heap->base_addr.addr;
3525
3526 return VK_SUCCESS;
3527 }
3528
pvr_setup_descriptor_mappings_old(struct pvr_cmd_buffer * const cmd_buffer,enum pvr_stage_allocation stage,const struct pvr_stage_allocation_descriptor_state * descriptor_state,const pvr_dev_addr_t * const num_worgroups_buff_addr,uint32_t * const descriptor_data_offset_out)3529 static VkResult pvr_setup_descriptor_mappings_old(
3530 struct pvr_cmd_buffer *const cmd_buffer,
3531 enum pvr_stage_allocation stage,
3532 const struct pvr_stage_allocation_descriptor_state *descriptor_state,
3533 const pvr_dev_addr_t *const num_worgroups_buff_addr,
3534 uint32_t *const descriptor_data_offset_out)
3535 {
3536 const struct pvr_pds_info *const pds_info = &descriptor_state->pds_info;
3537 const struct pvr_descriptor_state *desc_state;
3538 struct pvr_suballoc_bo *pvr_bo;
3539 const uint8_t *entries;
3540 uint32_t *dword_buffer;
3541 uint64_t *qword_buffer;
3542 VkResult result;
3543
3544 if (!pds_info->data_size_in_dwords)
3545 return VK_SUCCESS;
3546
3547 result =
3548 pvr_cmd_buffer_alloc_mem(cmd_buffer,
3549 cmd_buffer->device->heaps.pds_heap,
3550 PVR_DW_TO_BYTES(pds_info->data_size_in_dwords),
3551 &pvr_bo);
3552 if (result != VK_SUCCESS)
3553 return result;
3554
3555 dword_buffer = (uint32_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
3556 qword_buffer = (uint64_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
3557
3558 entries = (uint8_t *)pds_info->entries;
3559
3560 switch (stage) {
3561 case PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY:
3562 case PVR_STAGE_ALLOCATION_FRAGMENT:
3563 desc_state = &cmd_buffer->state.gfx_desc_state;
3564 break;
3565
3566 case PVR_STAGE_ALLOCATION_COMPUTE:
3567 desc_state = &cmd_buffer->state.compute_desc_state;
3568 break;
3569
3570 default:
3571 unreachable("Unsupported stage.");
3572 break;
3573 }
3574
3575 for (uint32_t i = 0; i < pds_info->entry_count; i++) {
3576 const struct pvr_const_map_entry *const entry_header =
3577 (struct pvr_const_map_entry *)entries;
3578
3579 switch (entry_header->type) {
3580 case PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32: {
3581 const struct pvr_const_map_entry_literal32 *const literal =
3582 (struct pvr_const_map_entry_literal32 *)entries;
3583
3584 PVR_WRITE(dword_buffer,
3585 literal->literal_value,
3586 literal->const_offset,
3587 pds_info->data_size_in_dwords);
3588
3589 entries += sizeof(*literal);
3590 break;
3591 }
3592
3593 case PVR_PDS_CONST_MAP_ENTRY_TYPE_CONSTANT_BUFFER: {
3594 const struct pvr_const_map_entry_constant_buffer *const_buffer_entry =
3595 (struct pvr_const_map_entry_constant_buffer *)entries;
3596 const uint32_t desc_set = const_buffer_entry->desc_set;
3597 const uint32_t binding = const_buffer_entry->binding;
3598 const struct pvr_descriptor_set *descriptor_set;
3599 const struct pvr_descriptor *descriptor;
3600 pvr_dev_addr_t buffer_addr;
3601
3602 assert(desc_set < PVR_MAX_DESCRIPTOR_SETS);
3603 descriptor_set = desc_state->descriptor_sets[desc_set];
3604
3605 /* TODO: Handle dynamic buffers. */
3606 descriptor = &descriptor_set->descriptors[binding];
3607 assert(descriptor->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
3608
3609 assert(descriptor->buffer_desc_range ==
3610 PVR_DW_TO_BYTES(const_buffer_entry->size_in_dwords));
3611 assert(descriptor->buffer_whole_range ==
3612 PVR_DW_TO_BYTES(const_buffer_entry->size_in_dwords));
3613
3614 buffer_addr =
3615 PVR_DEV_ADDR_OFFSET(descriptor->buffer_dev_addr,
3616 const_buffer_entry->offset * sizeof(uint32_t));
3617
3618 PVR_WRITE(qword_buffer,
3619 buffer_addr.addr,
3620 const_buffer_entry->const_offset,
3621 pds_info->data_size_in_dwords);
3622
3623 entries += sizeof(*const_buffer_entry);
3624 break;
3625 }
3626
3627 case PVR_PDS_CONST_MAP_ENTRY_TYPE_DESCRIPTOR_SET: {
3628 const struct pvr_const_map_entry_descriptor_set *desc_set_entry =
3629 (struct pvr_const_map_entry_descriptor_set *)entries;
3630 const uint32_t desc_set_num = desc_set_entry->descriptor_set;
3631 const struct pvr_descriptor_set *descriptor_set;
3632 pvr_dev_addr_t desc_set_addr;
3633 uint64_t desc_portion_offset;
3634
3635 assert(desc_set_num < PVR_MAX_DESCRIPTOR_SETS);
3636
3637 /* TODO: Remove this when the compiler provides us with usage info?
3638 */
3639 /* We skip DMAing unbound descriptor sets. */
3640 if (!(desc_state->valid_mask & BITFIELD_BIT(desc_set_num))) {
3641 const struct pvr_const_map_entry_literal32 *literal;
3642 uint32_t zero_literal_value;
3643
3644 /* The code segment contains a DOUT instructions so in the data
3645 * section we have to write a DOUTD_SRC0 and DOUTD_SRC1.
3646 * We'll write 0 for DOUTD_SRC0 since we don't have a buffer to DMA.
3647 * We're expecting a LITERAL32 entry containing the value for
3648 * DOUTD_SRC1 next so let's make sure we get it and write it
3649 * with BSIZE to 0 disabling the DMA operation.
3650 * We don't want the LITERAL32 to be processed as normal otherwise
3651 * we'd be DMAing from an address of 0.
3652 */
3653
3654 entries += sizeof(*desc_set_entry);
3655 literal = (struct pvr_const_map_entry_literal32 *)entries;
3656
3657 assert(literal->type == PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32);
3658
3659 zero_literal_value =
3660 literal->literal_value &
3661 PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTD_SRC1_BSIZE_CLRMSK;
3662
3663 PVR_WRITE(qword_buffer,
3664 UINT64_C(0),
3665 desc_set_entry->const_offset,
3666 pds_info->data_size_in_dwords);
3667
3668 PVR_WRITE(dword_buffer,
3669 zero_literal_value,
3670 desc_set_entry->const_offset,
3671 pds_info->data_size_in_dwords);
3672
3673 entries += sizeof(*literal);
3674 i++;
3675 continue;
3676 }
3677
3678 descriptor_set = desc_state->descriptor_sets[desc_set_num];
3679
3680 desc_set_addr = descriptor_set->pvr_bo->dev_addr;
3681
3682 if (desc_set_entry->primary) {
3683 desc_portion_offset =
3684 descriptor_set->layout->memory_layout_in_dwords_per_stage[stage]
3685 .primary_offset;
3686 } else {
3687 desc_portion_offset =
3688 descriptor_set->layout->memory_layout_in_dwords_per_stage[stage]
3689 .secondary_offset;
3690 }
3691 desc_portion_offset = PVR_DW_TO_BYTES(desc_portion_offset);
3692
3693 desc_set_addr =
3694 PVR_DEV_ADDR_OFFSET(desc_set_addr, desc_portion_offset);
3695
3696 desc_set_addr = PVR_DEV_ADDR_OFFSET(
3697 desc_set_addr,
3698 PVR_DW_TO_BYTES((uint64_t)desc_set_entry->offset_in_dwords));
3699
3700 PVR_WRITE(qword_buffer,
3701 desc_set_addr.addr,
3702 desc_set_entry->const_offset,
3703 pds_info->data_size_in_dwords);
3704
3705 entries += sizeof(*desc_set_entry);
3706 break;
3707 }
3708
3709 case PVR_PDS_CONST_MAP_ENTRY_TYPE_SPECIAL_BUFFER: {
3710 const struct pvr_const_map_entry_special_buffer *special_buff_entry =
3711 (struct pvr_const_map_entry_special_buffer *)entries;
3712
3713 switch (special_buff_entry->buffer_type) {
3714 case PVR_BUFFER_TYPE_COMPILE_TIME: {
3715 uint64_t addr = descriptor_state->static_consts->dev_addr.addr;
3716
3717 PVR_WRITE(qword_buffer,
3718 addr,
3719 special_buff_entry->const_offset,
3720 pds_info->data_size_in_dwords);
3721 break;
3722 }
3723
3724 case PVR_BUFFER_TYPE_BLEND_CONSTS:
3725 /* TODO: See if instead of reusing the blend constant buffer type
3726 * entry, we can setup a new buffer type specifically for
3727 * num_workgroups or other built-in variables. The mappings are
3728 * setup at pipeline creation when creating the descriptor program.
3729 */
3730 if (stage == PVR_STAGE_ALLOCATION_COMPUTE) {
3731 assert(num_worgroups_buff_addr->addr);
3732
3733 /* TODO: Check if we need to offset this (e.g. for just y and z),
3734 * or cope with any reordering?
3735 */
3736 PVR_WRITE(qword_buffer,
3737 num_worgroups_buff_addr->addr,
3738 special_buff_entry->const_offset,
3739 pds_info->data_size_in_dwords);
3740 } else {
3741 pvr_finishme("Add blend constants support.");
3742 }
3743 break;
3744
3745 default:
3746 unreachable("Unsupported special buffer type.");
3747 }
3748
3749 entries += sizeof(*special_buff_entry);
3750 break;
3751 }
3752
3753 default:
3754 unreachable("Unsupported map entry type.");
3755 }
3756 }
3757
3758 *descriptor_data_offset_out =
3759 pvr_bo->dev_addr.addr -
3760 cmd_buffer->device->heaps.pds_heap->base_addr.addr;
3761
3762 return VK_SUCCESS;
3763 }
3764
3765 /* Note that the descriptor set doesn't have any space for dynamic buffer
3766 * descriptors so this works on the assumption that you have a buffer with space
3767 * for them at the end.
3768 */
pvr_get_dynamic_descriptor_primary_offset(const struct pvr_device * device,const struct pvr_descriptor_set_layout * layout,const struct pvr_descriptor_set_layout_binding * binding,const uint32_t stage,const uint32_t desc_idx)3769 static uint16_t pvr_get_dynamic_descriptor_primary_offset(
3770 const struct pvr_device *device,
3771 const struct pvr_descriptor_set_layout *layout,
3772 const struct pvr_descriptor_set_layout_binding *binding,
3773 const uint32_t stage,
3774 const uint32_t desc_idx)
3775 {
3776 struct pvr_descriptor_size_info size_info;
3777 uint32_t offset;
3778
3779 assert(vk_descriptor_type_is_dynamic(binding->type));
3780 assert(desc_idx < binding->descriptor_count);
3781
3782 pvr_descriptor_size_info_init(device, binding->type, &size_info);
3783
3784 offset = layout->total_size_in_dwords;
3785 offset += binding->per_stage_offset_in_dwords[stage].primary;
3786 offset += (desc_idx * size_info.primary);
3787
3788 /* Offset must be less than * 16bits. */
3789 assert(offset < UINT16_MAX);
3790
3791 return (uint16_t)offset;
3792 }
3793
3794 /* Note that the descriptor set doesn't have any space for dynamic buffer
3795 * descriptors so this works on the assumption that you have a buffer with space
3796 * for them at the end.
3797 */
pvr_get_dynamic_descriptor_secondary_offset(const struct pvr_device * device,const struct pvr_descriptor_set_layout * layout,const struct pvr_descriptor_set_layout_binding * binding,const uint32_t stage,const uint32_t desc_idx)3798 static uint16_t pvr_get_dynamic_descriptor_secondary_offset(
3799 const struct pvr_device *device,
3800 const struct pvr_descriptor_set_layout *layout,
3801 const struct pvr_descriptor_set_layout_binding *binding,
3802 const uint32_t stage,
3803 const uint32_t desc_idx)
3804 {
3805 struct pvr_descriptor_size_info size_info;
3806 uint32_t offset;
3807
3808 assert(vk_descriptor_type_is_dynamic(binding->type));
3809 assert(desc_idx < binding->descriptor_count);
3810
3811 pvr_descriptor_size_info_init(device, binding->type, &size_info);
3812
3813 offset = layout->total_size_in_dwords;
3814 offset +=
3815 layout->memory_layout_in_dwords_per_stage[stage].primary_dynamic_size;
3816 offset += binding->per_stage_offset_in_dwords[stage].secondary;
3817 offset += (desc_idx * size_info.secondary);
3818
3819 /* Offset must be less than * 16bits. */
3820 assert(offset < UINT16_MAX);
3821
3822 return (uint16_t)offset;
3823 }
3824
3825 /**
3826 * \brief Upload a copy of the descriptor set with dynamic buffer offsets
3827 * applied.
3828 */
3829 /* TODO: We should probably make the compiler aware of the dynamic descriptors.
3830 * We could use push constants like Anv seems to do. This would avoid having to
3831 * duplicate all sets containing dynamic descriptors each time the offsets are
3832 * updated.
3833 */
pvr_cmd_buffer_upload_patched_desc_set(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_descriptor_set * desc_set,const uint32_t * dynamic_offsets,struct pvr_suballoc_bo ** const bo_out)3834 static VkResult pvr_cmd_buffer_upload_patched_desc_set(
3835 struct pvr_cmd_buffer *cmd_buffer,
3836 const struct pvr_descriptor_set *desc_set,
3837 const uint32_t *dynamic_offsets,
3838 struct pvr_suballoc_bo **const bo_out)
3839 {
3840 const struct pvr_descriptor_set_layout *layout = desc_set->layout;
3841 const uint64_t normal_desc_set_size =
3842 PVR_DW_TO_BYTES(layout->total_size_in_dwords);
3843 const uint64_t dynamic_descs_size =
3844 PVR_DW_TO_BYTES(layout->total_dynamic_size_in_dwords);
3845 struct pvr_descriptor_size_info dynamic_uniform_buffer_size_info;
3846 struct pvr_descriptor_size_info dynamic_storage_buffer_size_info;
3847 struct pvr_device *device = cmd_buffer->device;
3848 struct pvr_suballoc_bo *patched_desc_set_bo;
3849 uint32_t *src_mem_ptr, *dst_mem_ptr;
3850 uint32_t desc_idx_offset = 0;
3851 VkResult result;
3852
3853 assert(desc_set->layout->dynamic_buffer_count > 0);
3854
3855 pvr_descriptor_size_info_init(device,
3856 VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC,
3857 &dynamic_uniform_buffer_size_info);
3858 pvr_descriptor_size_info_init(device,
3859 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC,
3860 &dynamic_storage_buffer_size_info);
3861
3862 /* TODO: In the descriptor set we don't account for dynamic buffer
3863 * descriptors and take care of them in the pipeline layout. The pipeline
3864 * layout allocates them at the beginning but let's put them at the end just
3865 * because it makes things a bit easier. Ideally we should be using the
3866 * pipeline layout and use the offsets from the pipeline layout to patch
3867 * descriptors.
3868 */
3869 result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
3870 cmd_buffer->device->heaps.general_heap,
3871 normal_desc_set_size + dynamic_descs_size,
3872 &patched_desc_set_bo);
3873 if (result != VK_SUCCESS)
3874 return result;
3875
3876 src_mem_ptr = (uint32_t *)pvr_bo_suballoc_get_map_addr(desc_set->pvr_bo);
3877 dst_mem_ptr = (uint32_t *)pvr_bo_suballoc_get_map_addr(patched_desc_set_bo);
3878
3879 memcpy(dst_mem_ptr, src_mem_ptr, normal_desc_set_size);
3880
3881 for (uint32_t i = 0; i < desc_set->layout->binding_count; i++) {
3882 const struct pvr_descriptor_set_layout_binding *binding =
3883 &desc_set->layout->bindings[i];
3884 const struct pvr_descriptor *descriptors =
3885 &desc_set->descriptors[binding->descriptor_index];
3886 const struct pvr_descriptor_size_info *size_info;
3887
3888 if (binding->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC)
3889 size_info = &dynamic_uniform_buffer_size_info;
3890 else if (binding->type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC)
3891 size_info = &dynamic_storage_buffer_size_info;
3892 else
3893 continue;
3894
3895 for (uint32_t stage = 0; stage < PVR_STAGE_ALLOCATION_COUNT; stage++) {
3896 uint32_t primary_offset;
3897 uint32_t secondary_offset;
3898
3899 if (!(binding->shader_stage_mask & BITFIELD_BIT(stage)))
3900 continue;
3901
3902 /* Get the offsets for the first dynamic descriptor in the current
3903 * binding.
3904 */
3905 primary_offset =
3906 pvr_get_dynamic_descriptor_primary_offset(device,
3907 desc_set->layout,
3908 binding,
3909 stage,
3910 0);
3911 secondary_offset =
3912 pvr_get_dynamic_descriptor_secondary_offset(device,
3913 desc_set->layout,
3914 binding,
3915 stage,
3916 0);
3917
3918 /* clang-format off */
3919 for (uint32_t desc_idx = 0;
3920 desc_idx < binding->descriptor_count;
3921 desc_idx++) {
3922 /* clang-format on */
3923 const pvr_dev_addr_t addr =
3924 PVR_DEV_ADDR_OFFSET(descriptors[desc_idx].buffer_dev_addr,
3925 dynamic_offsets[desc_idx + desc_idx_offset]);
3926 const VkDeviceSize range =
3927 MIN2(descriptors[desc_idx].buffer_desc_range,
3928 descriptors[desc_idx].buffer_whole_range -
3929 dynamic_offsets[desc_idx]);
3930
3931 #if MESA_DEBUG
3932 uint32_t desc_primary_offset;
3933 uint32_t desc_secondary_offset;
3934
3935 desc_primary_offset =
3936 pvr_get_dynamic_descriptor_primary_offset(device,
3937 desc_set->layout,
3938 binding,
3939 stage,
3940 desc_idx);
3941 desc_secondary_offset =
3942 pvr_get_dynamic_descriptor_secondary_offset(device,
3943 desc_set->layout,
3944 binding,
3945 stage,
3946 desc_idx);
3947
3948 /* Check the assumption that the descriptors within a binding, for
3949 * a particular stage, are allocated consecutively.
3950 */
3951 assert(desc_primary_offset ==
3952 primary_offset + size_info->primary * desc_idx);
3953 assert(desc_secondary_offset ==
3954 secondary_offset + size_info->secondary * desc_idx);
3955 #endif
3956
3957 assert(descriptors[desc_idx].type == binding->type);
3958
3959 memcpy(dst_mem_ptr + primary_offset + size_info->primary * desc_idx,
3960 &addr.addr,
3961 PVR_DW_TO_BYTES(size_info->primary));
3962 memcpy(dst_mem_ptr + secondary_offset +
3963 size_info->secondary * desc_idx,
3964 &range,
3965 PVR_DW_TO_BYTES(size_info->secondary));
3966 }
3967 }
3968
3969 desc_idx_offset += binding->descriptor_count;
3970 }
3971
3972 *bo_out = patched_desc_set_bo;
3973
3974 return VK_SUCCESS;
3975 }
3976
3977 #define PVR_SELECT(_geom, _frag, _compute) \
3978 (stage == PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY) \
3979 ? (_geom) \
3980 : (stage == PVR_STAGE_ALLOCATION_FRAGMENT) ? (_frag) : (_compute)
3981
3982 static VkResult
pvr_cmd_buffer_upload_desc_set_table(struct pvr_cmd_buffer * const cmd_buffer,enum pvr_stage_allocation stage,pvr_dev_addr_t * addr_out)3983 pvr_cmd_buffer_upload_desc_set_table(struct pvr_cmd_buffer *const cmd_buffer,
3984 enum pvr_stage_allocation stage,
3985 pvr_dev_addr_t *addr_out)
3986 {
3987 uint64_t bound_desc_sets[PVR_MAX_DESCRIPTOR_SETS];
3988 const struct pvr_descriptor_state *desc_state;
3989 struct pvr_suballoc_bo *suballoc_bo;
3990 uint32_t dynamic_offset_idx = 0;
3991 VkResult result;
3992
3993 switch (stage) {
3994 case PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY:
3995 case PVR_STAGE_ALLOCATION_FRAGMENT:
3996 case PVR_STAGE_ALLOCATION_COMPUTE:
3997 break;
3998
3999 default:
4000 unreachable("Unsupported stage.");
4001 break;
4002 }
4003
4004 desc_state = PVR_SELECT(&cmd_buffer->state.gfx_desc_state,
4005 &cmd_buffer->state.gfx_desc_state,
4006 &cmd_buffer->state.compute_desc_state);
4007
4008 for (uint32_t set = 0; set < ARRAY_SIZE(bound_desc_sets); set++)
4009 bound_desc_sets[set] = ~0;
4010
4011 assert(util_last_bit(desc_state->valid_mask) <= ARRAY_SIZE(bound_desc_sets));
4012 for (uint32_t set = 0; set < util_last_bit(desc_state->valid_mask); set++) {
4013 const struct pvr_descriptor_set *desc_set;
4014
4015 if (!(desc_state->valid_mask & BITFIELD_BIT(set))) {
4016 const struct pvr_pipeline_layout *pipeline_layout =
4017 PVR_SELECT(cmd_buffer->state.gfx_pipeline->base.layout,
4018 cmd_buffer->state.gfx_pipeline->base.layout,
4019 cmd_buffer->state.compute_pipeline->base.layout);
4020 const struct pvr_descriptor_set_layout *set_layout;
4021
4022 assert(set <= pipeline_layout->set_count);
4023
4024 set_layout = pipeline_layout->set_layout[set];
4025 dynamic_offset_idx += set_layout->dynamic_buffer_count;
4026
4027 continue;
4028 }
4029
4030 desc_set = desc_state->descriptor_sets[set];
4031
4032 /* TODO: Is it better if we don't set the valid_mask for empty sets? */
4033 if (desc_set->layout->descriptor_count == 0)
4034 continue;
4035
4036 if (desc_set->layout->dynamic_buffer_count > 0) {
4037 struct pvr_suballoc_bo *new_desc_set_bo;
4038
4039 assert(dynamic_offset_idx + desc_set->layout->dynamic_buffer_count <=
4040 ARRAY_SIZE(desc_state->dynamic_offsets));
4041
4042 result = pvr_cmd_buffer_upload_patched_desc_set(
4043 cmd_buffer,
4044 desc_set,
4045 &desc_state->dynamic_offsets[dynamic_offset_idx],
4046 &new_desc_set_bo);
4047 if (result != VK_SUCCESS)
4048 return result;
4049
4050 dynamic_offset_idx += desc_set->layout->dynamic_buffer_count;
4051
4052 bound_desc_sets[set] = new_desc_set_bo->dev_addr.addr;
4053 } else {
4054 bound_desc_sets[set] = desc_set->pvr_bo->dev_addr.addr;
4055 }
4056 }
4057
4058 result = pvr_cmd_buffer_upload_general(cmd_buffer,
4059 bound_desc_sets,
4060 sizeof(bound_desc_sets),
4061 &suballoc_bo);
4062 if (result != VK_SUCCESS)
4063 return result;
4064
4065 *addr_out = suballoc_bo->dev_addr;
4066 return VK_SUCCESS;
4067 }
4068
4069 static VkResult
pvr_process_addr_literal(struct pvr_cmd_buffer * cmd_buffer,enum pvr_pds_addr_literal_type addr_literal_type,enum pvr_stage_allocation stage,pvr_dev_addr_t * addr_out)4070 pvr_process_addr_literal(struct pvr_cmd_buffer *cmd_buffer,
4071 enum pvr_pds_addr_literal_type addr_literal_type,
4072 enum pvr_stage_allocation stage,
4073 pvr_dev_addr_t *addr_out)
4074 {
4075 VkResult result;
4076
4077 switch (addr_literal_type) {
4078 case PVR_PDS_ADDR_LITERAL_DESC_SET_ADDRS_TABLE: {
4079 /* TODO: Maybe we want to free pvr_bo? And only when the data
4080 * section is written successfully we link all bos to the command
4081 * buffer.
4082 */
4083 result =
4084 pvr_cmd_buffer_upload_desc_set_table(cmd_buffer, stage, addr_out);
4085 if (result != VK_SUCCESS)
4086 return result;
4087
4088 break;
4089 }
4090
4091 case PVR_PDS_ADDR_LITERAL_PUSH_CONSTS: {
4092 const struct pvr_pipeline_layout *layout =
4093 PVR_SELECT(cmd_buffer->state.gfx_pipeline->base.layout,
4094 cmd_buffer->state.gfx_pipeline->base.layout,
4095 cmd_buffer->state.compute_pipeline->base.layout);
4096 const uint32_t push_constants_offset =
4097 PVR_SELECT(layout->vert_push_constants_offset,
4098 layout->frag_push_constants_offset,
4099 layout->compute_push_constants_offset);
4100
4101 *addr_out = PVR_DEV_ADDR_OFFSET(cmd_buffer->state.push_constants.dev_addr,
4102 push_constants_offset);
4103 break;
4104 }
4105
4106 case PVR_PDS_ADDR_LITERAL_BLEND_CONSTANTS: {
4107 float *blend_consts =
4108 cmd_buffer->vk.dynamic_graphics_state.cb.blend_constants;
4109 size_t size =
4110 sizeof(cmd_buffer->vk.dynamic_graphics_state.cb.blend_constants);
4111 struct pvr_suballoc_bo *blend_consts_bo;
4112
4113 result = pvr_cmd_buffer_upload_general(cmd_buffer,
4114 blend_consts,
4115 size,
4116 &blend_consts_bo);
4117 if (result != VK_SUCCESS)
4118 return result;
4119
4120 *addr_out = blend_consts_bo->dev_addr;
4121
4122 break;
4123 }
4124
4125 default:
4126 unreachable("Invalid add literal type.");
4127 }
4128
4129 return VK_SUCCESS;
4130 }
4131
4132 #undef PVR_SELECT
4133
pvr_setup_descriptor_mappings_new(struct pvr_cmd_buffer * const cmd_buffer,enum pvr_stage_allocation stage,const struct pvr_stage_allocation_descriptor_state * descriptor_state,uint32_t * const descriptor_data_offset_out)4134 static VkResult pvr_setup_descriptor_mappings_new(
4135 struct pvr_cmd_buffer *const cmd_buffer,
4136 enum pvr_stage_allocation stage,
4137 const struct pvr_stage_allocation_descriptor_state *descriptor_state,
4138 uint32_t *const descriptor_data_offset_out)
4139 {
4140 const struct pvr_pds_info *const pds_info = &descriptor_state->pds_info;
4141 struct pvr_suballoc_bo *pvr_bo;
4142 const uint8_t *entries;
4143 uint32_t *dword_buffer;
4144 uint64_t *qword_buffer;
4145 VkResult result;
4146
4147 if (!pds_info->data_size_in_dwords)
4148 return VK_SUCCESS;
4149
4150 result =
4151 pvr_cmd_buffer_alloc_mem(cmd_buffer,
4152 cmd_buffer->device->heaps.pds_heap,
4153 PVR_DW_TO_BYTES(pds_info->data_size_in_dwords),
4154 &pvr_bo);
4155 if (result != VK_SUCCESS)
4156 return result;
4157
4158 dword_buffer = (uint32_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
4159 qword_buffer = (uint64_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
4160
4161 entries = (uint8_t *)pds_info->entries;
4162
4163 switch (stage) {
4164 case PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY:
4165 case PVR_STAGE_ALLOCATION_FRAGMENT:
4166 case PVR_STAGE_ALLOCATION_COMPUTE:
4167 break;
4168
4169 default:
4170 unreachable("Unsupported stage.");
4171 break;
4172 }
4173
4174 for (uint32_t i = 0; i < pds_info->entry_count; i++) {
4175 const struct pvr_const_map_entry *const entry_header =
4176 (struct pvr_const_map_entry *)entries;
4177
4178 switch (entry_header->type) {
4179 case PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32: {
4180 const struct pvr_const_map_entry_literal32 *const literal =
4181 (struct pvr_const_map_entry_literal32 *)entries;
4182
4183 PVR_WRITE(dword_buffer,
4184 literal->literal_value,
4185 literal->const_offset,
4186 pds_info->data_size_in_dwords);
4187
4188 entries += sizeof(*literal);
4189 break;
4190 }
4191
4192 case PVR_PDS_CONST_MAP_ENTRY_TYPE_ADDR_LITERAL_BUFFER: {
4193 const struct pvr_pds_const_map_entry_addr_literal_buffer
4194 *const addr_literal_buffer_entry =
4195 (struct pvr_pds_const_map_entry_addr_literal_buffer *)entries;
4196 struct pvr_device *device = cmd_buffer->device;
4197 struct pvr_suballoc_bo *addr_literal_buffer_bo;
4198 uint32_t addr_literal_count = 0;
4199 uint64_t *addr_literal_buffer;
4200
4201 result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
4202 device->heaps.general_heap,
4203 addr_literal_buffer_entry->size,
4204 &addr_literal_buffer_bo);
4205 if (result != VK_SUCCESS)
4206 return result;
4207
4208 addr_literal_buffer =
4209 (uint64_t *)pvr_bo_suballoc_get_map_addr(addr_literal_buffer_bo);
4210
4211 entries += sizeof(*addr_literal_buffer_entry);
4212
4213 PVR_WRITE(qword_buffer,
4214 addr_literal_buffer_bo->dev_addr.addr,
4215 addr_literal_buffer_entry->const_offset,
4216 pds_info->data_size_in_dwords);
4217
4218 for (uint32_t j = i + 1; j < pds_info->entry_count; j++) {
4219 const struct pvr_const_map_entry *const entry_header =
4220 (struct pvr_const_map_entry *)entries;
4221 const struct pvr_pds_const_map_entry_addr_literal *addr_literal;
4222 pvr_dev_addr_t dev_addr;
4223
4224 if (entry_header->type != PVR_PDS_CONST_MAP_ENTRY_TYPE_ADDR_LITERAL)
4225 break;
4226
4227 addr_literal =
4228 (struct pvr_pds_const_map_entry_addr_literal *)entries;
4229
4230 result = pvr_process_addr_literal(cmd_buffer,
4231 addr_literal->addr_type,
4232 stage,
4233 &dev_addr);
4234 if (result != VK_SUCCESS)
4235 return result;
4236
4237 addr_literal_buffer[addr_literal_count++] = dev_addr.addr;
4238
4239 entries += sizeof(*addr_literal);
4240 }
4241
4242 assert(addr_literal_count * sizeof(uint64_t) ==
4243 addr_literal_buffer_entry->size);
4244
4245 i += addr_literal_count;
4246
4247 break;
4248 }
4249
4250 default:
4251 unreachable("Unsupported map entry type.");
4252 }
4253 }
4254
4255 *descriptor_data_offset_out =
4256 pvr_bo->dev_addr.addr -
4257 cmd_buffer->device->heaps.pds_heap->base_addr.addr;
4258
4259 return VK_SUCCESS;
4260 }
4261
pvr_setup_descriptor_mappings(struct pvr_cmd_buffer * const cmd_buffer,enum pvr_stage_allocation stage,const struct pvr_stage_allocation_descriptor_state * descriptor_state,const pvr_dev_addr_t * const num_worgroups_buff_addr,uint32_t * const descriptor_data_offset_out)4262 static VkResult pvr_setup_descriptor_mappings(
4263 struct pvr_cmd_buffer *const cmd_buffer,
4264 enum pvr_stage_allocation stage,
4265 const struct pvr_stage_allocation_descriptor_state *descriptor_state,
4266 const pvr_dev_addr_t *const num_worgroups_buff_addr,
4267 uint32_t *const descriptor_data_offset_out)
4268 {
4269 const bool old_path =
4270 pvr_has_hard_coded_shaders(&cmd_buffer->device->pdevice->dev_info);
4271
4272 if (old_path) {
4273 return pvr_setup_descriptor_mappings_old(cmd_buffer,
4274 stage,
4275 descriptor_state,
4276 num_worgroups_buff_addr,
4277 descriptor_data_offset_out);
4278 }
4279
4280 return pvr_setup_descriptor_mappings_new(cmd_buffer,
4281 stage,
4282 descriptor_state,
4283 descriptor_data_offset_out);
4284 }
4285
pvr_compute_update_shared(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd)4286 static void pvr_compute_update_shared(struct pvr_cmd_buffer *cmd_buffer,
4287 struct pvr_sub_cmd_compute *const sub_cmd)
4288 {
4289 const struct pvr_device *device = cmd_buffer->device;
4290 const struct pvr_physical_device *pdevice = device->pdevice;
4291 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4292 struct pvr_csb *csb = &sub_cmd->control_stream;
4293 const struct pvr_compute_pipeline *pipeline = state->compute_pipeline;
4294 const uint32_t const_shared_regs =
4295 pipeline->shader_state.const_shared_reg_count;
4296 struct pvr_compute_kernel_info info;
4297
4298 /* No shared regs, no need to use an allocation kernel. */
4299 if (!const_shared_regs)
4300 return;
4301
4302 /* Accumulate the MAX number of shared registers across the kernels in this
4303 * dispatch. This is used by the FW for context switching, so must be large
4304 * enough to contain all the shared registers that might be in use for this
4305 * compute job. Coefficients don't need to be included as the context switch
4306 * will not happen within the execution of a single workgroup, thus nothing
4307 * needs to be preserved.
4308 */
4309 state->max_shared_regs = MAX2(state->max_shared_regs, const_shared_regs);
4310
4311 info = (struct pvr_compute_kernel_info){
4312 .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
4313 .sd_type = ROGUE_CDMCTRL_SD_TYPE_NONE,
4314
4315 .usc_target = ROGUE_CDMCTRL_USC_TARGET_ALL,
4316 .usc_common_shared = true,
4317 .usc_common_size =
4318 DIV_ROUND_UP(const_shared_regs,
4319 ROGUE_CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE),
4320
4321 .global_size = { 1, 1, 1 },
4322 .local_size = { 1, 1, 1 },
4323 };
4324
4325 /* Sometimes we don't have a secondary program if there were no constants to
4326 * write, but we still need to run a PDS program to accomplish the
4327 * allocation of the local/common store shared registers. Use the
4328 * pre-uploaded empty PDS program in this instance.
4329 */
4330 if (pipeline->descriptor_state.pds_info.code_size_in_dwords) {
4331 uint32_t pds_data_size_in_dwords =
4332 pipeline->descriptor_state.pds_info.data_size_in_dwords;
4333
4334 info.pds_data_offset = state->pds_compute_descriptor_data_offset;
4335 info.pds_data_size =
4336 DIV_ROUND_UP(PVR_DW_TO_BYTES(pds_data_size_in_dwords),
4337 ROGUE_CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE);
4338
4339 /* Check that we have upload the code section. */
4340 assert(pipeline->descriptor_state.pds_code.code_size);
4341 info.pds_code_offset = pipeline->descriptor_state.pds_code.code_offset;
4342 } else {
4343 const struct pvr_pds_upload *program = &device->pds_compute_empty_program;
4344
4345 info.pds_data_offset = program->data_offset;
4346 info.pds_data_size =
4347 DIV_ROUND_UP(PVR_DW_TO_BYTES(program->data_size),
4348 ROGUE_CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE);
4349 info.pds_code_offset = program->code_offset;
4350 }
4351
4352 /* We don't need to pad the workgroup size. */
4353
4354 info.max_instances =
4355 pvr_compute_flat_slot_size(pdevice, const_shared_regs, false, 1U);
4356
4357 pvr_compute_generate_control_stream(csb, sub_cmd, &info);
4358 }
4359
pvr_compute_update_shared_private(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd,struct pvr_private_compute_pipeline * pipeline)4360 void pvr_compute_update_shared_private(
4361 struct pvr_cmd_buffer *cmd_buffer,
4362 struct pvr_sub_cmd_compute *const sub_cmd,
4363 struct pvr_private_compute_pipeline *pipeline)
4364 {
4365 const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
4366 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4367 const uint32_t const_shared_regs = pipeline->const_shared_regs_count;
4368 struct pvr_csb *csb = &sub_cmd->control_stream;
4369 struct pvr_compute_kernel_info info;
4370
4371 /* No shared regs, no need to use an allocation kernel. */
4372 if (!const_shared_regs)
4373 return;
4374
4375 /* See comment in pvr_compute_update_shared() for details on this. */
4376 state->max_shared_regs = MAX2(state->max_shared_regs, const_shared_regs);
4377
4378 info = (struct pvr_compute_kernel_info){
4379 .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
4380 .usc_common_size =
4381 DIV_ROUND_UP(const_shared_regs,
4382 ROGUE_CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE),
4383 .pds_data_size =
4384 DIV_ROUND_UP(PVR_DW_TO_BYTES(pipeline->pds_shared_update_data_size_dw),
4385 ROGUE_CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE),
4386 .usc_target = ROGUE_CDMCTRL_USC_TARGET_ALL,
4387 .pds_data_offset = pipeline->pds_shared_update_data_offset,
4388 .pds_code_offset = pipeline->pds_shared_update_code_offset,
4389 .sd_type = ROGUE_CDMCTRL_SD_TYPE_NONE,
4390 .usc_common_shared = true,
4391 .global_size = { 1, 1, 1 },
4392 .local_size = { 1, 1, 1 },
4393 };
4394
4395 /* We don't need to pad the workgroup size. */
4396
4397 info.max_instances =
4398 pvr_compute_flat_slot_size(pdevice, const_shared_regs, false, 1U);
4399
4400 pvr_compute_generate_control_stream(csb, sub_cmd, &info);
4401 }
4402
4403 static uint32_t
pvr_compute_flat_pad_workgroup_size(const struct pvr_physical_device * pdevice,uint32_t workgroup_size,uint32_t coeff_regs_count)4404 pvr_compute_flat_pad_workgroup_size(const struct pvr_physical_device *pdevice,
4405 uint32_t workgroup_size,
4406 uint32_t coeff_regs_count)
4407 {
4408 const struct pvr_device_runtime_info *dev_runtime_info =
4409 &pdevice->dev_runtime_info;
4410 const struct pvr_device_info *dev_info = &pdevice->dev_info;
4411 uint32_t max_avail_coeff_regs =
4412 dev_runtime_info->cdm_max_local_mem_size_regs;
4413 uint32_t coeff_regs_count_aligned =
4414 ALIGN_POT(coeff_regs_count,
4415 ROGUE_CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE >> 2U);
4416
4417 /* If the work group size is > ROGUE_MAX_INSTANCES_PER_TASK. We now *always*
4418 * pad the work group size to the next multiple of
4419 * ROGUE_MAX_INSTANCES_PER_TASK.
4420 *
4421 * If we use more than 1/8th of the max coefficient registers then we round
4422 * work group size up to the next multiple of ROGUE_MAX_INSTANCES_PER_TASK
4423 */
4424 /* TODO: See if this can be optimized. */
4425 if (workgroup_size > ROGUE_MAX_INSTANCES_PER_TASK ||
4426 coeff_regs_count_aligned > (max_avail_coeff_regs / 8)) {
4427 assert(workgroup_size < rogue_get_compute_max_work_group_size(dev_info));
4428
4429 return ALIGN_POT(workgroup_size, ROGUE_MAX_INSTANCES_PER_TASK);
4430 }
4431
4432 return workgroup_size;
4433 }
4434
pvr_compute_update_kernel_private(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd,struct pvr_private_compute_pipeline * pipeline,const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])4435 void pvr_compute_update_kernel_private(
4436 struct pvr_cmd_buffer *cmd_buffer,
4437 struct pvr_sub_cmd_compute *const sub_cmd,
4438 struct pvr_private_compute_pipeline *pipeline,
4439 const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])
4440 {
4441 const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
4442 const struct pvr_device_runtime_info *dev_runtime_info =
4443 &pdevice->dev_runtime_info;
4444 struct pvr_csb *csb = &sub_cmd->control_stream;
4445
4446 struct pvr_compute_kernel_info info = {
4447 .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
4448 .usc_target = ROGUE_CDMCTRL_USC_TARGET_ANY,
4449 .pds_temp_size =
4450 DIV_ROUND_UP(pipeline->pds_temps_used << 2U,
4451 ROGUE_CDMCTRL_KERNEL0_PDS_TEMP_SIZE_UNIT_SIZE),
4452
4453 .pds_data_size =
4454 DIV_ROUND_UP(PVR_DW_TO_BYTES(pipeline->pds_data_size_dw),
4455 ROGUE_CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE),
4456 .pds_data_offset = pipeline->pds_data_offset,
4457 .pds_code_offset = pipeline->pds_code_offset,
4458
4459 .sd_type = ROGUE_CDMCTRL_SD_TYPE_NONE,
4460
4461 .usc_unified_size =
4462 DIV_ROUND_UP(pipeline->unified_store_regs_count << 2U,
4463 ROGUE_CDMCTRL_KERNEL0_USC_UNIFIED_SIZE_UNIT_SIZE),
4464
4465 /* clang-format off */
4466 .global_size = {
4467 global_workgroup_size[0],
4468 global_workgroup_size[1],
4469 global_workgroup_size[2]
4470 },
4471 /* clang-format on */
4472 };
4473
4474 uint32_t work_size = pipeline->workgroup_size.width *
4475 pipeline->workgroup_size.height *
4476 pipeline->workgroup_size.depth;
4477 uint32_t coeff_regs;
4478
4479 if (work_size > ROGUE_MAX_INSTANCES_PER_TASK) {
4480 /* Enforce a single workgroup per cluster through allocation starvation.
4481 */
4482 coeff_regs = dev_runtime_info->cdm_max_local_mem_size_regs;
4483 } else {
4484 coeff_regs = pipeline->coeff_regs_count;
4485 }
4486
4487 info.usc_common_size =
4488 DIV_ROUND_UP(PVR_DW_TO_BYTES(coeff_regs),
4489 ROGUE_CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE);
4490
4491 /* Use a whole slot per workgroup. */
4492 work_size = MAX2(work_size, ROGUE_MAX_INSTANCES_PER_TASK);
4493
4494 coeff_regs += pipeline->const_shared_regs_count;
4495
4496 if (pipeline->const_shared_regs_count > 0)
4497 info.sd_type = ROGUE_CDMCTRL_SD_TYPE_USC;
4498
4499 work_size =
4500 pvr_compute_flat_pad_workgroup_size(pdevice, work_size, coeff_regs);
4501
4502 info.local_size[0] = work_size;
4503 info.local_size[1] = 1U;
4504 info.local_size[2] = 1U;
4505
4506 info.max_instances =
4507 pvr_compute_flat_slot_size(pdevice, coeff_regs, false, work_size);
4508
4509 pvr_compute_generate_control_stream(csb, sub_cmd, &info);
4510 }
4511
4512 /* TODO: Wire up the base_workgroup variant program when implementing
4513 * VK_KHR_device_group. The values will also need patching into the program.
4514 */
pvr_compute_update_kernel(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd,pvr_dev_addr_t indirect_addr,const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])4515 static void pvr_compute_update_kernel(
4516 struct pvr_cmd_buffer *cmd_buffer,
4517 struct pvr_sub_cmd_compute *const sub_cmd,
4518 pvr_dev_addr_t indirect_addr,
4519 const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])
4520 {
4521 const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
4522 const struct pvr_device_runtime_info *dev_runtime_info =
4523 &pdevice->dev_runtime_info;
4524 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4525 struct pvr_csb *csb = &sub_cmd->control_stream;
4526 const struct pvr_compute_pipeline *pipeline = state->compute_pipeline;
4527 const struct pvr_compute_shader_state *shader_state =
4528 &pipeline->shader_state;
4529 const struct pvr_pds_info *program_info = &pipeline->primary_program_info;
4530
4531 struct pvr_compute_kernel_info info = {
4532 .indirect_buffer_addr = indirect_addr,
4533 .usc_target = ROGUE_CDMCTRL_USC_TARGET_ANY,
4534 .pds_temp_size =
4535 DIV_ROUND_UP(program_info->temps_required << 2U,
4536 ROGUE_CDMCTRL_KERNEL0_PDS_TEMP_SIZE_UNIT_SIZE),
4537
4538 .pds_data_size =
4539 DIV_ROUND_UP(PVR_DW_TO_BYTES(program_info->data_size_in_dwords),
4540 ROGUE_CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE),
4541 .pds_data_offset = pipeline->primary_program.data_offset,
4542 .pds_code_offset = pipeline->primary_program.code_offset,
4543
4544 .sd_type = ROGUE_CDMCTRL_SD_TYPE_NONE,
4545
4546 .usc_unified_size =
4547 DIV_ROUND_UP(shader_state->input_register_count << 2U,
4548 ROGUE_CDMCTRL_KERNEL0_USC_UNIFIED_SIZE_UNIT_SIZE),
4549
4550 /* clang-format off */
4551 .global_size = {
4552 global_workgroup_size[0],
4553 global_workgroup_size[1],
4554 global_workgroup_size[2]
4555 },
4556 /* clang-format on */
4557 };
4558
4559 uint32_t work_size = shader_state->work_size;
4560 uint32_t coeff_regs;
4561
4562 if (work_size > ROGUE_MAX_INSTANCES_PER_TASK) {
4563 /* Enforce a single workgroup per cluster through allocation starvation.
4564 */
4565 coeff_regs = dev_runtime_info->cdm_max_local_mem_size_regs;
4566 } else {
4567 coeff_regs = shader_state->coefficient_register_count;
4568 }
4569
4570 info.usc_common_size =
4571 DIV_ROUND_UP(PVR_DW_TO_BYTES(coeff_regs),
4572 ROGUE_CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE);
4573
4574 /* Use a whole slot per workgroup. */
4575 work_size = MAX2(work_size, ROGUE_MAX_INSTANCES_PER_TASK);
4576
4577 coeff_regs += shader_state->const_shared_reg_count;
4578
4579 if (shader_state->const_shared_reg_count > 0)
4580 info.sd_type = ROGUE_CDMCTRL_SD_TYPE_USC;
4581
4582 work_size =
4583 pvr_compute_flat_pad_workgroup_size(pdevice, work_size, coeff_regs);
4584
4585 info.local_size[0] = work_size;
4586 info.local_size[1] = 1U;
4587 info.local_size[2] = 1U;
4588
4589 info.max_instances =
4590 pvr_compute_flat_slot_size(pdevice, coeff_regs, false, work_size);
4591
4592 pvr_compute_generate_control_stream(csb, sub_cmd, &info);
4593 }
4594
pvr_cmd_upload_push_consts(struct pvr_cmd_buffer * cmd_buffer)4595 static VkResult pvr_cmd_upload_push_consts(struct pvr_cmd_buffer *cmd_buffer)
4596 {
4597 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4598 struct pvr_suballoc_bo *suballoc_bo;
4599 VkResult result;
4600
4601 /* TODO: Here are some possible optimizations/things to consider:
4602 *
4603 * - Currently we upload maxPushConstantsSize. The application might only
4604 * be using a portion of that so we might end up with unused memory.
4605 * Should we be smarter about this. If we intend to upload the push
4606 * consts into shareds, we definitely want to do avoid reserving unused
4607 * regs.
4608 *
4609 * - For now we have to upload to a new buffer each time since the shaders
4610 * access the push constants from memory. If we were to reuse the same
4611 * buffer we might update the contents out of sync with job submission
4612 * and the shaders will see the updated contents while the command
4613 * buffer was still being recorded and not yet submitted.
4614 * If we were to upload the push constants directly to shared regs we
4615 * could reuse the same buffer (avoiding extra allocation overhead)
4616 * since the contents will be DMAed only on job submission when the
4617 * control stream is processed and the PDS program is executed. This
4618 * approach would also allow us to avoid regenerating the PDS data
4619 * section in some cases since the buffer address will be constants.
4620 */
4621
4622 if (cmd_buffer->state.push_constants.uploaded)
4623 return VK_SUCCESS;
4624
4625 result = pvr_cmd_buffer_upload_general(cmd_buffer,
4626 state->push_constants.data,
4627 sizeof(state->push_constants.data),
4628 &suballoc_bo);
4629 if (result != VK_SUCCESS)
4630 return result;
4631
4632 cmd_buffer->state.push_constants.dev_addr = suballoc_bo->dev_addr;
4633 cmd_buffer->state.push_constants.uploaded = true;
4634
4635 return VK_SUCCESS;
4636 }
4637
pvr_cmd_dispatch(struct pvr_cmd_buffer * const cmd_buffer,const pvr_dev_addr_t indirect_addr,const uint32_t workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])4638 static void pvr_cmd_dispatch(
4639 struct pvr_cmd_buffer *const cmd_buffer,
4640 const pvr_dev_addr_t indirect_addr,
4641 const uint32_t workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])
4642 {
4643 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4644 const struct pvr_compute_pipeline *compute_pipeline =
4645 state->compute_pipeline;
4646 struct pvr_sub_cmd_compute *sub_cmd;
4647 VkResult result;
4648
4649 pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_COMPUTE);
4650
4651 sub_cmd = &state->current_sub_cmd->compute;
4652 sub_cmd->uses_atomic_ops |= compute_pipeline->shader_state.uses_atomic_ops;
4653 sub_cmd->uses_barrier |= compute_pipeline->shader_state.uses_barrier;
4654
4655 if (state->push_constants.dirty_stages & VK_SHADER_STAGE_COMPUTE_BIT) {
4656 result = pvr_cmd_upload_push_consts(cmd_buffer);
4657 if (result != VK_SUCCESS)
4658 return;
4659
4660 /* Regenerate the PDS program to use the new push consts buffer. */
4661 state->dirty.compute_desc_dirty = true;
4662
4663 state->push_constants.dirty_stages &= ~VK_SHADER_STAGE_COMPUTE_BIT;
4664 }
4665
4666 if (compute_pipeline->shader_state.uses_num_workgroups) {
4667 pvr_dev_addr_t descriptor_data_offset_out;
4668
4669 if (indirect_addr.addr) {
4670 descriptor_data_offset_out = indirect_addr;
4671 } else {
4672 struct pvr_suballoc_bo *num_workgroups_bo;
4673
4674 result = pvr_cmd_buffer_upload_general(cmd_buffer,
4675 workgroup_size,
4676 sizeof(*workgroup_size) *
4677 PVR_WORKGROUP_DIMENSIONS,
4678 &num_workgroups_bo);
4679 if (result != VK_SUCCESS)
4680 return;
4681
4682 descriptor_data_offset_out = num_workgroups_bo->dev_addr;
4683 }
4684
4685 result = pvr_setup_descriptor_mappings(
4686 cmd_buffer,
4687 PVR_STAGE_ALLOCATION_COMPUTE,
4688 &compute_pipeline->descriptor_state,
4689 &descriptor_data_offset_out,
4690 &state->pds_compute_descriptor_data_offset);
4691 if (result != VK_SUCCESS)
4692 return;
4693 } else if ((compute_pipeline->base.layout
4694 ->per_stage_descriptor_masks[PVR_STAGE_ALLOCATION_COMPUTE] &&
4695 state->dirty.compute_desc_dirty) ||
4696 state->dirty.compute_pipeline_binding) {
4697 result = pvr_setup_descriptor_mappings(
4698 cmd_buffer,
4699 PVR_STAGE_ALLOCATION_COMPUTE,
4700 &compute_pipeline->descriptor_state,
4701 NULL,
4702 &state->pds_compute_descriptor_data_offset);
4703 if (result != VK_SUCCESS)
4704 return;
4705 }
4706
4707 pvr_compute_update_shared(cmd_buffer, sub_cmd);
4708 pvr_compute_update_kernel(cmd_buffer, sub_cmd, indirect_addr, workgroup_size);
4709 }
4710
pvr_CmdDispatch(VkCommandBuffer commandBuffer,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)4711 void pvr_CmdDispatch(VkCommandBuffer commandBuffer,
4712 uint32_t groupCountX,
4713 uint32_t groupCountY,
4714 uint32_t groupCountZ)
4715 {
4716 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
4717
4718 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
4719
4720 if (!groupCountX || !groupCountY || !groupCountZ)
4721 return;
4722
4723 pvr_cmd_dispatch(cmd_buffer,
4724 PVR_DEV_ADDR_INVALID,
4725 (uint32_t[]){ groupCountX, groupCountY, groupCountZ });
4726 }
4727
pvr_CmdDispatchIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset)4728 void pvr_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
4729 VkBuffer _buffer,
4730 VkDeviceSize offset)
4731 {
4732 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
4733 PVR_FROM_HANDLE(pvr_buffer, buffer, _buffer);
4734
4735 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
4736
4737 pvr_cmd_dispatch(cmd_buffer,
4738 PVR_DEV_ADDR_OFFSET(buffer->dev_addr, offset),
4739 (uint32_t[]){ 1, 1, 1 });
4740 }
4741
4742 static void
pvr_update_draw_state(struct pvr_cmd_buffer_state * const state,const struct pvr_cmd_buffer_draw_state * const draw_state)4743 pvr_update_draw_state(struct pvr_cmd_buffer_state *const state,
4744 const struct pvr_cmd_buffer_draw_state *const draw_state)
4745 {
4746 /* We don't have a state to tell us that base_instance is being used so it
4747 * gets used as a boolean - 0 means we'll use a pds program that skips the
4748 * base instance addition. If the base_instance gets used (and the last
4749 * draw's base_instance was 0) then we switch to the BASE_INSTANCE attrib
4750 * program.
4751 *
4752 * If base_instance changes then we only need to update the data section.
4753 *
4754 * The only draw call state that doesn't really matter is the start vertex
4755 * as that is handled properly in the VDM state in all cases.
4756 */
4757 if ((state->draw_state.draw_indexed != draw_state->draw_indexed) ||
4758 (state->draw_state.draw_indirect != draw_state->draw_indirect) ||
4759 (state->draw_state.base_instance == 0 &&
4760 draw_state->base_instance != 0)) {
4761 state->dirty.draw_variant = true;
4762 } else if (state->draw_state.base_instance != draw_state->base_instance) {
4763 state->dirty.draw_base_instance = true;
4764 }
4765
4766 state->draw_state = *draw_state;
4767 }
4768
pvr_calc_shared_regs_count(const struct pvr_graphics_pipeline * const gfx_pipeline)4769 static uint32_t pvr_calc_shared_regs_count(
4770 const struct pvr_graphics_pipeline *const gfx_pipeline)
4771 {
4772 uint32_t shared_regs = gfx_pipeline->vs_data.common.shareds;
4773
4774 if (gfx_pipeline->shader_state.fragment.bo) {
4775 uint32_t fragment_regs = gfx_pipeline->fs_data.common.shareds;
4776 shared_regs = MAX2(shared_regs, fragment_regs);
4777 }
4778
4779 return shared_regs;
4780 }
4781
4782 static void
pvr_emit_dirty_pds_state(const struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd,const uint32_t pds_vertex_descriptor_data_offset)4783 pvr_emit_dirty_pds_state(const struct pvr_cmd_buffer *const cmd_buffer,
4784 struct pvr_sub_cmd_gfx *const sub_cmd,
4785 const uint32_t pds_vertex_descriptor_data_offset)
4786 {
4787 const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
4788 const struct pvr_stage_allocation_descriptor_state
4789 *const vertex_descriptor_state =
4790 &state->gfx_pipeline->shader_state.vertex.descriptor_state;
4791 const pco_data *const vs_data = &state->gfx_pipeline->vs_data;
4792 struct pvr_csb *const csb = &sub_cmd->control_stream;
4793
4794 if (!vertex_descriptor_state->pds_info.code_size_in_dwords)
4795 return;
4796
4797 pvr_csb_set_relocation_mark(csb);
4798
4799 pvr_csb_emit (csb, VDMCTRL_PDS_STATE0, state0) {
4800 state0.usc_target = ROGUE_VDMCTRL_USC_TARGET_ALL;
4801
4802 state0.usc_common_size =
4803 DIV_ROUND_UP(vs_data->common.shareds,
4804 ROGUE_VDMCTRL_PDS_STATE0_USC_COMMON_SIZE_UNIT_SIZE);
4805
4806 state0.pds_data_size = DIV_ROUND_UP(
4807 PVR_DW_TO_BYTES(vertex_descriptor_state->pds_info.data_size_in_dwords),
4808 ROGUE_VDMCTRL_PDS_STATE0_PDS_DATA_SIZE_UNIT_SIZE);
4809 }
4810
4811 pvr_csb_emit (csb, VDMCTRL_PDS_STATE1, state1) {
4812 state1.pds_data_addr = PVR_DEV_ADDR(pds_vertex_descriptor_data_offset);
4813 state1.sd_type = ROGUE_VDMCTRL_SD_TYPE_NONE;
4814 }
4815
4816 pvr_csb_emit (csb, VDMCTRL_PDS_STATE2, state2) {
4817 state2.pds_code_addr =
4818 PVR_DEV_ADDR(vertex_descriptor_state->pds_code.code_offset);
4819 }
4820
4821 pvr_csb_clear_relocation_mark(csb);
4822 }
4823
pvr_setup_output_select(struct pvr_cmd_buffer * const cmd_buffer)4824 static void pvr_setup_output_select(struct pvr_cmd_buffer *const cmd_buffer)
4825 {
4826 const struct pvr_graphics_pipeline *const gfx_pipeline =
4827 cmd_buffer->state.gfx_pipeline;
4828 struct vk_dynamic_graphics_state *const dynamic_state =
4829 &cmd_buffer->vk.dynamic_graphics_state;
4830 struct ROGUE_TA_STATE_HEADER *const header = &cmd_buffer->state.emit_header;
4831 struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state;
4832 const pco_data *const vs_data = &gfx_pipeline->vs_data;
4833 const pco_data *const fs_data = &gfx_pipeline->fs_data;
4834 uint32_t output_selects;
4835 uint32_t varying[2];
4836
4837 const pco_range *varyings = vs_data->vs.varyings;
4838
4839 const bool has_point_size = dynamic_state->ia.primitive_topology ==
4840 VK_PRIMITIVE_TOPOLOGY_POINT_LIST &&
4841 varyings[VARYING_SLOT_PSIZ].count > 0;
4842
4843 const bool has_viewport = varyings[VARYING_SLOT_VIEWPORT].count > 0;
4844
4845 const bool has_layer = varyings[VARYING_SLOT_LAYER].count > 0;
4846
4847 pvr_csb_pack (&output_selects, TA_OUTPUT_SEL, state) {
4848 state.rhw_pres = fs_data->fs.uses.w;
4849 state.tsp_unclamped_z_pres = fs_data->fs.uses.z;
4850
4851 state.vtxsize = vs_data->vs.vtxouts;
4852 state.psprite_size_pres = has_point_size;
4853 state.vpt_tgt_pres = has_viewport;
4854 state.render_tgt_pres = has_layer;
4855 }
4856
4857 if (ppp_state->output_selects != output_selects) {
4858 ppp_state->output_selects = output_selects;
4859 header->pres_outselects = true;
4860 }
4861
4862 pvr_csb_pack (&varying[0], TA_STATE_VARYING0, varying0) {
4863 varying0.f32_linear = vs_data->vs.f32_smooth;
4864 varying0.f32_flat = vs_data->vs.f32_flat;
4865 varying0.f32_npc = vs_data->vs.f32_npc;
4866 }
4867
4868 if (ppp_state->varying_word[0] != varying[0]) {
4869 ppp_state->varying_word[0] = varying[0];
4870 header->pres_varying_word0 = true;
4871 }
4872
4873 pvr_csb_pack (&varying[1], TA_STATE_VARYING1, varying1) {
4874 varying1.f16_linear = vs_data->vs.f16_smooth;
4875 varying1.f16_flat = vs_data->vs.f16_flat;
4876 varying1.f16_npc = vs_data->vs.f16_npc;
4877 }
4878
4879 if (ppp_state->varying_word[1] != varying[1]) {
4880 ppp_state->varying_word[1] = varying[1];
4881 header->pres_varying_word1 = true;
4882 }
4883 }
4884
4885 static void
pvr_setup_isp_faces_and_control(struct pvr_cmd_buffer * const cmd_buffer,struct ROGUE_TA_STATE_ISPA * const ispa_out)4886 pvr_setup_isp_faces_and_control(struct pvr_cmd_buffer *const cmd_buffer,
4887 struct ROGUE_TA_STATE_ISPA *const ispa_out)
4888 {
4889 struct ROGUE_TA_STATE_HEADER *const header = &cmd_buffer->state.emit_header;
4890 const struct pvr_fragment_shader_state *const fragment_shader_state =
4891 &cmd_buffer->state.gfx_pipeline->shader_state.fragment;
4892 const struct pvr_render_pass_info *const pass_info =
4893 &cmd_buffer->state.render_pass_info;
4894 struct vk_dynamic_graphics_state *dynamic_state =
4895 &cmd_buffer->vk.dynamic_graphics_state;
4896 struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state;
4897
4898 const bool rasterizer_discard = dynamic_state->rs.rasterizer_discard_enable;
4899 const uint32_t subpass_idx = pass_info->subpass_idx;
4900 const uint32_t depth_stencil_attachment_idx =
4901 pass_info->pass->subpasses[subpass_idx].depth_stencil_attachment;
4902 const struct pvr_render_pass_attachment *const attachment =
4903 depth_stencil_attachment_idx != VK_ATTACHMENT_UNUSED
4904 ? &pass_info->pass->attachments[depth_stencil_attachment_idx]
4905 : NULL;
4906
4907 const enum ROGUE_TA_OBJTYPE obj_type =
4908 pvr_ta_objtype(dynamic_state->ia.primitive_topology);
4909
4910 const VkImageAspectFlags ds_aspects =
4911 (!rasterizer_discard && attachment)
4912 ? vk_format_aspects(attachment->vk_format) &
4913 (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)
4914 : VK_IMAGE_ASPECT_NONE;
4915
4916 /* This is deliberately a full copy rather than a pointer because
4917 * vk_optimize_depth_stencil_state() can only be run once against any given
4918 * instance of vk_depth_stencil_state.
4919 */
4920 struct vk_depth_stencil_state ds_state = dynamic_state->ds;
4921
4922 uint32_t ispb_stencil_off;
4923 bool is_two_sided = false;
4924 uint32_t isp_control;
4925
4926 uint32_t line_width;
4927 uint32_t common_a;
4928 uint32_t front_a;
4929 uint32_t front_b;
4930 uint32_t back_a;
4931 uint32_t back_b;
4932
4933 vk_optimize_depth_stencil_state(&ds_state, ds_aspects, true);
4934
4935 /* Convert to 4.4 fixed point format. */
4936 line_width = util_unsigned_fixed(dynamic_state->rs.line.width, 4);
4937
4938 /* Subtract 1 to shift values from range [0=0,256=16] to [0=1/16,255=16].
4939 * If 0 it stays at 0, otherwise we subtract 1.
4940 */
4941 line_width = (!!line_width) * (line_width - 1);
4942
4943 line_width = MIN2(line_width, ROGUE_TA_STATE_ISPA_POINTLINEWIDTH_SIZE_MAX);
4944
4945 /* TODO: Part of the logic in this function is duplicated in another part
4946 * of the code. E.g. the dcmpmode, and sop1/2/3. Could we do this earlier?
4947 */
4948
4949 pvr_csb_pack (&common_a, TA_STATE_ISPA, ispa) {
4950 ispa.pointlinewidth = line_width;
4951
4952 ispa.dcmpmode = pvr_ta_cmpmode(ds_state.depth.compare_op);
4953 ispa.dwritedisable = !ds_state.depth.write_enable;
4954
4955 ispa.passtype = fragment_shader_state->pass_type;
4956
4957 ispa.objtype = obj_type;
4958
4959 /* Return unpacked ispa structure. dcmpmode, dwritedisable, passtype and
4960 * objtype are needed by pvr_setup_triangle_merging_flag.
4961 */
4962 if (ispa_out)
4963 *ispa_out = ispa;
4964 }
4965
4966 /* TODO: Does this actually represent the ispb control word on stencil off?
4967 * If not, rename the variable.
4968 */
4969 pvr_csb_pack (&ispb_stencil_off, TA_STATE_ISPB, ispb) {
4970 ispb.sop3 = ROGUE_TA_ISPB_STENCILOP_KEEP;
4971 ispb.sop2 = ROGUE_TA_ISPB_STENCILOP_KEEP;
4972 ispb.sop1 = ROGUE_TA_ISPB_STENCILOP_KEEP;
4973 ispb.scmpmode = ROGUE_TA_CMPMODE_ALWAYS;
4974 }
4975
4976 /* FIXME: This logic should be redone and improved. Can we also get rid of
4977 * the front and back variants?
4978 */
4979
4980 front_a = common_a;
4981 back_a = common_a;
4982
4983 if (ds_state.stencil.test_enable) {
4984 uint32_t front_a_sref;
4985 uint32_t back_a_sref;
4986
4987 pvr_csb_pack (&front_a_sref, TA_STATE_ISPA, ispa) {
4988 ispa.sref = ds_state.stencil.front.reference;
4989 }
4990 front_a |= front_a_sref;
4991
4992 pvr_csb_pack (&back_a_sref, TA_STATE_ISPA, ispa) {
4993 ispa.sref = ds_state.stencil.back.reference;
4994 }
4995 back_a |= back_a_sref;
4996
4997 pvr_csb_pack (&front_b, TA_STATE_ISPB, ispb) {
4998 const struct vk_stencil_test_face_state *const front =
4999 &ds_state.stencil.front;
5000
5001 if (ds_state.stencil.write_enable)
5002 ispb.swmask = front->write_mask;
5003
5004 ispb.scmpmask = front->compare_mask;
5005
5006 ispb.sop3 = pvr_ta_stencilop(front->op.pass);
5007 ispb.sop2 = pvr_ta_stencilop(front->op.depth_fail);
5008 ispb.sop1 = pvr_ta_stencilop(front->op.fail);
5009 ispb.scmpmode = pvr_ta_cmpmode(front->op.compare);
5010 }
5011
5012 pvr_csb_pack (&back_b, TA_STATE_ISPB, ispb) {
5013 const struct vk_stencil_test_face_state *const back =
5014 &ds_state.stencil.back;
5015
5016 if (ds_state.stencil.write_enable)
5017 ispb.swmask = back->write_mask;
5018
5019 ispb.scmpmask = back->compare_mask;
5020
5021 ispb.sop3 = pvr_ta_stencilop(back->op.pass);
5022 ispb.sop2 = pvr_ta_stencilop(back->op.depth_fail);
5023 ispb.sop1 = pvr_ta_stencilop(back->op.fail);
5024 ispb.scmpmode = pvr_ta_cmpmode(back->op.compare);
5025 }
5026 } else {
5027 front_b = ispb_stencil_off;
5028 back_b = ispb_stencil_off;
5029 }
5030
5031 if (front_a != back_a || front_b != back_b) {
5032 if (dynamic_state->rs.cull_mode & VK_CULL_MODE_BACK_BIT) {
5033 /* Single face, using front state. */
5034 } else if (dynamic_state->rs.cull_mode & VK_CULL_MODE_FRONT_BIT) {
5035 /* Single face, using back state. */
5036
5037 front_a = back_a;
5038 front_b = back_b;
5039 } else {
5040 /* Both faces. */
5041
5042 header->pres_ispctl_ba = is_two_sided = true;
5043
5044 if (dynamic_state->rs.front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE) {
5045 uint32_t tmp = front_a;
5046
5047 front_a = back_a;
5048 back_a = tmp;
5049
5050 tmp = front_b;
5051 front_b = back_b;
5052 back_b = tmp;
5053 }
5054
5055 /* HW defaults to stencil off. */
5056 if (back_b != ispb_stencil_off) {
5057 header->pres_ispctl_fb = true;
5058 header->pres_ispctl_bb = true;
5059 }
5060 }
5061 }
5062
5063 if (ds_state.stencil.test_enable && front_b != ispb_stencil_off)
5064 header->pres_ispctl_fb = true;
5065
5066 pvr_csb_pack (&isp_control, TA_STATE_ISPCTL, ispctl) {
5067 ispctl.upass = pass_info->isp_userpass;
5068
5069 /* TODO: is bo ever NULL? Figure out what to do. */
5070 ispctl.tagwritedisable = rasterizer_discard || !fragment_shader_state->bo;
5071
5072 ispctl.two_sided = is_two_sided;
5073 ispctl.bpres = header->pres_ispctl_fb || header->pres_ispctl_bb;
5074
5075 ispctl.dbenable = !rasterizer_discard &&
5076 dynamic_state->rs.depth_bias.enable &&
5077 obj_type == ROGUE_TA_OBJTYPE_TRIANGLE;
5078 if (!rasterizer_discard && cmd_buffer->state.vis_test_enabled) {
5079 ispctl.vistest = true;
5080 ispctl.visreg = cmd_buffer->state.vis_reg;
5081 }
5082
5083 ispctl.scenable = !rasterizer_discard;
5084
5085 ppp_state->isp.control_struct = ispctl;
5086 }
5087
5088 header->pres_ispctl = true;
5089
5090 ppp_state->isp.control = isp_control;
5091 ppp_state->isp.front_a = front_a;
5092 ppp_state->isp.front_b = front_b;
5093 ppp_state->isp.back_a = back_a;
5094 ppp_state->isp.back_b = back_b;
5095 }
5096
5097 static float
pvr_calculate_final_depth_bias_contant_factor(struct pvr_device_info * dev_info,VkFormat format,float depth_bias)5098 pvr_calculate_final_depth_bias_contant_factor(struct pvr_device_info *dev_info,
5099 VkFormat format,
5100 float depth_bias)
5101 {
5102 /* Information for future modifiers of these depth bias calculations.
5103 * ==================================================================
5104 * Specified depth bias equations scale the specified constant factor by a
5105 * value 'r' that is guaranteed to cause a resolvable difference in depth
5106 * across the entire range of depth values.
5107 * For floating point depth formats 'r' is calculated by taking the maximum
5108 * exponent across the triangle.
5109 * For UNORM formats 'r' is constant.
5110 * Here 'n' is the number of mantissa bits stored in the floating point
5111 * representation (23 for F32).
5112 *
5113 * UNORM Format -> z += dbcf * r + slope
5114 * FLOAT Format -> z += dbcf * 2^(e-n) + slope
5115 *
5116 * HW Variations.
5117 * ==============
5118 * The HW either always performs the F32 depth bias equation (exponent based
5119 * r), or in the case of HW that correctly supports the integer depth bias
5120 * equation for UNORM depth formats, we can select between both equations
5121 * using the ROGUE_CR_ISP_CTL.dbias_is_int flag - this is required to
5122 * correctly perform Vulkan UNORM depth bias (constant r).
5123 *
5124 * if ern42307:
5125 * if DBIAS_IS_INT_EN:
5126 * z += dbcf + slope
5127 * else:
5128 * z += dbcf * 2^(e-n) + slope
5129 * else:
5130 * z += dbcf * 2^(e-n) + slope
5131 *
5132 */
5133
5134 float nudge_factor;
5135
5136 if (PVR_HAS_ERN(dev_info, 42307)) {
5137 switch (format) {
5138 case VK_FORMAT_D16_UNORM:
5139 return depth_bias / (1 << 15);
5140
5141 case VK_FORMAT_D24_UNORM_S8_UINT:
5142 case VK_FORMAT_X8_D24_UNORM_PACK32:
5143 return depth_bias / (1 << 23);
5144
5145 default:
5146 return depth_bias;
5147 }
5148 }
5149
5150 /* The reasoning behind clamping/nudging the value here is because UNORM
5151 * depth formats can have higher precision over our underlying D32F
5152 * representation for some depth ranges.
5153 *
5154 * When the HW scales the depth bias value by 2^(e-n) [The 'r' term'] a depth
5155 * bias of 1 can result in a value smaller than one F32 ULP, which will get
5156 * quantized to 0 - resulting in no bias.
5157 *
5158 * Biasing small values away from zero will ensure that small depth biases of
5159 * 1 still yield a result and overcome Z-fighting.
5160 */
5161 switch (format) {
5162 case VK_FORMAT_D16_UNORM:
5163 depth_bias *= 512.0f;
5164 nudge_factor = 1.0f;
5165 break;
5166
5167 case VK_FORMAT_D24_UNORM_S8_UINT:
5168 case VK_FORMAT_X8_D24_UNORM_PACK32:
5169 depth_bias *= 2.0f;
5170 nudge_factor = 2.0f;
5171 break;
5172
5173 default:
5174 nudge_factor = 0.0f;
5175 break;
5176 }
5177
5178 if (nudge_factor != 0.0f) {
5179 if (depth_bias < 0.0f && depth_bias > -nudge_factor)
5180 depth_bias -= nudge_factor;
5181 else if (depth_bias > 0.0f && depth_bias < nudge_factor)
5182 depth_bias += nudge_factor;
5183 }
5184
5185 return depth_bias;
5186 }
5187
pvr_get_viewport_scissor_overlap(const VkViewport * const viewport,const VkRect2D * const scissor,VkRect2D * const rect_out)5188 static void pvr_get_viewport_scissor_overlap(const VkViewport *const viewport,
5189 const VkRect2D *const scissor,
5190 VkRect2D *const rect_out)
5191 {
5192 /* TODO: See if we can remove this struct. */
5193 struct pvr_rect {
5194 int32_t x0, y0;
5195 int32_t x1, y1;
5196 };
5197
5198 /* TODO: Worry about overflow? */
5199 const struct pvr_rect scissor_rect = {
5200 .x0 = scissor->offset.x,
5201 .y0 = scissor->offset.y,
5202 .x1 = scissor->offset.x + scissor->extent.width,
5203 .y1 = scissor->offset.y + scissor->extent.height
5204 };
5205 struct pvr_rect viewport_rect = { 0 };
5206
5207 assert(viewport->width >= 0.0f);
5208 assert(scissor_rect.x0 >= 0);
5209 assert(scissor_rect.y0 >= 0);
5210
5211 if (scissor->extent.width == 0 || scissor->extent.height == 0) {
5212 *rect_out = (VkRect2D){ 0 };
5213 return;
5214 }
5215
5216 viewport_rect.x0 = (int32_t)viewport->x;
5217 viewport_rect.x1 = (int32_t)viewport->x + (int32_t)viewport->width;
5218
5219 /* TODO: Is there a mathematical way of doing all this and then clamp at
5220 * the end?
5221 */
5222 /* We flip the y0 and y1 when height is negative. */
5223 viewport_rect.y0 = (int32_t)viewport->y + MIN2(0, (int32_t)viewport->height);
5224 viewport_rect.y1 = (int32_t)viewport->y + MAX2(0, (int32_t)viewport->height);
5225
5226 if (scissor_rect.x1 <= viewport_rect.x0 ||
5227 scissor_rect.y1 <= viewport_rect.y0 ||
5228 scissor_rect.x0 >= viewport_rect.x1 ||
5229 scissor_rect.y0 >= viewport_rect.y1) {
5230 *rect_out = (VkRect2D){ 0 };
5231 return;
5232 }
5233
5234 /* Determine the overlapping rectangle. */
5235 viewport_rect.x0 = MAX2(viewport_rect.x0, scissor_rect.x0);
5236 viewport_rect.y0 = MAX2(viewport_rect.y0, scissor_rect.y0);
5237 viewport_rect.x1 = MIN2(viewport_rect.x1, scissor_rect.x1);
5238 viewport_rect.y1 = MIN2(viewport_rect.y1, scissor_rect.y1);
5239
5240 /* TODO: Is this conversion safe? Is this logic right? */
5241 rect_out->offset.x = (uint32_t)viewport_rect.x0;
5242 rect_out->offset.y = (uint32_t)viewport_rect.y0;
5243 rect_out->extent.height = (uint32_t)(viewport_rect.y1 - viewport_rect.y0);
5244 rect_out->extent.width = (uint32_t)(viewport_rect.x1 - viewport_rect.x0);
5245 }
5246
5247 static inline uint32_t
pvr_get_geom_region_clip_align_size(struct pvr_device_info * const dev_info)5248 pvr_get_geom_region_clip_align_size(struct pvr_device_info *const dev_info)
5249 {
5250 /* TODO: This should come from rogue_ppp.xml. */
5251 return 16U + 16U * (!PVR_HAS_FEATURE(dev_info, tile_size_16x16));
5252 }
5253
5254 static void
pvr_setup_isp_depth_bias_scissor_state(struct pvr_cmd_buffer * const cmd_buffer)5255 pvr_setup_isp_depth_bias_scissor_state(struct pvr_cmd_buffer *const cmd_buffer)
5256 {
5257 struct ROGUE_TA_STATE_HEADER *const header = &cmd_buffer->state.emit_header;
5258 struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state;
5259 struct vk_dynamic_graphics_state *const dynamic_state =
5260 &cmd_buffer->vk.dynamic_graphics_state;
5261 const struct ROGUE_TA_STATE_ISPCTL *const ispctl =
5262 &ppp_state->isp.control_struct;
5263 struct pvr_device_info *const dev_info =
5264 &cmd_buffer->device->pdevice->dev_info;
5265
5266 if (ispctl->dbenable &&
5267 (BITSET_TEST(dynamic_state->dirty,
5268 MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS) ||
5269 cmd_buffer->depth_bias_array.size == 0)) {
5270 struct pvr_depth_bias_state depth_bias = {
5271 .constant_factor = pvr_calculate_final_depth_bias_contant_factor(
5272 dev_info,
5273 cmd_buffer->state.depth_format,
5274 dynamic_state->rs.depth_bias.constant_factor),
5275 .slope_factor = dynamic_state->rs.depth_bias.slope_factor,
5276 .clamp = dynamic_state->rs.depth_bias.clamp,
5277 };
5278
5279 ppp_state->depthbias_scissor_indices.depthbias_index =
5280 util_dynarray_num_elements(&cmd_buffer->depth_bias_array,
5281 __typeof__(depth_bias));
5282
5283 util_dynarray_append(&cmd_buffer->depth_bias_array,
5284 __typeof__(depth_bias),
5285 depth_bias);
5286
5287 header->pres_ispctl_dbsc = true;
5288 }
5289
5290 if (ispctl->scenable) {
5291 const uint32_t region_clip_align_size =
5292 pvr_get_geom_region_clip_align_size(dev_info);
5293 const VkViewport *const viewport = &dynamic_state->vp.viewports[0];
5294 const VkRect2D *const scissor = &dynamic_state->vp.scissors[0];
5295 struct pvr_scissor_words scissor_words;
5296 VkRect2D overlap_rect;
5297 uint32_t height;
5298 uint32_t width;
5299 uint32_t x;
5300 uint32_t y;
5301
5302 /* For region clip. */
5303 uint32_t bottom;
5304 uint32_t right;
5305 uint32_t left;
5306 uint32_t top;
5307
5308 /* We don't support multiple viewport calculations. */
5309 assert(dynamic_state->vp.viewport_count == 1);
5310 /* We don't support multiple scissor calculations. */
5311 assert(dynamic_state->vp.scissor_count == 1);
5312
5313 pvr_get_viewport_scissor_overlap(viewport, scissor, &overlap_rect);
5314
5315 x = overlap_rect.offset.x;
5316 y = overlap_rect.offset.y;
5317 width = overlap_rect.extent.width;
5318 height = overlap_rect.extent.height;
5319
5320 pvr_csb_pack (&scissor_words.w0, IPF_SCISSOR_WORD_0, word0) {
5321 word0.scw0_xmax = x + width;
5322 word0.scw0_xmin = x;
5323 }
5324
5325 pvr_csb_pack (&scissor_words.w1, IPF_SCISSOR_WORD_1, word1) {
5326 word1.scw1_ymax = y + height;
5327 word1.scw1_ymin = y;
5328 }
5329
5330 if (cmd_buffer->scissor_array.size &&
5331 cmd_buffer->scissor_words.w0 == scissor_words.w0 &&
5332 cmd_buffer->scissor_words.w1 == scissor_words.w1) {
5333 return;
5334 }
5335
5336 cmd_buffer->scissor_words = scissor_words;
5337
5338 /* Calculate region clip. */
5339
5340 left = x / region_clip_align_size;
5341 top = y / region_clip_align_size;
5342
5343 /* We prevent right=-1 with the multiplication. */
5344 /* TODO: Is there a better way of doing this? */
5345 if ((x + width) != 0U)
5346 right = DIV_ROUND_UP(x + width, region_clip_align_size) - 1;
5347 else
5348 right = 0;
5349
5350 if ((y + height) != 0U)
5351 bottom = DIV_ROUND_UP(y + height, region_clip_align_size) - 1;
5352 else
5353 bottom = 0U;
5354
5355 /* Setup region clip to clip everything outside what was calculated. */
5356
5357 /* FIXME: Should we mask to prevent writing over other words? */
5358 pvr_csb_pack (&ppp_state->region_clipping.word0, TA_REGION_CLIP0, word0) {
5359 word0.right = right;
5360 word0.left = left;
5361 word0.mode = ROGUE_TA_REGION_CLIP_MODE_OUTSIDE;
5362 }
5363
5364 pvr_csb_pack (&ppp_state->region_clipping.word1, TA_REGION_CLIP1, word1) {
5365 word1.bottom = bottom;
5366 word1.top = top;
5367 }
5368
5369 ppp_state->depthbias_scissor_indices.scissor_index =
5370 util_dynarray_num_elements(&cmd_buffer->scissor_array,
5371 struct pvr_scissor_words);
5372
5373 util_dynarray_append(&cmd_buffer->scissor_array,
5374 struct pvr_scissor_words,
5375 cmd_buffer->scissor_words);
5376
5377 header->pres_ispctl_dbsc = true;
5378 header->pres_region_clip = true;
5379 }
5380 }
5381
5382 static void
pvr_setup_triangle_merging_flag(struct pvr_cmd_buffer * const cmd_buffer,struct ROGUE_TA_STATE_ISPA * ispa)5383 pvr_setup_triangle_merging_flag(struct pvr_cmd_buffer *const cmd_buffer,
5384 struct ROGUE_TA_STATE_ISPA *ispa)
5385 {
5386 struct ROGUE_TA_STATE_HEADER *const header = &cmd_buffer->state.emit_header;
5387 struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state;
5388 uint32_t merge_word;
5389 uint32_t mask;
5390
5391 pvr_csb_pack (&merge_word, TA_STATE_PDS_SIZEINFO2, size_info) {
5392 /* Disable for lines or punch-through or for DWD and depth compare
5393 * always.
5394 */
5395 if (ispa->objtype == ROGUE_TA_OBJTYPE_LINE ||
5396 ispa->passtype == ROGUE_TA_PASSTYPE_PUNCH_THROUGH ||
5397 (ispa->dwritedisable && ispa->dcmpmode == ROGUE_TA_CMPMODE_ALWAYS)) {
5398 size_info.pds_tri_merge_disable = true;
5399 }
5400 }
5401
5402 pvr_csb_pack (&mask, TA_STATE_PDS_SIZEINFO2, size_info) {
5403 size_info.pds_tri_merge_disable = true;
5404 }
5405
5406 merge_word |= ppp_state->pds.size_info2 & ~mask;
5407
5408 if (merge_word != ppp_state->pds.size_info2) {
5409 ppp_state->pds.size_info2 = merge_word;
5410 header->pres_pds_state_ptr0 = true;
5411 }
5412 }
5413
5414 static void
pvr_setup_fragment_state_pointers(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)5415 pvr_setup_fragment_state_pointers(struct pvr_cmd_buffer *const cmd_buffer,
5416 struct pvr_sub_cmd_gfx *const sub_cmd)
5417 {
5418 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
5419 const pco_data *const fs_data = &state->gfx_pipeline->fs_data;
5420
5421 const struct pvr_fragment_shader_state *const fragment_shader_state =
5422 &state->gfx_pipeline->shader_state.fragment;
5423 const struct pvr_stage_allocation_descriptor_state *descriptor_shader_state =
5424 &fragment_shader_state->descriptor_state;
5425 const struct pvr_pipeline_stage_state *fragment_state =
5426 &fragment_shader_state->stage_state;
5427 const struct pvr_pds_upload *pds_coeff_program =
5428 &fragment_shader_state->pds_coeff_program;
5429
5430 const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
5431 struct ROGUE_TA_STATE_HEADER *const header = &state->emit_header;
5432 struct pvr_ppp_state *const ppp_state = &state->ppp_state;
5433
5434 const uint32_t pds_uniform_size =
5435 DIV_ROUND_UP(descriptor_shader_state->pds_info.data_size_in_dwords,
5436 ROGUE_TA_STATE_PDS_SIZEINFO1_PDS_UNIFORMSIZE_UNIT_SIZE);
5437
5438 const uint32_t pds_varying_state_size =
5439 DIV_ROUND_UP(pds_coeff_program->data_size,
5440 ROGUE_TA_STATE_PDS_SIZEINFO1_PDS_VARYINGSIZE_UNIT_SIZE);
5441
5442 const uint32_t usc_varying_size =
5443 DIV_ROUND_UP(fs_data->common.coeffs,
5444 ROGUE_TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_UNIT_SIZE);
5445
5446 const uint32_t pds_temp_size =
5447 DIV_ROUND_UP(fragment_state->pds_temps_count,
5448 ROGUE_TA_STATE_PDS_SIZEINFO1_PDS_TEMPSIZE_UNIT_SIZE);
5449
5450 const uint32_t usc_shared_size =
5451 DIV_ROUND_UP(fs_data->common.shareds,
5452 ROGUE_TA_STATE_PDS_SIZEINFO2_USC_SHAREDSIZE_UNIT_SIZE);
5453
5454 const uint32_t max_tiles_in_flight =
5455 pvr_calc_fscommon_size_and_tiles_in_flight(
5456 &pdevice->dev_info,
5457 &pdevice->dev_runtime_info,
5458 usc_shared_size *
5459 ROGUE_TA_STATE_PDS_SIZEINFO2_USC_SHAREDSIZE_UNIT_SIZE,
5460 1);
5461 uint32_t size_info_mask;
5462 uint32_t size_info2;
5463
5464 if (max_tiles_in_flight < sub_cmd->max_tiles_in_flight)
5465 sub_cmd->max_tiles_in_flight = max_tiles_in_flight;
5466
5467 pvr_csb_pack (&ppp_state->pds.pixel_shader_base,
5468 TA_STATE_PDS_SHADERBASE,
5469 shader_base) {
5470 const struct pvr_pds_upload *const pds_upload =
5471 &fragment_shader_state->pds_fragment_program;
5472
5473 shader_base.addr = PVR_DEV_ADDR(pds_upload->data_offset);
5474 }
5475
5476 if (descriptor_shader_state->pds_code.pvr_bo) {
5477 pvr_csb_pack (&ppp_state->pds.texture_uniform_code_base,
5478 TA_STATE_PDS_TEXUNICODEBASE,
5479 tex_base) {
5480 tex_base.addr =
5481 PVR_DEV_ADDR(descriptor_shader_state->pds_code.code_offset);
5482 }
5483 } else {
5484 ppp_state->pds.texture_uniform_code_base = 0U;
5485 }
5486
5487 pvr_csb_pack (&ppp_state->pds.size_info1, TA_STATE_PDS_SIZEINFO1, info1) {
5488 info1.pds_uniformsize = pds_uniform_size;
5489 info1.pds_texturestatesize = 0U;
5490 info1.pds_varyingsize = pds_varying_state_size;
5491 info1.usc_varyingsize = usc_varying_size;
5492 info1.pds_tempsize = pds_temp_size;
5493 }
5494
5495 pvr_csb_pack (&size_info_mask, TA_STATE_PDS_SIZEINFO2, mask) {
5496 mask.pds_tri_merge_disable = true;
5497 }
5498
5499 ppp_state->pds.size_info2 &= size_info_mask;
5500
5501 pvr_csb_pack (&size_info2, TA_STATE_PDS_SIZEINFO2, info2) {
5502 info2.usc_sharedsize = usc_shared_size;
5503 }
5504
5505 ppp_state->pds.size_info2 |= size_info2;
5506
5507 if (pds_coeff_program->pvr_bo) {
5508 header->pres_pds_state_ptr1 = true;
5509
5510 pvr_csb_pack (&ppp_state->pds.varying_base,
5511 TA_STATE_PDS_VARYINGBASE,
5512 base) {
5513 base.addr = PVR_DEV_ADDR(pds_coeff_program->data_offset);
5514 }
5515 } else {
5516 ppp_state->pds.varying_base = 0U;
5517 }
5518
5519 pvr_csb_pack (&ppp_state->pds.uniform_state_data_base,
5520 TA_STATE_PDS_UNIFORMDATABASE,
5521 base) {
5522 base.addr = PVR_DEV_ADDR(state->pds_fragment_descriptor_data_offset);
5523 }
5524
5525 header->pres_pds_state_ptr0 = true;
5526 header->pres_pds_state_ptr3 = true;
5527 }
5528
pvr_setup_viewport(struct pvr_cmd_buffer * const cmd_buffer)5529 static void pvr_setup_viewport(struct pvr_cmd_buffer *const cmd_buffer)
5530 {
5531 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
5532 struct ROGUE_TA_STATE_HEADER *const header = &state->emit_header;
5533 struct vk_dynamic_graphics_state *const dynamic_state =
5534 &cmd_buffer->vk.dynamic_graphics_state;
5535 struct pvr_ppp_state *const ppp_state = &state->ppp_state;
5536
5537 if (ppp_state->viewport_count != dynamic_state->vp.viewport_count) {
5538 ppp_state->viewport_count = dynamic_state->vp.viewport_count;
5539 header->pres_viewport = true;
5540 }
5541
5542 if (dynamic_state->rs.rasterizer_discard_enable) {
5543 /* We don't want to emit any viewport data as it'll just get thrown
5544 * away. It's after the previous condition because we still want to
5545 * stash the viewport_count as it's our trigger for when
5546 * rasterizer discard gets disabled.
5547 */
5548 header->pres_viewport = false;
5549 return;
5550 }
5551
5552 for (uint32_t i = 0; i < ppp_state->viewport_count; i++) {
5553 VkViewport *viewport = &dynamic_state->vp.viewports[i];
5554 uint32_t x_scale = fui(viewport->width * 0.5f);
5555 uint32_t y_scale = fui(viewport->height * 0.5f);
5556 uint32_t z_scale = fui(viewport->maxDepth - viewport->minDepth);
5557 uint32_t x_center = fui(viewport->x + viewport->width * 0.5f);
5558 uint32_t y_center = fui(viewport->y + viewport->height * 0.5f);
5559 uint32_t z_center = fui(viewport->minDepth);
5560
5561 if (ppp_state->viewports[i].a0 != x_center ||
5562 ppp_state->viewports[i].m0 != x_scale ||
5563 ppp_state->viewports[i].a1 != y_center ||
5564 ppp_state->viewports[i].m1 != y_scale ||
5565 ppp_state->viewports[i].a2 != z_center ||
5566 ppp_state->viewports[i].m2 != z_scale) {
5567 ppp_state->viewports[i].a0 = x_center;
5568 ppp_state->viewports[i].m0 = x_scale;
5569 ppp_state->viewports[i].a1 = y_center;
5570 ppp_state->viewports[i].m1 = y_scale;
5571 ppp_state->viewports[i].a2 = z_center;
5572 ppp_state->viewports[i].m2 = z_scale;
5573
5574 header->pres_viewport = true;
5575 }
5576 }
5577 }
5578
pvr_setup_ppp_control(struct pvr_cmd_buffer * const cmd_buffer)5579 static void pvr_setup_ppp_control(struct pvr_cmd_buffer *const cmd_buffer)
5580 {
5581 struct vk_dynamic_graphics_state *const dynamic_state =
5582 &cmd_buffer->vk.dynamic_graphics_state;
5583 const VkPrimitiveTopology topology = dynamic_state->ia.primitive_topology;
5584 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
5585 struct ROGUE_TA_STATE_HEADER *const header = &state->emit_header;
5586 struct pvr_ppp_state *const ppp_state = &state->ppp_state;
5587 uint32_t ppp_control;
5588
5589 pvr_csb_pack (&ppp_control, TA_STATE_PPP_CTRL, control) {
5590 control.drawclippededges = true;
5591 control.wclampen = true;
5592
5593 if (topology == VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN)
5594 control.flatshade_vtx = ROGUE_TA_FLATSHADE_VTX_VERTEX_1;
5595 else
5596 control.flatshade_vtx = ROGUE_TA_FLATSHADE_VTX_VERTEX_0;
5597
5598 if (dynamic_state->rs.depth_clamp_enable)
5599 control.clip_mode = ROGUE_TA_CLIP_MODE_NO_FRONT_OR_REAR;
5600 else
5601 control.clip_mode = ROGUE_TA_CLIP_MODE_FRONT_REAR;
5602
5603 /* +--- FrontIsCCW?
5604 * | +--- Cull Front?
5605 * v v
5606 * 0|0 CULLMODE_CULL_CCW,
5607 * 0|1 CULLMODE_CULL_CW,
5608 * 1|0 CULLMODE_CULL_CW,
5609 * 1|1 CULLMODE_CULL_CCW,
5610 */
5611 switch (dynamic_state->rs.cull_mode) {
5612 case VK_CULL_MODE_BACK_BIT:
5613 case VK_CULL_MODE_FRONT_BIT:
5614 if ((dynamic_state->rs.front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE) ^
5615 (dynamic_state->rs.cull_mode == VK_CULL_MODE_FRONT_BIT)) {
5616 control.cullmode = ROGUE_TA_CULLMODE_CULL_CW;
5617 } else {
5618 control.cullmode = ROGUE_TA_CULLMODE_CULL_CCW;
5619 }
5620
5621 break;
5622
5623 case VK_CULL_MODE_FRONT_AND_BACK:
5624 case VK_CULL_MODE_NONE:
5625 control.cullmode = ROGUE_TA_CULLMODE_NO_CULLING;
5626 break;
5627
5628 default:
5629 unreachable("Unsupported cull mode!");
5630 }
5631 }
5632
5633 if (ppp_control != ppp_state->ppp_control) {
5634 ppp_state->ppp_control = ppp_control;
5635 header->pres_ppp_ctrl = true;
5636 }
5637 }
5638
5639 /* Largest valid PPP State update in words = 31
5640 * 1 - Header
5641 * 3 - Stream Out Config words 0, 1 and 2
5642 * 1 - PPP Control word
5643 * 3 - Varying Config words 0, 1 and 2
5644 * 1 - Output Select
5645 * 1 - WClamp
5646 * 6 - Viewport Transform words
5647 * 2 - Region Clip words
5648 * 3 - PDS State for fragment phase (PDSSTATEPTR 1-3)
5649 * 4 - PDS State for fragment phase (PDSSTATEPTR0)
5650 * 6 - ISP Control Words
5651 */
5652 #define PVR_MAX_PPP_STATE_DWORDS 31
5653
pvr_emit_ppp_state(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)5654 static VkResult pvr_emit_ppp_state(struct pvr_cmd_buffer *const cmd_buffer,
5655 struct pvr_sub_cmd_gfx *const sub_cmd)
5656 {
5657 const bool deferred_secondary = pvr_cmd_uses_deferred_cs_cmds(cmd_buffer);
5658 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
5659 struct ROGUE_TA_STATE_HEADER *const header = &state->emit_header;
5660 struct pvr_csb *const control_stream = &sub_cmd->control_stream;
5661 struct pvr_ppp_state *const ppp_state = &state->ppp_state;
5662 uint32_t ppp_state_words[PVR_MAX_PPP_STATE_DWORDS];
5663 const bool emit_dbsc = header->pres_ispctl_dbsc;
5664 uint32_t *buffer_ptr = ppp_state_words;
5665 uint32_t dbsc_patching_offset = 0;
5666 uint32_t ppp_state_words_count;
5667 struct pvr_suballoc_bo *pvr_bo;
5668 VkResult result;
5669
5670 #if !defined(NDEBUG)
5671 struct ROGUE_TA_STATE_HEADER emit_mask = *header;
5672 uint32_t packed_emit_mask;
5673
5674 static_assert(pvr_cmd_length(TA_STATE_HEADER) == 1,
5675 "EMIT_MASK_IS_CLEAR assumes 1 dword sized header.");
5676
5677 # define EMIT_MASK_GET(field) (emit_mask.field)
5678 # define EMIT_MASK_SET(field, value) (emit_mask.field = (value))
5679 # define EMIT_MASK_IS_CLEAR \
5680 (pvr_cmd_pack(TA_STATE_HEADER)(&packed_emit_mask, &emit_mask), \
5681 packed_emit_mask == 0)
5682 #else
5683 # define EMIT_MASK_GET(field)
5684 # define EMIT_MASK_SET(field, value)
5685 #endif
5686
5687 header->view_port_count =
5688 (ppp_state->viewport_count == 0) ? 0U : (ppp_state->viewport_count - 1);
5689 header->pres_ispctl_fa = header->pres_ispctl;
5690
5691 /* If deferred_secondary is true then we do a separate state update
5692 * which gets patched in vkCmdExecuteCommands().
5693 */
5694 header->pres_ispctl_dbsc &= !deferred_secondary;
5695
5696 pvr_csb_write_struct(buffer_ptr, TA_STATE_HEADER, header);
5697
5698 static_assert(pvr_cmd_length(TA_STATE_HEADER) == 1,
5699 "Following header check assumes 1 dword sized header.");
5700 /* If the header is empty we exit early and prevent a bo alloc of 0 size. */
5701 if (ppp_state_words[0] == 0)
5702 return VK_SUCCESS;
5703
5704 if (header->pres_ispctl) {
5705 pvr_csb_write_value(buffer_ptr, TA_STATE_ISPCTL, ppp_state->isp.control);
5706
5707 assert(header->pres_ispctl_fa);
5708 /* This is not a mistake. FA, BA have the ISPA format, and FB, BB have the
5709 * ISPB format.
5710 */
5711 pvr_csb_write_value(buffer_ptr, TA_STATE_ISPA, ppp_state->isp.front_a);
5712 EMIT_MASK_SET(pres_ispctl_fa, false);
5713
5714 if (header->pres_ispctl_fb) {
5715 pvr_csb_write_value(buffer_ptr, TA_STATE_ISPB, ppp_state->isp.front_b);
5716 EMIT_MASK_SET(pres_ispctl_fb, false);
5717 }
5718
5719 if (header->pres_ispctl_ba) {
5720 pvr_csb_write_value(buffer_ptr, TA_STATE_ISPA, ppp_state->isp.back_a);
5721 EMIT_MASK_SET(pres_ispctl_ba, false);
5722 }
5723
5724 if (header->pres_ispctl_bb) {
5725 pvr_csb_write_value(buffer_ptr, TA_STATE_ISPB, ppp_state->isp.back_b);
5726 EMIT_MASK_SET(pres_ispctl_bb, false);
5727 }
5728
5729 EMIT_MASK_SET(pres_ispctl, false);
5730 }
5731
5732 if (header->pres_ispctl_dbsc) {
5733 assert(!deferred_secondary);
5734
5735 dbsc_patching_offset = buffer_ptr - ppp_state_words;
5736
5737 pvr_csb_pack (buffer_ptr, TA_STATE_ISPDBSC, ispdbsc) {
5738 ispdbsc.dbindex = ppp_state->depthbias_scissor_indices.depthbias_index;
5739 ispdbsc.scindex = ppp_state->depthbias_scissor_indices.scissor_index;
5740 }
5741 buffer_ptr += pvr_cmd_length(TA_STATE_ISPDBSC);
5742
5743 EMIT_MASK_SET(pres_ispctl_dbsc, false);
5744 }
5745
5746 if (header->pres_pds_state_ptr0) {
5747 pvr_csb_write_value(buffer_ptr,
5748 TA_STATE_PDS_SHADERBASE,
5749 ppp_state->pds.pixel_shader_base);
5750
5751 pvr_csb_write_value(buffer_ptr,
5752 TA_STATE_PDS_TEXUNICODEBASE,
5753 ppp_state->pds.texture_uniform_code_base);
5754
5755 pvr_csb_write_value(buffer_ptr,
5756 TA_STATE_PDS_SIZEINFO1,
5757 ppp_state->pds.size_info1);
5758 pvr_csb_write_value(buffer_ptr,
5759 TA_STATE_PDS_SIZEINFO2,
5760 ppp_state->pds.size_info2);
5761
5762 EMIT_MASK_SET(pres_pds_state_ptr0, false);
5763 }
5764
5765 if (header->pres_pds_state_ptr1) {
5766 pvr_csb_write_value(buffer_ptr,
5767 TA_STATE_PDS_VARYINGBASE,
5768 ppp_state->pds.varying_base);
5769 EMIT_MASK_SET(pres_pds_state_ptr1, false);
5770 }
5771
5772 /* We don't use pds_state_ptr2 (texture state programs) control word, but
5773 * this doesn't mean we need to set it to 0. This is because the hardware
5774 * runs the texture state program only when
5775 * ROGUE_TA_STATE_PDS_SIZEINFO1.pds_texturestatesize is non-zero.
5776 */
5777 assert(pvr_csb_unpack(&ppp_state->pds.size_info1, TA_STATE_PDS_SIZEINFO1)
5778 .pds_texturestatesize == 0);
5779
5780 if (header->pres_pds_state_ptr3) {
5781 pvr_csb_write_value(buffer_ptr,
5782 TA_STATE_PDS_UNIFORMDATABASE,
5783 ppp_state->pds.uniform_state_data_base);
5784 EMIT_MASK_SET(pres_pds_state_ptr3, false);
5785 }
5786
5787 if (header->pres_region_clip) {
5788 pvr_csb_write_value(buffer_ptr,
5789 TA_REGION_CLIP0,
5790 ppp_state->region_clipping.word0);
5791 pvr_csb_write_value(buffer_ptr,
5792 TA_REGION_CLIP1,
5793 ppp_state->region_clipping.word1);
5794
5795 EMIT_MASK_SET(pres_region_clip, false);
5796 }
5797
5798 if (header->pres_viewport) {
5799 const uint32_t viewports = MAX2(1, ppp_state->viewport_count);
5800 EMIT_MASK_SET(view_port_count, viewports);
5801
5802 for (uint32_t i = 0; i < viewports; i++) {
5803 /* These don't have any definitions in the csbgen xml files and none
5804 * will be added.
5805 */
5806 *buffer_ptr++ = ppp_state->viewports[i].a0;
5807 *buffer_ptr++ = ppp_state->viewports[i].m0;
5808 *buffer_ptr++ = ppp_state->viewports[i].a1;
5809 *buffer_ptr++ = ppp_state->viewports[i].m1;
5810 *buffer_ptr++ = ppp_state->viewports[i].a2;
5811 *buffer_ptr++ = ppp_state->viewports[i].m2;
5812
5813 EMIT_MASK_SET(view_port_count, EMIT_MASK_GET(view_port_count) - 1);
5814 }
5815
5816 EMIT_MASK_SET(pres_viewport, false);
5817 }
5818
5819 if (header->pres_wclamp) {
5820 pvr_csb_pack (buffer_ptr, TA_WCLAMP, wclamp) {
5821 wclamp.val = fui(0.00001f);
5822 }
5823 buffer_ptr += pvr_cmd_length(TA_WCLAMP);
5824 EMIT_MASK_SET(pres_wclamp, false);
5825 }
5826
5827 if (header->pres_outselects) {
5828 pvr_csb_write_value(buffer_ptr, TA_OUTPUT_SEL, ppp_state->output_selects);
5829 EMIT_MASK_SET(pres_outselects, false);
5830 }
5831
5832 if (header->pres_varying_word0) {
5833 pvr_csb_write_value(buffer_ptr,
5834 TA_STATE_VARYING0,
5835 ppp_state->varying_word[0]);
5836 EMIT_MASK_SET(pres_varying_word0, false);
5837 }
5838
5839 if (header->pres_varying_word1) {
5840 pvr_csb_write_value(buffer_ptr,
5841 TA_STATE_VARYING1,
5842 ppp_state->varying_word[1]);
5843 EMIT_MASK_SET(pres_varying_word1, false);
5844 }
5845
5846 /* We only emit this on the first draw of a render job to prevent us from
5847 * inheriting a non-zero value set elsewhere.
5848 */
5849 if (header->pres_varying_word2) {
5850 pvr_csb_write_value(buffer_ptr, TA_STATE_VARYING2, 0);
5851 EMIT_MASK_SET(pres_varying_word2, false);
5852 }
5853
5854 if (header->pres_ppp_ctrl) {
5855 pvr_csb_write_value(buffer_ptr,
5856 TA_STATE_PPP_CTRL,
5857 ppp_state->ppp_control);
5858 EMIT_MASK_SET(pres_ppp_ctrl, false);
5859 }
5860
5861 /* We only emit this on the first draw of a render job to prevent us from
5862 * inheriting a non-zero value set elsewhere.
5863 */
5864 if (header->pres_stream_out_size) {
5865 pvr_csb_write_value(buffer_ptr, TA_STATE_STREAM_OUT0, 0);
5866 EMIT_MASK_SET(pres_stream_out_size, false);
5867 }
5868
5869 assert(EMIT_MASK_IS_CLEAR);
5870
5871 #undef EMIT_MASK_GET
5872 #undef EMIT_MASK_SET
5873 #if !defined(NDEBUG)
5874 # undef EMIT_MASK_IS_CLEAR
5875 #endif
5876
5877 ppp_state_words_count = buffer_ptr - ppp_state_words;
5878 assert(ppp_state_words_count <= PVR_MAX_PPP_STATE_DWORDS);
5879
5880 result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
5881 cmd_buffer->device->heaps.general_heap,
5882 PVR_DW_TO_BYTES(ppp_state_words_count),
5883 &pvr_bo);
5884 if (result != VK_SUCCESS)
5885 return result;
5886
5887 memcpy(pvr_bo_suballoc_get_map_addr(pvr_bo),
5888 ppp_state_words,
5889 PVR_DW_TO_BYTES(ppp_state_words_count));
5890
5891 pvr_csb_set_relocation_mark(control_stream);
5892
5893 /* Write the VDM state update into the VDM control stream. */
5894 pvr_csb_emit (control_stream, VDMCTRL_PPP_STATE0, state0) {
5895 state0.word_count = ppp_state_words_count;
5896 state0.addrmsb = pvr_bo->dev_addr;
5897 }
5898
5899 pvr_csb_emit (control_stream, VDMCTRL_PPP_STATE1, state1) {
5900 state1.addrlsb = pvr_bo->dev_addr;
5901 }
5902
5903 pvr_csb_clear_relocation_mark(control_stream);
5904
5905 if (emit_dbsc && cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
5906 struct pvr_deferred_cs_command cmd;
5907
5908 if (deferred_secondary) {
5909 const uint32_t num_dwords = pvr_cmd_length(VDMCTRL_PPP_STATE0) +
5910 pvr_cmd_length(VDMCTRL_PPP_STATE1);
5911 uint32_t *vdm_state;
5912
5913 pvr_csb_set_relocation_mark(control_stream);
5914
5915 vdm_state = pvr_csb_alloc_dwords(control_stream, num_dwords);
5916 if (!vdm_state) {
5917 result = pvr_csb_get_status(control_stream);
5918 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
5919 }
5920
5921 pvr_csb_clear_relocation_mark(control_stream);
5922
5923 cmd = (struct pvr_deferred_cs_command){
5924 .type = PVR_DEFERRED_CS_COMMAND_TYPE_DBSC,
5925 .dbsc = {
5926 .state = ppp_state->depthbias_scissor_indices,
5927 .vdm_state = vdm_state,
5928 },
5929 };
5930 } else {
5931 cmd = (struct pvr_deferred_cs_command){
5932 .type = PVR_DEFERRED_CS_COMMAND_TYPE_DBSC2,
5933 .dbsc2 = {
5934 .state = ppp_state->depthbias_scissor_indices,
5935 .ppp_cs_bo = pvr_bo,
5936 .patch_offset = dbsc_patching_offset,
5937 },
5938 };
5939 }
5940
5941 util_dynarray_append(&cmd_buffer->deferred_csb_commands,
5942 struct pvr_deferred_cs_command,
5943 cmd);
5944 }
5945
5946 state->emit_header = (struct ROGUE_TA_STATE_HEADER){ 0 };
5947
5948 return VK_SUCCESS;
5949 }
5950
5951 static inline bool
pvr_ppp_state_update_required(const struct pvr_cmd_buffer * cmd_buffer)5952 pvr_ppp_state_update_required(const struct pvr_cmd_buffer *cmd_buffer)
5953 {
5954 const BITSET_WORD *const dynamic_dirty =
5955 cmd_buffer->vk.dynamic_graphics_state.dirty;
5956 const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
5957 const struct ROGUE_TA_STATE_HEADER *const header = &state->emit_header;
5958
5959 /* For push constants we only need to worry if they are updated for the
5960 * fragment stage since we're only updating the pds programs used in the
5961 * fragment stage.
5962 */
5963
5964 return header->pres_ppp_ctrl || header->pres_ispctl ||
5965 header->pres_ispctl_fb || header->pres_ispctl_ba ||
5966 header->pres_ispctl_bb || header->pres_ispctl_dbsc ||
5967 header->pres_pds_state_ptr0 || header->pres_pds_state_ptr1 ||
5968 header->pres_pds_state_ptr2 || header->pres_pds_state_ptr3 ||
5969 header->pres_region_clip || header->pres_viewport ||
5970 header->pres_wclamp || header->pres_outselects ||
5971 header->pres_varying_word0 || header->pres_varying_word1 ||
5972 header->pres_varying_word2 || header->pres_stream_out_program ||
5973 state->dirty.fragment_descriptors || state->dirty.vis_test ||
5974 state->dirty.gfx_pipeline_binding || state->dirty.isp_userpass ||
5975 state->push_constants.dirty_stages & VK_SHADER_STAGE_FRAGMENT_BIT ||
5976 BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) ||
5977 BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) ||
5978 BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE) ||
5979 BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE) ||
5980 BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS) ||
5981 BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH) ||
5982 BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_VP_SCISSORS) ||
5983 BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_VP_SCISSOR_COUNT) ||
5984 BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) ||
5985 BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT);
5986 }
5987
5988 static VkResult
pvr_emit_dirty_ppp_state(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)5989 pvr_emit_dirty_ppp_state(struct pvr_cmd_buffer *const cmd_buffer,
5990 struct pvr_sub_cmd_gfx *const sub_cmd)
5991 {
5992 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
5993 struct vk_dynamic_graphics_state *const dynamic_state =
5994 &cmd_buffer->vk.dynamic_graphics_state;
5995 VkResult result;
5996
5997 /* TODO: The emit_header will be dirty only if
5998 * pvr_reset_graphics_dirty_state() was called before this (so when command
5999 * buffer begins recording or when it's reset). Otherwise it will have been
6000 * zeroed out by the previous pvr_emit_ppp_state(). We can probably set a
6001 * flag in there and check it here instead of checking the header.
6002 * Check if this is true and implement the flag.
6003 */
6004 if (!pvr_ppp_state_update_required(cmd_buffer))
6005 return VK_SUCCESS;
6006
6007 if (state->dirty.gfx_pipeline_binding) {
6008 struct ROGUE_TA_STATE_ISPA ispa;
6009
6010 pvr_setup_output_select(cmd_buffer);
6011 pvr_setup_isp_faces_and_control(cmd_buffer, &ispa);
6012 pvr_setup_triangle_merging_flag(cmd_buffer, &ispa);
6013 } else if (BITSET_TEST(dynamic_state->dirty,
6014 MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) ||
6015 BITSET_TEST(dynamic_state->dirty,
6016 MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE) ||
6017 BITSET_TEST(dynamic_state->dirty,
6018 MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) ||
6019 BITSET_TEST(dynamic_state->dirty,
6020 MESA_VK_DYNAMIC_RS_LINE_WIDTH) ||
6021 state->dirty.isp_userpass || state->dirty.vis_test) {
6022 pvr_setup_isp_faces_and_control(cmd_buffer, NULL);
6023 }
6024
6025 if (!dynamic_state->rs.rasterizer_discard_enable &&
6026 state->dirty.fragment_descriptors &&
6027 state->gfx_pipeline->shader_state.fragment.bo &&
6028 !state->gfx_pipeline->fs_data.common.uses.empty) {
6029 pvr_setup_fragment_state_pointers(cmd_buffer, sub_cmd);
6030 }
6031
6032 pvr_setup_isp_depth_bias_scissor_state(cmd_buffer);
6033
6034 if (BITSET_TEST(dynamic_state->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) ||
6035 BITSET_TEST(dynamic_state->dirty, MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT))
6036 pvr_setup_viewport(cmd_buffer);
6037
6038 pvr_setup_ppp_control(cmd_buffer);
6039
6040 /* The hardware doesn't have an explicit mode for this so we use a
6041 * negative viewport to make sure all objects are culled out early.
6042 */
6043 if (dynamic_state->rs.cull_mode == VK_CULL_MODE_FRONT_AND_BACK) {
6044 /* Shift the viewport out of the guard-band culling everything. */
6045 const uint32_t negative_vp_val = fui(-2.0f);
6046
6047 state->ppp_state.viewports[0].a0 = negative_vp_val;
6048 state->ppp_state.viewports[0].m0 = 0;
6049 state->ppp_state.viewports[0].a1 = negative_vp_val;
6050 state->ppp_state.viewports[0].m1 = 0;
6051 state->ppp_state.viewports[0].a2 = negative_vp_val;
6052 state->ppp_state.viewports[0].m2 = 0;
6053
6054 state->ppp_state.viewport_count = 1;
6055
6056 state->emit_header.pres_viewport = true;
6057 }
6058
6059 result = pvr_emit_ppp_state(cmd_buffer, sub_cmd);
6060 if (result != VK_SUCCESS)
6061 return result;
6062
6063 return VK_SUCCESS;
6064 }
6065
pvr_calculate_vertex_cam_size(const struct pvr_device_info * dev_info,const uint32_t vs_output_size,const bool raster_enable,uint32_t * const cam_size_out,uint32_t * const vs_max_instances_out)6066 void pvr_calculate_vertex_cam_size(const struct pvr_device_info *dev_info,
6067 const uint32_t vs_output_size,
6068 const bool raster_enable,
6069 uint32_t *const cam_size_out,
6070 uint32_t *const vs_max_instances_out)
6071 {
6072 /* First work out the size of a vertex in the UVS and multiply by 4 for
6073 * column ordering.
6074 */
6075 const uint32_t uvs_vertex_vector_size_in_dwords =
6076 (vs_output_size + 1U + raster_enable * 4U) * 4U;
6077 const uint32_t vdm_cam_size =
6078 PVR_GET_FEATURE_VALUE(dev_info, vdm_cam_size, 32U);
6079
6080 /* This is a proxy for 8XE. */
6081 if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format) &&
6082 vdm_cam_size < 96U) {
6083 /* Comparisons are based on size including scratch per vertex vector. */
6084 if (uvs_vertex_vector_size_in_dwords < (14U * 4U)) {
6085 *cam_size_out = MIN2(31U, vdm_cam_size - 1U);
6086 *vs_max_instances_out = 16U;
6087 } else if (uvs_vertex_vector_size_in_dwords < (20U * 4U)) {
6088 *cam_size_out = 15U;
6089 *vs_max_instances_out = 16U;
6090 } else if (uvs_vertex_vector_size_in_dwords < (28U * 4U)) {
6091 *cam_size_out = 11U;
6092 *vs_max_instances_out = 12U;
6093 } else if (uvs_vertex_vector_size_in_dwords < (44U * 4U)) {
6094 *cam_size_out = 7U;
6095 *vs_max_instances_out = 8U;
6096 } else if (PVR_HAS_FEATURE(dev_info,
6097 simple_internal_parameter_format_v2) ||
6098 uvs_vertex_vector_size_in_dwords < (64U * 4U)) {
6099 *cam_size_out = 7U;
6100 *vs_max_instances_out = 4U;
6101 } else {
6102 *cam_size_out = 3U;
6103 *vs_max_instances_out = 2U;
6104 }
6105 } else {
6106 /* Comparisons are based on size including scratch per vertex vector. */
6107 if (uvs_vertex_vector_size_in_dwords <= (32U * 4U)) {
6108 /* output size <= 27 + 5 scratch. */
6109 *cam_size_out = MIN2(95U, vdm_cam_size - 1U);
6110 *vs_max_instances_out = 0U;
6111 } else if (uvs_vertex_vector_size_in_dwords <= 48U * 4U) {
6112 /* output size <= 43 + 5 scratch */
6113 *cam_size_out = 63U;
6114 if (PVR_GET_FEATURE_VALUE(dev_info, uvs_vtx_entries, 144U) < 288U)
6115 *vs_max_instances_out = 16U;
6116 else
6117 *vs_max_instances_out = 0U;
6118 } else if (uvs_vertex_vector_size_in_dwords <= 64U * 4U) {
6119 /* output size <= 59 + 5 scratch. */
6120 *cam_size_out = 31U;
6121 if (PVR_GET_FEATURE_VALUE(dev_info, uvs_vtx_entries, 144U) < 288U)
6122 *vs_max_instances_out = 16U;
6123 else
6124 *vs_max_instances_out = 0U;
6125 } else {
6126 *cam_size_out = 15U;
6127 *vs_max_instances_out = 16U;
6128 }
6129 }
6130 }
6131
pvr_emit_dirty_vdm_state(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)6132 static void pvr_emit_dirty_vdm_state(struct pvr_cmd_buffer *const cmd_buffer,
6133 struct pvr_sub_cmd_gfx *const sub_cmd)
6134 {
6135 /* FIXME: Assume all state is dirty for the moment. */
6136 struct pvr_device_info *const dev_info =
6137 &cmd_buffer->device->pdevice->dev_info;
6138 ASSERTED const uint32_t max_user_vertex_output_components =
6139 pvr_get_max_user_vertex_output_components(dev_info);
6140 struct ROGUE_VDMCTRL_VDM_STATE0 header = { pvr_cmd_header(
6141 VDMCTRL_VDM_STATE0) };
6142 struct vk_dynamic_graphics_state *const dynamic_state =
6143 &cmd_buffer->vk.dynamic_graphics_state;
6144 const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
6145 const pco_data *const vs_data = &state->gfx_pipeline->vs_data;
6146 struct pvr_csb *const csb = &sub_cmd->control_stream;
6147 uint32_t max_instances;
6148 uint32_t cam_size;
6149
6150 /* CAM Calculations and HW state take vertex size aligned to DWORDS. */
6151 assert(vs_data->vs.vtxouts <= max_user_vertex_output_components);
6152
6153 pvr_calculate_vertex_cam_size(dev_info,
6154 vs_data->vs.vtxouts,
6155 true,
6156 &cam_size,
6157 &max_instances);
6158
6159 pvr_csb_set_relocation_mark(csb);
6160
6161 pvr_csb_emit (csb, VDMCTRL_VDM_STATE0, state0) {
6162 state0.cam_size = cam_size;
6163
6164 if (dynamic_state->ia.primitive_restart_enable) {
6165 state0.cut_index_enable = true;
6166 state0.cut_index_present = true;
6167 }
6168
6169 switch (dynamic_state->ia.primitive_topology) {
6170 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
6171 state0.flatshade_control = ROGUE_VDMCTRL_FLATSHADE_CONTROL_VERTEX_1;
6172 break;
6173
6174 default:
6175 state0.flatshade_control = ROGUE_VDMCTRL_FLATSHADE_CONTROL_VERTEX_0;
6176 break;
6177 }
6178
6179 /* If we've bound a different vertex buffer, or this draw-call requires
6180 * a different PDS attrib data-section from the last draw call (changed
6181 * base_instance) then we need to specify a new data section. This is
6182 * also the case if we've switched pipeline or attrib program as the
6183 * data-section layout will be different.
6184 */
6185 state0.vs_data_addr_present =
6186 state->dirty.gfx_pipeline_binding || state->dirty.vertex_bindings ||
6187 state->dirty.draw_base_instance || state->dirty.draw_variant;
6188
6189 /* Need to specify new PDS Attrib program if we've bound a different
6190 * pipeline or we needed a different PDS Attrib variant for this
6191 * draw-call.
6192 */
6193 state0.vs_other_present = state->dirty.gfx_pipeline_binding ||
6194 state->dirty.draw_variant;
6195
6196 /* UVB_SCRATCH_SELECT_ONE with no rasterization is only valid when
6197 * stream output is enabled. We use UVB_SCRATCH_SELECT_FIVE because
6198 * Vulkan doesn't support stream output and the vertex position is
6199 * always emitted to the UVB.
6200 */
6201 state0.uvs_scratch_size_select =
6202 ROGUE_VDMCTRL_UVS_SCRATCH_SIZE_SELECT_FIVE;
6203
6204 header = state0;
6205 }
6206
6207 if (header.cut_index_present) {
6208 pvr_csb_emit (csb, VDMCTRL_VDM_STATE1, state1) {
6209 state1.cut_index =
6210 vk_index_to_restart(state->index_buffer_binding.type);
6211 }
6212 }
6213
6214 if (header.vs_data_addr_present) {
6215 pvr_csb_emit (csb, VDMCTRL_VDM_STATE2, state2) {
6216 state2.vs_pds_data_base_addr =
6217 PVR_DEV_ADDR(state->pds_vertex_attrib_offset);
6218 }
6219 }
6220
6221 if (header.vs_other_present) {
6222 const uint32_t usc_unified_store_size_in_bytes = vs_data->common.vtxins
6223 << 2;
6224
6225 pvr_csb_emit (csb, VDMCTRL_VDM_STATE3, state3) {
6226 state3.vs_pds_code_base_addr =
6227 PVR_DEV_ADDR(state->pds_shader.code_offset);
6228 }
6229
6230 pvr_csb_emit (csb, VDMCTRL_VDM_STATE4, state4) {
6231 state4.vs_output_size = vs_data->vs.vtxouts;
6232 }
6233
6234 pvr_csb_emit (csb, VDMCTRL_VDM_STATE5, state5) {
6235 state5.vs_max_instances = max_instances;
6236 state5.vs_usc_common_size = 0U;
6237 state5.vs_usc_unified_size = DIV_ROUND_UP(
6238 usc_unified_store_size_in_bytes,
6239 ROGUE_VDMCTRL_VDM_STATE5_VS_USC_UNIFIED_SIZE_UNIT_SIZE);
6240 state5.vs_pds_temp_size =
6241 DIV_ROUND_UP(state->pds_shader.info->temps_required << 2,
6242 ROGUE_VDMCTRL_VDM_STATE5_VS_PDS_TEMP_SIZE_UNIT_SIZE);
6243 state5.vs_pds_data_size = DIV_ROUND_UP(
6244 PVR_DW_TO_BYTES(state->pds_shader.info->data_size_in_dwords),
6245 ROGUE_VDMCTRL_VDM_STATE5_VS_PDS_DATA_SIZE_UNIT_SIZE);
6246 }
6247 }
6248
6249 pvr_csb_clear_relocation_mark(csb);
6250 }
6251
pvr_validate_draw_state(struct pvr_cmd_buffer * cmd_buffer)6252 static VkResult pvr_validate_draw_state(struct pvr_cmd_buffer *cmd_buffer)
6253 {
6254 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
6255 struct vk_dynamic_graphics_state *const dynamic_state =
6256 &cmd_buffer->vk.dynamic_graphics_state;
6257 const struct pvr_graphics_pipeline *const gfx_pipeline = state->gfx_pipeline;
6258 const pco_data *const fs_data = &gfx_pipeline->fs_data;
6259 struct pvr_sub_cmd_gfx *sub_cmd;
6260 bool fstencil_writemask_zero;
6261 bool bstencil_writemask_zero;
6262 bool fstencil_keep;
6263 bool bstencil_keep;
6264 VkResult result;
6265
6266 pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
6267
6268 sub_cmd = &state->current_sub_cmd->gfx;
6269 sub_cmd->empty_cmd = false;
6270
6271 /* Determine pipeline depth/stencil usage. If a pipeline uses depth or
6272 * stencil testing, those attachments are using their loaded values, and
6273 * the loadOps cannot be optimized out.
6274 */
6275 /* Pipeline uses depth testing. */
6276 if (sub_cmd->depth_usage == PVR_DEPTH_STENCIL_USAGE_UNDEFINED &&
6277 dynamic_state->ds.depth.compare_op != VK_COMPARE_OP_ALWAYS) {
6278 sub_cmd->depth_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED;
6279 }
6280
6281 /* Pipeline uses stencil testing. */
6282 if (sub_cmd->stencil_usage == PVR_DEPTH_STENCIL_USAGE_UNDEFINED &&
6283 (dynamic_state->ds.stencil.front.op.compare != VK_COMPARE_OP_ALWAYS ||
6284 dynamic_state->ds.stencil.back.op.compare != VK_COMPARE_OP_ALWAYS)) {
6285 sub_cmd->stencil_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED;
6286 }
6287
6288 if (PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info,
6289 compute_overlap)) {
6290 uint32_t coefficient_size =
6291 DIV_ROUND_UP(fs_data->common.coeffs,
6292 ROGUE_TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_UNIT_SIZE);
6293
6294 if (coefficient_size >
6295 ROGUE_TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_MAX_SIZE)
6296 sub_cmd->disable_compute_overlap = true;
6297 }
6298
6299 sub_cmd->frag_uses_atomic_ops |= fs_data->common.uses.atomics;
6300 sub_cmd->frag_has_side_effects |= fs_data->common.uses.side_effects;
6301 sub_cmd->frag_uses_texture_rw |= false;
6302 sub_cmd->vertex_uses_texture_rw |= false;
6303
6304 sub_cmd->job.get_vis_results = state->vis_test_enabled;
6305
6306 fstencil_keep =
6307 (dynamic_state->ds.stencil.front.op.fail == VK_STENCIL_OP_KEEP) &&
6308 (dynamic_state->ds.stencil.front.op.pass == VK_STENCIL_OP_KEEP);
6309 bstencil_keep =
6310 (dynamic_state->ds.stencil.back.op.fail == VK_STENCIL_OP_KEEP) &&
6311 (dynamic_state->ds.stencil.back.op.pass == VK_STENCIL_OP_KEEP);
6312 fstencil_writemask_zero = (dynamic_state->ds.stencil.front.write_mask == 0);
6313 bstencil_writemask_zero = (dynamic_state->ds.stencil.back.write_mask == 0);
6314
6315 /* Set stencil modified flag if:
6316 * - Neither front nor back-facing stencil has a fail_op/pass_op of KEEP.
6317 * - Neither front nor back-facing stencil has a write_mask of zero.
6318 */
6319 if (!(fstencil_keep && bstencil_keep) &&
6320 !(fstencil_writemask_zero && bstencil_writemask_zero)) {
6321 sub_cmd->modifies_stencil = true;
6322 }
6323
6324 /* Set depth modified flag if depth write is enabled. */
6325 if (dynamic_state->ds.depth.write_enable)
6326 sub_cmd->modifies_depth = true;
6327
6328 /* If either the data or code changes for pds vertex attribs, regenerate the
6329 * data segment.
6330 */
6331 if (state->dirty.vertex_bindings || state->dirty.gfx_pipeline_binding ||
6332 state->dirty.draw_variant || state->dirty.draw_base_instance) {
6333 enum pvr_pds_vertex_attrib_program_type prog_type;
6334 const struct pvr_pds_attrib_program *program;
6335
6336 if (state->draw_state.draw_indirect)
6337 prog_type = PVR_PDS_VERTEX_ATTRIB_PROGRAM_DRAW_INDIRECT;
6338 else if (state->draw_state.base_instance)
6339 prog_type = PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASE_INSTANCE;
6340 else
6341 prog_type = PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASIC;
6342
6343 program =
6344 &gfx_pipeline->shader_state.vertex.pds_attrib_programs[prog_type];
6345 state->pds_shader.info = &program->info;
6346 state->pds_shader.code_offset = program->program.code_offset;
6347
6348 state->max_shared_regs =
6349 MAX2(state->max_shared_regs, pvr_calc_shared_regs_count(gfx_pipeline));
6350
6351 pvr_setup_vertex_buffers(cmd_buffer, gfx_pipeline);
6352 }
6353
6354 if (state->push_constants.dirty_stages & VK_SHADER_STAGE_ALL_GRAPHICS) {
6355 result = pvr_cmd_upload_push_consts(cmd_buffer);
6356 if (result != VK_SUCCESS)
6357 return result;
6358 }
6359
6360 state->dirty.vertex_descriptors = state->dirty.gfx_pipeline_binding;
6361 state->dirty.fragment_descriptors = state->dirty.vertex_descriptors;
6362
6363 /* Account for dirty descriptor set. */
6364 /* TODO: It could be the case that there are no descriptors for a specific
6365 * stage, or that the update descriptors aren't active for a particular
6366 * stage. In such cases we could avoid regenerating the descriptor PDS
6367 * program.
6368 */
6369 state->dirty.vertex_descriptors |= state->dirty.gfx_desc_dirty;
6370 state->dirty.fragment_descriptors |= state->dirty.gfx_desc_dirty;
6371
6372 if (BITSET_TEST(dynamic_state->dirty, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS))
6373 state->dirty.fragment_descriptors = true;
6374
6375 state->dirty.vertex_descriptors |=
6376 state->push_constants.dirty_stages &
6377 (VK_SHADER_STAGE_ALL_GRAPHICS & ~VK_SHADER_STAGE_FRAGMENT_BIT);
6378 state->dirty.fragment_descriptors |= state->push_constants.dirty_stages &
6379 VK_SHADER_STAGE_FRAGMENT_BIT;
6380
6381 if (state->dirty.fragment_descriptors) {
6382 result = pvr_setup_descriptor_mappings(
6383 cmd_buffer,
6384 PVR_STAGE_ALLOCATION_FRAGMENT,
6385 &state->gfx_pipeline->shader_state.fragment.descriptor_state,
6386 NULL,
6387 &state->pds_fragment_descriptor_data_offset);
6388 if (result != VK_SUCCESS) {
6389 mesa_loge("Could not setup fragment descriptor mappings.");
6390 return result;
6391 }
6392 }
6393
6394 if (state->dirty.vertex_descriptors) {
6395 uint32_t pds_vertex_descriptor_data_offset;
6396
6397 result = pvr_setup_descriptor_mappings(
6398 cmd_buffer,
6399 PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY,
6400 &state->gfx_pipeline->shader_state.vertex.descriptor_state,
6401 NULL,
6402 &pds_vertex_descriptor_data_offset);
6403 if (result != VK_SUCCESS) {
6404 mesa_loge("Could not setup vertex descriptor mappings.");
6405 return result;
6406 }
6407
6408 pvr_emit_dirty_pds_state(cmd_buffer,
6409 sub_cmd,
6410 pds_vertex_descriptor_data_offset);
6411 }
6412
6413 pvr_emit_dirty_ppp_state(cmd_buffer, sub_cmd);
6414 pvr_emit_dirty_vdm_state(cmd_buffer, sub_cmd);
6415
6416 vk_dynamic_graphics_state_clear_dirty(dynamic_state);
6417 state->dirty.gfx_desc_dirty = false;
6418 state->dirty.draw_base_instance = false;
6419 state->dirty.draw_variant = false;
6420 state->dirty.fragment_descriptors = false;
6421 state->dirty.gfx_pipeline_binding = false;
6422 state->dirty.isp_userpass = false;
6423 state->dirty.vertex_bindings = false;
6424 state->dirty.vis_test = false;
6425
6426 state->push_constants.dirty_stages &= ~VK_SHADER_STAGE_ALL_GRAPHICS;
6427
6428 return VK_SUCCESS;
6429 }
6430
pvr_get_hw_primitive_topology(VkPrimitiveTopology topology)6431 static uint32_t pvr_get_hw_primitive_topology(VkPrimitiveTopology topology)
6432 {
6433 switch (topology) {
6434 case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
6435 return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_POINT_LIST;
6436 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
6437 return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_LIST;
6438 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
6439 return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_STRIP;
6440 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
6441 return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_LIST;
6442 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
6443 return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_STRIP;
6444 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
6445 return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_FAN;
6446 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
6447 return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_LIST_ADJ;
6448 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
6449 return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_STRIP_ADJ;
6450 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
6451 return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_LIST_ADJ;
6452 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
6453 return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_STRIP_ADJ;
6454 case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
6455 return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_PATCH_LIST;
6456 default:
6457 unreachable("Undefined primitive topology");
6458 }
6459 }
6460
6461 /* TODO: Rewrite this in terms of ALIGN_POT() and pvr_cmd_length(). */
6462 /* Aligned to 128 bit for PDS loads / stores */
6463 #define DUMMY_VDM_CONTROL_STREAM_BLOCK_SIZE 8
6464
6465 static VkResult
pvr_write_draw_indirect_vdm_stream(struct pvr_cmd_buffer * cmd_buffer,struct pvr_csb * const csb,pvr_dev_addr_t idx_buffer_addr,uint32_t idx_stride,struct ROGUE_VDMCTRL_INDEX_LIST0 * list_hdr,struct pvr_buffer * buffer,VkDeviceSize offset,uint32_t count,uint32_t stride)6466 pvr_write_draw_indirect_vdm_stream(struct pvr_cmd_buffer *cmd_buffer,
6467 struct pvr_csb *const csb,
6468 pvr_dev_addr_t idx_buffer_addr,
6469 uint32_t idx_stride,
6470 struct ROGUE_VDMCTRL_INDEX_LIST0 *list_hdr,
6471 struct pvr_buffer *buffer,
6472 VkDeviceSize offset,
6473 uint32_t count,
6474 uint32_t stride)
6475 {
6476 struct pvr_pds_drawindirect_program pds_prog = { 0 };
6477 uint32_t word0;
6478
6479 /* Draw indirect always has index offset and instance count. */
6480 list_hdr->index_offset_present = true;
6481 list_hdr->index_instance_count_present = true;
6482
6483 pvr_cmd_pack(VDMCTRL_INDEX_LIST0)(&word0, list_hdr);
6484
6485 pds_prog.support_base_instance = true;
6486 pds_prog.arg_buffer = buffer->dev_addr.addr + offset;
6487 pds_prog.index_buffer = idx_buffer_addr.addr;
6488 pds_prog.index_block_header = word0;
6489 pds_prog.index_stride = idx_stride;
6490 pds_prog.num_views = 1U;
6491
6492 /* TODO: See if we can pre-upload the code section of all the pds programs
6493 * and reuse them here.
6494 */
6495 /* Generate and upload the PDS programs (code + data). */
6496 for (uint32_t i = 0U; i < count; i++) {
6497 const struct pvr_device_info *dev_info =
6498 &cmd_buffer->device->pdevice->dev_info;
6499 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6500 struct pvr_suballoc_bo *dummy_bo;
6501 struct pvr_suballoc_bo *pds_bo;
6502 uint32_t *dummy_stream;
6503 uint32_t *pds_base;
6504 uint32_t pds_size;
6505 VkResult result;
6506
6507 /* TODO: Move this outside the loop and allocate all of them in one go? */
6508 result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
6509 cmd_buffer->device->heaps.general_heap,
6510 DUMMY_VDM_CONTROL_STREAM_BLOCK_SIZE,
6511 &dummy_bo);
6512 if (result != VK_SUCCESS)
6513 return result;
6514
6515 pds_prog.increment_draw_id = (i != 0);
6516 pds_prog.index_list_addr_buffer = dummy_bo->dev_addr.addr;
6517
6518 if (state->draw_state.draw_indexed) {
6519 pvr_pds_generate_draw_elements_indirect(&pds_prog,
6520 0,
6521 PDS_GENERATE_SIZES,
6522 dev_info);
6523 } else {
6524 pvr_pds_generate_draw_arrays_indirect(&pds_prog,
6525 0,
6526 PDS_GENERATE_SIZES,
6527 dev_info);
6528 }
6529
6530 pds_size = PVR_DW_TO_BYTES(pds_prog.program.data_size_aligned +
6531 pds_prog.program.code_size_aligned);
6532
6533 result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
6534 cmd_buffer->device->heaps.pds_heap,
6535 pds_size,
6536 &pds_bo);
6537 if (result != VK_SUCCESS)
6538 return result;
6539
6540 pds_base = pvr_bo_suballoc_get_map_addr(pds_bo);
6541 memcpy(pds_base,
6542 pds_prog.program.code,
6543 PVR_DW_TO_BYTES(pds_prog.program.code_size_aligned));
6544
6545 if (state->draw_state.draw_indexed) {
6546 pvr_pds_generate_draw_elements_indirect(
6547 &pds_prog,
6548 pds_base + pds_prog.program.code_size_aligned,
6549 PDS_GENERATE_DATA_SEGMENT,
6550 dev_info);
6551 } else {
6552 pvr_pds_generate_draw_arrays_indirect(
6553 &pds_prog,
6554 pds_base + pds_prog.program.code_size_aligned,
6555 PDS_GENERATE_DATA_SEGMENT,
6556 dev_info);
6557 }
6558
6559 pvr_csb_set_relocation_mark(csb);
6560
6561 pvr_csb_emit (csb, VDMCTRL_PDS_STATE0, state0) {
6562 state0.usc_target = ROGUE_VDMCTRL_USC_TARGET_ANY;
6563
6564 state0.pds_temp_size =
6565 DIV_ROUND_UP(PVR_DW_TO_BYTES(pds_prog.program.temp_size_aligned),
6566 ROGUE_VDMCTRL_PDS_STATE0_PDS_TEMP_SIZE_UNIT_SIZE);
6567
6568 state0.pds_data_size =
6569 DIV_ROUND_UP(PVR_DW_TO_BYTES(pds_prog.program.data_size_aligned),
6570 ROGUE_VDMCTRL_PDS_STATE0_PDS_DATA_SIZE_UNIT_SIZE);
6571 }
6572
6573 pvr_csb_emit (csb, VDMCTRL_PDS_STATE1, state1) {
6574 const uint32_t data_offset =
6575 pds_bo->dev_addr.addr +
6576 PVR_DW_TO_BYTES(pds_prog.program.code_size_aligned) -
6577 cmd_buffer->device->heaps.pds_heap->base_addr.addr;
6578
6579 state1.pds_data_addr = PVR_DEV_ADDR(data_offset);
6580 state1.sd_type = ROGUE_VDMCTRL_SD_TYPE_PDS;
6581 state1.sd_next_type = ROGUE_VDMCTRL_SD_TYPE_NONE;
6582 }
6583
6584 pvr_csb_emit (csb, VDMCTRL_PDS_STATE2, state2) {
6585 const uint32_t code_offset =
6586 pds_bo->dev_addr.addr -
6587 cmd_buffer->device->heaps.pds_heap->base_addr.addr;
6588
6589 state2.pds_code_addr = PVR_DEV_ADDR(code_offset);
6590 }
6591
6592 pvr_csb_clear_relocation_mark(csb);
6593
6594 /* We don't really need to set the relocation mark since the following
6595 * state update is just one emit but let's be nice and use it.
6596 */
6597 pvr_csb_set_relocation_mark(csb);
6598
6599 /* Sync task to ensure the VDM doesn't start reading the dummy blocks
6600 * before they are ready.
6601 */
6602 pvr_csb_emit (csb, VDMCTRL_INDEX_LIST0, list0) {
6603 list0.primitive_topology = ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_LIST;
6604 }
6605
6606 pvr_csb_clear_relocation_mark(csb);
6607
6608 dummy_stream = pvr_bo_suballoc_get_map_addr(dummy_bo);
6609
6610 /* For indexed draw cmds fill in the dummy's header (as it won't change
6611 * based on the indirect args) and increment by the in-use size of each
6612 * dummy block.
6613 */
6614 if (!state->draw_state.draw_indexed) {
6615 dummy_stream[0] = word0;
6616 dummy_stream += 4;
6617 } else {
6618 dummy_stream += 5;
6619 }
6620
6621 /* clang-format off */
6622 pvr_csb_pack (dummy_stream, VDMCTRL_STREAM_RETURN, word);
6623 /* clang-format on */
6624
6625 pvr_csb_set_relocation_mark(csb);
6626
6627 /* Stream link to the first dummy which forces the VDM to discard any
6628 * prefetched (dummy) control stream.
6629 */
6630 pvr_csb_emit (csb, VDMCTRL_STREAM_LINK0, link) {
6631 link.with_return = true;
6632 link.link_addrmsb = dummy_bo->dev_addr;
6633 }
6634
6635 pvr_csb_emit (csb, VDMCTRL_STREAM_LINK1, link) {
6636 link.link_addrlsb = dummy_bo->dev_addr;
6637 }
6638
6639 pvr_csb_clear_relocation_mark(csb);
6640
6641 /* Point the pds program to the next argument buffer and the next VDM
6642 * dummy buffer.
6643 */
6644 pds_prog.arg_buffer += stride;
6645 }
6646
6647 return VK_SUCCESS;
6648 }
6649
6650 #undef DUMMY_VDM_CONTROL_STREAM_BLOCK_SIZE
6651
pvr_emit_vdm_index_list(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd,VkPrimitiveTopology topology,uint32_t index_offset,uint32_t first_index,uint32_t index_count,uint32_t instance_count,struct pvr_buffer * buffer,VkDeviceSize offset,uint32_t count,uint32_t stride)6652 static void pvr_emit_vdm_index_list(struct pvr_cmd_buffer *cmd_buffer,
6653 struct pvr_sub_cmd_gfx *const sub_cmd,
6654 VkPrimitiveTopology topology,
6655 uint32_t index_offset,
6656 uint32_t first_index,
6657 uint32_t index_count,
6658 uint32_t instance_count,
6659 struct pvr_buffer *buffer,
6660 VkDeviceSize offset,
6661 uint32_t count,
6662 uint32_t stride)
6663 {
6664 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6665
6666 const pco_data *const vs_data = &state->gfx_pipeline->vs_data;
6667 struct ROGUE_VDMCTRL_INDEX_LIST0 list_hdr = { pvr_cmd_header(
6668 VDMCTRL_INDEX_LIST0) };
6669 pvr_dev_addr_t index_buffer_addr = PVR_DEV_ADDR_INVALID;
6670 struct pvr_csb *const csb = &sub_cmd->control_stream;
6671 unsigned int index_stride = 0;
6672
6673 list_hdr.primitive_topology = pvr_get_hw_primitive_topology(topology);
6674
6675 /* firstInstance is not handled here in the VDM state, it's implemented as
6676 * an addition in the PDS vertex fetch using
6677 * PVR_PDS_CONST_MAP_ENTRY_TYPE_BASE_INSTANCE entry type.
6678 */
6679
6680 list_hdr.index_count_present = true;
6681
6682 if (instance_count > 1)
6683 list_hdr.index_instance_count_present = true;
6684
6685 if (index_offset)
6686 list_hdr.index_offset_present = true;
6687
6688 if (state->draw_state.draw_indexed) {
6689 list_hdr.index_size =
6690 pvr_vdmctrl_index_size_from_type(state->index_buffer_binding.type);
6691 index_stride = vk_index_type_to_bytes(state->index_buffer_binding.type);
6692
6693 index_buffer_addr = PVR_DEV_ADDR_OFFSET(
6694 state->index_buffer_binding.buffer->dev_addr,
6695 state->index_buffer_binding.offset + first_index * index_stride);
6696
6697 list_hdr.index_addr_present = true;
6698 list_hdr.index_base_addrmsb = index_buffer_addr;
6699 }
6700
6701 list_hdr.degen_cull_enable =
6702 PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info,
6703 vdm_degenerate_culling) &&
6704 !vs_data->common.uses.side_effects;
6705
6706 if (state->draw_state.draw_indirect) {
6707 assert(buffer);
6708 pvr_write_draw_indirect_vdm_stream(cmd_buffer,
6709 csb,
6710 index_buffer_addr,
6711 index_stride,
6712 &list_hdr,
6713 buffer,
6714 offset,
6715 count,
6716 stride);
6717 return;
6718 }
6719
6720 pvr_csb_set_relocation_mark(csb);
6721
6722 pvr_csb_emit (csb, VDMCTRL_INDEX_LIST0, list0) {
6723 list0 = list_hdr;
6724 }
6725
6726 if (list_hdr.index_addr_present) {
6727 pvr_csb_emit (csb, VDMCTRL_INDEX_LIST1, list1) {
6728 list1.index_base_addrlsb = index_buffer_addr;
6729 }
6730 }
6731
6732 if (list_hdr.index_count_present) {
6733 pvr_csb_emit (csb, VDMCTRL_INDEX_LIST2, list2) {
6734 list2.index_count = index_count;
6735 }
6736 }
6737
6738 if (list_hdr.index_instance_count_present) {
6739 pvr_csb_emit (csb, VDMCTRL_INDEX_LIST3, list3) {
6740 list3.instance_count = instance_count - 1;
6741 }
6742 }
6743
6744 if (list_hdr.index_offset_present) {
6745 pvr_csb_emit (csb, VDMCTRL_INDEX_LIST4, list4) {
6746 list4.index_offset = index_offset;
6747 }
6748 }
6749
6750 pvr_csb_clear_relocation_mark(csb);
6751 }
6752
pvr_CmdDraw(VkCommandBuffer commandBuffer,uint32_t vertexCount,uint32_t instanceCount,uint32_t firstVertex,uint32_t firstInstance)6753 void pvr_CmdDraw(VkCommandBuffer commandBuffer,
6754 uint32_t vertexCount,
6755 uint32_t instanceCount,
6756 uint32_t firstVertex,
6757 uint32_t firstInstance)
6758 {
6759 const struct pvr_cmd_buffer_draw_state draw_state = {
6760 .base_vertex = firstVertex,
6761 .base_instance = firstInstance,
6762 };
6763
6764 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
6765 struct vk_dynamic_graphics_state *const dynamic_state =
6766 &cmd_buffer->vk.dynamic_graphics_state;
6767 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6768 VkResult result;
6769
6770 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
6771
6772 pvr_update_draw_state(state, &draw_state);
6773
6774 result = pvr_validate_draw_state(cmd_buffer);
6775 if (result != VK_SUCCESS)
6776 return;
6777
6778 /* Write the VDM control stream for the primitive. */
6779 pvr_emit_vdm_index_list(cmd_buffer,
6780 &state->current_sub_cmd->gfx,
6781 dynamic_state->ia.primitive_topology,
6782 firstVertex,
6783 0U,
6784 vertexCount,
6785 instanceCount,
6786 NULL,
6787 0U,
6788 0U,
6789 0U);
6790 }
6791
pvr_CmdDrawIndexed(VkCommandBuffer commandBuffer,uint32_t indexCount,uint32_t instanceCount,uint32_t firstIndex,int32_t vertexOffset,uint32_t firstInstance)6792 void pvr_CmdDrawIndexed(VkCommandBuffer commandBuffer,
6793 uint32_t indexCount,
6794 uint32_t instanceCount,
6795 uint32_t firstIndex,
6796 int32_t vertexOffset,
6797 uint32_t firstInstance)
6798 {
6799 const struct pvr_cmd_buffer_draw_state draw_state = {
6800 .base_vertex = vertexOffset,
6801 .base_instance = firstInstance,
6802 .draw_indexed = true,
6803 };
6804
6805 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
6806 struct vk_dynamic_graphics_state *const dynamic_state =
6807 &cmd_buffer->vk.dynamic_graphics_state;
6808 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6809 VkResult result;
6810
6811 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
6812
6813 pvr_update_draw_state(state, &draw_state);
6814
6815 result = pvr_validate_draw_state(cmd_buffer);
6816 if (result != VK_SUCCESS)
6817 return;
6818
6819 /* Write the VDM control stream for the primitive. */
6820 pvr_emit_vdm_index_list(cmd_buffer,
6821 &state->current_sub_cmd->gfx,
6822 dynamic_state->ia.primitive_topology,
6823 vertexOffset,
6824 firstIndex,
6825 indexCount,
6826 instanceCount,
6827 NULL,
6828 0U,
6829 0U,
6830 0U);
6831 }
6832
pvr_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)6833 void pvr_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
6834 VkBuffer _buffer,
6835 VkDeviceSize offset,
6836 uint32_t drawCount,
6837 uint32_t stride)
6838 {
6839 const struct pvr_cmd_buffer_draw_state draw_state = {
6840 .draw_indirect = true,
6841 .draw_indexed = true,
6842 };
6843
6844 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
6845 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6846 struct vk_dynamic_graphics_state *const dynamic_state =
6847 &cmd_buffer->vk.dynamic_graphics_state;
6848 PVR_FROM_HANDLE(pvr_buffer, buffer, _buffer);
6849 VkResult result;
6850
6851 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
6852
6853 pvr_update_draw_state(state, &draw_state);
6854
6855 result = pvr_validate_draw_state(cmd_buffer);
6856 if (result != VK_SUCCESS)
6857 return;
6858
6859 /* Write the VDM control stream for the primitive. */
6860 pvr_emit_vdm_index_list(cmd_buffer,
6861 &state->current_sub_cmd->gfx,
6862 dynamic_state->ia.primitive_topology,
6863 0U,
6864 0U,
6865 0U,
6866 0U,
6867 buffer,
6868 offset,
6869 drawCount,
6870 stride);
6871 }
6872
pvr_CmdDrawIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)6873 void pvr_CmdDrawIndirect(VkCommandBuffer commandBuffer,
6874 VkBuffer _buffer,
6875 VkDeviceSize offset,
6876 uint32_t drawCount,
6877 uint32_t stride)
6878 {
6879 const struct pvr_cmd_buffer_draw_state draw_state = {
6880 .draw_indirect = true,
6881 };
6882
6883 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
6884 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6885 PVR_FROM_HANDLE(pvr_buffer, buffer, _buffer);
6886 struct vk_dynamic_graphics_state *const dynamic_state =
6887 &cmd_buffer->vk.dynamic_graphics_state;
6888 VkResult result;
6889
6890 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
6891
6892 pvr_update_draw_state(state, &draw_state);
6893
6894 result = pvr_validate_draw_state(cmd_buffer);
6895 if (result != VK_SUCCESS)
6896 return;
6897
6898 /* Write the VDM control stream for the primitive. */
6899 pvr_emit_vdm_index_list(cmd_buffer,
6900 &state->current_sub_cmd->gfx,
6901 dynamic_state->ia.primitive_topology,
6902 0U,
6903 0U,
6904 0U,
6905 0U,
6906 buffer,
6907 offset,
6908 drawCount,
6909 stride);
6910 }
6911
6912 static VkResult
pvr_resolve_unemitted_resolve_attachments(struct pvr_cmd_buffer * cmd_buffer,struct pvr_render_pass_info * info)6913 pvr_resolve_unemitted_resolve_attachments(struct pvr_cmd_buffer *cmd_buffer,
6914 struct pvr_render_pass_info *info)
6915 {
6916 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6917 const struct pvr_renderpass_hwsetup_render *hw_render =
6918 &state->render_pass_info.pass->hw_setup->renders[info->current_hw_subpass];
6919
6920 for (uint32_t i = 0U; i < hw_render->eot_surface_count; i++) {
6921 const struct pvr_renderpass_hwsetup_eot_surface *surface =
6922 &hw_render->eot_surfaces[i];
6923 const uint32_t color_attach_idx = surface->src_attachment_idx;
6924 const uint32_t resolve_attach_idx = surface->attachment_idx;
6925 VkImageSubresourceLayers src_subresource;
6926 VkImageSubresourceLayers dst_subresource;
6927 struct pvr_image_view *dst_view;
6928 struct pvr_image_view *src_view;
6929 VkFormat src_format;
6930 VkFormat dst_format;
6931 VkImageCopy2 region;
6932 VkResult result;
6933
6934 if (!surface->need_resolve ||
6935 surface->resolve_type != PVR_RESOLVE_TYPE_TRANSFER)
6936 continue;
6937
6938 dst_view = info->attachments[resolve_attach_idx];
6939 src_view = info->attachments[color_attach_idx];
6940
6941 src_subresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
6942 src_subresource.mipLevel = src_view->vk.base_mip_level;
6943 src_subresource.baseArrayLayer = src_view->vk.base_array_layer;
6944 src_subresource.layerCount = src_view->vk.layer_count;
6945
6946 dst_subresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
6947 dst_subresource.mipLevel = dst_view->vk.base_mip_level;
6948 dst_subresource.baseArrayLayer = dst_view->vk.base_array_layer;
6949 dst_subresource.layerCount = dst_view->vk.layer_count;
6950
6951 region.srcOffset = (VkOffset3D){ info->render_area.offset.x,
6952 info->render_area.offset.y,
6953 0 };
6954 region.dstOffset = (VkOffset3D){ info->render_area.offset.x,
6955 info->render_area.offset.y,
6956 0 };
6957 region.extent = (VkExtent3D){ info->render_area.extent.width,
6958 info->render_area.extent.height,
6959 1 };
6960
6961 region.srcSubresource = src_subresource;
6962 region.dstSubresource = dst_subresource;
6963
6964 /* TODO: if ERN_46863 is supported, Depth and stencil are sampled
6965 * separately from images with combined depth+stencil. Add logic here to
6966 * handle it using appropriate format from image view.
6967 */
6968 src_format = src_view->vk.image->format;
6969 dst_format = dst_view->vk.image->format;
6970 src_view->vk.image->format = src_view->vk.format;
6971 dst_view->vk.image->format = dst_view->vk.format;
6972
6973 result = pvr_copy_or_resolve_color_image_region(
6974 cmd_buffer,
6975 vk_to_pvr_image(src_view->vk.image),
6976 vk_to_pvr_image(dst_view->vk.image),
6977 ®ion);
6978
6979 src_view->vk.image->format = src_format;
6980 dst_view->vk.image->format = dst_format;
6981
6982 state->current_sub_cmd->transfer.serialize_with_frag = true;
6983
6984 if (result != VK_SUCCESS)
6985 return result;
6986 }
6987
6988 return pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
6989 }
6990
pvr_CmdEndRenderPass2(VkCommandBuffer commandBuffer,const VkSubpassEndInfo * pSubpassEndInfo)6991 void pvr_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
6992 const VkSubpassEndInfo *pSubpassEndInfo)
6993 {
6994 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
6995 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6996 struct pvr_image_view **attachments;
6997 VkClearValue *clear_values;
6998 VkResult result;
6999
7000 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
7001
7002 assert(state->render_pass_info.pass);
7003 assert(state->render_pass_info.framebuffer);
7004
7005 result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7006 if (result != VK_SUCCESS)
7007 return;
7008
7009 result = pvr_resolve_unemitted_resolve_attachments(cmd_buffer,
7010 &state->render_pass_info);
7011 if (result != VK_SUCCESS)
7012 return;
7013
7014 /* Save the required fields before clearing render_pass_info struct. */
7015 attachments = state->render_pass_info.attachments;
7016 clear_values = state->render_pass_info.clear_values;
7017
7018 memset(&state->render_pass_info, 0, sizeof(state->render_pass_info));
7019
7020 state->render_pass_info.attachments = attachments;
7021 state->render_pass_info.clear_values = clear_values;
7022 }
7023
7024 static VkResult
pvr_execute_deferred_cmd_buffer(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_cmd_buffer * sec_cmd_buffer)7025 pvr_execute_deferred_cmd_buffer(struct pvr_cmd_buffer *cmd_buffer,
7026 const struct pvr_cmd_buffer *sec_cmd_buffer)
7027 {
7028 struct vk_dynamic_graphics_state *const dynamic_state =
7029 &cmd_buffer->vk.dynamic_graphics_state;
7030 const uint32_t prim_db_elems =
7031 util_dynarray_num_elements(&cmd_buffer->depth_bias_array,
7032 struct pvr_depth_bias_state);
7033 const uint32_t prim_scissor_elems =
7034 util_dynarray_num_elements(&cmd_buffer->scissor_array,
7035 struct pvr_scissor_words);
7036
7037 util_dynarray_foreach (&sec_cmd_buffer->deferred_csb_commands,
7038 struct pvr_deferred_cs_command,
7039 cmd) {
7040 switch (cmd->type) {
7041 case PVR_DEFERRED_CS_COMMAND_TYPE_DBSC: {
7042 const uint32_t scissor_idx =
7043 prim_scissor_elems + cmd->dbsc.state.scissor_index;
7044 const uint32_t db_idx =
7045 prim_db_elems + cmd->dbsc.state.depthbias_index;
7046 const uint32_t num_dwords =
7047 pvr_cmd_length(TA_STATE_HEADER) + pvr_cmd_length(TA_STATE_ISPDBSC);
7048 struct pvr_suballoc_bo *suballoc_bo;
7049 uint32_t ppp_state[num_dwords];
7050 VkResult result;
7051
7052 pvr_csb_pack (&ppp_state[0], TA_STATE_HEADER, header) {
7053 header.pres_ispctl_dbsc = true;
7054 }
7055
7056 pvr_csb_pack (&ppp_state[1], TA_STATE_ISPDBSC, ispdbsc) {
7057 ispdbsc.dbindex = db_idx;
7058 ispdbsc.scindex = scissor_idx;
7059 }
7060
7061 result = pvr_cmd_buffer_upload_general(cmd_buffer,
7062 &ppp_state[0],
7063 sizeof(ppp_state),
7064 &suballoc_bo);
7065 if (result != VK_SUCCESS)
7066 return result;
7067
7068 pvr_csb_pack (&cmd->dbsc.vdm_state[0], VDMCTRL_PPP_STATE0, state) {
7069 state.word_count = num_dwords;
7070 state.addrmsb = suballoc_bo->dev_addr;
7071 }
7072
7073 pvr_csb_pack (&cmd->dbsc.vdm_state[1], VDMCTRL_PPP_STATE1, state) {
7074 state.addrlsb = suballoc_bo->dev_addr;
7075 }
7076
7077 break;
7078 }
7079
7080 case PVR_DEFERRED_CS_COMMAND_TYPE_DBSC2: {
7081 const uint32_t scissor_idx =
7082 prim_scissor_elems + cmd->dbsc2.state.scissor_index;
7083 const uint32_t db_idx =
7084 prim_db_elems + cmd->dbsc2.state.depthbias_index;
7085
7086 uint32_t *const addr =
7087 (uint32_t *)pvr_bo_suballoc_get_map_addr(cmd->dbsc2.ppp_cs_bo) +
7088 cmd->dbsc2.patch_offset;
7089
7090 assert(pvr_bo_suballoc_get_map_addr(cmd->dbsc2.ppp_cs_bo));
7091
7092 pvr_csb_pack (addr, TA_STATE_ISPDBSC, ispdbsc) {
7093 ispdbsc.dbindex = db_idx;
7094 ispdbsc.scindex = scissor_idx;
7095 }
7096
7097 break;
7098 }
7099
7100 default:
7101 unreachable("Invalid deferred control stream command type.");
7102 break;
7103 }
7104 }
7105
7106 util_dynarray_append_dynarray(&cmd_buffer->depth_bias_array,
7107 &sec_cmd_buffer->depth_bias_array);
7108
7109 util_dynarray_append_dynarray(&cmd_buffer->scissor_array,
7110 &sec_cmd_buffer->scissor_array);
7111
7112 BITSET_SET(dynamic_state->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS);
7113 cmd_buffer->scissor_words = (struct pvr_scissor_words){ 0 };
7114
7115 return VK_SUCCESS;
7116 }
7117
7118 /* Caller needs to make sure that it ends the current sub_cmd. This function
7119 * only creates a copy of sec_sub_cmd and links it to the cmd_buffer's
7120 * sub_cmd list.
7121 */
pvr_execute_sub_cmd(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd * sec_sub_cmd)7122 static VkResult pvr_execute_sub_cmd(struct pvr_cmd_buffer *cmd_buffer,
7123 struct pvr_sub_cmd *sec_sub_cmd)
7124 {
7125 struct pvr_sub_cmd *primary_sub_cmd =
7126 vk_zalloc(&cmd_buffer->vk.pool->alloc,
7127 sizeof(*primary_sub_cmd),
7128 8,
7129 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
7130 if (!primary_sub_cmd) {
7131 return vk_command_buffer_set_error(&cmd_buffer->vk,
7132 VK_ERROR_OUT_OF_HOST_MEMORY);
7133 }
7134
7135 primary_sub_cmd->type = sec_sub_cmd->type;
7136 primary_sub_cmd->owned = false;
7137
7138 list_addtail(&primary_sub_cmd->link, &cmd_buffer->sub_cmds);
7139
7140 switch (sec_sub_cmd->type) {
7141 case PVR_SUB_CMD_TYPE_GRAPHICS:
7142 primary_sub_cmd->gfx = sec_sub_cmd->gfx;
7143 break;
7144
7145 case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY:
7146 case PVR_SUB_CMD_TYPE_COMPUTE:
7147 primary_sub_cmd->compute = sec_sub_cmd->compute;
7148 break;
7149
7150 case PVR_SUB_CMD_TYPE_TRANSFER:
7151 primary_sub_cmd->transfer = sec_sub_cmd->transfer;
7152 break;
7153
7154 case PVR_SUB_CMD_TYPE_EVENT:
7155 primary_sub_cmd->event = sec_sub_cmd->event;
7156 break;
7157
7158 default:
7159 unreachable("Unsupported sub-command type");
7160 }
7161
7162 return VK_SUCCESS;
7163 }
7164
7165 static VkResult
pvr_execute_graphics_cmd_buffer(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_cmd_buffer * sec_cmd_buffer)7166 pvr_execute_graphics_cmd_buffer(struct pvr_cmd_buffer *cmd_buffer,
7167 const struct pvr_cmd_buffer *sec_cmd_buffer)
7168 {
7169 const struct pvr_device_info *dev_info =
7170 &cmd_buffer->device->pdevice->dev_info;
7171 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
7172 struct pvr_sub_cmd *primary_sub_cmd = state->current_sub_cmd;
7173 struct pvr_sub_cmd *first_sec_cmd;
7174 VkResult result;
7175
7176 /* Inherited queries are not supported. */
7177 assert(!state->vis_test_enabled);
7178
7179 if (list_is_empty(&sec_cmd_buffer->sub_cmds))
7180 return VK_SUCCESS;
7181
7182 first_sec_cmd =
7183 list_first_entry(&sec_cmd_buffer->sub_cmds, struct pvr_sub_cmd, link);
7184
7185 /* Kick a render if we have a new base address. */
7186 if (primary_sub_cmd->gfx.query_pool && first_sec_cmd->gfx.query_pool &&
7187 primary_sub_cmd->gfx.query_pool != first_sec_cmd->gfx.query_pool) {
7188 state->current_sub_cmd->gfx.barrier_store = true;
7189
7190 result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7191 if (result != VK_SUCCESS)
7192 return result;
7193
7194 result =
7195 pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
7196 if (result != VK_SUCCESS)
7197 return result;
7198
7199 primary_sub_cmd = state->current_sub_cmd;
7200
7201 /* Use existing render setup, but load color attachments from HW
7202 * Background object.
7203 */
7204 primary_sub_cmd->gfx.barrier_load = true;
7205 primary_sub_cmd->gfx.barrier_store = false;
7206 }
7207
7208 list_for_each_entry (struct pvr_sub_cmd,
7209 sec_sub_cmd,
7210 &sec_cmd_buffer->sub_cmds,
7211 link) {
7212 /* Only graphics secondary execution supported within a renderpass. */
7213 assert(sec_sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS);
7214
7215 if (!sec_sub_cmd->gfx.empty_cmd)
7216 primary_sub_cmd->gfx.empty_cmd = false;
7217
7218 if (sec_sub_cmd->gfx.query_pool) {
7219 primary_sub_cmd->gfx.query_pool = sec_sub_cmd->gfx.query_pool;
7220
7221 util_dynarray_append_dynarray(&state->query_indices,
7222 &sec_sub_cmd->gfx.sec_query_indices);
7223 }
7224
7225 if (pvr_cmd_uses_deferred_cs_cmds(sec_cmd_buffer)) {
7226 /* TODO: In case if secondary buffer is created with
7227 * VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT, then we patch the
7228 * stream and copy it to primary stream using pvr_csb_copy below.
7229 * This will need locking if the same secondary command buffer is
7230 * executed in multiple primary buffers at the same time.
7231 */
7232 result = pvr_execute_deferred_cmd_buffer(cmd_buffer, sec_cmd_buffer);
7233 if (result != VK_SUCCESS)
7234 return result;
7235
7236 result = pvr_csb_copy(&primary_sub_cmd->gfx.control_stream,
7237 &sec_sub_cmd->gfx.control_stream);
7238 if (result != VK_SUCCESS)
7239 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
7240 } else {
7241 result = pvr_execute_deferred_cmd_buffer(cmd_buffer, sec_cmd_buffer);
7242 if (result != VK_SUCCESS)
7243 return result;
7244
7245 pvr_csb_emit_link(
7246 &primary_sub_cmd->gfx.control_stream,
7247 pvr_csb_get_start_address(&sec_sub_cmd->gfx.control_stream),
7248 true);
7249 }
7250
7251 if (PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info,
7252 compute_overlap)) {
7253 primary_sub_cmd->gfx.job.disable_compute_overlap |=
7254 sec_sub_cmd->gfx.job.disable_compute_overlap;
7255 }
7256
7257 primary_sub_cmd->gfx.max_tiles_in_flight =
7258 MIN2(primary_sub_cmd->gfx.max_tiles_in_flight,
7259 sec_sub_cmd->gfx.max_tiles_in_flight);
7260
7261 /* Pass loaded depth/stencil usage from secondary command buffer. */
7262 if (sec_sub_cmd->gfx.depth_usage == PVR_DEPTH_STENCIL_USAGE_NEEDED)
7263 primary_sub_cmd->gfx.depth_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED;
7264
7265 if (sec_sub_cmd->gfx.stencil_usage == PVR_DEPTH_STENCIL_USAGE_NEEDED)
7266 primary_sub_cmd->gfx.stencil_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED;
7267
7268 /* Pass depth/stencil modification state from secondary command buffer. */
7269 if (sec_sub_cmd->gfx.modifies_depth)
7270 primary_sub_cmd->gfx.modifies_depth = true;
7271
7272 if (sec_sub_cmd->gfx.modifies_stencil)
7273 primary_sub_cmd->gfx.modifies_stencil = true;
7274
7275 if (sec_sub_cmd->gfx.barrier_store) {
7276 struct pvr_sub_cmd *sec_next =
7277 list_entry(sec_sub_cmd->link.next, struct pvr_sub_cmd, link);
7278
7279 /* This shouldn't be the last sub cmd. There should be a barrier load
7280 * subsequent to the barrier store.
7281 */
7282 assert(list_last_entry(&sec_cmd_buffer->sub_cmds,
7283 struct pvr_sub_cmd,
7284 link) != sec_sub_cmd);
7285
7286 /* Kick render to store stencil. */
7287 state->current_sub_cmd->gfx.barrier_store = true;
7288 state->current_sub_cmd->gfx.empty_cmd = false;
7289
7290 result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7291 if (result != VK_SUCCESS)
7292 return result;
7293
7294 result =
7295 pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
7296 if (result != VK_SUCCESS)
7297 return result;
7298
7299 primary_sub_cmd = state->current_sub_cmd;
7300
7301 /* Use existing render setup, but load color attachments from HW
7302 * Background object.
7303 */
7304 primary_sub_cmd->gfx.barrier_load = sec_next->gfx.barrier_load;
7305 primary_sub_cmd->gfx.barrier_store = sec_next->gfx.barrier_store;
7306 primary_sub_cmd->gfx.empty_cmd = false;
7307 }
7308
7309 if (!PVR_HAS_FEATURE(dev_info, gs_rta_support)) {
7310 util_dynarray_append_dynarray(&cmd_buffer->deferred_clears,
7311 &sec_cmd_buffer->deferred_clears);
7312 }
7313 }
7314
7315 return VK_SUCCESS;
7316 }
7317
pvr_CmdExecuteCommands(VkCommandBuffer commandBuffer,uint32_t commandBufferCount,const VkCommandBuffer * pCommandBuffers)7318 void pvr_CmdExecuteCommands(VkCommandBuffer commandBuffer,
7319 uint32_t commandBufferCount,
7320 const VkCommandBuffer *pCommandBuffers)
7321 {
7322 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7323 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
7324 struct pvr_cmd_buffer *last_cmd_buffer;
7325 VkResult result;
7326
7327 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
7328
7329 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
7330
7331 /* Reset the CPU copy of the most recent PPP state of the primary command
7332 * buffer.
7333 *
7334 * The next draw call in the primary after CmdExecuteCommands may send
7335 * redundant state, if it all goes in the same geom job.
7336 *
7337 * Can't just copy state from the secondary because the recording state of
7338 * the secondary command buffers would have been deleted at this point.
7339 */
7340 pvr_reset_graphics_dirty_state(cmd_buffer, false);
7341
7342 if (state->current_sub_cmd &&
7343 state->current_sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS) {
7344 for (uint32_t i = 0; i < commandBufferCount; i++) {
7345 PVR_FROM_HANDLE(pvr_cmd_buffer, sec_cmd_buffer, pCommandBuffers[i]);
7346
7347 assert(sec_cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
7348
7349 result = pvr_execute_graphics_cmd_buffer(cmd_buffer, sec_cmd_buffer);
7350 if (result != VK_SUCCESS)
7351 return;
7352 }
7353
7354 last_cmd_buffer =
7355 pvr_cmd_buffer_from_handle(pCommandBuffers[commandBufferCount - 1]);
7356
7357 /* Set barriers from final command secondary command buffer. */
7358 for (uint32_t i = 0; i != PVR_NUM_SYNC_PIPELINE_STAGES; i++) {
7359 state->barriers_needed[i] |=
7360 last_cmd_buffer->state.barriers_needed[i] &
7361 PVR_PIPELINE_STAGE_ALL_GRAPHICS_BITS;
7362 }
7363 } else {
7364 for (uint32_t i = 0; i < commandBufferCount; i++) {
7365 PVR_FROM_HANDLE(pvr_cmd_buffer, sec_cmd_buffer, pCommandBuffers[i]);
7366
7367 assert(sec_cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
7368
7369 result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7370 if (result != VK_SUCCESS)
7371 return;
7372
7373 list_for_each_entry_safe (struct pvr_sub_cmd,
7374 sec_sub_cmd,
7375 &sec_cmd_buffer->sub_cmds,
7376 link) {
7377 result = pvr_execute_sub_cmd(cmd_buffer, sec_sub_cmd);
7378 if (result != VK_SUCCESS)
7379 return;
7380 }
7381 }
7382
7383 last_cmd_buffer =
7384 pvr_cmd_buffer_from_handle(pCommandBuffers[commandBufferCount - 1]);
7385
7386 memcpy(state->barriers_needed,
7387 last_cmd_buffer->state.barriers_needed,
7388 sizeof(state->barriers_needed));
7389 }
7390 }
7391
pvr_insert_transparent_obj(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)7392 static void pvr_insert_transparent_obj(struct pvr_cmd_buffer *const cmd_buffer,
7393 struct pvr_sub_cmd_gfx *const sub_cmd)
7394 {
7395 struct pvr_device *const device = cmd_buffer->device;
7396 /* Yes we want a copy. The user could be recording multiple command buffers
7397 * in parallel so writing the template in place could cause problems.
7398 */
7399 struct pvr_static_clear_ppp_template clear =
7400 device->static_clear_state.ppp_templates[VK_IMAGE_ASPECT_COLOR_BIT];
7401 uint32_t pds_state[PVR_STATIC_CLEAR_PDS_STATE_COUNT] = { 0 };
7402 struct pvr_csb *csb = &sub_cmd->control_stream;
7403 struct pvr_suballoc_bo *ppp_bo;
7404
7405 assert(clear.requires_pds_state);
7406
7407 /* Patch the template. */
7408
7409 pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_SHADERBASE],
7410 TA_STATE_PDS_SHADERBASE,
7411 shaderbase) {
7412 shaderbase.addr = PVR_DEV_ADDR(device->nop_program.pds.data_offset);
7413 }
7414
7415 clear.config.pds_state = &pds_state;
7416
7417 clear.config.ispctl.upass = cmd_buffer->state.render_pass_info.isp_userpass;
7418
7419 /* Emit PPP state from template. */
7420
7421 pvr_emit_ppp_from_template(csb, &clear, &ppp_bo);
7422 list_add(&ppp_bo->link, &cmd_buffer->bo_list);
7423
7424 /* Emit VDM state. */
7425
7426 pvr_emit_clear_words(cmd_buffer, sub_cmd);
7427
7428 /* Reset graphics state. */
7429 pvr_reset_graphics_dirty_state(cmd_buffer, false);
7430 }
7431
7432 static inline struct pvr_render_subpass *
pvr_get_current_subpass(const struct pvr_cmd_buffer_state * const state)7433 pvr_get_current_subpass(const struct pvr_cmd_buffer_state *const state)
7434 {
7435 const uint32_t subpass_idx = state->render_pass_info.subpass_idx;
7436
7437 return &state->render_pass_info.pass->subpasses[subpass_idx];
7438 }
7439
pvr_CmdNextSubpass2(VkCommandBuffer commandBuffer,const VkSubpassBeginInfo * pSubpassBeginInfo,const VkSubpassEndInfo * pSubpassEndInfo)7440 void pvr_CmdNextSubpass2(VkCommandBuffer commandBuffer,
7441 const VkSubpassBeginInfo *pSubpassBeginInfo,
7442 const VkSubpassEndInfo *pSubpassEndInfo)
7443 {
7444 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7445 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
7446 struct pvr_render_pass_info *rp_info = &state->render_pass_info;
7447 const struct pvr_renderpass_hwsetup_subpass *hw_subpass;
7448 struct pvr_renderpass_hwsetup_render *next_hw_render;
7449 const struct pvr_render_pass *pass = rp_info->pass;
7450 const struct pvr_renderpass_hw_map *current_map;
7451 const struct pvr_renderpass_hw_map *next_map;
7452 struct pvr_load_op *hw_subpass_load_op;
7453 VkResult result;
7454
7455 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
7456
7457 current_map = &pass->hw_setup->subpass_map[rp_info->subpass_idx];
7458 next_map = &pass->hw_setup->subpass_map[rp_info->subpass_idx + 1];
7459 next_hw_render = &pass->hw_setup->renders[next_map->render];
7460
7461 if (current_map->render != next_map->render) {
7462 result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7463 if (result != VK_SUCCESS)
7464 return;
7465
7466 result = pvr_resolve_unemitted_resolve_attachments(cmd_buffer, rp_info);
7467 if (result != VK_SUCCESS)
7468 return;
7469
7470 rp_info->current_hw_subpass = next_map->render;
7471
7472 result =
7473 pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
7474 if (result != VK_SUCCESS)
7475 return;
7476
7477 rp_info->enable_bg_tag = false;
7478 rp_info->process_empty_tiles = false;
7479
7480 /* If this subpass contains any load ops the HW Background Object must be
7481 * run to do the clears/loads.
7482 */
7483 if (next_hw_render->color_init_count > 0) {
7484 rp_info->enable_bg_tag = true;
7485
7486 for (uint32_t i = 0; i < next_hw_render->color_init_count; i++) {
7487 /* Empty tiles need to be cleared too. */
7488 if (next_hw_render->color_init[i].op ==
7489 VK_ATTACHMENT_LOAD_OP_CLEAR) {
7490 rp_info->process_empty_tiles = true;
7491 break;
7492 }
7493 }
7494 }
7495
7496 /* Set isp_userpass to zero for new hw_render. This will be used to set
7497 * ROGUE_CR_ISP_CTL::upass_start.
7498 */
7499 rp_info->isp_userpass = 0;
7500 }
7501
7502 hw_subpass = &next_hw_render->subpasses[next_map->subpass];
7503 hw_subpass_load_op = hw_subpass->load_op;
7504
7505 if (hw_subpass_load_op) {
7506 result = pvr_cs_write_load_op(cmd_buffer,
7507 &state->current_sub_cmd->gfx,
7508 hw_subpass_load_op,
7509 rp_info->isp_userpass);
7510 }
7511
7512 /* Pipelines are created for a particular subpass so unbind but leave the
7513 * vertex and descriptor bindings intact as they are orthogonal to the
7514 * subpass.
7515 */
7516 state->gfx_pipeline = NULL;
7517
7518 /* User-pass spawn is 4 bits so if the driver has to wrap it, it will emit a
7519 * full screen transparent object to flush all tags up until now, then the
7520 * user-pass spawn value will implicitly be reset to 0 because
7521 * pvr_render_subpass::isp_userpass values are stored ANDed with
7522 * ROGUE_CR_ISP_CTL_UPASS_START_SIZE_MAX.
7523 */
7524 /* If hw_subpass_load_op is valid then pvr_write_load_op_control_stream
7525 * has already done a full-screen transparent object.
7526 */
7527 if (rp_info->isp_userpass == ROGUE_CR_ISP_CTL_UPASS_START_SIZE_MAX &&
7528 !hw_subpass_load_op) {
7529 pvr_insert_transparent_obj(cmd_buffer, &state->current_sub_cmd->gfx);
7530 }
7531
7532 rp_info->subpass_idx++;
7533
7534 rp_info->isp_userpass = pass->subpasses[rp_info->subpass_idx].isp_userpass;
7535 state->dirty.isp_userpass = true;
7536
7537 rp_info->pipeline_bind_point =
7538 pass->subpasses[rp_info->subpass_idx].pipeline_bind_point;
7539
7540 pvr_stash_depth_format(state, &state->current_sub_cmd->gfx);
7541 }
7542
7543 static bool
pvr_stencil_has_self_dependency(const struct pvr_cmd_buffer_state * const state)7544 pvr_stencil_has_self_dependency(const struct pvr_cmd_buffer_state *const state)
7545 {
7546 const struct pvr_render_subpass *const current_subpass =
7547 pvr_get_current_subpass(state);
7548 const uint32_t *const input_attachments = current_subpass->input_attachments;
7549
7550 if (current_subpass->depth_stencil_attachment == VK_ATTACHMENT_UNUSED)
7551 return false;
7552
7553 /* We only need to check the current software subpass as we don't support
7554 * merging to/from a subpass with self-dep stencil.
7555 */
7556
7557 for (uint32_t i = 0; i < current_subpass->input_count; i++) {
7558 if (input_attachments[i] == current_subpass->depth_stencil_attachment)
7559 return true;
7560 }
7561
7562 return false;
7563 }
7564
pvr_is_stencil_store_load_needed(const struct pvr_cmd_buffer * const cmd_buffer,VkPipelineStageFlags2 vk_src_stage_mask,VkPipelineStageFlags2 vk_dst_stage_mask,uint32_t memory_barrier_count,const VkMemoryBarrier2 * const memory_barriers,uint32_t image_barrier_count,const VkImageMemoryBarrier2 * const image_barriers)7565 static bool pvr_is_stencil_store_load_needed(
7566 const struct pvr_cmd_buffer *const cmd_buffer,
7567 VkPipelineStageFlags2 vk_src_stage_mask,
7568 VkPipelineStageFlags2 vk_dst_stage_mask,
7569 uint32_t memory_barrier_count,
7570 const VkMemoryBarrier2 *const memory_barriers,
7571 uint32_t image_barrier_count,
7572 const VkImageMemoryBarrier2 *const image_barriers)
7573 {
7574 const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
7575 const uint32_t fragment_test_stages =
7576 VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
7577 VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT;
7578 const struct pvr_render_pass *const pass = state->render_pass_info.pass;
7579 const struct pvr_renderpass_hwsetup_render *hw_render;
7580 struct pvr_image_view **const attachments =
7581 state->render_pass_info.attachments;
7582 const struct pvr_image_view *attachment;
7583 uint32_t hw_render_idx;
7584
7585 if (!pass)
7586 return false;
7587
7588 hw_render_idx = state->current_sub_cmd->gfx.hw_render_idx;
7589 hw_render = &pass->hw_setup->renders[hw_render_idx];
7590
7591 if (hw_render->ds_attach_idx == VK_ATTACHMENT_UNUSED)
7592 return false;
7593
7594 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
7595 attachment = attachments[hw_render->ds_attach_idx];
7596 } else {
7597 assert(!attachments);
7598 attachment = NULL;
7599 }
7600
7601 if (!(vk_src_stage_mask & fragment_test_stages) &&
7602 vk_dst_stage_mask & VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT)
7603 return false;
7604
7605 for (uint32_t i = 0; i < memory_barrier_count; i++) {
7606 const uint32_t stencil_write_bit =
7607 VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
7608 const uint32_t input_attachment_read_bit =
7609 VK_ACCESS_INPUT_ATTACHMENT_READ_BIT;
7610
7611 if (!(memory_barriers[i].srcAccessMask & stencil_write_bit))
7612 continue;
7613
7614 if (!(memory_barriers[i].dstAccessMask & input_attachment_read_bit))
7615 continue;
7616
7617 return pvr_stencil_has_self_dependency(state);
7618 }
7619
7620 for (uint32_t i = 0; i < image_barrier_count; i++) {
7621 PVR_FROM_HANDLE(pvr_image, image, image_barriers[i].image);
7622 const uint32_t stencil_bit = VK_IMAGE_ASPECT_STENCIL_BIT;
7623
7624 if (!(image_barriers[i].subresourceRange.aspectMask & stencil_bit))
7625 continue;
7626
7627 if (attachment && image != vk_to_pvr_image(attachment->vk.image))
7628 continue;
7629
7630 if (!vk_format_has_stencil(image->vk.format))
7631 continue;
7632
7633 return pvr_stencil_has_self_dependency(state);
7634 }
7635
7636 return false;
7637 }
7638
7639 static VkResult
pvr_cmd_buffer_insert_mid_frag_barrier_event(struct pvr_cmd_buffer * cmd_buffer,uint32_t src_stage_mask,uint32_t dst_stage_mask)7640 pvr_cmd_buffer_insert_mid_frag_barrier_event(struct pvr_cmd_buffer *cmd_buffer,
7641 uint32_t src_stage_mask,
7642 uint32_t dst_stage_mask)
7643 {
7644 VkResult result;
7645
7646 assert(cmd_buffer->state.current_sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS);
7647
7648 cmd_buffer->state.current_sub_cmd->gfx.empty_cmd = false;
7649
7650 /* Submit graphics job to store stencil. */
7651 cmd_buffer->state.current_sub_cmd->gfx.barrier_store = true;
7652
7653 pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7654 result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
7655 if (result != VK_SUCCESS)
7656 return result;
7657
7658 cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
7659 .type = PVR_EVENT_TYPE_BARRIER,
7660 .barrier = {
7661 .wait_for_stage_mask = src_stage_mask,
7662 .wait_at_stage_mask = dst_stage_mask,
7663 },
7664 };
7665
7666 pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7667 pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
7668
7669 /* Use existing render setup, but load color attachments from HW BGOBJ */
7670 cmd_buffer->state.current_sub_cmd->gfx.barrier_load = true;
7671 cmd_buffer->state.current_sub_cmd->gfx.barrier_store = false;
7672
7673 return VK_SUCCESS;
7674 }
7675
7676 static VkResult
pvr_cmd_buffer_insert_barrier_event(struct pvr_cmd_buffer * cmd_buffer,uint32_t src_stage_mask,uint32_t dst_stage_mask)7677 pvr_cmd_buffer_insert_barrier_event(struct pvr_cmd_buffer *cmd_buffer,
7678 uint32_t src_stage_mask,
7679 uint32_t dst_stage_mask)
7680 {
7681 VkResult result;
7682
7683 result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
7684 if (result != VK_SUCCESS)
7685 return result;
7686
7687 cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
7688 .type = PVR_EVENT_TYPE_BARRIER,
7689 .barrier = {
7690 .wait_for_stage_mask = src_stage_mask,
7691 .wait_at_stage_mask = dst_stage_mask,
7692 },
7693 };
7694
7695 return pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7696 }
7697
7698 /* This is just enough to handle vkCmdPipelineBarrier().
7699 * TODO: Complete?
7700 */
pvr_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,const VkDependencyInfo * pDependencyInfo)7701 void pvr_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,
7702 const VkDependencyInfo *pDependencyInfo)
7703 {
7704 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7705 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
7706 const struct pvr_render_pass *const render_pass =
7707 state->render_pass_info.pass;
7708 VkPipelineStageFlags vk_src_stage_mask = 0U;
7709 VkPipelineStageFlags vk_dst_stage_mask = 0U;
7710 bool is_stencil_store_load_needed;
7711 uint32_t required_stage_mask = 0U;
7712 uint32_t src_stage_mask;
7713 uint32_t dst_stage_mask;
7714 bool is_barrier_needed;
7715 VkResult result;
7716
7717 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
7718
7719 for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++) {
7720 vk_src_stage_mask |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
7721 vk_dst_stage_mask |= pDependencyInfo->pMemoryBarriers[i].dstStageMask;
7722 }
7723
7724 for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++) {
7725 vk_src_stage_mask |=
7726 pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
7727 vk_dst_stage_mask |=
7728 pDependencyInfo->pBufferMemoryBarriers[i].dstStageMask;
7729 }
7730
7731 for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++) {
7732 vk_src_stage_mask |=
7733 pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
7734 vk_dst_stage_mask |=
7735 pDependencyInfo->pImageMemoryBarriers[i].dstStageMask;
7736 }
7737
7738 src_stage_mask = pvr_stage_mask_src(vk_src_stage_mask);
7739 dst_stage_mask = pvr_stage_mask_dst(vk_dst_stage_mask);
7740
7741 for (uint32_t stage = 0U; stage != PVR_NUM_SYNC_PIPELINE_STAGES; stage++) {
7742 if (!(dst_stage_mask & BITFIELD_BIT(stage)))
7743 continue;
7744
7745 required_stage_mask |= state->barriers_needed[stage];
7746 }
7747
7748 src_stage_mask &= required_stage_mask;
7749 for (uint32_t stage = 0U; stage != PVR_NUM_SYNC_PIPELINE_STAGES; stage++) {
7750 if (!(dst_stage_mask & BITFIELD_BIT(stage)))
7751 continue;
7752
7753 state->barriers_needed[stage] &= ~src_stage_mask;
7754 }
7755
7756 if (src_stage_mask == 0 || dst_stage_mask == 0) {
7757 is_barrier_needed = false;
7758 } else if (src_stage_mask == PVR_PIPELINE_STAGE_GEOM_BIT &&
7759 dst_stage_mask == PVR_PIPELINE_STAGE_FRAG_BIT) {
7760 /* This is implicit so no need to barrier. */
7761 is_barrier_needed = false;
7762 } else if (src_stage_mask == dst_stage_mask &&
7763 util_bitcount(src_stage_mask) == 1) {
7764 struct pvr_sub_cmd *const current_sub_cmd = state->current_sub_cmd;
7765
7766 switch (src_stage_mask) {
7767 case PVR_PIPELINE_STAGE_FRAG_BIT:
7768 is_barrier_needed = false;
7769
7770 if (!render_pass)
7771 break;
7772
7773 assert(current_sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS);
7774
7775 /* Flush all fragment work up to this point. */
7776 pvr_insert_transparent_obj(cmd_buffer, ¤t_sub_cmd->gfx);
7777 break;
7778
7779 case PVR_PIPELINE_STAGE_COMPUTE_BIT:
7780 is_barrier_needed = false;
7781
7782 if (!current_sub_cmd ||
7783 current_sub_cmd->type != PVR_SUB_CMD_TYPE_COMPUTE) {
7784 break;
7785 }
7786
7787 /* Multiple dispatches can be merged into a single job. When back to
7788 * back dispatches have a sequential dependency (Compute -> compute
7789 * pipeline barrier) we need to do the following.
7790 * - Dispatch a kernel which fences all previous memory writes and
7791 * flushes the MADD cache.
7792 * - Issue a compute fence which ensures all previous tasks emitted
7793 * by the compute data master are completed before starting
7794 * anything new.
7795 */
7796
7797 /* Issue Data Fence, Wait for Data Fence (IDFWDF) makes the PDS wait
7798 * for data.
7799 */
7800 pvr_compute_generate_idfwdf(cmd_buffer, ¤t_sub_cmd->compute);
7801
7802 pvr_compute_generate_fence(cmd_buffer,
7803 ¤t_sub_cmd->compute,
7804 false);
7805 break;
7806
7807 default:
7808 is_barrier_needed = false;
7809 break;
7810 };
7811 } else {
7812 is_barrier_needed = true;
7813 }
7814
7815 is_stencil_store_load_needed =
7816 pvr_is_stencil_store_load_needed(cmd_buffer,
7817 vk_src_stage_mask,
7818 vk_dst_stage_mask,
7819 pDependencyInfo->memoryBarrierCount,
7820 pDependencyInfo->pMemoryBarriers,
7821 pDependencyInfo->imageMemoryBarrierCount,
7822 pDependencyInfo->pImageMemoryBarriers);
7823
7824 if (is_stencil_store_load_needed) {
7825 assert(render_pass);
7826 result = pvr_cmd_buffer_insert_mid_frag_barrier_event(cmd_buffer,
7827 src_stage_mask,
7828 dst_stage_mask);
7829 if (result != VK_SUCCESS)
7830 mesa_loge("Failed to insert mid frag barrier event.");
7831 } else if (is_barrier_needed) {
7832 result = pvr_cmd_buffer_insert_barrier_event(cmd_buffer,
7833 src_stage_mask,
7834 dst_stage_mask);
7835 if (result != VK_SUCCESS)
7836 mesa_loge("Failed to insert pipeline barrier event.");
7837 }
7838 }
7839
pvr_CmdResetEvent2(VkCommandBuffer commandBuffer,VkEvent _event,VkPipelineStageFlags2 stageMask)7840 void pvr_CmdResetEvent2(VkCommandBuffer commandBuffer,
7841 VkEvent _event,
7842 VkPipelineStageFlags2 stageMask)
7843 {
7844 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7845 PVR_FROM_HANDLE(pvr_event, event, _event);
7846 VkResult result;
7847
7848 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
7849
7850 result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
7851 if (result != VK_SUCCESS)
7852 return;
7853
7854 cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
7855 .type = PVR_EVENT_TYPE_RESET,
7856 .set_reset = {
7857 .event = event,
7858 .wait_for_stage_mask = pvr_stage_mask_src(stageMask),
7859 },
7860 };
7861
7862 pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7863 }
7864
pvr_CmdSetEvent2(VkCommandBuffer commandBuffer,VkEvent _event,const VkDependencyInfo * pDependencyInfo)7865 void pvr_CmdSetEvent2(VkCommandBuffer commandBuffer,
7866 VkEvent _event,
7867 const VkDependencyInfo *pDependencyInfo)
7868 {
7869 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7870 PVR_FROM_HANDLE(pvr_event, event, _event);
7871 VkPipelineStageFlags2 stage_mask = 0;
7872 VkResult result;
7873
7874 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
7875
7876 result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
7877 if (result != VK_SUCCESS)
7878 return;
7879
7880 for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++)
7881 stage_mask |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
7882
7883 for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++)
7884 stage_mask |= pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
7885
7886 for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++)
7887 stage_mask |= pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
7888
7889 cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
7890 .type = PVR_EVENT_TYPE_SET,
7891 .set_reset = {
7892 .event = event,
7893 .wait_for_stage_mask = pvr_stage_mask_dst(stage_mask),
7894 },
7895 };
7896
7897 pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7898 }
7899
pvr_CmdWaitEvents2(VkCommandBuffer commandBuffer,uint32_t eventCount,const VkEvent * pEvents,const VkDependencyInfo * pDependencyInfos)7900 void pvr_CmdWaitEvents2(VkCommandBuffer commandBuffer,
7901 uint32_t eventCount,
7902 const VkEvent *pEvents,
7903 const VkDependencyInfo *pDependencyInfos)
7904 {
7905 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7906 struct pvr_event **events_array;
7907 uint32_t *stage_masks;
7908 VkResult result;
7909
7910 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
7911
7912 VK_MULTIALLOC(ma);
7913 vk_multialloc_add(&ma, &events_array, __typeof__(*events_array), eventCount);
7914 vk_multialloc_add(&ma, &stage_masks, __typeof__(*stage_masks), eventCount);
7915
7916 if (!vk_multialloc_alloc(&ma,
7917 &cmd_buffer->vk.pool->alloc,
7918 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT)) {
7919 vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
7920 return;
7921 }
7922
7923 result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
7924 if (result != VK_SUCCESS) {
7925 vk_free(&cmd_buffer->vk.pool->alloc, events_array);
7926 return;
7927 }
7928
7929 memcpy(events_array, pEvents, sizeof(*events_array) * eventCount);
7930
7931 for (uint32_t i = 0; i < eventCount; i++) {
7932 const VkDependencyInfo *info = &pDependencyInfos[i];
7933 VkPipelineStageFlags2 mask = 0;
7934
7935 for (uint32_t j = 0; j < info->memoryBarrierCount; j++)
7936 mask |= info->pMemoryBarriers[j].dstStageMask;
7937
7938 for (uint32_t j = 0; j < info->bufferMemoryBarrierCount; j++)
7939 mask |= info->pBufferMemoryBarriers[j].dstStageMask;
7940
7941 for (uint32_t j = 0; j < info->imageMemoryBarrierCount; j++)
7942 mask |= info->pImageMemoryBarriers[j].dstStageMask;
7943
7944 stage_masks[i] = pvr_stage_mask_dst(mask);
7945 }
7946
7947 cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
7948 .type = PVR_EVENT_TYPE_WAIT,
7949 .wait = {
7950 .count = eventCount,
7951 .events = events_array,
7952 .wait_at_stage_masks = stage_masks,
7953 },
7954 };
7955
7956 pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7957 }
7958
pvr_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,VkPipelineStageFlags2 stage,VkQueryPool queryPool,uint32_t query)7959 void pvr_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,
7960 VkPipelineStageFlags2 stage,
7961 VkQueryPool queryPool,
7962 uint32_t query)
7963 {
7964 unreachable("Timestamp queries are not supported.");
7965 }
7966
pvr_EndCommandBuffer(VkCommandBuffer commandBuffer)7967 VkResult pvr_EndCommandBuffer(VkCommandBuffer commandBuffer)
7968 {
7969 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7970 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
7971 VkResult result;
7972
7973 if (vk_command_buffer_has_error(&cmd_buffer->vk))
7974 return vk_command_buffer_end(&cmd_buffer->vk);
7975
7976 /* TODO: We should be freeing all the resources, allocated for recording,
7977 * here.
7978 */
7979 util_dynarray_fini(&state->query_indices);
7980
7981 result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7982 if (result != VK_SUCCESS)
7983 pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
7984
7985 return vk_command_buffer_end(&cmd_buffer->vk);
7986 }
7987