1 /*
2 * Copyright © 2022 Imagination Technologies Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a copy
5 * of this software and associated documentation files (the "Software"), to deal
6 * in the Software without restriction, including without limitation the rights
7 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 * copies of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <limits.h>
26 #include <stdbool.h>
27 #include <stddef.h>
28 #include <stdint.h>
29 #include <string.h>
30 #include <vulkan/vulkan.h>
31
32 #include "hwdef/rogue_hw_defs.h"
33 #include "hwdef/rogue_hw_utils.h"
34 #include "pvr_bo.h"
35 #include "pvr_csb.h"
36 #include "pvr_csb_enum_helpers.h"
37 #include "pvr_device_info.h"
38 #include "pvr_end_of_tile.h"
39 #include "pvr_formats.h"
40 #include "pvr_hw_pass.h"
41 #include "pvr_job_common.h"
42 #include "pvr_job_render.h"
43 #include "pvr_limits.h"
44 #include "pvr_pds.h"
45 #include "pvr_private.h"
46 #include "pvr_types.h"
47 #include "pvr_winsys.h"
48 #include "util/bitscan.h"
49 #include "util/compiler.h"
50 #include "util/list.h"
51 #include "util/macros.h"
52 #include "util/u_dynarray.h"
53 #include "util/u_pack_color.h"
54 #include "vk_alloc.h"
55 #include "vk_command_buffer.h"
56 #include "vk_command_pool.h"
57 #include "vk_format.h"
58 #include "vk_log.h"
59 #include "vk_object.h"
60 #include "vk_util.h"
61
62 /* Structure used to pass data into pvr_compute_generate_control_stream()
63 * function.
64 */
65 struct pvr_compute_kernel_info {
66 pvr_dev_addr_t indirect_buffer_addr;
67 bool global_offsets_present;
68 uint32_t usc_common_size;
69 uint32_t usc_unified_size;
70 uint32_t pds_temp_size;
71 uint32_t pds_data_size;
72 enum PVRX(CDMCTRL_USC_TARGET) usc_target;
73 bool is_fence;
74 uint32_t pds_data_offset;
75 uint32_t pds_code_offset;
76 enum PVRX(CDMCTRL_SD_TYPE) sd_type;
77 bool usc_common_shared;
78 uint32_t local_size[PVR_WORKGROUP_DIMENSIONS];
79 uint32_t global_size[PVR_WORKGROUP_DIMENSIONS];
80 uint32_t max_instances;
81 };
82
pvr_cmd_buffer_free_sub_cmd(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd * sub_cmd)83 static void pvr_cmd_buffer_free_sub_cmd(struct pvr_cmd_buffer *cmd_buffer,
84 struct pvr_sub_cmd *sub_cmd)
85 {
86 switch (sub_cmd->type) {
87 case PVR_SUB_CMD_TYPE_GRAPHICS:
88 pvr_csb_finish(&sub_cmd->gfx.control_stream);
89 pvr_bo_free(cmd_buffer->device, sub_cmd->gfx.depth_bias_bo);
90 pvr_bo_free(cmd_buffer->device, sub_cmd->gfx.scissor_bo);
91 break;
92
93 case PVR_SUB_CMD_TYPE_COMPUTE:
94 pvr_csb_finish(&sub_cmd->compute.control_stream);
95 break;
96
97 case PVR_SUB_CMD_TYPE_TRANSFER:
98 list_for_each_entry_safe (struct pvr_transfer_cmd,
99 transfer_cmd,
100 &sub_cmd->transfer.transfer_cmds,
101 link) {
102 list_del(&transfer_cmd->link);
103 vk_free(&cmd_buffer->vk.pool->alloc, transfer_cmd);
104 }
105 break;
106
107 default:
108 pvr_finishme("Unsupported sub-command type %d", sub_cmd->type);
109 break;
110 }
111
112 list_del(&sub_cmd->link);
113 vk_free(&cmd_buffer->vk.pool->alloc, sub_cmd);
114 }
115
pvr_cmd_buffer_free_sub_cmds(struct pvr_cmd_buffer * cmd_buffer)116 static void pvr_cmd_buffer_free_sub_cmds(struct pvr_cmd_buffer *cmd_buffer)
117 {
118 list_for_each_entry_safe (struct pvr_sub_cmd,
119 sub_cmd,
120 &cmd_buffer->sub_cmds,
121 link) {
122 pvr_cmd_buffer_free_sub_cmd(cmd_buffer, sub_cmd);
123 }
124 }
125
pvr_cmd_buffer_destroy(struct vk_command_buffer * vk_cmd_buffer)126 static void pvr_cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer)
127 {
128 struct pvr_cmd_buffer *cmd_buffer =
129 container_of(vk_cmd_buffer, struct pvr_cmd_buffer, vk);
130
131 vk_free(&cmd_buffer->vk.pool->alloc,
132 cmd_buffer->state.render_pass_info.attachments);
133 vk_free(&cmd_buffer->vk.pool->alloc,
134 cmd_buffer->state.render_pass_info.clear_values);
135
136 pvr_cmd_buffer_free_sub_cmds(cmd_buffer);
137
138 list_for_each_entry_safe (struct pvr_bo, bo, &cmd_buffer->bo_list, link) {
139 list_del(&bo->link);
140 pvr_bo_free(cmd_buffer->device, bo);
141 }
142
143 util_dynarray_fini(&cmd_buffer->scissor_array);
144 util_dynarray_fini(&cmd_buffer->depth_bias_array);
145
146 vk_command_buffer_finish(&cmd_buffer->vk);
147 vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer);
148 }
149
pvr_cmd_buffer_create(struct pvr_device * device,struct vk_command_pool * pool,VkCommandBufferLevel level,VkCommandBuffer * pCommandBuffer)150 static VkResult pvr_cmd_buffer_create(struct pvr_device *device,
151 struct vk_command_pool *pool,
152 VkCommandBufferLevel level,
153 VkCommandBuffer *pCommandBuffer)
154 {
155 struct pvr_cmd_buffer *cmd_buffer;
156 VkResult result;
157
158 cmd_buffer = vk_zalloc(&pool->alloc,
159 sizeof(*cmd_buffer),
160 8U,
161 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
162 if (!cmd_buffer)
163 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
164
165 result = vk_command_buffer_init(&cmd_buffer->vk, pool, level);
166 if (result != VK_SUCCESS) {
167 vk_free(&pool->alloc, cmd_buffer);
168 return result;
169 }
170
171 cmd_buffer->vk.destroy = pvr_cmd_buffer_destroy;
172 cmd_buffer->device = device;
173
174 util_dynarray_init(&cmd_buffer->depth_bias_array, NULL);
175 util_dynarray_init(&cmd_buffer->scissor_array, NULL);
176
177 cmd_buffer->state.status = VK_SUCCESS;
178 cmd_buffer->status = PVR_CMD_BUFFER_STATUS_INITIAL;
179
180 list_inithead(&cmd_buffer->sub_cmds);
181 list_inithead(&cmd_buffer->bo_list);
182
183 *pCommandBuffer = pvr_cmd_buffer_to_handle(cmd_buffer);
184
185 return VK_SUCCESS;
186 }
187
188 VkResult
pvr_AllocateCommandBuffers(VkDevice _device,const VkCommandBufferAllocateInfo * pAllocateInfo,VkCommandBuffer * pCommandBuffers)189 pvr_AllocateCommandBuffers(VkDevice _device,
190 const VkCommandBufferAllocateInfo *pAllocateInfo,
191 VkCommandBuffer *pCommandBuffers)
192 {
193 VK_FROM_HANDLE(vk_command_pool, pool, pAllocateInfo->commandPool);
194 PVR_FROM_HANDLE(pvr_device, device, _device);
195 VkResult result = VK_SUCCESS;
196 uint32_t i;
197
198 for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
199 result = pvr_cmd_buffer_create(device,
200 pool,
201 pAllocateInfo->level,
202 &pCommandBuffers[i]);
203 if (result != VK_SUCCESS)
204 break;
205 }
206
207 if (result != VK_SUCCESS) {
208 while (i--) {
209 VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, pCommandBuffers[i]);
210 pvr_cmd_buffer_destroy(cmd_buffer);
211 }
212
213 for (i = 0; i < pAllocateInfo->commandBufferCount; i++)
214 pCommandBuffers[i] = VK_NULL_HANDLE;
215 }
216
217 return result;
218 }
219
pvr_cmd_buffer_update_barriers(struct pvr_cmd_buffer * cmd_buffer,enum pvr_sub_cmd_type type)220 static void pvr_cmd_buffer_update_barriers(struct pvr_cmd_buffer *cmd_buffer,
221 enum pvr_sub_cmd_type type)
222 {
223 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
224 uint32_t barriers;
225
226 switch (type) {
227 case PVR_SUB_CMD_TYPE_GRAPHICS:
228 barriers = PVR_PIPELINE_STAGE_GEOM_BIT | PVR_PIPELINE_STAGE_FRAG_BIT;
229 break;
230
231 case PVR_SUB_CMD_TYPE_COMPUTE:
232 barriers = PVR_PIPELINE_STAGE_COMPUTE_BIT;
233 break;
234
235 case PVR_SUB_CMD_TYPE_TRANSFER:
236 barriers = PVR_PIPELINE_STAGE_TRANSFER_BIT;
237 break;
238
239 default:
240 barriers = 0;
241 pvr_finishme("Unsupported sub-command type %d", type);
242 break;
243 }
244
245 for (uint32_t i = 0; i < ARRAY_SIZE(state->barriers_needed); i++)
246 state->barriers_needed[i] |= barriers;
247 }
248
249 static VkResult
pvr_cmd_buffer_upload_tables(struct pvr_device * device,struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)250 pvr_cmd_buffer_upload_tables(struct pvr_device *device,
251 struct pvr_cmd_buffer *cmd_buffer,
252 struct pvr_sub_cmd_gfx *const sub_cmd)
253 {
254 const uint32_t cache_line_size =
255 rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
256 VkResult result;
257
258 assert(!sub_cmd->depth_bias_bo && !sub_cmd->scissor_bo);
259
260 if (cmd_buffer->depth_bias_array.size > 0) {
261 result =
262 pvr_gpu_upload(device,
263 device->heaps.general_heap,
264 util_dynarray_begin(&cmd_buffer->depth_bias_array),
265 cmd_buffer->depth_bias_array.size,
266 cache_line_size,
267 &sub_cmd->depth_bias_bo);
268 if (result != VK_SUCCESS)
269 return result;
270 }
271
272 if (cmd_buffer->scissor_array.size > 0) {
273 result = pvr_gpu_upload(device,
274 device->heaps.general_heap,
275 util_dynarray_begin(&cmd_buffer->scissor_array),
276 cmd_buffer->scissor_array.size,
277 cache_line_size,
278 &sub_cmd->scissor_bo);
279 if (result != VK_SUCCESS)
280 goto err_free_depth_bias_bo;
281 }
282
283 util_dynarray_clear(&cmd_buffer->depth_bias_array);
284 util_dynarray_clear(&cmd_buffer->scissor_array);
285
286 return VK_SUCCESS;
287
288 err_free_depth_bias_bo:
289 pvr_bo_free(device, sub_cmd->depth_bias_bo);
290 sub_cmd->depth_bias_bo = NULL;
291
292 return result;
293 }
294
295 static VkResult
pvr_cmd_buffer_emit_ppp_state(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)296 pvr_cmd_buffer_emit_ppp_state(struct pvr_cmd_buffer *cmd_buffer,
297 struct pvr_sub_cmd_gfx *const sub_cmd)
298 {
299 struct pvr_framebuffer *framebuffer =
300 cmd_buffer->state.render_pass_info.framebuffer;
301
302 pvr_csb_emit (&sub_cmd->control_stream, VDMCTRL_PPP_STATE0, state0) {
303 state0.addrmsb = framebuffer->ppp_state_bo->vma->dev_addr;
304 state0.word_count = framebuffer->ppp_state_size;
305 }
306
307 pvr_csb_emit (&sub_cmd->control_stream, VDMCTRL_PPP_STATE1, state1) {
308 state1.addrlsb = framebuffer->ppp_state_bo->vma->dev_addr;
309 }
310
311 return VK_SUCCESS;
312 }
313
314 static VkResult
pvr_cmd_buffer_upload_general(struct pvr_cmd_buffer * const cmd_buffer,const void * const data,const size_t size,struct pvr_bo ** const pvr_bo_out)315 pvr_cmd_buffer_upload_general(struct pvr_cmd_buffer *const cmd_buffer,
316 const void *const data,
317 const size_t size,
318 struct pvr_bo **const pvr_bo_out)
319 {
320 struct pvr_device *const device = cmd_buffer->device;
321 const uint32_t cache_line_size =
322 rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
323 struct pvr_bo *pvr_bo;
324 VkResult result;
325
326 result = pvr_gpu_upload(device,
327 device->heaps.general_heap,
328 data,
329 size,
330 cache_line_size,
331 &pvr_bo);
332 if (result != VK_SUCCESS)
333 return result;
334
335 list_add(&pvr_bo->link, &cmd_buffer->bo_list);
336
337 *pvr_bo_out = pvr_bo;
338
339 return VK_SUCCESS;
340 }
341
342 static VkResult
pvr_cmd_buffer_upload_usc(struct pvr_cmd_buffer * const cmd_buffer,const void * const code,const size_t code_size,uint64_t code_alignment,struct pvr_bo ** const pvr_bo_out)343 pvr_cmd_buffer_upload_usc(struct pvr_cmd_buffer *const cmd_buffer,
344 const void *const code,
345 const size_t code_size,
346 uint64_t code_alignment,
347 struct pvr_bo **const pvr_bo_out)
348 {
349 struct pvr_device *const device = cmd_buffer->device;
350 const uint32_t cache_line_size =
351 rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
352 struct pvr_bo *pvr_bo;
353 VkResult result;
354
355 code_alignment = MAX2(code_alignment, cache_line_size);
356
357 result =
358 pvr_gpu_upload_usc(device, code, code_size, code_alignment, &pvr_bo);
359 if (result != VK_SUCCESS)
360 return result;
361
362 list_add(&pvr_bo->link, &cmd_buffer->bo_list);
363
364 *pvr_bo_out = pvr_bo;
365
366 return VK_SUCCESS;
367 }
368
369 static VkResult
pvr_cmd_buffer_upload_pds(struct pvr_cmd_buffer * const cmd_buffer,const uint32_t * data,uint32_t data_size_dwords,uint32_t data_alignment,const uint32_t * code,uint32_t code_size_dwords,uint32_t code_alignment,uint64_t min_alignment,struct pvr_pds_upload * const pds_upload_out)370 pvr_cmd_buffer_upload_pds(struct pvr_cmd_buffer *const cmd_buffer,
371 const uint32_t *data,
372 uint32_t data_size_dwords,
373 uint32_t data_alignment,
374 const uint32_t *code,
375 uint32_t code_size_dwords,
376 uint32_t code_alignment,
377 uint64_t min_alignment,
378 struct pvr_pds_upload *const pds_upload_out)
379 {
380 struct pvr_device *const device = cmd_buffer->device;
381 VkResult result;
382
383 result = pvr_gpu_upload_pds(device,
384 data,
385 data_size_dwords,
386 data_alignment,
387 code,
388 code_size_dwords,
389 code_alignment,
390 min_alignment,
391 pds_upload_out);
392 if (result != VK_SUCCESS)
393 return result;
394
395 list_add(&pds_upload_out->pvr_bo->link, &cmd_buffer->bo_list);
396
397 return VK_SUCCESS;
398 }
399
400 static inline VkResult
pvr_cmd_buffer_upload_pds_data(struct pvr_cmd_buffer * const cmd_buffer,const uint32_t * data,uint32_t data_size_dwords,uint32_t data_alignment,struct pvr_pds_upload * const pds_upload_out)401 pvr_cmd_buffer_upload_pds_data(struct pvr_cmd_buffer *const cmd_buffer,
402 const uint32_t *data,
403 uint32_t data_size_dwords,
404 uint32_t data_alignment,
405 struct pvr_pds_upload *const pds_upload_out)
406 {
407 return pvr_cmd_buffer_upload_pds(cmd_buffer,
408 data,
409 data_size_dwords,
410 data_alignment,
411 NULL,
412 0,
413 0,
414 data_alignment,
415 pds_upload_out);
416 }
417
pvr_sub_cmd_gfx_per_job_fragment_programs_create_and_upload(struct pvr_cmd_buffer * const cmd_buffer,const uint32_t pbe_cs_words[static const ROGUE_NUM_PBESTATE_STATE_WORDS],struct pvr_pds_upload * const pds_upload_out)418 static VkResult pvr_sub_cmd_gfx_per_job_fragment_programs_create_and_upload(
419 struct pvr_cmd_buffer *const cmd_buffer,
420 const uint32_t pbe_cs_words[static const ROGUE_NUM_PBESTATE_STATE_WORDS],
421 struct pvr_pds_upload *const pds_upload_out)
422 {
423 struct pvr_pds_event_program pixel_event_program = {
424 /* No data to DMA, just a DOUTU needed. */
425 .num_emit_word_pairs = 0,
426 };
427 const uint32_t staging_buffer_size =
428 cmd_buffer->device->pixel_event_data_size_in_dwords * sizeof(uint32_t);
429 const VkAllocationCallbacks *const allocator = &cmd_buffer->vk.pool->alloc;
430 struct pvr_device *const device = cmd_buffer->device;
431 /* FIXME: This should come from the compiler for the USC pixel program. */
432 const uint32_t usc_temp_count = 0;
433 struct pvr_bo *usc_eot_program;
434 uint8_t *usc_eot_program_ptr;
435 uint32_t *staging_buffer;
436 VkResult result;
437
438 result = pvr_cmd_buffer_upload_usc(cmd_buffer,
439 pvr_end_of_tile_program,
440 sizeof(pvr_end_of_tile_program),
441 4,
442 &usc_eot_program);
443 if (result != VK_SUCCESS)
444 return result;
445
446 assert((pbe_cs_words[1] & 0x3F) == 0x20);
447
448 /* FIXME: Stop patching the framebuffer address (this will require the
449 * end-of-tile program to be generated at run-time).
450 */
451 pvr_bo_cpu_map(device, usc_eot_program);
452 usc_eot_program_ptr = usc_eot_program->bo->map;
453 usc_eot_program_ptr[6] = (pbe_cs_words[0] >> 0) & 0xFF;
454 usc_eot_program_ptr[7] = (pbe_cs_words[0] >> 8) & 0xFF;
455 usc_eot_program_ptr[8] = (pbe_cs_words[0] >> 16) & 0xFF;
456 usc_eot_program_ptr[9] = (pbe_cs_words[0] >> 24) & 0xFF;
457 pvr_bo_cpu_unmap(device, usc_eot_program);
458
459 pvr_pds_setup_doutu(&pixel_event_program.task_control,
460 usc_eot_program->vma->dev_addr.addr,
461 usc_temp_count,
462 PVRX(PDSINST_DOUTU_SAMPLE_RATE_INSTANCE),
463 false);
464
465 /* TODO: We could skip allocating this and generate directly into the device
466 * buffer thus removing one allocation and memcpy() per job. Would this
467 * speed up things in a noticeable way?
468 */
469 staging_buffer = vk_alloc(allocator,
470 staging_buffer_size,
471 8,
472 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
473 if (!staging_buffer) {
474 result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
475 goto err_free_usc_pixel_program;
476 }
477
478 /* Generate the data segment. The code segment was uploaded earlier when
479 * setting up the PDS static heap data.
480 */
481 pvr_pds_generate_pixel_event_data_segment(&pixel_event_program,
482 staging_buffer,
483 &device->pdevice->dev_info);
484
485 result = pvr_cmd_buffer_upload_pds_data(
486 cmd_buffer,
487 staging_buffer,
488 cmd_buffer->device->pixel_event_data_size_in_dwords,
489 4,
490 pds_upload_out);
491 if (result != VK_SUCCESS)
492 goto err_free_pixel_event_staging_buffer;
493
494 vk_free(allocator, staging_buffer);
495
496 return VK_SUCCESS;
497
498 err_free_pixel_event_staging_buffer:
499 vk_free(allocator, staging_buffer);
500
501 err_free_usc_pixel_program:
502 list_del(&usc_eot_program->link);
503 pvr_bo_free(device, usc_eot_program);
504
505 return result;
506 }
507
pvr_get_hw_clear_color(VkFormat vk_format,const VkClearValue * clear_value)508 static uint32_t pvr_get_hw_clear_color(VkFormat vk_format,
509 const VkClearValue *clear_value)
510 {
511 union util_color uc = { .ui = 0 };
512
513 switch (vk_format) {
514 case VK_FORMAT_B8G8R8A8_UNORM:
515 util_pack_color(clear_value->color.float32,
516 PIPE_FORMAT_R8G8B8A8_UNORM,
517 &uc);
518 break;
519
520 default:
521 assert(!"Unsupported format");
522 uc.ui[0] = 0;
523 break;
524 }
525
526 return uc.ui[0];
527 }
528
529 static VkResult
pvr_load_op_constants_create_and_upload(struct pvr_cmd_buffer * cmd_buffer,uint32_t idx,pvr_dev_addr_t * const addr_out)530 pvr_load_op_constants_create_and_upload(struct pvr_cmd_buffer *cmd_buffer,
531 uint32_t idx,
532 pvr_dev_addr_t *const addr_out)
533 {
534 const struct pvr_render_pass_info *render_pass_info =
535 &cmd_buffer->state.render_pass_info;
536 const struct pvr_render_pass *pass = render_pass_info->pass;
537 const struct pvr_renderpass_hwsetup_render *hw_render =
538 &pass->hw_setup->renders[idx];
539 ASSERTED const struct pvr_load_op *load_op = hw_render->client_data;
540 const struct pvr_renderpass_colorinit *color_init =
541 &hw_render->color_init[0];
542 const struct pvr_render_pass_attachment *attachment =
543 &pass->attachments[color_init->driver_id];
544 const VkClearValue *clear_value =
545 &render_pass_info->clear_values[color_init->driver_id];
546 uint32_t hw_clear_value;
547 struct pvr_bo *clear_bo;
548 VkResult result;
549
550 pvr_finishme("Add missing load op data support");
551
552 assert(load_op->is_hw_object);
553 assert(hw_render->color_init_count == 1);
554
555 /* FIXME: add support for RENDERPASS_SURFACE_INITOP_LOAD. */
556 assert(color_init->op == RENDERPASS_SURFACE_INITOP_CLEAR);
557
558 /* FIXME: do this at the point we store the clear values? */
559 hw_clear_value = pvr_get_hw_clear_color(attachment->vk_format, clear_value);
560
561 result = pvr_cmd_buffer_upload_general(cmd_buffer,
562 &hw_clear_value,
563 sizeof(hw_clear_value),
564 &clear_bo);
565 if (result != VK_SUCCESS)
566 return result;
567
568 *addr_out = clear_bo->vma->dev_addr;
569
570 return VK_SUCCESS;
571 }
572
pvr_load_op_pds_data_create_and_upload(struct pvr_cmd_buffer * cmd_buffer,uint32_t idx,pvr_dev_addr_t constants_addr,struct pvr_pds_upload * const pds_upload_out)573 static VkResult pvr_load_op_pds_data_create_and_upload(
574 struct pvr_cmd_buffer *cmd_buffer,
575 uint32_t idx,
576 pvr_dev_addr_t constants_addr,
577 struct pvr_pds_upload *const pds_upload_out)
578 {
579 const struct pvr_render_pass_info *render_pass_info =
580 &cmd_buffer->state.render_pass_info;
581 const struct pvr_load_op *load_op =
582 render_pass_info->pass->hw_setup->renders[idx].client_data;
583 struct pvr_device *device = cmd_buffer->device;
584 const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
585 struct pvr_pds_pixel_shader_sa_program program = { 0 };
586 uint32_t staging_buffer_size;
587 uint32_t *staging_buffer;
588 VkResult result;
589
590 program.num_texture_dma_kicks = 1;
591
592 pvr_csb_pack (&program.texture_dma_address[0],
593 PDSINST_DOUT_FIELDS_DOUTD_SRC0,
594 value) {
595 value.sbase = constants_addr;
596 }
597
598 pvr_csb_pack (&program.texture_dma_control[0],
599 PDSINST_DOUT_FIELDS_DOUTD_SRC1,
600 value) {
601 value.dest = PVRX(PDSINST_DOUTD_DEST_COMMON_STORE);
602 value.a0 = load_op->shareds_dest_offset;
603 value.bsize = load_op->shareds_count;
604 }
605
606 pvr_pds_set_sizes_pixel_shader_sa_texture_data(&program, dev_info);
607
608 staging_buffer_size = program.data_size * sizeof(*staging_buffer);
609
610 staging_buffer = vk_alloc(&cmd_buffer->vk.pool->alloc,
611 staging_buffer_size,
612 8,
613 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
614 if (!staging_buffer)
615 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
616
617 pvr_pds_generate_pixel_shader_sa_texture_state_data(&program,
618 staging_buffer,
619 dev_info);
620
621 result = pvr_cmd_buffer_upload_pds_data(cmd_buffer,
622 staging_buffer,
623 program.data_size,
624 1,
625 pds_upload_out);
626 if (result != VK_SUCCESS) {
627 vk_free(&cmd_buffer->vk.pool->alloc, staging_buffer);
628 return result;
629 }
630
631 vk_free(&cmd_buffer->vk.pool->alloc, staging_buffer);
632
633 return VK_SUCCESS;
634 }
635
636 /* FIXME: Should this function be specific to the HW background object, in
637 * which case its name should be changed, or should it have the load op
638 * structure passed in?
639 */
640 static VkResult
pvr_load_op_data_create_and_upload(struct pvr_cmd_buffer * cmd_buffer,uint32_t idx,struct pvr_pds_upload * const pds_upload_out)641 pvr_load_op_data_create_and_upload(struct pvr_cmd_buffer *cmd_buffer,
642 uint32_t idx,
643 struct pvr_pds_upload *const pds_upload_out)
644 {
645 pvr_dev_addr_t constants_addr;
646 VkResult result;
647
648 result =
649 pvr_load_op_constants_create_and_upload(cmd_buffer, idx, &constants_addr);
650 if (result != VK_SUCCESS)
651 return result;
652
653 return pvr_load_op_pds_data_create_and_upload(cmd_buffer,
654 idx,
655 constants_addr,
656 pds_upload_out);
657 }
658
pvr_pds_bgnd_pack_state(const struct pvr_load_op * load_op,const struct pvr_pds_upload * load_op_program,uint64_t pds_reg_values[static const ROGUE_NUM_CR_PDS_BGRND_WORDS])659 static void pvr_pds_bgnd_pack_state(
660 const struct pvr_load_op *load_op,
661 const struct pvr_pds_upload *load_op_program,
662 uint64_t pds_reg_values[static const ROGUE_NUM_CR_PDS_BGRND_WORDS])
663 {
664 pvr_csb_pack (&pds_reg_values[0], CR_PDS_BGRND0_BASE, value) {
665 value.shader_addr = PVR_DEV_ADDR(load_op->pds_frag_prog.data_offset);
666 value.texunicode_addr =
667 PVR_DEV_ADDR(load_op->pds_tex_state_prog.code_offset);
668 }
669
670 pvr_csb_pack (&pds_reg_values[1], CR_PDS_BGRND1_BASE, value) {
671 value.texturedata_addr = PVR_DEV_ADDR(load_op_program->data_offset);
672 }
673
674 pvr_csb_pack (&pds_reg_values[2], CR_PDS_BGRND3_SIZEINFO, value) {
675 value.usc_sharedsize =
676 DIV_ROUND_UP(load_op->const_shareds_count,
677 PVRX(CR_PDS_BGRND3_SIZEINFO_USC_SHAREDSIZE_UNIT_SIZE));
678 value.pds_texturestatesize = DIV_ROUND_UP(
679 load_op_program->data_size,
680 PVRX(CR_PDS_BGRND3_SIZEINFO_PDS_TEXTURESTATESIZE_UNIT_SIZE));
681 value.pds_tempsize =
682 DIV_ROUND_UP(load_op->temps_count,
683 PVRX(CR_PDS_BGRND3_SIZEINFO_PDS_TEMPSIZE_UNIT_SIZE));
684 }
685 }
686
687 /**
688 * \brief Calculates the stride in pixels based on the pitch in bytes and pixel
689 * format.
690 *
691 * \param[in] pitch Width pitch in bytes.
692 * \param[in] vk_format Vulkan image format.
693 * \return Stride in pixels.
694 */
pvr_stride_from_pitch(uint32_t pitch,VkFormat vk_format)695 static inline uint32_t pvr_stride_from_pitch(uint32_t pitch, VkFormat vk_format)
696 {
697 const unsigned int cpp = vk_format_get_blocksize(vk_format);
698
699 assert(pitch % cpp == 0);
700
701 return pitch / cpp;
702 }
703
pvr_setup_pbe_state(const struct pvr_device_info * dev_info,struct pvr_framebuffer * framebuffer,uint32_t mrt_index,const struct usc_mrt_resource * mrt_resource,const struct pvr_image_view * const iview,const VkRect2D * render_area,const bool down_scale,const uint32_t samples,uint32_t pbe_cs_words[static const ROGUE_NUM_PBESTATE_STATE_WORDS],uint64_t pbe_reg_words[static const ROGUE_NUM_PBESTATE_REG_WORDS])704 static void pvr_setup_pbe_state(
705 const struct pvr_device_info *dev_info,
706 struct pvr_framebuffer *framebuffer,
707 uint32_t mrt_index,
708 const struct usc_mrt_resource *mrt_resource,
709 const struct pvr_image_view *const iview,
710 const VkRect2D *render_area,
711 const bool down_scale,
712 const uint32_t samples,
713 uint32_t pbe_cs_words[static const ROGUE_NUM_PBESTATE_STATE_WORDS],
714 uint64_t pbe_reg_words[static const ROGUE_NUM_PBESTATE_REG_WORDS])
715 {
716 const struct pvr_image *image = iview->image;
717 uint32_t level_pitch = image->mip_levels[iview->vk.base_mip_level].pitch;
718
719 struct pvr_pbe_surf_params surface_params;
720 struct pvr_pbe_render_params render_params;
721 bool with_packed_usc_channel;
722 const uint8_t *swizzle;
723 uint32_t position;
724
725 /* down_scale should be true when performing a resolve, in which case there
726 * should be more than one sample.
727 */
728 assert((down_scale && samples > 1U) || (!down_scale && samples == 1U));
729
730 /* Setup surface parameters. */
731
732 if (PVR_HAS_FEATURE(dev_info, usc_f16sop_u8)) {
733 switch (iview->vk.format) {
734 case VK_FORMAT_B8G8R8A8_UNORM:
735 with_packed_usc_channel = true;
736 break;
737 case VK_FORMAT_D32_SFLOAT:
738 with_packed_usc_channel = false;
739 break;
740 default:
741 unreachable("Unsupported Vulkan image format");
742 }
743 } else {
744 with_packed_usc_channel = false;
745 }
746
747 swizzle = pvr_get_format_swizzle(iview->vk.format);
748 memcpy(surface_params.swizzle, swizzle, sizeof(surface_params.swizzle));
749
750 pvr_pbe_get_src_format_and_gamma(iview->vk.format,
751 PVR_PBE_GAMMA_NONE,
752 with_packed_usc_channel,
753 &surface_params.source_format,
754 &surface_params.gamma);
755
756 surface_params.is_normalized = vk_format_is_normalized(iview->vk.format);
757 surface_params.pbe_packmode = pvr_get_pbe_packmode(iview->vk.format);
758 surface_params.nr_components = vk_format_get_nr_components(iview->vk.format);
759
760 /* FIXME: Should we have an inline function to return the address of a mip
761 * level?
762 */
763 surface_params.addr =
764 PVR_DEV_ADDR_OFFSET(image->vma->dev_addr,
765 image->mip_levels[iview->vk.base_mip_level].offset);
766
767 surface_params.mem_layout = image->memlayout;
768 surface_params.stride = pvr_stride_from_pitch(level_pitch, iview->vk.format);
769 surface_params.depth = iview->vk.extent.depth;
770 surface_params.width = iview->vk.extent.width;
771 surface_params.height = iview->vk.extent.height;
772 surface_params.z_only_render = false;
773 surface_params.down_scale = down_scale;
774 surface_params.msaa_mode = samples;
775
776 /* Setup render parameters. */
777
778 if (mrt_resource->type == USC_MRT_RESOURCE_TYPE_MEMORY) {
779 position = mrt_resource->u.mem.offset_in_dwords;
780 } else {
781 assert(mrt_resource->type == USC_MRT_RESOURCE_TYPE_OUTPUT_REGISTER);
782 assert(mrt_resource->u.reg.offset == 0);
783
784 position = mrt_resource->u.reg.out_reg;
785 }
786
787 assert(position <= 3 || PVR_HAS_FEATURE(dev_info, eight_output_registers));
788
789 switch (position) {
790 case 0:
791 case 4:
792 render_params.source_start = PVR_PBE_STARTPOS_BIT0;
793 break;
794 case 1:
795 case 5:
796 render_params.source_start = PVR_PBE_STARTPOS_BIT32;
797 break;
798 case 2:
799 case 6:
800 render_params.source_start = PVR_PBE_STARTPOS_BIT64;
801 break;
802 case 3:
803 case 7:
804 render_params.source_start = PVR_PBE_STARTPOS_BIT96;
805 break;
806 default:
807 assert(!"Invalid output register");
808 break;
809 }
810
811 render_params.min_x_clip = MAX2(0, render_area->offset.x);
812 render_params.min_y_clip = MAX2(0, render_area->offset.y);
813 render_params.max_x_clip =
814 MIN2(framebuffer->width,
815 render_area->offset.x + render_area->extent.width) -
816 1;
817 render_params.max_y_clip =
818 MIN2(framebuffer->height,
819 render_area->offset.y + render_area->extent.height) -
820 1;
821
822 render_params.slice = 0;
823 render_params.mrt_index = mrt_index;
824
825 pvr_pbe_pack_state(dev_info,
826 &surface_params,
827 &render_params,
828 pbe_cs_words,
829 pbe_reg_words);
830 }
831
832 static struct pvr_render_target *
pvr_get_render_target(const struct pvr_render_pass * pass,const struct pvr_framebuffer * framebuffer,uint32_t idx)833 pvr_get_render_target(const struct pvr_render_pass *pass,
834 const struct pvr_framebuffer *framebuffer,
835 uint32_t idx)
836 {
837 const struct pvr_renderpass_hwsetup_render *hw_render =
838 &pass->hw_setup->renders[idx];
839 uint32_t rt_idx = 0;
840
841 switch (hw_render->sample_count) {
842 case 1:
843 case 2:
844 case 4:
845 case 8:
846 rt_idx = util_logbase2(hw_render->sample_count);
847 break;
848
849 default:
850 unreachable("Unsupported sample count");
851 break;
852 }
853
854 return &framebuffer->render_targets[rt_idx];
855 }
856
857 static uint32_t
pvr_pass_get_pixel_output_width(const struct pvr_render_pass * pass,uint32_t idx,const struct pvr_device_info * dev_info)858 pvr_pass_get_pixel_output_width(const struct pvr_render_pass *pass,
859 uint32_t idx,
860 const struct pvr_device_info *dev_info)
861 {
862 const struct pvr_renderpass_hwsetup_render *hw_render =
863 &pass->hw_setup->renders[idx];
864 /* Default value based on the maximum value found in all existing cores. The
865 * maximum is used as this is being treated as a lower bound, making it a
866 * "safer" choice than the minimum value found in all existing cores.
867 */
868 const uint32_t min_output_regs =
869 PVR_GET_FEATURE_VALUE(dev_info, usc_min_output_registers_per_pix, 2U);
870 const uint32_t width = MAX2(hw_render->output_regs_count, min_output_regs);
871
872 return util_next_power_of_two(width);
873 }
874
pvr_sub_cmd_gfx_job_init(const struct pvr_device_info * dev_info,struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_gfx * sub_cmd)875 static VkResult pvr_sub_cmd_gfx_job_init(const struct pvr_device_info *dev_info,
876 struct pvr_cmd_buffer *cmd_buffer,
877 struct pvr_sub_cmd_gfx *sub_cmd)
878 {
879 struct pvr_render_pass_info *render_pass_info =
880 &cmd_buffer->state.render_pass_info;
881 const struct pvr_renderpass_hwsetup_render *hw_render =
882 &render_pass_info->pass->hw_setup->renders[sub_cmd->hw_render_idx];
883 struct pvr_render_job *job = &sub_cmd->job;
884 struct pvr_pds_upload pds_pixel_event_program;
885
886 uint32_t pbe_cs_words[PVR_MAX_COLOR_ATTACHMENTS]
887 [ROGUE_NUM_PBESTATE_STATE_WORDS];
888 struct pvr_render_target *render_target;
889 VkResult result;
890
891 assert(hw_render->eot_surface_count < ARRAY_SIZE(pbe_cs_words));
892
893 for (uint32_t i = 0; i < hw_render->eot_surface_count; i++) {
894 const struct pvr_renderpass_hwsetup_eot_surface *surface =
895 &hw_render->eot_surfaces[i];
896 const struct pvr_image_view *iview =
897 render_pass_info->attachments[surface->attachment_index];
898 const struct usc_mrt_resource *mrt_resource =
899 &hw_render->eot_setup.mrt_resources[surface->mrt_index];
900 uint32_t samples = 1;
901
902 if (surface->need_resolve)
903 pvr_finishme("Set up job resolve information.");
904
905 pvr_setup_pbe_state(dev_info,
906 render_pass_info->framebuffer,
907 surface->mrt_index,
908 mrt_resource,
909 iview,
910 &render_pass_info->render_area,
911 surface->need_resolve,
912 samples,
913 pbe_cs_words[i],
914 job->pbe_reg_words[i]);
915 }
916
917 /* FIXME: The fragment program only supports a single surface at present. */
918 assert(hw_render->eot_surface_count == 1);
919 result = pvr_sub_cmd_gfx_per_job_fragment_programs_create_and_upload(
920 cmd_buffer,
921 pbe_cs_words[0],
922 &pds_pixel_event_program);
923 if (result != VK_SUCCESS)
924 return result;
925
926 job->pds_pixel_event_data_offset = pds_pixel_event_program.data_offset;
927
928 /* FIXME: Don't do this if there is a barrier load. */
929 if (render_pass_info->enable_bg_tag) {
930 const struct pvr_load_op *load_op = hw_render->client_data;
931 struct pvr_pds_upload load_op_program;
932
933 /* FIXME: Should we free the PDS pixel event data or let it be freed
934 * when the pool gets emptied?
935 */
936 result = pvr_load_op_data_create_and_upload(cmd_buffer,
937 sub_cmd->hw_render_idx,
938 &load_op_program);
939 if (result != VK_SUCCESS)
940 return result;
941
942 pvr_pds_bgnd_pack_state(load_op,
943 &load_op_program,
944 job->pds_bgnd_reg_values);
945 }
946
947 job->enable_bg_tag = render_pass_info->enable_bg_tag;
948 job->process_empty_tiles = render_pass_info->process_empty_tiles;
949
950 render_target = pvr_get_render_target(render_pass_info->pass,
951 render_pass_info->framebuffer,
952 sub_cmd->hw_render_idx);
953 job->rt_dataset = render_target->rt_dataset;
954
955 job->ctrl_stream_addr = pvr_csb_get_start_address(&sub_cmd->control_stream);
956
957 /* FIXME: Need to set up the border color table at device creation
958 * time. Set to invalid for the time being.
959 */
960 job->border_colour_table_addr = PVR_DEV_ADDR_INVALID;
961
962 if (sub_cmd->depth_bias_bo)
963 job->depth_bias_table_addr = sub_cmd->depth_bias_bo->vma->dev_addr;
964 else
965 job->depth_bias_table_addr = PVR_DEV_ADDR_INVALID;
966
967 if (sub_cmd->scissor_bo)
968 job->scissor_table_addr = sub_cmd->scissor_bo->vma->dev_addr;
969 else
970 job->scissor_table_addr = PVR_DEV_ADDR_INVALID;
971
972 job->pixel_output_width =
973 pvr_pass_get_pixel_output_width(render_pass_info->pass,
974 sub_cmd->hw_render_idx,
975 dev_info);
976
977 if (hw_render->ds_surface_id != -1) {
978 struct pvr_image_view *iview =
979 render_pass_info->attachments[hw_render->ds_surface_id];
980 const struct pvr_image *image = iview->image;
981
982 if (vk_format_has_depth(image->vk.format)) {
983 uint32_t level_pitch =
984 image->mip_levels[iview->vk.base_mip_level].pitch;
985
986 /* FIXME: Is this sufficient for depth buffers? */
987 job->depth_addr = image->dev_addr;
988
989 job->depth_stride =
990 pvr_stride_from_pitch(level_pitch, iview->vk.format);
991 job->depth_height = iview->vk.extent.height;
992 job->depth_physical_width =
993 u_minify(image->physical_extent.width, iview->vk.base_mip_level);
994 job->depth_physical_height =
995 u_minify(image->physical_extent.height, iview->vk.base_mip_level);
996 job->depth_layer_size = image->layer_size;
997
998 if (hw_render->ds_surface_id < render_pass_info->clear_value_count) {
999 VkClearValue *clear_values =
1000 &render_pass_info->clear_values[hw_render->ds_surface_id];
1001
1002 job->depth_clear_value = clear_values->depthStencil.depth;
1003 } else {
1004 job->depth_clear_value = 1.0f;
1005 }
1006
1007 job->depth_vk_format = iview->vk.format;
1008
1009 job->depth_memlayout = image->memlayout;
1010 } else {
1011 job->depth_addr = PVR_DEV_ADDR_INVALID;
1012 job->depth_stride = 0;
1013 job->depth_height = 0;
1014 job->depth_physical_width = 0;
1015 job->depth_physical_height = 0;
1016 job->depth_layer_size = 0;
1017 job->depth_clear_value = 1.0f;
1018 job->depth_vk_format = VK_FORMAT_UNDEFINED;
1019 job->depth_memlayout = PVR_MEMLAYOUT_LINEAR;
1020 }
1021
1022 if (vk_format_has_stencil(image->vk.format)) {
1023 /* FIXME: Is this sufficient for stencil buffers? */
1024 job->stencil_addr = image->dev_addr;
1025 } else {
1026 job->stencil_addr = PVR_DEV_ADDR_INVALID;
1027 }
1028
1029 job->samples = image->vk.samples;
1030 } else {
1031 pvr_finishme("Set up correct number of samples for render job");
1032
1033 job->depth_addr = PVR_DEV_ADDR_INVALID;
1034 job->depth_stride = 0;
1035 job->depth_height = 0;
1036 job->depth_physical_width = 0;
1037 job->depth_physical_height = 0;
1038 job->depth_layer_size = 0;
1039 job->depth_clear_value = 1.0f;
1040 job->depth_vk_format = VK_FORMAT_UNDEFINED;
1041 job->depth_memlayout = PVR_MEMLAYOUT_LINEAR;
1042
1043 job->stencil_addr = PVR_DEV_ADDR_INVALID;
1044
1045 job->samples = 1;
1046 }
1047
1048 if (sub_cmd->max_tiles_in_flight ==
1049 PVR_GET_FEATURE_VALUE(dev_info, isp_max_tiles_in_flight, 1U)) {
1050 /* Use the default limit based on the partition store. */
1051 job->max_tiles_in_flight = 0U;
1052 } else {
1053 job->max_tiles_in_flight = sub_cmd->max_tiles_in_flight;
1054 }
1055
1056 job->frag_uses_atomic_ops = sub_cmd->frag_uses_atomic_ops;
1057 job->disable_compute_overlap = false;
1058 job->max_shared_registers = cmd_buffer->state.max_shared_regs;
1059 job->run_frag = true;
1060 job->geometry_terminate = true;
1061
1062 return VK_SUCCESS;
1063 }
1064
1065 /* Number of shareds used in the Issue Data Fence(IDF)/Wait Data Fence(WDF)
1066 * kernel.
1067 */
1068 #define PVR_IDF_WDF_IN_REGISTER_CONST_COUNT 12U
1069
1070 static void
pvr_sub_cmd_compute_job_init(const struct pvr_physical_device * pdevice,struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * sub_cmd)1071 pvr_sub_cmd_compute_job_init(const struct pvr_physical_device *pdevice,
1072 struct pvr_cmd_buffer *cmd_buffer,
1073 struct pvr_sub_cmd_compute *sub_cmd)
1074 {
1075 const struct pvr_device_runtime_info *dev_runtime_info =
1076 &pdevice->dev_runtime_info;
1077 const struct pvr_device_info *dev_info = &pdevice->dev_info;
1078
1079 if (sub_cmd->uses_barrier)
1080 sub_cmd->submit_info.flags |= PVR_WINSYS_COMPUTE_FLAG_PREVENT_ALL_OVERLAP;
1081
1082 pvr_csb_pack (&sub_cmd->submit_info.regs.cdm_ctrl_stream_base,
1083 CR_CDM_CTRL_STREAM_BASE,
1084 value) {
1085 value.addr = pvr_csb_get_start_address(&sub_cmd->control_stream);
1086 }
1087
1088 /* FIXME: Need to set up the border color table at device creation
1089 * time. Set to invalid for the time being.
1090 */
1091 pvr_csb_pack (&sub_cmd->submit_info.regs.tpu_border_colour_table,
1092 CR_TPU_BORDER_COLOUR_TABLE_CDM,
1093 value) {
1094 value.border_colour_table_address = PVR_DEV_ADDR_INVALID;
1095 }
1096
1097 sub_cmd->num_shared_regs = MAX2(cmd_buffer->device->idfwdf_state.usc_shareds,
1098 cmd_buffer->state.max_shared_regs);
1099
1100 cmd_buffer->state.max_shared_regs = 0U;
1101
1102 if (PVR_HAS_FEATURE(dev_info, compute_morton_capable))
1103 sub_cmd->submit_info.regs.cdm_item = 0;
1104
1105 pvr_csb_pack (&sub_cmd->submit_info.regs.tpu, CR_TPU, value) {
1106 value.tag_cem_4k_face_packing = true;
1107 }
1108
1109 if (PVR_HAS_FEATURE(dev_info, cluster_grouping) &&
1110 PVR_HAS_FEATURE(dev_info, slc_mcu_cache_controls) &&
1111 dev_runtime_info->num_phantoms > 1 && sub_cmd->uses_atomic_ops) {
1112 /* Each phantom has its own MCU, so atomicity can only be guaranteed
1113 * when all work items are processed on the same phantom. This means we
1114 * need to disable all USCs other than those of the first phantom, which
1115 * has 4 clusters.
1116 */
1117 pvr_csb_pack (&sub_cmd->submit_info.regs.compute_cluster,
1118 CR_COMPUTE_CLUSTER,
1119 value) {
1120 value.mask = 0xFU;
1121 }
1122 } else {
1123 pvr_csb_pack (&sub_cmd->submit_info.regs.compute_cluster,
1124 CR_COMPUTE_CLUSTER,
1125 value) {
1126 value.mask = 0U;
1127 }
1128 }
1129
1130 if (PVR_HAS_FEATURE(dev_info, gpu_multicore_support) &&
1131 sub_cmd->uses_atomic_ops) {
1132 sub_cmd->submit_info.flags |= PVR_WINSYS_COMPUTE_FLAG_SINGLE_CORE;
1133 }
1134 }
1135
1136 #define PIXEL_ALLOCATION_SIZE_MAX_IN_BLOCKS \
1137 (1024 / PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE))
1138
1139 static uint32_t
pvr_compute_flat_slot_size(const struct pvr_physical_device * pdevice,uint32_t coeff_regs_count,bool use_barrier,uint32_t total_workitems)1140 pvr_compute_flat_slot_size(const struct pvr_physical_device *pdevice,
1141 uint32_t coeff_regs_count,
1142 bool use_barrier,
1143 uint32_t total_workitems)
1144 {
1145 const struct pvr_device_runtime_info *dev_runtime_info =
1146 &pdevice->dev_runtime_info;
1147 const struct pvr_device_info *dev_info = &pdevice->dev_info;
1148 uint32_t max_workgroups_per_task = ROGUE_CDM_MAX_PACKED_WORKGROUPS_PER_TASK;
1149 uint32_t max_avail_coeff_regs =
1150 dev_runtime_info->cdm_max_local_mem_size_regs;
1151 uint32_t localstore_chunks_count =
1152 DIV_ROUND_UP(coeff_regs_count << 2,
1153 PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE));
1154
1155 /* Ensure that we cannot have more workgroups in a slot than the available
1156 * number of coefficients allow us to have.
1157 */
1158 if (coeff_regs_count > 0U) {
1159 /* If TA or 3D can overlap with CDM, or if the TA is running a geometry
1160 * shader then we need to consider this in calculating max allowed
1161 * work-groups.
1162 */
1163 if (PVR_HAS_QUIRK(dev_info, 52354) &&
1164 (PVR_HAS_FEATURE(dev_info, compute_overlap) ||
1165 PVR_HAS_FEATURE(dev_info, gs_rta_support))) {
1166 /* Solve for n (number of work-groups per task). All values are in
1167 * size of common store alloc blocks:
1168 *
1169 * n + (2n + 7) * (local_memory_size_max - 1) =
1170 * (coefficient_memory_pool_size) - (7 * pixel_allocation_size_max)
1171 * ==>
1172 * n + 2n * (local_memory_size_max - 1) =
1173 * (coefficient_memory_pool_size) - (7 * pixel_allocation_size_max)
1174 * - (7 * (local_memory_size_max - 1))
1175 * ==>
1176 * n * (1 + 2 * (local_memory_size_max - 1)) =
1177 * (coefficient_memory_pool_size) - (7 * pixel_allocation_size_max)
1178 * - (7 * (local_memory_size_max - 1))
1179 * ==>
1180 * n = ((coefficient_memory_pool_size) -
1181 * (7 * pixel_allocation_size_max) -
1182 * (7 * (local_memory_size_max - 1)) / (1 +
1183 * 2 * (local_memory_size_max - 1)))
1184 */
1185 uint32_t max_common_store_blocks =
1186 DIV_ROUND_UP(max_avail_coeff_regs * 4U,
1187 PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE));
1188
1189 /* (coefficient_memory_pool_size) - (7 * pixel_allocation_size_max)
1190 */
1191 max_common_store_blocks -= ROGUE_MAX_OVERLAPPED_PIXEL_TASK_INSTANCES *
1192 PIXEL_ALLOCATION_SIZE_MAX_IN_BLOCKS;
1193
1194 /* - (7 * (local_memory_size_max - 1)) */
1195 max_common_store_blocks -= (ROGUE_MAX_OVERLAPPED_PIXEL_TASK_INSTANCES *
1196 (localstore_chunks_count - 1U));
1197
1198 /* Divide by (1 + 2 * (local_memory_size_max - 1)) */
1199 max_workgroups_per_task = max_common_store_blocks /
1200 (1U + 2U * (localstore_chunks_count - 1U));
1201
1202 max_workgroups_per_task =
1203 MIN2(max_workgroups_per_task,
1204 ROGUE_CDM_MAX_PACKED_WORKGROUPS_PER_TASK);
1205
1206 } else {
1207 max_workgroups_per_task =
1208 MIN2((max_avail_coeff_regs / coeff_regs_count),
1209 max_workgroups_per_task);
1210 }
1211 }
1212
1213 /* max_workgroups_per_task should at least be one. */
1214 assert(max_workgroups_per_task >= 1U);
1215
1216 if (total_workitems >= ROGUE_MAX_INSTANCES_PER_TASK) {
1217 /* In this case, the work group size will have been padded up to the
1218 * next ROGUE_MAX_INSTANCES_PER_TASK so we just set max instances to be
1219 * ROGUE_MAX_INSTANCES_PER_TASK.
1220 */
1221 return ROGUE_MAX_INSTANCES_PER_TASK;
1222 }
1223
1224 /* In this case, the number of instances in the slot must be clamped to
1225 * accommodate whole work-groups only.
1226 */
1227 if (PVR_HAS_QUIRK(dev_info, 49032) || use_barrier) {
1228 max_workgroups_per_task =
1229 MIN2(max_workgroups_per_task,
1230 ROGUE_MAX_INSTANCES_PER_TASK / total_workitems);
1231 return total_workitems * max_workgroups_per_task;
1232 }
1233
1234 return MIN2(total_workitems * max_workgroups_per_task,
1235 ROGUE_MAX_INSTANCES_PER_TASK);
1236 }
1237
1238 static void
pvr_compute_generate_control_stream(struct pvr_csb * csb,struct pvr_sub_cmd_compute * sub_cmd,const struct pvr_compute_kernel_info * info)1239 pvr_compute_generate_control_stream(struct pvr_csb *csb,
1240 struct pvr_sub_cmd_compute *sub_cmd,
1241 const struct pvr_compute_kernel_info *info)
1242 {
1243 /* Compute kernel 0. */
1244 pvr_csb_emit (csb, CDMCTRL_KERNEL0, kernel0) {
1245 kernel0.indirect_present = !!info->indirect_buffer_addr.addr;
1246 kernel0.global_offsets_present = info->global_offsets_present;
1247 kernel0.usc_common_size = info->usc_common_size;
1248 kernel0.usc_unified_size = info->usc_unified_size;
1249 kernel0.pds_temp_size = info->pds_temp_size;
1250 kernel0.pds_data_size = info->pds_data_size;
1251 kernel0.usc_target = info->usc_target;
1252 kernel0.fence = info->is_fence;
1253 }
1254
1255 /* Compute kernel 1. */
1256 pvr_csb_emit (csb, CDMCTRL_KERNEL1, kernel1) {
1257 kernel1.data_addr = PVR_DEV_ADDR(info->pds_data_offset);
1258 kernel1.sd_type = info->sd_type;
1259 kernel1.usc_common_shared = info->usc_common_shared;
1260 }
1261
1262 /* Compute kernel 2. */
1263 pvr_csb_emit (csb, CDMCTRL_KERNEL2, kernel2) {
1264 kernel2.code_addr = PVR_DEV_ADDR(info->pds_code_offset);
1265 }
1266
1267 if (info->indirect_buffer_addr.addr) {
1268 /* Compute kernel 6. */
1269 pvr_csb_emit (csb, CDMCTRL_KERNEL6, kernel6) {
1270 kernel6.indirect_addrmsb = info->indirect_buffer_addr;
1271 }
1272
1273 /* Compute kernel 7. */
1274 pvr_csb_emit (csb, CDMCTRL_KERNEL7, kernel7) {
1275 kernel7.indirect_addrlsb = info->indirect_buffer_addr;
1276 }
1277 } else {
1278 /* Compute kernel 3. */
1279 pvr_csb_emit (csb, CDMCTRL_KERNEL3, kernel3) {
1280 assert(info->global_size[0U] > 0U);
1281 kernel3.workgroup_x = info->global_size[0U] - 1U;
1282 }
1283
1284 /* Compute kernel 4. */
1285 pvr_csb_emit (csb, CDMCTRL_KERNEL4, kernel4) {
1286 assert(info->global_size[1U] > 0U);
1287 kernel4.workgroup_y = info->global_size[1U] - 1U;
1288 }
1289
1290 /* Compute kernel 5. */
1291 pvr_csb_emit (csb, CDMCTRL_KERNEL5, kernel5) {
1292 assert(info->global_size[2U] > 0U);
1293 kernel5.workgroup_z = info->global_size[2U] - 1U;
1294 }
1295 }
1296
1297 /* Compute kernel 8. */
1298 pvr_csb_emit (csb, CDMCTRL_KERNEL8, kernel8) {
1299 if (info->max_instances == ROGUE_MAX_INSTANCES_PER_TASK)
1300 kernel8.max_instances = 0U;
1301 else
1302 kernel8.max_instances = info->max_instances;
1303
1304 assert(info->local_size[0U] > 0U);
1305 kernel8.workgroup_size_x = info->local_size[0U] - 1U;
1306 assert(info->local_size[1U] > 0U);
1307 kernel8.workgroup_size_y = info->local_size[1U] - 1U;
1308 assert(info->local_size[2U] > 0U);
1309 kernel8.workgroup_size_z = info->local_size[2U] - 1U;
1310 }
1311
1312 /* Track the highest amount of shared registers usage in this dispatch.
1313 * This is used by the FW for context switching, so must be large enough
1314 * to contain all the shared registers that might be in use for this compute
1315 * job. Coefficients don't need to be included as the context switch will not
1316 * happen within the execution of a single workgroup, thus nothing needs to
1317 * be preserved.
1318 */
1319 if (info->usc_common_shared) {
1320 sub_cmd->num_shared_regs =
1321 MAX2(sub_cmd->num_shared_regs, info->usc_common_size);
1322 }
1323 }
1324
1325 /* TODO: This can be pre-packed and uploaded directly. Would that provide any
1326 * speed up?
1327 */
1328 static void
pvr_compute_generate_idfwdf(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd)1329 pvr_compute_generate_idfwdf(struct pvr_cmd_buffer *cmd_buffer,
1330 struct pvr_sub_cmd_compute *const sub_cmd)
1331 {
1332 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
1333 bool *const is_sw_barier_required =
1334 &state->current_sub_cmd->compute.pds_sw_barrier_requires_clearing;
1335 const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
1336 struct pvr_csb *csb = &sub_cmd->control_stream;
1337 const struct pvr_pds_upload *program;
1338
1339 if (PVR_NEED_SW_COMPUTE_PDS_BARRIER(&pdevice->dev_info) &&
1340 *is_sw_barier_required) {
1341 *is_sw_barier_required = false;
1342 program = &cmd_buffer->device->idfwdf_state.sw_compute_barrier_pds;
1343 } else {
1344 program = &cmd_buffer->device->idfwdf_state.pds;
1345 }
1346
1347 struct pvr_compute_kernel_info info = {
1348 .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
1349 .global_offsets_present = false,
1350 .usc_common_size =
1351 DIV_ROUND_UP(cmd_buffer->device->idfwdf_state.usc_shareds << 2,
1352 PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE)),
1353 .usc_unified_size = 0U,
1354 .pds_temp_size = 0U,
1355 .pds_data_size =
1356 DIV_ROUND_UP(program->data_size << 2,
1357 PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE)),
1358 .usc_target = PVRX(CDMCTRL_USC_TARGET_ALL),
1359 .is_fence = false,
1360 .pds_data_offset = program->data_offset,
1361 .sd_type = PVRX(CDMCTRL_SD_TYPE_USC),
1362 .usc_common_shared = true,
1363 .pds_code_offset = program->code_offset,
1364 .global_size = { 1U, 1U, 1U },
1365 .local_size = { 1U, 1U, 1U },
1366 };
1367
1368 /* We don't need to pad work-group size for this case. */
1369
1370 info.max_instances =
1371 pvr_compute_flat_slot_size(pdevice,
1372 cmd_buffer->device->idfwdf_state.usc_shareds,
1373 false,
1374 1U);
1375
1376 pvr_compute_generate_control_stream(csb, sub_cmd, &info);
1377 }
1378
1379 static void
pvr_compute_generate_fence(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd,bool deallocate_shareds)1380 pvr_compute_generate_fence(struct pvr_cmd_buffer *cmd_buffer,
1381 struct pvr_sub_cmd_compute *const sub_cmd,
1382 bool deallocate_shareds)
1383 {
1384 const struct pvr_pds_upload *program =
1385 &cmd_buffer->device->pds_compute_fence_program;
1386 const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
1387 struct pvr_csb *csb = &sub_cmd->control_stream;
1388
1389 struct pvr_compute_kernel_info info = {
1390 .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
1391 .global_offsets_present = false,
1392 .usc_common_size = 0U,
1393 .usc_unified_size = 0U,
1394 .pds_temp_size = 0U,
1395 .pds_data_size =
1396 DIV_ROUND_UP(program->data_size << 2,
1397 PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE)),
1398 .usc_target = PVRX(CDMCTRL_USC_TARGET_ANY),
1399 .is_fence = true,
1400 .pds_data_offset = program->data_offset,
1401 .sd_type = PVRX(CDMCTRL_SD_TYPE_PDS),
1402 .usc_common_shared = deallocate_shareds,
1403 .pds_code_offset = program->code_offset,
1404 .global_size = { 1U, 1U, 1U },
1405 .local_size = { 1U, 1U, 1U },
1406 };
1407
1408 /* We don't need to pad work-group size for this case. */
1409 /* Here we calculate the slot size. This can depend on the use of barriers,
1410 * local memory, BRN's or other factors.
1411 */
1412 info.max_instances = pvr_compute_flat_slot_size(pdevice, 0U, false, 1U);
1413
1414 pvr_compute_generate_control_stream(csb, sub_cmd, &info);
1415 }
1416
pvr_cmd_buffer_end_sub_cmd(struct pvr_cmd_buffer * cmd_buffer)1417 static VkResult pvr_cmd_buffer_end_sub_cmd(struct pvr_cmd_buffer *cmd_buffer)
1418 {
1419 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
1420 struct pvr_sub_cmd *sub_cmd = state->current_sub_cmd;
1421 struct pvr_device *device = cmd_buffer->device;
1422 VkResult result;
1423
1424 /* FIXME: Is this NULL check required because this function is called from
1425 * pvr_resolve_unemitted_resolve_attachments()? See comment about this
1426 * function being called twice in a row in pvr_CmdEndRenderPass().
1427 */
1428 if (!sub_cmd)
1429 return VK_SUCCESS;
1430
1431 switch (sub_cmd->type) {
1432 case PVR_SUB_CMD_TYPE_GRAPHICS: {
1433 struct pvr_sub_cmd_gfx *const gfx_sub_cmd = &sub_cmd->gfx;
1434
1435 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
1436 result = pvr_csb_emit_return(&gfx_sub_cmd->control_stream);
1437 if (result != VK_SUCCESS) {
1438 state->status = result;
1439 return result;
1440 }
1441
1442 break;
1443 }
1444
1445 /* TODO: Check if the sub_cmd can be skipped based on
1446 * sub_cmd->gfx.empty_cmd flag.
1447 */
1448
1449 result = pvr_cmd_buffer_upload_tables(device, cmd_buffer, gfx_sub_cmd);
1450 if (result != VK_SUCCESS) {
1451 state->status = result;
1452 return result;
1453 }
1454
1455 result = pvr_cmd_buffer_emit_ppp_state(cmd_buffer, gfx_sub_cmd);
1456 if (result != VK_SUCCESS) {
1457 state->status = result;
1458 return result;
1459 }
1460
1461 result = pvr_csb_emit_terminate(&gfx_sub_cmd->control_stream);
1462 if (result != VK_SUCCESS) {
1463 state->status = result;
1464 return result;
1465 }
1466
1467 result = pvr_sub_cmd_gfx_job_init(&device->pdevice->dev_info,
1468 cmd_buffer,
1469 gfx_sub_cmd);
1470 if (result != VK_SUCCESS) {
1471 state->status = result;
1472 return result;
1473 }
1474
1475 break;
1476 }
1477
1478 case PVR_SUB_CMD_TYPE_COMPUTE: {
1479 struct pvr_sub_cmd_compute *const compute_sub_cmd = &sub_cmd->compute;
1480
1481 pvr_compute_generate_fence(cmd_buffer, compute_sub_cmd, true);
1482
1483 result = pvr_csb_emit_terminate(&compute_sub_cmd->control_stream);
1484 if (result != VK_SUCCESS) {
1485 state->status = result;
1486 return result;
1487 }
1488
1489 pvr_sub_cmd_compute_job_init(device->pdevice,
1490 cmd_buffer,
1491 compute_sub_cmd);
1492 break;
1493 }
1494
1495 case PVR_SUB_CMD_TYPE_TRANSFER:
1496 break;
1497
1498 default:
1499 pvr_finishme("Unsupported sub-command type %d", sub_cmd->type);
1500 break;
1501 }
1502
1503 state->current_sub_cmd = NULL;
1504
1505 return VK_SUCCESS;
1506 }
1507
pvr_reset_graphics_dirty_state(struct pvr_cmd_buffer_state * state,bool start_geom)1508 static void pvr_reset_graphics_dirty_state(struct pvr_cmd_buffer_state *state,
1509 bool start_geom)
1510 {
1511 if (start_geom) {
1512 /*
1513 * Initial geometry phase State.
1514 * It's the driver's responsibility to ensure that the state of the
1515 * hardware is correctly initialized at the start of every geometry
1516 * phase. This is required to prevent stale state from a previous
1517 * geometry phase erroneously affecting the next geometry phase. The
1518 * following fields in PPP State Header, and their corresponding state
1519 * words, must be supplied in the first PPP State Update of a geometry
1520 * phase that contains any geometry (draw calls). Any field not listed
1521 * below is safe to ignore.
1522 *
1523 * TA_PRES_STREAM_OUT_SIZE
1524 * TA_PRES_PPPCTRL
1525 * TA_PRES_VARYING_WORD2
1526 * TA_PRES_VARYING_WORD1
1527 * TA_PRES_VARYING_WORD0
1528 * TA_PRES_OUTSELECTS
1529 * TA_PRES_WCLAMP
1530 * TA_VIEWPORT_COUNT
1531 * TA_PRES_VIEWPORT
1532 * TA_PRES_REGION_CLIP
1533 * TA_PRES_PDSSTATEPTR0
1534 * TA_PRES_ISPCTLFB
1535 * TA_PRES_ISPCTLFA
1536 * TA_PRES_ISPCTL
1537 *
1538 * If a geometry phase does not contain any geometry, this restriction
1539 * can be ignored. If the first draw call in a geometry phase will only
1540 * update the depth or stencil buffers i.e. ISP_TAGWRITEDISABLE is set
1541 * in the ISP State Control Word, the PDS State Pointers
1542 * (TA_PRES_PDSSTATEPTR*) in the first PPP State Update do not need to
1543 * be supplied, since they will never reach the PDS in the fragment
1544 * phase.
1545 */
1546
1547 state->emit_state_bits = 0;
1548
1549 state->emit_state.stream_out = true;
1550 state->emit_state.ppp_control = true;
1551 state->emit_state.varying_word2 = true;
1552 state->emit_state.varying_word1 = true;
1553 state->emit_state.varying_word0 = true;
1554 state->emit_state.output_selects = true;
1555 state->emit_state.wclamp = true;
1556 state->emit_state.viewport = true;
1557 state->emit_state.region_clip = true;
1558 state->emit_state.pds_fragment_stateptr0 = true;
1559 state->emit_state.isp_fb = true;
1560 state->emit_state.isp = true;
1561 } else {
1562 state->emit_state.ppp_control = true;
1563 state->emit_state.varying_word1 = true;
1564 state->emit_state.varying_word0 = true;
1565 state->emit_state.output_selects = true;
1566 state->emit_state.viewport = true;
1567 state->emit_state.region_clip = true;
1568 state->emit_state.pds_fragment_stateptr0 = true;
1569 state->emit_state.isp_fb = true;
1570 state->emit_state.isp = true;
1571 }
1572
1573 memset(&state->ppp_state, 0U, sizeof(state->ppp_state));
1574
1575 state->dirty.vertex_bindings = true;
1576 state->dirty.gfx_pipeline_binding = true;
1577 state->dirty.viewport = true;
1578 }
1579
pvr_cmd_buffer_start_sub_cmd(struct pvr_cmd_buffer * cmd_buffer,enum pvr_sub_cmd_type type)1580 static VkResult pvr_cmd_buffer_start_sub_cmd(struct pvr_cmd_buffer *cmd_buffer,
1581 enum pvr_sub_cmd_type type)
1582 {
1583 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
1584 struct pvr_device *device = cmd_buffer->device;
1585 struct pvr_sub_cmd *sub_cmd;
1586 VkResult result;
1587
1588 /* Check the current status of the buffer. */
1589 if (state->status != VK_SUCCESS)
1590 return state->status;
1591
1592 pvr_cmd_buffer_update_barriers(cmd_buffer, type);
1593
1594 if (state->current_sub_cmd) {
1595 if (state->current_sub_cmd->type == type) {
1596 /* Continue adding to the current sub command. */
1597 return VK_SUCCESS;
1598 }
1599
1600 /* End the current sub command. */
1601 result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
1602 if (result != VK_SUCCESS)
1603 return result;
1604 }
1605
1606 sub_cmd = vk_zalloc(&cmd_buffer->vk.pool->alloc,
1607 sizeof(*sub_cmd),
1608 8,
1609 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
1610 if (!sub_cmd) {
1611 state->status = vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
1612 return state->status;
1613 }
1614
1615 sub_cmd->type = type;
1616
1617 switch (type) {
1618 case PVR_SUB_CMD_TYPE_GRAPHICS:
1619
1620 sub_cmd->gfx.depth_usage = PVR_DEPTH_STENCIL_USAGE_UNDEFINED;
1621 sub_cmd->gfx.stencil_usage = PVR_DEPTH_STENCIL_USAGE_UNDEFINED;
1622 sub_cmd->gfx.modifies_depth = false;
1623 sub_cmd->gfx.modifies_stencil = false;
1624 sub_cmd->gfx.max_tiles_in_flight =
1625 PVR_GET_FEATURE_VALUE(&device->pdevice->dev_info,
1626 isp_max_tiles_in_flight,
1627 1);
1628 sub_cmd->gfx.hw_render_idx = state->render_pass_info.current_hw_subpass;
1629 sub_cmd->gfx.framebuffer = state->render_pass_info.framebuffer;
1630 sub_cmd->gfx.empty_cmd = true;
1631
1632 pvr_reset_graphics_dirty_state(state, true);
1633 pvr_csb_init(device,
1634 PVR_CMD_STREAM_TYPE_GRAPHICS,
1635 &sub_cmd->gfx.control_stream);
1636 break;
1637
1638 case PVR_SUB_CMD_TYPE_COMPUTE:
1639 pvr_csb_init(device,
1640 PVR_CMD_STREAM_TYPE_COMPUTE,
1641 &sub_cmd->compute.control_stream);
1642 break;
1643
1644 case PVR_SUB_CMD_TYPE_TRANSFER:
1645 list_inithead(&sub_cmd->transfer.transfer_cmds);
1646 break;
1647
1648 default:
1649 pvr_finishme("Unsupported sub-command type %d", type);
1650 break;
1651 }
1652
1653 list_addtail(&sub_cmd->link, &cmd_buffer->sub_cmds);
1654 state->current_sub_cmd = sub_cmd;
1655
1656 return VK_SUCCESS;
1657 }
1658
pvr_cmd_buffer_alloc_mem(struct pvr_cmd_buffer * cmd_buffer,struct pvr_winsys_heap * heap,uint64_t size,uint32_t flags,struct pvr_bo ** const pvr_bo_out)1659 VkResult pvr_cmd_buffer_alloc_mem(struct pvr_cmd_buffer *cmd_buffer,
1660 struct pvr_winsys_heap *heap,
1661 uint64_t size,
1662 uint32_t flags,
1663 struct pvr_bo **const pvr_bo_out)
1664 {
1665 const uint32_t cache_line_size =
1666 rogue_get_slc_cache_line_size(&cmd_buffer->device->pdevice->dev_info);
1667 struct pvr_bo *pvr_bo;
1668 VkResult result;
1669
1670 result = pvr_bo_alloc(cmd_buffer->device,
1671 heap,
1672 size,
1673 cache_line_size,
1674 flags,
1675 &pvr_bo);
1676 if (result != VK_SUCCESS) {
1677 cmd_buffer->state.status = result;
1678 return result;
1679 }
1680
1681 list_add(&pvr_bo->link, &cmd_buffer->bo_list);
1682
1683 *pvr_bo_out = pvr_bo;
1684
1685 return VK_SUCCESS;
1686 }
1687
pvr_ResetCommandBuffer(VkCommandBuffer commandBuffer,VkCommandBufferResetFlags flags)1688 VkResult pvr_ResetCommandBuffer(VkCommandBuffer commandBuffer,
1689 VkCommandBufferResetFlags flags)
1690 {
1691 assert(!"Unimplemented");
1692 return VK_SUCCESS;
1693 }
1694
pvr_cmd_bind_compute_pipeline(const struct pvr_compute_pipeline * const compute_pipeline,struct pvr_cmd_buffer * const cmd_buffer)1695 static void pvr_cmd_bind_compute_pipeline(
1696 const struct pvr_compute_pipeline *const compute_pipeline,
1697 struct pvr_cmd_buffer *const cmd_buffer)
1698 {
1699 cmd_buffer->state.compute_pipeline = compute_pipeline;
1700 cmd_buffer->state.dirty.compute_pipeline_binding = true;
1701 }
1702
pvr_cmd_bind_graphics_pipeline(const struct pvr_graphics_pipeline * const gfx_pipeline,struct pvr_cmd_buffer * const cmd_buffer)1703 static void pvr_cmd_bind_graphics_pipeline(
1704 const struct pvr_graphics_pipeline *const gfx_pipeline,
1705 struct pvr_cmd_buffer *const cmd_buffer)
1706 {
1707 struct pvr_dynamic_state *const dest_state =
1708 &cmd_buffer->state.dynamic.common;
1709 const struct pvr_dynamic_state *const src_state =
1710 &gfx_pipeline->dynamic_state;
1711 struct pvr_cmd_buffer_state *const cmd_buffer_state = &cmd_buffer->state;
1712 const uint32_t state_mask = src_state->mask;
1713
1714 cmd_buffer_state->gfx_pipeline = gfx_pipeline;
1715 cmd_buffer_state->dirty.gfx_pipeline_binding = true;
1716
1717 /* FIXME: Handle PVR_DYNAMIC_STATE_BIT_VIEWPORT. */
1718 if (!(state_mask & PVR_DYNAMIC_STATE_BIT_VIEWPORT)) {
1719 assert(!"Unimplemented");
1720 }
1721
1722 /* FIXME: Handle PVR_DYNAMIC_STATE_BIT_SCISSOR. */
1723 if (!(state_mask & PVR_DYNAMIC_STATE_BIT_SCISSOR)) {
1724 assert(!"Unimplemented");
1725 }
1726
1727 if (!(state_mask & PVR_DYNAMIC_STATE_BIT_LINE_WIDTH)) {
1728 dest_state->line_width = src_state->line_width;
1729
1730 cmd_buffer_state->dirty.line_width = true;
1731 }
1732
1733 if (!(state_mask & PVR_DYNAMIC_STATE_BIT_DEPTH_BIAS)) {
1734 memcpy(&dest_state->depth_bias,
1735 &src_state->depth_bias,
1736 sizeof(src_state->depth_bias));
1737
1738 cmd_buffer_state->dirty.depth_bias = true;
1739 }
1740
1741 if (!(state_mask & PVR_DYNAMIC_STATE_BIT_BLEND_CONSTANTS)) {
1742 STATIC_ASSERT(
1743 __same_type(dest_state->blend_constants, src_state->blend_constants));
1744
1745 typed_memcpy(dest_state->blend_constants,
1746 src_state->blend_constants,
1747 ARRAY_SIZE(dest_state->blend_constants));
1748
1749 cmd_buffer_state->dirty.blend_constants = true;
1750 }
1751
1752 if (!(state_mask & PVR_DYNAMIC_STATE_BIT_STENCIL_COMPARE_MASK)) {
1753 dest_state->compare_mask.front = src_state->compare_mask.front;
1754 dest_state->compare_mask.back = src_state->compare_mask.back;
1755
1756 cmd_buffer_state->dirty.compare_mask = true;
1757 }
1758
1759 if (!(state_mask & PVR_DYNAMIC_STATE_BIT_STENCIL_WRITE_MASK)) {
1760 dest_state->write_mask.front = src_state->write_mask.front;
1761 dest_state->write_mask.back = src_state->write_mask.back;
1762
1763 cmd_buffer_state->dirty.write_mask = true;
1764 }
1765
1766 if (!(state_mask & PVR_DYNAMIC_STATE_BIT_STENCIL_REFERENCE)) {
1767 dest_state->reference.front = src_state->reference.front;
1768 dest_state->reference.back = src_state->reference.back;
1769
1770 cmd_buffer_state->dirty.reference = true;
1771 }
1772 }
1773
pvr_CmdBindPipeline(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipeline _pipeline)1774 void pvr_CmdBindPipeline(VkCommandBuffer commandBuffer,
1775 VkPipelineBindPoint pipelineBindPoint,
1776 VkPipeline _pipeline)
1777 {
1778 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
1779 PVR_FROM_HANDLE(pvr_pipeline, pipeline, _pipeline);
1780
1781 switch (pipelineBindPoint) {
1782 case VK_PIPELINE_BIND_POINT_COMPUTE:
1783 pvr_cmd_bind_compute_pipeline(to_pvr_compute_pipeline(pipeline),
1784 cmd_buffer);
1785 break;
1786
1787 case VK_PIPELINE_BIND_POINT_GRAPHICS:
1788 pvr_cmd_bind_graphics_pipeline(to_pvr_graphics_pipeline(pipeline),
1789 cmd_buffer);
1790 break;
1791
1792 default:
1793 unreachable("Invalid bind point.");
1794 break;
1795 }
1796 }
1797
1798 #if defined(DEBUG)
check_viewport_quirk_70165(const struct pvr_device * device,const VkViewport * pViewport)1799 static void check_viewport_quirk_70165(const struct pvr_device *device,
1800 const VkViewport *pViewport)
1801 {
1802 const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
1803 float min_vertex_x, max_vertex_x, min_vertex_y, max_vertex_y;
1804 float min_screen_space_value, max_screen_space_value;
1805 float sign_to_unsigned_offset, fixed_point_max;
1806 float guardband_width, guardband_height;
1807
1808 if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
1809 /* Max representable value in 13.4 fixed point format.
1810 * Round-down to avoid precision issues.
1811 * Calculated as (2 ** 13) - 2*(2 ** -4)
1812 */
1813 fixed_point_max = 8192.0f - 2.0f / 16.0f;
1814
1815 if (PVR_HAS_FEATURE(dev_info, screen_size8K)) {
1816 if (pViewport->width <= 4096 && pViewport->height <= 4096) {
1817 guardband_width = pViewport->width / 4.0f;
1818 guardband_height = pViewport->height / 4.0f;
1819
1820 /* 2k of the range is negative */
1821 sign_to_unsigned_offset = 2048.0f;
1822 } else {
1823 guardband_width = 0.0f;
1824 guardband_height = 0.0f;
1825
1826 /* For > 4k renders, the entire range is positive */
1827 sign_to_unsigned_offset = 0.0f;
1828 }
1829 } else {
1830 guardband_width = pViewport->width / 4.0f;
1831 guardband_height = pViewport->height / 4.0f;
1832
1833 /* 2k of the range is negative */
1834 sign_to_unsigned_offset = 2048.0f;
1835 }
1836 } else {
1837 /* Max representable value in 16.8 fixed point format
1838 * Calculated as (2 ** 16) - (2 ** -8)
1839 */
1840 fixed_point_max = 65535.99609375f;
1841 guardband_width = pViewport->width / 4.0f;
1842 guardband_height = pViewport->height / 4.0f;
1843
1844 /* 4k/20k of the range is negative */
1845 sign_to_unsigned_offset = (float)PVR_MAX_NEG_OFFSCREEN_OFFSET;
1846 }
1847
1848 min_screen_space_value = -sign_to_unsigned_offset;
1849 max_screen_space_value = fixed_point_max - sign_to_unsigned_offset;
1850
1851 min_vertex_x = pViewport->x - guardband_width;
1852 max_vertex_x = pViewport->x + pViewport->width + guardband_width;
1853 min_vertex_y = pViewport->y - guardband_height;
1854 max_vertex_y = pViewport->y + pViewport->height + guardband_height;
1855 if (min_vertex_x < min_screen_space_value ||
1856 max_vertex_x > max_screen_space_value ||
1857 min_vertex_y < min_screen_space_value ||
1858 max_vertex_y > max_screen_space_value) {
1859 mesa_logw("Viewport is affected by BRN70165, geometry outside "
1860 "the viewport could be corrupted");
1861 }
1862 }
1863 #endif
1864
pvr_CmdSetViewport(VkCommandBuffer commandBuffer,uint32_t firstViewport,uint32_t viewportCount,const VkViewport * pViewports)1865 void pvr_CmdSetViewport(VkCommandBuffer commandBuffer,
1866 uint32_t firstViewport,
1867 uint32_t viewportCount,
1868 const VkViewport *pViewports)
1869 {
1870 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
1871 const uint32_t total_count = firstViewport + viewportCount;
1872 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
1873
1874 assert(firstViewport < PVR_MAX_VIEWPORTS && viewportCount > 0);
1875 assert(total_count >= 1 && total_count <= PVR_MAX_VIEWPORTS);
1876
1877 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
1878
1879 #if defined(DEBUG)
1880 if (PVR_HAS_QUIRK(&cmd_buffer->device->pdevice->dev_info, 70165)) {
1881 for (uint32_t viewport = 0; viewport < viewportCount; viewport++) {
1882 check_viewport_quirk_70165(cmd_buffer->device, &pViewports[viewport]);
1883 }
1884 }
1885 #endif
1886
1887 if (state->dynamic.common.viewport.count < total_count)
1888 state->dynamic.common.viewport.count = total_count;
1889
1890 memcpy(&state->dynamic.common.viewport.viewports[firstViewport],
1891 pViewports,
1892 viewportCount * sizeof(*pViewports));
1893
1894 state->dirty.viewport = true;
1895 }
1896
pvr_CmdSetScissor(VkCommandBuffer commandBuffer,uint32_t firstScissor,uint32_t scissorCount,const VkRect2D * pScissors)1897 void pvr_CmdSetScissor(VkCommandBuffer commandBuffer,
1898 uint32_t firstScissor,
1899 uint32_t scissorCount,
1900 const VkRect2D *pScissors)
1901 {
1902 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
1903 const uint32_t total_count = firstScissor + scissorCount;
1904 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
1905
1906 assert(firstScissor < PVR_MAX_VIEWPORTS && scissorCount > 0);
1907 assert(total_count >= 1 && total_count <= PVR_MAX_VIEWPORTS);
1908
1909 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
1910
1911 if (state->dynamic.common.scissor.count < total_count)
1912 state->dynamic.common.scissor.count = total_count;
1913
1914 memcpy(&state->dynamic.common.scissor.scissors[firstScissor],
1915 pScissors,
1916 scissorCount * sizeof(*pScissors));
1917
1918 state->dirty.scissor = true;
1919 }
1920
pvr_CmdSetLineWidth(VkCommandBuffer commandBuffer,float lineWidth)1921 void pvr_CmdSetLineWidth(VkCommandBuffer commandBuffer, float lineWidth)
1922 {
1923 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
1924 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
1925
1926 state->dynamic.common.line_width = lineWidth;
1927 state->dirty.line_width = true;
1928 }
1929
pvr_CmdSetDepthBias(VkCommandBuffer commandBuffer,float depthBiasConstantFactor,float depthBiasClamp,float depthBiasSlopeFactor)1930 void pvr_CmdSetDepthBias(VkCommandBuffer commandBuffer,
1931 float depthBiasConstantFactor,
1932 float depthBiasClamp,
1933 float depthBiasSlopeFactor)
1934 {
1935 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
1936 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
1937
1938 state->dynamic.common.depth_bias.constant_factor = depthBiasConstantFactor;
1939 state->dynamic.common.depth_bias.clamp = depthBiasClamp;
1940 state->dynamic.common.depth_bias.slope_factor = depthBiasSlopeFactor;
1941 state->dirty.depth_bias = true;
1942 }
1943
pvr_CmdSetBlendConstants(VkCommandBuffer commandBuffer,const float blendConstants[4])1944 void pvr_CmdSetBlendConstants(VkCommandBuffer commandBuffer,
1945 const float blendConstants[4])
1946 {
1947 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
1948 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
1949
1950 STATIC_ASSERT(ARRAY_SIZE(state->dynamic.common.blend_constants) == 4);
1951 memcpy(state->dynamic.common.blend_constants,
1952 blendConstants,
1953 sizeof(state->dynamic.common.blend_constants));
1954
1955 state->dirty.blend_constants = true;
1956 }
1957
pvr_CmdSetDepthBounds(VkCommandBuffer commandBuffer,float minDepthBounds,float maxDepthBounds)1958 void pvr_CmdSetDepthBounds(VkCommandBuffer commandBuffer,
1959 float minDepthBounds,
1960 float maxDepthBounds)
1961 {
1962 mesa_logd("No support for depth bounds testing.");
1963 }
1964
pvr_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t compareMask)1965 void pvr_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,
1966 VkStencilFaceFlags faceMask,
1967 uint32_t compareMask)
1968 {
1969 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
1970 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
1971
1972 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
1973 state->dynamic.common.compare_mask.front = compareMask;
1974
1975 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
1976 state->dynamic.common.compare_mask.back = compareMask;
1977
1978 state->dirty.compare_mask = true;
1979 }
1980
pvr_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t writeMask)1981 void pvr_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,
1982 VkStencilFaceFlags faceMask,
1983 uint32_t writeMask)
1984 {
1985 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
1986 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
1987
1988 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
1989 state->dynamic.common.write_mask.front = writeMask;
1990
1991 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
1992 state->dynamic.common.write_mask.back = writeMask;
1993
1994 state->dirty.write_mask = true;
1995 }
1996
pvr_CmdSetStencilReference(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t reference)1997 void pvr_CmdSetStencilReference(VkCommandBuffer commandBuffer,
1998 VkStencilFaceFlags faceMask,
1999 uint32_t reference)
2000 {
2001 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2002 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
2003
2004 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
2005 state->dynamic.common.reference.front = reference;
2006
2007 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
2008 state->dynamic.common.reference.back = reference;
2009
2010 state->dirty.reference = true;
2011 }
2012
pvr_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipelineLayout _layout,uint32_t firstSet,uint32_t descriptorSetCount,const VkDescriptorSet * pDescriptorSets,uint32_t dynamicOffsetCount,const uint32_t * pDynamicOffsets)2013 void pvr_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
2014 VkPipelineBindPoint pipelineBindPoint,
2015 VkPipelineLayout _layout,
2016 uint32_t firstSet,
2017 uint32_t descriptorSetCount,
2018 const VkDescriptorSet *pDescriptorSets,
2019 uint32_t dynamicOffsetCount,
2020 const uint32_t *pDynamicOffsets)
2021 {
2022 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2023 struct pvr_descriptor_state *descriptor_state;
2024
2025 assert(firstSet + descriptorSetCount <= PVR_MAX_DESCRIPTOR_SETS);
2026
2027 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2028
2029 switch (pipelineBindPoint) {
2030 case VK_PIPELINE_BIND_POINT_GRAPHICS:
2031 case VK_PIPELINE_BIND_POINT_COMPUTE:
2032 break;
2033
2034 default:
2035 unreachable("Unsupported bind point.");
2036 break;
2037 }
2038
2039 if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) {
2040 descriptor_state = &cmd_buffer->state.gfx_desc_state;
2041 cmd_buffer->state.dirty.gfx_desc_dirty = true;
2042 } else {
2043 descriptor_state = &cmd_buffer->state.compute_desc_state;
2044 cmd_buffer->state.dirty.compute_desc_dirty = true;
2045 }
2046
2047 for (uint32_t i = 0; i < descriptorSetCount; i++) {
2048 PVR_FROM_HANDLE(pvr_descriptor_set, set, pDescriptorSets[i]);
2049 uint32_t index = firstSet + i;
2050
2051 if (descriptor_state->descriptor_sets[index] != set) {
2052 descriptor_state->descriptor_sets[index] = set;
2053 descriptor_state->valid_mask |= (1u << index);
2054 }
2055 }
2056 }
2057
pvr_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets)2058 void pvr_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,
2059 uint32_t firstBinding,
2060 uint32_t bindingCount,
2061 const VkBuffer *pBuffers,
2062 const VkDeviceSize *pOffsets)
2063 {
2064 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2065 struct pvr_vertex_binding *const vb = cmd_buffer->state.vertex_bindings;
2066
2067 /* We have to defer setting up vertex buffer since we need the buffer
2068 * stride from the pipeline.
2069 */
2070
2071 assert(firstBinding < PVR_MAX_VERTEX_INPUT_BINDINGS &&
2072 bindingCount <= PVR_MAX_VERTEX_INPUT_BINDINGS);
2073
2074 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2075
2076 for (uint32_t i = 0; i < bindingCount; i++) {
2077 vb[firstBinding + i].buffer = pvr_buffer_from_handle(pBuffers[i]);
2078 vb[firstBinding + i].offset = pOffsets[i];
2079 }
2080
2081 cmd_buffer->state.dirty.vertex_bindings = true;
2082 }
2083
pvr_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,VkBuffer buffer,VkDeviceSize offset,VkIndexType indexType)2084 void pvr_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,
2085 VkBuffer buffer,
2086 VkDeviceSize offset,
2087 VkIndexType indexType)
2088 {
2089 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2090 PVR_FROM_HANDLE(pvr_buffer, index_buffer, buffer);
2091 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
2092
2093 assert(offset < index_buffer->vk.size);
2094 assert(indexType == VK_INDEX_TYPE_UINT32 ||
2095 indexType == VK_INDEX_TYPE_UINT16);
2096
2097 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2098
2099 state->index_buffer_binding.buffer = index_buffer;
2100 state->index_buffer_binding.offset = offset;
2101 state->index_buffer_binding.type = indexType;
2102 state->dirty.index_buffer_binding = true;
2103 }
2104
pvr_CmdPushConstants(VkCommandBuffer commandBuffer,VkPipelineLayout layout,VkShaderStageFlags stageFlags,uint32_t offset,uint32_t size,const void * pValues)2105 void pvr_CmdPushConstants(VkCommandBuffer commandBuffer,
2106 VkPipelineLayout layout,
2107 VkShaderStageFlags stageFlags,
2108 uint32_t offset,
2109 uint32_t size,
2110 const void *pValues)
2111 {
2112 #if defined(DEBUG)
2113 const uint64_t ending = (uint64_t)offset + (uint64_t)size;
2114 #endif
2115
2116 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2117 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
2118
2119 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2120
2121 pvr_assert(ending <= PVR_MAX_PUSH_CONSTANTS_SIZE);
2122
2123 memcpy(&state->push_constants.data[offset], pValues, size);
2124
2125 state->push_constants.dirty_stages |= stageFlags;
2126 }
2127
2128 static VkResult
pvr_cmd_buffer_setup_attachments(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_render_pass * pass,const struct pvr_framebuffer * framebuffer)2129 pvr_cmd_buffer_setup_attachments(struct pvr_cmd_buffer *cmd_buffer,
2130 const struct pvr_render_pass *pass,
2131 const struct pvr_framebuffer *framebuffer)
2132 {
2133 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
2134 struct pvr_render_pass_info *info = &state->render_pass_info;
2135
2136 assert(pass->attachment_count == framebuffer->attachment_count);
2137
2138 /* Free any previously allocated attachments. */
2139 vk_free(&cmd_buffer->vk.pool->alloc, state->render_pass_info.attachments);
2140
2141 if (pass->attachment_count == 0) {
2142 info->attachments = NULL;
2143 return VK_SUCCESS;
2144 }
2145
2146 info->attachments =
2147 vk_zalloc(&cmd_buffer->vk.pool->alloc,
2148 pass->attachment_count * sizeof(*info->attachments),
2149 8,
2150 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
2151 if (!info->attachments) {
2152 /* Propagate VK_ERROR_OUT_OF_HOST_MEMORY to vkEndCommandBuffer */
2153 state->status = vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
2154 return state->status;
2155 }
2156
2157 if (framebuffer) {
2158 for (uint32_t i = 0; i < pass->attachment_count; i++)
2159 info->attachments[i] = framebuffer->attachments[i];
2160 }
2161
2162 return VK_SUCCESS;
2163 }
2164
pvr_init_render_targets(struct pvr_device * device,struct pvr_render_pass * pass,struct pvr_framebuffer * framebuffer)2165 static VkResult pvr_init_render_targets(struct pvr_device *device,
2166 struct pvr_render_pass *pass,
2167 struct pvr_framebuffer *framebuffer)
2168 {
2169 for (uint32_t i = 0; i < pass->hw_setup->render_count; i++) {
2170 struct pvr_render_target *render_target =
2171 pvr_get_render_target(pass, framebuffer, i);
2172
2173 pthread_mutex_lock(&render_target->mutex);
2174
2175 if (!render_target->valid) {
2176 const struct pvr_renderpass_hwsetup_render *hw_render =
2177 &pass->hw_setup->renders[i];
2178 VkResult result;
2179
2180 result = pvr_render_target_dataset_create(device,
2181 framebuffer->width,
2182 framebuffer->height,
2183 hw_render->sample_count,
2184 framebuffer->layers,
2185 &render_target->rt_dataset);
2186 if (result != VK_SUCCESS) {
2187 pthread_mutex_unlock(&render_target->mutex);
2188 return result;
2189 }
2190
2191 render_target->valid = true;
2192 }
2193
2194 pthread_mutex_unlock(&render_target->mutex);
2195 }
2196
2197 return VK_SUCCESS;
2198 }
2199
2200 static const struct pvr_renderpass_hwsetup_subpass *
pvr_get_hw_subpass(const struct pvr_render_pass * pass,const uint32_t subpass)2201 pvr_get_hw_subpass(const struct pvr_render_pass *pass, const uint32_t subpass)
2202 {
2203 const struct pvr_renderpass_hw_map *map =
2204 &pass->hw_setup->subpass_map[subpass];
2205
2206 return &pass->hw_setup->renders[map->render].subpasses[map->subpass];
2207 }
2208
pvr_perform_start_of_render_attachment_clear(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_framebuffer * framebuffer,uint32_t index,bool is_depth_stencil,uint32_t * index_list_clear_mask)2209 static void pvr_perform_start_of_render_attachment_clear(
2210 struct pvr_cmd_buffer *cmd_buffer,
2211 const struct pvr_framebuffer *framebuffer,
2212 uint32_t index,
2213 bool is_depth_stencil,
2214 uint32_t *index_list_clear_mask)
2215 {
2216 struct pvr_render_pass_info *info = &cmd_buffer->state.render_pass_info;
2217 const struct pvr_render_pass *pass = info->pass;
2218 const struct pvr_renderpass_hwsetup_render *hw_render;
2219 const struct pvr_renderpass_hwsetup *hw_setup;
2220 struct pvr_image_view *iview;
2221 uint32_t view_idx;
2222 uint32_t height;
2223 uint32_t width;
2224
2225 hw_setup = pass->hw_setup;
2226 hw_render =
2227 &hw_setup->renders[hw_setup->subpass_map[info->subpass_idx].render];
2228
2229 if (is_depth_stencil) {
2230 bool stencil_clear;
2231 bool depth_clear;
2232 bool is_stencil;
2233 bool is_depth;
2234
2235 assert(hw_render->ds_surface_id != -1);
2236 assert(index == 0);
2237
2238 view_idx = hw_render->ds_surface_id;
2239
2240 is_depth = vk_format_has_depth(pass->attachments[view_idx].vk_format);
2241 is_stencil = vk_format_has_stencil(pass->attachments[view_idx].vk_format);
2242 depth_clear = hw_render->depth_init == RENDERPASS_SURFACE_INITOP_CLEAR;
2243 stencil_clear = hw_render->stencil_init ==
2244 RENDERPASS_SURFACE_INITOP_CLEAR;
2245
2246 /* Attempt to clear the ds attachment. Do not erroneously discard an
2247 * attachment that has no depth clear but has a stencil attachment.
2248 */
2249 /* if not (a ∧ c) ∨ (b ∧ d) */
2250 if (!((is_depth && depth_clear) || (is_stencil && stencil_clear)))
2251 return;
2252 } else if (hw_render->color_init[index].op !=
2253 RENDERPASS_SURFACE_INITOP_CLEAR) {
2254 return;
2255 } else {
2256 view_idx = hw_render->color_init[index].driver_id;
2257 }
2258
2259 iview = info->attachments[view_idx];
2260 width = iview->vk.extent.width;
2261 height = iview->vk.extent.height;
2262
2263 /* FIXME: It would be nice if this function and pvr_sub_cmd_gfx_job_init()
2264 * were doing the same check (even if it's just an assert) to determine if a
2265 * clear is needed.
2266 */
2267 /* If this is single-layer fullscreen, we already do the clears in
2268 * pvr_sub_cmd_gfx_job_init().
2269 */
2270 if (info->render_area.offset.x == 0 && info->render_area.offset.y == 0 &&
2271 info->render_area.extent.width == width &&
2272 info->render_area.extent.height == height && framebuffer->layers == 1) {
2273 return;
2274 }
2275
2276 pvr_finishme("Unimplemented path!");
2277 }
2278
2279 static void
pvr_perform_start_of_render_clears(struct pvr_cmd_buffer * cmd_buffer)2280 pvr_perform_start_of_render_clears(struct pvr_cmd_buffer *cmd_buffer)
2281 {
2282 struct pvr_render_pass_info *info = &cmd_buffer->state.render_pass_info;
2283 const struct pvr_framebuffer *framebuffer = info->framebuffer;
2284 const struct pvr_render_pass *pass = info->pass;
2285 const struct pvr_renderpass_hwsetup *hw_setup = pass->hw_setup;
2286 const struct pvr_renderpass_hwsetup_render *hw_render;
2287
2288 /* Mask of attachment clears using index lists instead of background object
2289 * to clear.
2290 */
2291 uint32_t index_list_clear_mask = 0;
2292
2293 hw_render =
2294 &hw_setup->renders[hw_setup->subpass_map[info->subpass_idx].render];
2295 if (!hw_render) {
2296 info->process_empty_tiles = false;
2297 info->enable_bg_tag = false;
2298 return;
2299 }
2300
2301 for (uint32_t i = 0; i < hw_render->color_init_count; i++) {
2302 pvr_perform_start_of_render_attachment_clear(cmd_buffer,
2303 framebuffer,
2304 i,
2305 false,
2306 &index_list_clear_mask);
2307 }
2308
2309 info->enable_bg_tag = !!hw_render->color_init_count;
2310
2311 /* If we're not using index list for all clears/loads then we need to run
2312 * the background object on empty tiles.
2313 */
2314 if (hw_render->color_init_count &&
2315 index_list_clear_mask != ((1u << hw_render->color_init_count) - 1u)) {
2316 info->process_empty_tiles = true;
2317 } else {
2318 info->process_empty_tiles = false;
2319 }
2320
2321 if (hw_render->ds_surface_id != -1) {
2322 uint32_t ds_index_list = 0;
2323
2324 pvr_perform_start_of_render_attachment_clear(cmd_buffer,
2325 framebuffer,
2326 0,
2327 true,
2328 &ds_index_list);
2329 }
2330
2331 if (index_list_clear_mask)
2332 pvr_finishme("Add support for generating loadops shaders!");
2333 }
2334
pvr_stash_depth_format(struct pvr_cmd_buffer_state * state,struct pvr_sub_cmd_gfx * const sub_cmd)2335 static void pvr_stash_depth_format(struct pvr_cmd_buffer_state *state,
2336 struct pvr_sub_cmd_gfx *const sub_cmd)
2337 {
2338 const struct pvr_render_pass *pass = state->render_pass_info.pass;
2339 const struct pvr_renderpass_hwsetup_render *hw_render =
2340 &pass->hw_setup->renders[sub_cmd->hw_render_idx];
2341
2342 if (hw_render->ds_surface_id != -1) {
2343 struct pvr_image_view **iviews = state->render_pass_info.attachments;
2344
2345 state->depth_format = iviews[hw_render->ds_surface_id]->vk.format;
2346 }
2347 }
2348
pvr_loadops_contain_clear(struct pvr_renderpass_hwsetup * hw_setup)2349 static bool pvr_loadops_contain_clear(struct pvr_renderpass_hwsetup *hw_setup)
2350 {
2351 for (uint32_t i = 0; i < hw_setup->render_count; i++) {
2352 struct pvr_renderpass_hwsetup_render *hw_render = &hw_setup->renders[i];
2353 uint32_t render_targets_count =
2354 hw_render->init_setup.render_targets_count;
2355
2356 for (uint32_t j = 0;
2357 j < (hw_render->color_init_count * render_targets_count);
2358 j += render_targets_count) {
2359 for (uint32_t k = 0; k < hw_render->init_setup.render_targets_count;
2360 k++) {
2361 if (hw_render->color_init[j + k].op ==
2362 RENDERPASS_SURFACE_INITOP_CLEAR) {
2363 return true;
2364 }
2365 }
2366 }
2367 if (hw_render->depth_init == RENDERPASS_SURFACE_INITOP_CLEAR ||
2368 hw_render->stencil_init == RENDERPASS_SURFACE_INITOP_CLEAR) {
2369 return true;
2370 }
2371 }
2372
2373 return false;
2374 }
2375
2376 static VkResult
pvr_cmd_buffer_set_clear_values(struct pvr_cmd_buffer * cmd_buffer,const VkRenderPassBeginInfo * pRenderPassBegin)2377 pvr_cmd_buffer_set_clear_values(struct pvr_cmd_buffer *cmd_buffer,
2378 const VkRenderPassBeginInfo *pRenderPassBegin)
2379 {
2380 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
2381
2382 /* Free any previously allocated clear values. */
2383 vk_free(&cmd_buffer->vk.pool->alloc, state->render_pass_info.clear_values);
2384
2385 if (pRenderPassBegin->clearValueCount) {
2386 const size_t size = pRenderPassBegin->clearValueCount *
2387 sizeof(*state->render_pass_info.clear_values);
2388
2389 state->render_pass_info.clear_values =
2390 vk_zalloc(&cmd_buffer->vk.pool->alloc,
2391 size,
2392 8,
2393 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
2394 if (!state->render_pass_info.clear_values) {
2395 state->status = vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
2396 return state->status;
2397 }
2398
2399 memcpy(state->render_pass_info.clear_values,
2400 pRenderPassBegin->pClearValues,
2401 size);
2402 } else {
2403 state->render_pass_info.clear_values = NULL;
2404 }
2405
2406 state->render_pass_info.clear_value_count =
2407 pRenderPassBegin->clearValueCount;
2408
2409 return VK_SUCCESS;
2410 }
2411
pvr_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,const VkRenderPassBeginInfo * pRenderPassBeginInfo,const VkSubpassBeginInfo * pSubpassBeginInfo)2412 void pvr_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
2413 const VkRenderPassBeginInfo *pRenderPassBeginInfo,
2414 const VkSubpassBeginInfo *pSubpassBeginInfo)
2415 {
2416 PVR_FROM_HANDLE(pvr_framebuffer,
2417 framebuffer,
2418 pRenderPassBeginInfo->framebuffer);
2419 PVR_FROM_HANDLE(pvr_render_pass, pass, pRenderPassBeginInfo->renderPass);
2420 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2421 const struct pvr_renderpass_hwsetup_subpass *hw_subpass;
2422 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
2423 VkResult result;
2424
2425 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2426
2427 assert(!state->render_pass_info.pass);
2428 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
2429
2430 /* FIXME: Create a separate function for everything using pass->subpasses,
2431 * look at cmd_buffer_begin_subpass() for example. */
2432 state->render_pass_info.pass = pass;
2433 state->render_pass_info.framebuffer = framebuffer;
2434 state->render_pass_info.subpass_idx = 0;
2435 state->render_pass_info.render_area = pRenderPassBeginInfo->renderArea;
2436 state->render_pass_info.current_hw_subpass = 0;
2437 state->render_pass_info.pipeline_bind_point =
2438 pass->subpasses[0].pipeline_bind_point;
2439 state->render_pass_info.userpass_spawn = pass->subpasses[0].userpass_spawn;
2440 state->dirty.userpass_spawn = true;
2441
2442 result = pvr_cmd_buffer_setup_attachments(cmd_buffer, pass, framebuffer);
2443 if (result != VK_SUCCESS)
2444 return;
2445
2446 state->status =
2447 pvr_init_render_targets(cmd_buffer->device, pass, framebuffer);
2448 if (state->status != VK_SUCCESS)
2449 return;
2450
2451 result = pvr_cmd_buffer_set_clear_values(cmd_buffer, pRenderPassBeginInfo);
2452 if (result != VK_SUCCESS)
2453 return;
2454
2455 assert(pass->subpasses[0].pipeline_bind_point ==
2456 VK_PIPELINE_BIND_POINT_GRAPHICS);
2457
2458 result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
2459 if (result != VK_SUCCESS)
2460 return;
2461
2462 /* Run subpass 0 "soft" background object after the actual background
2463 * object.
2464 */
2465 hw_subpass = pvr_get_hw_subpass(pass, 0);
2466 if (hw_subpass->client_data)
2467 pvr_finishme("Unimplemented path!");
2468
2469 pvr_perform_start_of_render_clears(cmd_buffer);
2470 pvr_stash_depth_format(&cmd_buffer->state,
2471 &cmd_buffer->state.current_sub_cmd->gfx);
2472
2473 if (!pvr_loadops_contain_clear(pass->hw_setup)) {
2474 state->dynamic.scissor_accum_state = PVR_SCISSOR_ACCUM_CHECK_FOR_CLEAR;
2475 state->dynamic.scissor_accum_bounds.offset.x = 0;
2476 state->dynamic.scissor_accum_bounds.offset.y = 0;
2477 state->dynamic.scissor_accum_bounds.extent.width = 0;
2478 state->dynamic.scissor_accum_bounds.extent.height = 0;
2479 } else {
2480 state->dynamic.scissor_accum_state = PVR_SCISSOR_ACCUM_DISABLED;
2481 }
2482 }
2483
pvr_cmd_buffer_reset(struct pvr_cmd_buffer * cmd_buffer)2484 static void pvr_cmd_buffer_reset(struct pvr_cmd_buffer *cmd_buffer)
2485 {
2486 if (cmd_buffer->status != PVR_CMD_BUFFER_STATUS_INITIAL) {
2487 /* FIXME: For now we always free all resources as if
2488 * VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT was set.
2489 */
2490 pvr_cmd_buffer_free_sub_cmds(cmd_buffer);
2491
2492 list_for_each_entry_safe (struct pvr_bo, bo, &cmd_buffer->bo_list, link) {
2493 list_del(&bo->link);
2494 pvr_bo_free(cmd_buffer->device, bo);
2495 }
2496
2497 util_dynarray_clear(&cmd_buffer->scissor_array);
2498 util_dynarray_clear(&cmd_buffer->depth_bias_array);
2499
2500 cmd_buffer->state.status = VK_SUCCESS;
2501 cmd_buffer->status = PVR_CMD_BUFFER_STATUS_INITIAL;
2502 }
2503 }
2504
pvr_BeginCommandBuffer(VkCommandBuffer commandBuffer,const VkCommandBufferBeginInfo * pBeginInfo)2505 VkResult pvr_BeginCommandBuffer(VkCommandBuffer commandBuffer,
2506 const VkCommandBufferBeginInfo *pBeginInfo)
2507 {
2508 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2509 struct pvr_cmd_buffer_state *state;
2510 VkResult result;
2511
2512 pvr_cmd_buffer_reset(cmd_buffer);
2513
2514 cmd_buffer->usage_flags = pBeginInfo->flags;
2515 state = &cmd_buffer->state;
2516
2517 /* VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT must be ignored for
2518 * primary level command buffers.
2519 *
2520 * From the Vulkan 1.0 spec:
2521 *
2522 * VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT specifies that a
2523 * secondary command buffer is considered to be entirely inside a render
2524 * pass. If this is a primary command buffer, then this bit is ignored.
2525 */
2526 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
2527 cmd_buffer->usage_flags &=
2528 ~VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
2529 }
2530
2531 if (cmd_buffer->usage_flags &
2532 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
2533 const VkCommandBufferInheritanceInfo *inheritance_info =
2534 pBeginInfo->pInheritanceInfo;
2535 struct pvr_render_pass *pass;
2536
2537 pass = pvr_render_pass_from_handle(inheritance_info->renderPass);
2538 state->render_pass_info.pass = pass;
2539 state->render_pass_info.framebuffer =
2540 pvr_framebuffer_from_handle(inheritance_info->framebuffer);
2541 state->render_pass_info.subpass_idx = inheritance_info->subpass;
2542 state->render_pass_info.userpass_spawn =
2543 pass->subpasses[inheritance_info->subpass].userpass_spawn;
2544
2545 result =
2546 pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
2547 if (result != VK_SUCCESS)
2548 return result;
2549 }
2550
2551 memset(state->barriers_needed,
2552 0xFF,
2553 sizeof(*state->barriers_needed) * ARRAY_SIZE(state->barriers_needed));
2554
2555 cmd_buffer->status = PVR_CMD_BUFFER_STATUS_RECORDING;
2556
2557 return VK_SUCCESS;
2558 }
2559
pvr_cmd_buffer_add_transfer_cmd(struct pvr_cmd_buffer * cmd_buffer,struct pvr_transfer_cmd * transfer_cmd)2560 VkResult pvr_cmd_buffer_add_transfer_cmd(struct pvr_cmd_buffer *cmd_buffer,
2561 struct pvr_transfer_cmd *transfer_cmd)
2562 {
2563 struct pvr_sub_cmd_transfer *sub_cmd;
2564 VkResult result;
2565
2566 result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_TRANSFER);
2567 if (result != VK_SUCCESS)
2568 return result;
2569
2570 sub_cmd = &cmd_buffer->state.current_sub_cmd->transfer;
2571
2572 list_addtail(&transfer_cmd->link, &sub_cmd->transfer_cmds);
2573
2574 return VK_SUCCESS;
2575 }
2576
2577 static void
pvr_validate_push_descriptors(struct pvr_cmd_buffer * cmd_buffer,bool * const push_descriptors_dirty_out)2578 pvr_validate_push_descriptors(struct pvr_cmd_buffer *cmd_buffer,
2579 bool *const push_descriptors_dirty_out)
2580 {
2581 /* TODO: Implement this function, based on ValidatePushDescriptors. */
2582 pvr_finishme("Add support for push descriptors!");
2583 *push_descriptors_dirty_out = false;
2584 }
2585
2586 #define PVR_WRITE(_buffer, _value, _offset, _max) \
2587 do { \
2588 __typeof__(_value) __value = _value; \
2589 uint64_t __offset = _offset; \
2590 uint32_t __nr_dwords = sizeof(__value) / sizeof(uint32_t); \
2591 static_assert(__same_type(*_buffer, __value), \
2592 "Buffer and value type mismatch"); \
2593 assert((__offset + __nr_dwords) <= (_max)); \
2594 assert((__offset % __nr_dwords) == 0U); \
2595 _buffer[__offset / __nr_dwords] = __value; \
2596 } while (0)
2597
2598 static VkResult
pvr_setup_vertex_buffers(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_graphics_pipeline * const gfx_pipeline)2599 pvr_setup_vertex_buffers(struct pvr_cmd_buffer *cmd_buffer,
2600 const struct pvr_graphics_pipeline *const gfx_pipeline)
2601 {
2602 const struct pvr_vertex_shader_state *const vertex_state =
2603 &gfx_pipeline->vertex_shader_state;
2604 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
2605 const struct pvr_pds_info *const pds_info = state->pds_shader.info;
2606 const uint8_t *entries;
2607 uint32_t *dword_buffer;
2608 uint64_t *qword_buffer;
2609 struct pvr_bo *pvr_bo;
2610 VkResult result;
2611
2612 result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
2613 cmd_buffer->device->heaps.pds_heap,
2614 pds_info->data_size_in_dwords,
2615 PVR_BO_ALLOC_FLAG_CPU_MAPPED,
2616 &pvr_bo);
2617 if (result != VK_SUCCESS)
2618 return result;
2619
2620 dword_buffer = (uint32_t *)pvr_bo->bo->map;
2621 qword_buffer = (uint64_t *)pvr_bo->bo->map;
2622
2623 entries = (uint8_t *)pds_info->entries;
2624
2625 for (uint32_t i = 0; i < pds_info->entry_count; i++) {
2626 const struct pvr_const_map_entry *const entry_header =
2627 (struct pvr_const_map_entry *)entries;
2628
2629 switch (entry_header->type) {
2630 case PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32: {
2631 const struct pvr_const_map_entry_literal32 *const literal =
2632 (struct pvr_const_map_entry_literal32 *)entries;
2633
2634 PVR_WRITE(dword_buffer,
2635 literal->literal_value,
2636 literal->const_offset,
2637 pds_info->data_size_in_dwords);
2638
2639 entries += sizeof(*literal);
2640 break;
2641 }
2642
2643 case PVR_PDS_CONST_MAP_ENTRY_TYPE_DOUTU_ADDRESS: {
2644 const struct pvr_const_map_entry_doutu_address *const doutu_addr =
2645 (struct pvr_const_map_entry_doutu_address *)entries;
2646 const pvr_dev_addr_t exec_addr =
2647 PVR_DEV_ADDR_OFFSET(vertex_state->bo->vma->dev_addr,
2648 vertex_state->entry_offset);
2649 uint64_t addr = 0ULL;
2650
2651 pvr_set_usc_execution_address64(&addr, exec_addr.addr);
2652
2653 PVR_WRITE(qword_buffer,
2654 addr | doutu_addr->doutu_control,
2655 doutu_addr->const_offset,
2656 pds_info->data_size_in_dwords);
2657
2658 entries += sizeof(*doutu_addr);
2659 break;
2660 }
2661
2662 case PVR_PDS_CONST_MAP_ENTRY_TYPE_BASE_INSTANCE: {
2663 const struct pvr_const_map_entry_base_instance *const base_instance =
2664 (struct pvr_const_map_entry_base_instance *)entries;
2665
2666 PVR_WRITE(dword_buffer,
2667 state->draw_state.base_instance,
2668 base_instance->const_offset,
2669 pds_info->data_size_in_dwords);
2670
2671 entries += sizeof(*base_instance);
2672 break;
2673 }
2674
2675 case PVR_PDS_CONST_MAP_ENTRY_TYPE_VERTEX_ATTRIBUTE_ADDRESS: {
2676 const struct pvr_const_map_entry_vertex_attribute_address
2677 *const attribute =
2678 (struct pvr_const_map_entry_vertex_attribute_address *)entries;
2679 const struct pvr_vertex_binding *const binding =
2680 &state->vertex_bindings[attribute->binding_index];
2681 const pvr_dev_addr_t addr =
2682 PVR_DEV_ADDR_OFFSET(binding->buffer->dev_addr,
2683 binding->offset + attribute->offset);
2684
2685 PVR_WRITE(qword_buffer,
2686 addr.addr,
2687 attribute->const_offset,
2688 pds_info->data_size_in_dwords);
2689
2690 entries += sizeof(*attribute);
2691 break;
2692 }
2693
2694 default:
2695 unreachable("Unsupported data section map");
2696 break;
2697 }
2698 }
2699
2700 state->pds_vertex_attrib_offset =
2701 pvr_bo->vma->dev_addr.addr -
2702 cmd_buffer->device->heaps.pds_heap->base_addr.addr;
2703
2704 pvr_bo_cpu_unmap(cmd_buffer->device, pvr_bo);
2705
2706 return VK_SUCCESS;
2707 }
2708
pvr_setup_descriptor_mappings(struct pvr_cmd_buffer * const cmd_buffer,enum pvr_stage_allocation stage,const struct pvr_stage_allocation_descriptor_state * descriptor_state,UNUSED const pvr_dev_addr_t * const num_worgroups_buff_addr,uint32_t * const descriptor_data_offset_out)2709 static VkResult pvr_setup_descriptor_mappings(
2710 struct pvr_cmd_buffer *const cmd_buffer,
2711 enum pvr_stage_allocation stage,
2712 const struct pvr_stage_allocation_descriptor_state *descriptor_state,
2713 UNUSED const pvr_dev_addr_t *const num_worgroups_buff_addr,
2714 uint32_t *const descriptor_data_offset_out)
2715 {
2716 const struct pvr_pds_info *const pds_info = &descriptor_state->pds_info;
2717 const struct pvr_descriptor_state *desc_state;
2718 const uint8_t *entries;
2719 uint32_t *dword_buffer;
2720 uint64_t *qword_buffer;
2721 struct pvr_bo *pvr_bo;
2722 VkResult result;
2723
2724 pvr_finishme("Handle num_worgroups_buff_addr");
2725
2726 if (!pds_info->data_size_in_dwords)
2727 return VK_SUCCESS;
2728
2729 result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
2730 cmd_buffer->device->heaps.pds_heap,
2731 pds_info->data_size_in_dwords,
2732 PVR_BO_ALLOC_FLAG_CPU_MAPPED,
2733 &pvr_bo);
2734 if (result != VK_SUCCESS)
2735 return result;
2736
2737 dword_buffer = (uint32_t *)pvr_bo->bo->map;
2738 qword_buffer = (uint64_t *)pvr_bo->bo->map;
2739
2740 entries = (uint8_t *)pds_info->entries;
2741
2742 switch (stage) {
2743 case PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY:
2744 case PVR_STAGE_ALLOCATION_FRAGMENT:
2745 desc_state = &cmd_buffer->state.gfx_desc_state;
2746 break;
2747
2748 case PVR_STAGE_ALLOCATION_COMPUTE:
2749 desc_state = &cmd_buffer->state.compute_desc_state;
2750 break;
2751
2752 default:
2753 unreachable("Unsupported stage.");
2754 break;
2755 }
2756
2757 for (uint32_t i = 0; i < pds_info->entry_count; i++) {
2758 const struct pvr_const_map_entry *const entry_header =
2759 (struct pvr_const_map_entry *)entries;
2760
2761 /* TODO: See if instead of reusing the blend constant buffer type entry,
2762 * we can setup a new buffer type specifically for num_workgroups or other
2763 * built-in variables. The mappings are setup at pipeline creation when
2764 * creating the descriptor program.
2765 */
2766 pvr_finishme("Handle blend constant reuse for compute.");
2767
2768 switch (entry_header->type) {
2769 case PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32: {
2770 const struct pvr_const_map_entry_literal32 *const literal =
2771 (struct pvr_const_map_entry_literal32 *)entries;
2772
2773 PVR_WRITE(dword_buffer,
2774 literal->literal_value,
2775 literal->const_offset,
2776 pds_info->data_size_in_dwords);
2777
2778 entries += sizeof(*literal);
2779 break;
2780 }
2781
2782 case PVR_PDS_CONST_MAP_ENTRY_TYPE_CONSTANT_BUFFER: {
2783 const struct pvr_const_map_entry_constant_buffer *const_buffer_entry =
2784 (struct pvr_const_map_entry_constant_buffer *)entries;
2785 const uint32_t desc_set = const_buffer_entry->desc_set;
2786 const uint32_t binding = const_buffer_entry->binding;
2787 const struct pvr_descriptor_set *descriptor_set;
2788 const struct pvr_descriptor *descriptor;
2789 pvr_dev_addr_t buffer_addr;
2790
2791 /* TODO: Handle push descriptors. */
2792
2793 assert(desc_set < PVR_MAX_DESCRIPTOR_SETS);
2794 descriptor_set = desc_state->descriptor_sets[desc_set];
2795
2796 /* TODO: Handle dynamic buffers. */
2797 descriptor = &descriptor_set->descriptors[binding];
2798 assert(descriptor->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
2799
2800 assert(descriptor->buffer_desc_range ==
2801 const_buffer_entry->size_in_dwords * sizeof(uint32_t));
2802 assert(descriptor->buffer_create_info_size ==
2803 const_buffer_entry->size_in_dwords * sizeof(uint32_t));
2804
2805 buffer_addr =
2806 PVR_DEV_ADDR_OFFSET(descriptor->buffer_dev_addr,
2807 const_buffer_entry->offset * sizeof(uint32_t));
2808
2809 PVR_WRITE(qword_buffer,
2810 buffer_addr.addr,
2811 const_buffer_entry->const_offset,
2812 pds_info->data_size_in_dwords);
2813
2814 entries += sizeof(*const_buffer_entry);
2815 break;
2816 }
2817
2818 case PVR_PDS_CONST_MAP_ENTRY_TYPE_DESCRIPTOR_SET: {
2819 const struct pvr_const_map_entry_descriptor_set *desc_set_entry =
2820 (struct pvr_const_map_entry_descriptor_set *)entries;
2821 const uint32_t desc_set_num = desc_set_entry->descriptor_set;
2822 const struct pvr_descriptor_set *descriptor_set;
2823 pvr_dev_addr_t desc_set_addr;
2824
2825 assert(desc_set_num < PVR_MAX_DESCRIPTOR_SETS);
2826
2827 /* TODO: Remove this when the compiler provides us with usage info?
2828 */
2829 /* We skip DMAing unbound descriptor sets. */
2830 if (!(desc_state->valid_mask & BITFIELD_BIT(desc_set_num))) {
2831 const struct pvr_const_map_entry_literal32 *literal;
2832 uint32_t zero_literal_value;
2833
2834 entries += sizeof(*desc_set_entry);
2835 literal = (struct pvr_const_map_entry_literal32 *)entries;
2836
2837 /* TODO: Is there any guarantee that a literal will follow the
2838 * descriptor set entry?
2839 */
2840 assert(literal->type == PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32);
2841
2842 /* We zero out the DMA size so the DMA isn't performed. */
2843 zero_literal_value =
2844 literal->literal_value &
2845 PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTD_SRC1_BSIZE_CLRMSK;
2846
2847 PVR_WRITE(qword_buffer,
2848 UINT64_C(0),
2849 desc_set_entry->const_offset,
2850 pds_info->data_size_in_dwords);
2851
2852 PVR_WRITE(dword_buffer,
2853 zero_literal_value,
2854 desc_set_entry->const_offset,
2855 pds_info->data_size_in_dwords);
2856
2857 entries += sizeof(*literal);
2858 i++;
2859 continue;
2860 }
2861
2862 descriptor_set = desc_state->descriptor_sets[desc_set_num];
2863
2864 pvr_finishme("Handle push descriptor entry.");
2865
2866 desc_set_addr = descriptor_set->pvr_bo->vma->dev_addr;
2867
2868 if (desc_set_entry->primary) {
2869 desc_set_addr = PVR_DEV_ADDR_OFFSET(
2870 desc_set_addr,
2871 descriptor_set->layout->memory_layout_in_dwords_per_stage[stage]
2872 .primary_offset
2873 << 2U);
2874 } else {
2875 desc_set_addr = PVR_DEV_ADDR_OFFSET(
2876 desc_set_addr,
2877 descriptor_set->layout->memory_layout_in_dwords_per_stage[stage]
2878 .secondary_offset
2879 << 2U);
2880 }
2881
2882 desc_set_addr = PVR_DEV_ADDR_OFFSET(
2883 desc_set_addr,
2884 (uint64_t)desc_set_entry->offset_in_dwords << 2U);
2885
2886 PVR_WRITE(qword_buffer,
2887 desc_set_addr.addr,
2888 desc_set_entry->const_offset,
2889 pds_info->data_size_in_dwords);
2890
2891 entries += sizeof(*desc_set_entry);
2892 break;
2893 }
2894
2895 case PVR_PDS_CONST_MAP_ENTRY_TYPE_SPECIAL_BUFFER: {
2896 const struct pvr_const_map_entry_special_buffer *special_buff_entry =
2897 (struct pvr_const_map_entry_special_buffer *)entries;
2898
2899 switch (special_buff_entry->buffer_type) {
2900 case PVR_BUFFER_TYPES_COMPILE_TIME: {
2901 uint64_t addr = descriptor_state->static_consts->vma->dev_addr.addr;
2902
2903 PVR_WRITE(qword_buffer,
2904 addr,
2905 special_buff_entry->const_offset,
2906 pds_info->data_size_in_dwords);
2907 break;
2908 }
2909
2910 default:
2911 unreachable("Unsupported special buffer type.");
2912 }
2913
2914 entries += sizeof(*special_buff_entry);
2915 break;
2916 }
2917
2918 default:
2919 unreachable("Unsupported map entry type.");
2920 }
2921 }
2922
2923 pvr_bo_cpu_unmap(cmd_buffer->device, pvr_bo);
2924
2925 *descriptor_data_offset_out =
2926 pvr_bo->vma->dev_addr.addr -
2927 cmd_buffer->device->heaps.pds_heap->base_addr.addr;
2928
2929 return VK_SUCCESS;
2930 }
2931
2932 #undef PVR_WRITE
2933
pvr_compute_update_shared(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd)2934 static void pvr_compute_update_shared(struct pvr_cmd_buffer *cmd_buffer,
2935 struct pvr_sub_cmd_compute *const sub_cmd)
2936 {
2937 const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
2938 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
2939 struct pvr_csb *csb = &sub_cmd->control_stream;
2940 const struct pvr_compute_pipeline *pipeline = state->compute_pipeline;
2941 const uint32_t const_shared_reg_count =
2942 pipeline->state.shader.const_shared_reg_count;
2943 struct pvr_compute_kernel_info info;
2944
2945 /* No shared regs, no need to use an allocation kernel. */
2946 if (!const_shared_reg_count)
2947 return;
2948
2949 info = (struct pvr_compute_kernel_info){
2950 .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
2951 .sd_type = PVRX(CDMCTRL_SD_TYPE_NONE),
2952
2953 .usc_target = PVRX(CDMCTRL_USC_TARGET_ALL),
2954 .usc_common_shared = true,
2955 .usc_common_size =
2956 DIV_ROUND_UP(const_shared_reg_count,
2957 PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE)),
2958
2959 .local_size = { 1, 1, 1 },
2960 .global_size = { 1, 1, 1 },
2961 };
2962
2963 /* Sometimes we don't have a secondary program if there were no constants to
2964 * write, but we still need to run a PDS program to accomplish the
2965 * allocation of the local/common store shared registers so we repurpose the
2966 * deallocation PDS program.
2967 */
2968 if (pipeline->state.descriptor.pds_info.code_size_in_dwords) {
2969 uint32_t pds_data_size_in_dwords =
2970 pipeline->state.descriptor.pds_info.data_size_in_dwords;
2971
2972 info.pds_data_offset = state->pds_compute_descriptor_data_offset;
2973 info.pds_data_size =
2974 DIV_ROUND_UP(pds_data_size_in_dwords << 2U,
2975 PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE));
2976
2977 /* Check that we have upload the code section. */
2978 assert(pipeline->state.descriptor.pds_code.code_size);
2979 info.pds_code_offset = pipeline->state.descriptor.pds_code.code_offset;
2980 } else {
2981 /* FIXME: There should be a deallocation pds program already uploaded
2982 * that we use at this point.
2983 */
2984 assert(!"Unimplemented");
2985 }
2986
2987 /* We don't need to pad the workgroup size. */
2988
2989 info.max_instances =
2990 pvr_compute_flat_slot_size(pdevice, const_shared_reg_count, false, 1U);
2991
2992 pvr_compute_generate_control_stream(csb, sub_cmd, &info);
2993 }
2994
2995 static uint32_t
pvr_compute_flat_pad_workgroup_size(const struct pvr_physical_device * pdevice,uint32_t workgroup_size,uint32_t coeff_regs_count)2996 pvr_compute_flat_pad_workgroup_size(const struct pvr_physical_device *pdevice,
2997 uint32_t workgroup_size,
2998 uint32_t coeff_regs_count)
2999 {
3000 const struct pvr_device_runtime_info *dev_runtime_info =
3001 &pdevice->dev_runtime_info;
3002 const struct pvr_device_info *dev_info = &pdevice->dev_info;
3003 uint32_t max_avail_coeff_regs =
3004 dev_runtime_info->cdm_max_local_mem_size_regs;
3005 uint32_t coeff_regs_count_aligned =
3006 ALIGN_POT(coeff_regs_count,
3007 PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE) >> 2U);
3008
3009 /* If the work group size is > ROGUE_MAX_INSTANCES_PER_TASK. We now *always*
3010 * pad the work group size to the next multiple of
3011 * ROGUE_MAX_INSTANCES_PER_TASK.
3012 *
3013 * If we use more than 1/8th of the max coefficient registers then we round
3014 * work group size up to the next multiple of ROGUE_MAX_INSTANCES_PER_TASK
3015 */
3016 /* TODO: See if this can be optimized. */
3017 if (workgroup_size > ROGUE_MAX_INSTANCES_PER_TASK ||
3018 coeff_regs_count_aligned > (max_avail_coeff_regs / 8)) {
3019 assert(workgroup_size < rogue_get_compute_max_work_group_size(dev_info));
3020
3021 return ALIGN_POT(workgroup_size, ROGUE_MAX_INSTANCES_PER_TASK);
3022 }
3023
3024 return workgroup_size;
3025 }
3026
3027 /* TODO: Wire up the base_workgroup variant program when implementing
3028 * VK_KHR_device_group. The values will also need patching into the program.
3029 */
pvr_compute_update_kernel(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd,const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])3030 static void pvr_compute_update_kernel(
3031 struct pvr_cmd_buffer *cmd_buffer,
3032 struct pvr_sub_cmd_compute *const sub_cmd,
3033 const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])
3034 {
3035 const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
3036 const struct pvr_device_runtime_info *dev_runtime_info =
3037 &pdevice->dev_runtime_info;
3038 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
3039 struct pvr_csb *csb = &sub_cmd->control_stream;
3040 const struct pvr_compute_pipeline *pipeline = state->compute_pipeline;
3041 const struct pvr_pds_info *program_info =
3042 &pipeline->state.primary_program_info;
3043
3044 struct pvr_compute_kernel_info info = {
3045 .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
3046 .usc_target = PVRX(CDMCTRL_USC_TARGET_ANY),
3047 .pds_temp_size =
3048 DIV_ROUND_UP(program_info->temps_required << 2U,
3049 PVRX(CDMCTRL_KERNEL0_PDS_TEMP_SIZE_UNIT_SIZE)),
3050
3051 .pds_data_size =
3052 DIV_ROUND_UP(program_info->data_size_in_dwords << 2U,
3053 PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE)),
3054 .pds_data_offset = pipeline->state.primary_program.data_offset,
3055 .pds_code_offset = pipeline->state.primary_program.code_offset,
3056
3057 .sd_type = PVRX(CDMCTRL_SD_TYPE_USC),
3058
3059 .usc_unified_size =
3060 DIV_ROUND_UP(pipeline->state.shader.input_register_count << 2U,
3061 PVRX(CDMCTRL_KERNEL0_USC_UNIFIED_SIZE_UNIT_SIZE)),
3062
3063 /* clang-format off */
3064 .global_size = {
3065 global_workgroup_size[0],
3066 global_workgroup_size[1],
3067 global_workgroup_size[2]
3068 },
3069 /* clang-format on */
3070 };
3071
3072 uint32_t work_size = pipeline->state.shader.work_size;
3073 uint32_t coeff_regs;
3074
3075 if (work_size > ROGUE_MAX_INSTANCES_PER_TASK) {
3076 /* Enforce a single workgroup per cluster through allocation starvation.
3077 */
3078 coeff_regs = dev_runtime_info->cdm_max_local_mem_size_regs;
3079 } else {
3080 coeff_regs = pipeline->state.shader.coefficient_register_count;
3081 }
3082
3083 info.usc_common_size =
3084 DIV_ROUND_UP(coeff_regs << 2U,
3085 PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE));
3086
3087 /* Use a whole slot per workgroup. */
3088 work_size = MAX2(work_size, ROGUE_MAX_INSTANCES_PER_TASK);
3089
3090 coeff_regs += pipeline->state.shader.const_shared_reg_count;
3091
3092 work_size =
3093 pvr_compute_flat_pad_workgroup_size(pdevice, work_size, coeff_regs);
3094
3095 info.local_size[0] = work_size;
3096 info.local_size[1] = 1U;
3097 info.local_size[2] = 1U;
3098
3099 info.max_instances =
3100 pvr_compute_flat_slot_size(pdevice, coeff_regs, false, work_size);
3101
3102 pvr_compute_generate_control_stream(csb, sub_cmd, &info);
3103 }
3104
pvr_CmdDispatch(VkCommandBuffer commandBuffer,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)3105 void pvr_CmdDispatch(VkCommandBuffer commandBuffer,
3106 uint32_t groupCountX,
3107 uint32_t groupCountY,
3108 uint32_t groupCountZ)
3109 {
3110 const uint32_t workgroup_size[] = { groupCountX, groupCountY, groupCountZ };
3111 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
3112 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
3113 const struct pvr_compute_pipeline *compute_pipeline =
3114 state->compute_pipeline;
3115 const VkShaderStageFlags push_consts_stage_mask =
3116 compute_pipeline->base.layout->push_constants_shader_stages;
3117 bool push_descriptors_dirty;
3118 struct pvr_sub_cmd_compute *sub_cmd;
3119 VkResult result;
3120
3121 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
3122 assert(compute_pipeline);
3123
3124 if (!groupCountX || !groupCountY || !groupCountZ)
3125 return;
3126
3127 pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_COMPUTE);
3128
3129 sub_cmd = &state->current_sub_cmd->compute;
3130
3131 sub_cmd->uses_atomic_ops |= compute_pipeline->state.shader.uses_atomic_ops;
3132 sub_cmd->uses_barrier |= compute_pipeline->state.shader.uses_barrier;
3133
3134 if (push_consts_stage_mask & VK_SHADER_STAGE_COMPUTE_BIT) {
3135 /* TODO: Add a dirty push constants mask in the cmd_buffer state and
3136 * check for dirty compute stage.
3137 */
3138 pvr_finishme("Add support for push constants.");
3139 }
3140
3141 pvr_validate_push_descriptors(cmd_buffer, &push_descriptors_dirty);
3142
3143 if (compute_pipeline->state.shader.uses_num_workgroups) {
3144 struct pvr_bo *num_workgroups_bo;
3145
3146 result = pvr_cmd_buffer_upload_general(cmd_buffer,
3147 workgroup_size,
3148 sizeof(workgroup_size),
3149 &num_workgroups_bo);
3150 if (result != VK_SUCCESS)
3151 return;
3152
3153 result = pvr_setup_descriptor_mappings(
3154 cmd_buffer,
3155 PVR_STAGE_ALLOCATION_COMPUTE,
3156 &compute_pipeline->state.descriptor,
3157 &num_workgroups_bo->vma->dev_addr,
3158 &state->pds_compute_descriptor_data_offset);
3159 if (result != VK_SUCCESS)
3160 return;
3161 } else if ((compute_pipeline->base.layout
3162 ->per_stage_descriptor_masks[PVR_STAGE_ALLOCATION_COMPUTE] &&
3163 state->dirty.compute_desc_dirty) ||
3164 state->dirty.compute_pipeline_binding || push_descriptors_dirty) {
3165 result = pvr_setup_descriptor_mappings(
3166 cmd_buffer,
3167 PVR_STAGE_ALLOCATION_COMPUTE,
3168 &compute_pipeline->state.descriptor,
3169 NULL,
3170 &state->pds_compute_descriptor_data_offset);
3171 if (result != VK_SUCCESS)
3172 return;
3173 }
3174
3175 pvr_compute_update_shared(cmd_buffer, sub_cmd);
3176
3177 pvr_compute_update_kernel(cmd_buffer, sub_cmd, workgroup_size);
3178 }
3179
pvr_CmdDispatchIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset)3180 void pvr_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
3181 VkBuffer _buffer,
3182 VkDeviceSize offset)
3183 {
3184 assert(!"Unimplemented");
3185 }
3186
3187 static void
pvr_update_draw_state(struct pvr_cmd_buffer_state * const state,const struct pvr_cmd_buffer_draw_state * const draw_state)3188 pvr_update_draw_state(struct pvr_cmd_buffer_state *const state,
3189 const struct pvr_cmd_buffer_draw_state *const draw_state)
3190 {
3191 /* We don't have a state to tell us that base_instance is being used so it
3192 * gets used as a boolean - 0 means we'll use a pds program that skips the
3193 * base instance addition. If the base_instance gets used (and the last
3194 * draw's base_instance was 0) then we switch to the BASE_INSTANCE attrib
3195 * program.
3196 *
3197 * If base_instance changes then we only need to update the data section.
3198 *
3199 * The only draw call state that doesn't really matter is the start vertex
3200 * as that is handled properly in the VDM state in all cases.
3201 */
3202 if ((state->draw_state.draw_indexed != draw_state->draw_indexed) ||
3203 (state->draw_state.draw_indirect != draw_state->draw_indirect) ||
3204 (state->draw_state.base_instance == 0 &&
3205 draw_state->base_instance != 0)) {
3206 state->dirty.draw_variant = true;
3207 } else if (state->draw_state.base_instance != draw_state->base_instance) {
3208 state->dirty.draw_base_instance = true;
3209 }
3210
3211 state->draw_state = *draw_state;
3212 }
3213
pvr_calc_shared_regs_count(const struct pvr_graphics_pipeline * const gfx_pipeline)3214 static uint32_t pvr_calc_shared_regs_count(
3215 const struct pvr_graphics_pipeline *const gfx_pipeline)
3216 {
3217 const struct pvr_pipeline_stage_state *const vertex_state =
3218 &gfx_pipeline->vertex_shader_state.stage_state;
3219 uint32_t shared_regs = vertex_state->const_shared_reg_count +
3220 vertex_state->const_shared_reg_offset;
3221
3222 if (gfx_pipeline->fragment_shader_state.bo) {
3223 const struct pvr_pipeline_stage_state *const fragment_state =
3224 &gfx_pipeline->fragment_shader_state.stage_state;
3225 uint32_t fragment_regs = fragment_state->const_shared_reg_count +
3226 fragment_state->const_shared_reg_offset;
3227
3228 shared_regs = MAX2(shared_regs, fragment_regs);
3229 }
3230
3231 return shared_regs;
3232 }
3233
3234 static void
pvr_emit_dirty_pds_state(const struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd,const uint32_t pds_vertex_descriptor_data_offset)3235 pvr_emit_dirty_pds_state(const struct pvr_cmd_buffer *const cmd_buffer,
3236 struct pvr_sub_cmd_gfx *const sub_cmd,
3237 const uint32_t pds_vertex_descriptor_data_offset)
3238 {
3239 const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
3240 const struct pvr_stage_allocation_descriptor_state
3241 *const vertex_descriptor_state =
3242 &state->gfx_pipeline->vertex_shader_state.descriptor_state;
3243 const struct pvr_pipeline_stage_state *const vertex_stage_state =
3244 &state->gfx_pipeline->vertex_shader_state.stage_state;
3245 struct pvr_csb *const csb = &sub_cmd->control_stream;
3246
3247 if (!vertex_descriptor_state->pds_info.code_size_in_dwords)
3248 return;
3249
3250 pvr_csb_emit (csb, VDMCTRL_PDS_STATE0, state0) {
3251 state0.usc_target = PVRX(VDMCTRL_USC_TARGET_ALL);
3252
3253 state0.usc_common_size =
3254 DIV_ROUND_UP(vertex_stage_state->const_shared_reg_count << 2,
3255 PVRX(VDMCTRL_PDS_STATE0_USC_COMMON_SIZE_UNIT_SIZE));
3256
3257 state0.pds_data_size = DIV_ROUND_UP(
3258 vertex_descriptor_state->pds_info.data_size_in_dwords << 2,
3259 PVRX(VDMCTRL_PDS_STATE0_PDS_DATA_SIZE_UNIT_SIZE));
3260 }
3261
3262 pvr_csb_emit (csb, VDMCTRL_PDS_STATE1, state1) {
3263 state1.pds_data_addr = PVR_DEV_ADDR(pds_vertex_descriptor_data_offset);
3264 state1.sd_type = PVRX(VDMCTRL_SD_TYPE_NONE);
3265 }
3266
3267 pvr_csb_emit (csb, VDMCTRL_PDS_STATE2, state2) {
3268 state2.pds_code_addr =
3269 PVR_DEV_ADDR(vertex_descriptor_state->pds_code.code_offset);
3270 }
3271 }
3272
pvr_setup_output_select(struct pvr_cmd_buffer * const cmd_buffer)3273 static void pvr_setup_output_select(struct pvr_cmd_buffer *const cmd_buffer)
3274 {
3275 struct pvr_emit_state *const emit_state = &cmd_buffer->state.emit_state;
3276 const struct pvr_graphics_pipeline *const gfx_pipeline =
3277 cmd_buffer->state.gfx_pipeline;
3278 struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state;
3279 const struct pvr_vertex_shader_state *const vertex_state =
3280 &gfx_pipeline->vertex_shader_state;
3281 uint32_t output_selects;
3282
3283 /* TODO: Handle vertex and fragment shader state flags. */
3284
3285 pvr_csb_pack (&output_selects, TA_OUTPUT_SEL, state) {
3286 const VkPrimitiveTopology topology =
3287 gfx_pipeline->input_asm_state.topology;
3288
3289 state.rhw_pres = true;
3290 state.vtxsize = DIV_ROUND_UP(vertex_state->vertex_output_size, 4U);
3291 state.psprite_size_pres = (topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST);
3292 }
3293
3294 if (ppp_state->output_selects != output_selects) {
3295 ppp_state->output_selects = output_selects;
3296 emit_state->output_selects = true;
3297 }
3298
3299 if (ppp_state->varying_word[0] != vertex_state->varying[0]) {
3300 ppp_state->varying_word[0] = vertex_state->varying[0];
3301 emit_state->varying_word0 = true;
3302 }
3303
3304 if (ppp_state->varying_word[1] != vertex_state->varying[1]) {
3305 ppp_state->varying_word[1] = vertex_state->varying[1];
3306 emit_state->varying_word1 = true;
3307 }
3308 }
3309
3310 static void
pvr_setup_isp_faces_and_control(struct pvr_cmd_buffer * const cmd_buffer,struct PVRX (TA_STATE_ISPA)* const ispa_out)3311 pvr_setup_isp_faces_and_control(struct pvr_cmd_buffer *const cmd_buffer,
3312 struct PVRX(TA_STATE_ISPA) *const ispa_out)
3313 {
3314 struct pvr_emit_state *const emit_state = &cmd_buffer->state.emit_state;
3315 const struct pvr_graphics_pipeline *const gfx_pipeline =
3316 cmd_buffer->state.gfx_pipeline;
3317 struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state;
3318 const struct pvr_dynamic_state *const dynamic_state =
3319 &cmd_buffer->state.dynamic.common;
3320 const struct pvr_render_pass_info *const pass_info =
3321 &cmd_buffer->state.render_pass_info;
3322 const uint32_t subpass_idx = pass_info->subpass_idx;
3323 const uint32_t *depth_stencil_attachment_idx =
3324 pass_info->pass->subpasses[subpass_idx].depth_stencil_attachment;
3325 const struct pvr_image_view *const attachment =
3326 (!depth_stencil_attachment_idx)
3327 ? NULL
3328 : pass_info->attachments[*depth_stencil_attachment_idx];
3329
3330 const VkCullModeFlags cull_mode = gfx_pipeline->raster_state.cull_mode;
3331 const bool raster_discard_enabled =
3332 gfx_pipeline->raster_state.discard_enable;
3333 const bool disable_all = raster_discard_enabled || !attachment;
3334
3335 const VkPrimitiveTopology topology = gfx_pipeline->input_asm_state.topology;
3336 const enum PVRX(TA_OBJTYPE) obj_type = pvr_ta_objtype(topology);
3337
3338 const bool disable_stencil_write = disable_all;
3339 const bool disable_stencil_test =
3340 disable_all || !vk_format_has_stencil(attachment->vk.format);
3341
3342 const bool disable_depth_write = disable_all;
3343 const bool disable_depth_test = disable_all ||
3344 !vk_format_has_depth(attachment->vk.format);
3345
3346 uint32_t ispb_stencil_off;
3347 bool is_two_sided = false;
3348 uint32_t isp_control;
3349
3350 uint32_t line_width;
3351 uint32_t common_a;
3352 uint32_t front_a;
3353 uint32_t front_b;
3354 uint32_t back_a;
3355 uint32_t back_b;
3356
3357 /* Convert to 4.4 fixed point format. */
3358 line_width = util_unsigned_fixed(dynamic_state->line_width, 4);
3359
3360 /* Subtract 1 to shift values from range [0=0,256=16] to [0=1/16,255=16].
3361 * If 0 it stays at 0, otherwise we subtract 1.
3362 */
3363 line_width = (!!line_width) * (line_width - 1);
3364
3365 line_width = MIN2(line_width, PVRX(TA_STATE_ISPA_POINTLINEWIDTH_SIZE_MAX));
3366
3367 /* TODO: Part of the logic in this function is duplicated in another part
3368 * of the code. E.g. the dcmpmode, and sop1/2/3. Could we do this earlier?
3369 */
3370
3371 pvr_csb_pack (&common_a, TA_STATE_ISPA, ispa) {
3372 ispa.pointlinewidth = line_width;
3373
3374 if (disable_depth_test)
3375 ispa.dcmpmode = PVRX(TA_CMPMODE_ALWAYS);
3376 else
3377 ispa.dcmpmode = pvr_ta_cmpmode(gfx_pipeline->depth_compare_op);
3378
3379 /* FIXME: Can we just have this and remove the assignment above?
3380 * The user provides a depthTestEnable at vkCreateGraphicsPipelines()
3381 * should we be using that?
3382 */
3383 ispa.dcmpmode |= gfx_pipeline->depth_compare_op;
3384
3385 ispa.dwritedisable = disable_depth_test || disable_depth_write;
3386 /* FIXME: Can we just have this and remove the assignment above? */
3387 ispa.dwritedisable = ispa.dwritedisable ||
3388 gfx_pipeline->depth_write_disable;
3389
3390 ispa.passtype = gfx_pipeline->fragment_shader_state.pass_type;
3391
3392 ispa.objtype = obj_type;
3393
3394 /* Return unpacked ispa structure. dcmpmode, dwritedisable, passtype and
3395 * objtype are needed by pvr_setup_triangle_merging_flag.
3396 */
3397 if (ispa_out)
3398 *ispa_out = ispa;
3399 }
3400
3401 /* FIXME: This logic should be redone and improved. Can we also get rid of
3402 * the front and back variants?
3403 */
3404
3405 pvr_csb_pack (&front_a, TA_STATE_ISPA, ispa) {
3406 ispa.sref = (!disable_stencil_test) * dynamic_state->reference.front;
3407 }
3408 front_a |= common_a;
3409
3410 pvr_csb_pack (&back_a, TA_STATE_ISPA, ispa) {
3411 ispa.sref = (!disable_stencil_test) * dynamic_state->compare_mask.back;
3412 }
3413 back_a |= common_a;
3414
3415 /* TODO: Does this actually represent the ispb control word on stencil off?
3416 * If not, rename the variable.
3417 */
3418 pvr_csb_pack (&ispb_stencil_off, TA_STATE_ISPB, ispb) {
3419 ispb.sop3 = PVRX(TA_ISPB_STENCILOP_KEEP);
3420 ispb.sop2 = PVRX(TA_ISPB_STENCILOP_KEEP);
3421 ispb.sop1 = PVRX(TA_ISPB_STENCILOP_KEEP);
3422 ispb.scmpmode = PVRX(TA_CMPMODE_ALWAYS);
3423 }
3424
3425 if (disable_stencil_test) {
3426 back_b = front_b = ispb_stencil_off;
3427 } else {
3428 pvr_csb_pack (&front_b, TA_STATE_ISPB, ispb) {
3429 ispb.swmask =
3430 (!disable_stencil_write) * dynamic_state->write_mask.front;
3431 ispb.scmpmask = dynamic_state->compare_mask.front;
3432
3433 ispb.sop3 = pvr_ta_stencilop(gfx_pipeline->stencil_front.pass_op);
3434 ispb.sop2 =
3435 pvr_ta_stencilop(gfx_pipeline->stencil_front.depth_fail_op);
3436 ispb.sop1 = pvr_ta_stencilop(gfx_pipeline->stencil_front.fail_op);
3437
3438 ispb.scmpmode = pvr_ta_cmpmode(gfx_pipeline->stencil_front.compare_op);
3439 }
3440
3441 pvr_csb_pack (&back_b, TA_STATE_ISPB, ispb) {
3442 ispb.swmask =
3443 (!disable_stencil_write) * dynamic_state->write_mask.back;
3444 ispb.scmpmask = dynamic_state->compare_mask.back;
3445
3446 ispb.sop3 = pvr_ta_stencilop(gfx_pipeline->stencil_back.pass_op);
3447 ispb.sop2 = pvr_ta_stencilop(gfx_pipeline->stencil_back.depth_fail_op);
3448 ispb.sop1 = pvr_ta_stencilop(gfx_pipeline->stencil_back.fail_op);
3449
3450 ispb.scmpmode = pvr_ta_cmpmode(gfx_pipeline->stencil_back.compare_op);
3451 }
3452 }
3453
3454 if (front_a != back_a || front_b != back_b) {
3455 if (cull_mode & VK_CULL_MODE_BACK_BIT) {
3456 /* Single face, using front state. */
3457 } else if (cull_mode & VK_CULL_MODE_FRONT_BIT) {
3458 /* Single face, using back state. */
3459
3460 front_a = back_a;
3461 front_b = back_b;
3462 } else {
3463 /* Both faces. */
3464
3465 emit_state->isp_ba = is_two_sided = true;
3466
3467 if (gfx_pipeline->raster_state.front_face ==
3468 VK_FRONT_FACE_COUNTER_CLOCKWISE) {
3469 uint32_t tmp = front_a;
3470
3471 front_a = back_a;
3472 back_a = tmp;
3473
3474 tmp = front_b;
3475 front_b = back_b;
3476 back_b = tmp;
3477 }
3478
3479 /* HW defaults to stencil off. */
3480 if (back_b != ispb_stencil_off)
3481 emit_state->isp_fb = emit_state->isp_bb = true;
3482 }
3483 }
3484
3485 if (!disable_stencil_test && front_b != ispb_stencil_off)
3486 emit_state->isp_fb = true;
3487
3488 pvr_csb_pack (&isp_control, TA_STATE_ISPCTL, ispctl) {
3489 ispctl.upass = pass_info->userpass_spawn;
3490
3491 /* TODO: is bo ever NULL? Figure out what to do. */
3492 ispctl.tagwritedisable = raster_discard_enabled ||
3493 !gfx_pipeline->fragment_shader_state.bo;
3494
3495 ispctl.two_sided = is_two_sided;
3496 ispctl.bpres = emit_state->isp_fb || emit_state->isp_bb;
3497
3498 ispctl.dbenable = !raster_discard_enabled &&
3499 gfx_pipeline->raster_state.depth_bias_enable &&
3500 obj_type == PVRX(TA_OBJTYPE_TRIANGLE);
3501 ispctl.scenable = !raster_discard_enabled;
3502
3503 ppp_state->isp.control_struct = ispctl;
3504 }
3505
3506 emit_state->isp = true;
3507
3508 ppp_state->isp.control = isp_control;
3509 ppp_state->isp.front_a = front_a;
3510 ppp_state->isp.front_b = front_b;
3511 ppp_state->isp.back_a = back_a;
3512 ppp_state->isp.back_b = back_b;
3513 }
3514
pvr_get_viewport_scissor_overlap(const VkViewport * const viewport,const VkRect2D * const scissor,VkRect2D * const rect_out)3515 static void pvr_get_viewport_scissor_overlap(const VkViewport *const viewport,
3516 const VkRect2D *const scissor,
3517 VkRect2D *const rect_out)
3518 {
3519 /* TODO: See if we can remove this struct. */
3520 struct pvr_rect {
3521 int32_t x0, y0;
3522 int32_t x1, y1;
3523 };
3524
3525 /* TODO: Worry about overflow? */
3526 const struct pvr_rect scissor_rect = {
3527 .x0 = scissor->offset.x,
3528 .y0 = scissor->offset.y,
3529 .x1 = scissor->offset.x + scissor->extent.width,
3530 .y1 = scissor->offset.y + scissor->extent.height
3531 };
3532 struct pvr_rect viewport_rect = { 0 };
3533
3534 assert(viewport->width >= 0.0f);
3535 assert(scissor_rect.x0 >= 0);
3536 assert(scissor_rect.y0 >= 0);
3537
3538 if (scissor->extent.width == 0 || scissor->extent.height == 0) {
3539 *rect_out = (VkRect2D){ 0 };
3540 return;
3541 }
3542
3543 viewport_rect.x0 = (int32_t)viewport->x;
3544 viewport_rect.x1 = (int32_t)viewport->x + (int32_t)viewport->width;
3545
3546 /* TODO: Is there a mathematical way of doing all this and then clamp at
3547 * the end?
3548 */
3549 /* We flip the y0 and y1 when height is negative. */
3550 viewport_rect.y0 = (int32_t)viewport->y + MIN2(0, (int32_t)viewport->height);
3551 viewport_rect.y1 = (int32_t)viewport->y + MAX2(0, (int32_t)viewport->height);
3552
3553 if (scissor_rect.x1 <= viewport_rect.x0 ||
3554 scissor_rect.y1 <= viewport_rect.y0 ||
3555 scissor_rect.x0 >= viewport_rect.x1 ||
3556 scissor_rect.y0 >= viewport_rect.y1) {
3557 *rect_out = (VkRect2D){ 0 };
3558 return;
3559 }
3560
3561 /* Determine the overlapping rectangle. */
3562 viewport_rect.x0 = MAX2(viewport_rect.x0, scissor_rect.x0);
3563 viewport_rect.y0 = MAX2(viewport_rect.y0, scissor_rect.y0);
3564 viewport_rect.x1 = MIN2(viewport_rect.x1, scissor_rect.x1);
3565 viewport_rect.y1 = MIN2(viewport_rect.y1, scissor_rect.y1);
3566
3567 /* TODO: Is this conversion safe? Is this logic right? */
3568 rect_out->offset.x = (uint32_t)viewport_rect.x0;
3569 rect_out->offset.y = (uint32_t)viewport_rect.y0;
3570 rect_out->extent.height = (uint32_t)(viewport_rect.y1 - viewport_rect.y0);
3571 rect_out->extent.width = (uint32_t)(viewport_rect.x1 - viewport_rect.x0);
3572 }
3573
3574 static inline uint32_t
pvr_get_geom_region_clip_align_size(struct pvr_device_info * const dev_info)3575 pvr_get_geom_region_clip_align_size(struct pvr_device_info *const dev_info)
3576 {
3577 /* TODO: This should come from rogue_ppp.xml. */
3578 return 16U + 16U * (!PVR_HAS_FEATURE(dev_info, tile_size_16x16));
3579 }
3580
3581 /* FIXME: Remove device param when PVR_HAS_FEATURE() accepts const dev_info */
3582 static void
pvr_setup_isp_depth_bias_scissor_state(struct pvr_cmd_buffer * const cmd_buffer)3583 pvr_setup_isp_depth_bias_scissor_state(struct pvr_cmd_buffer *const cmd_buffer)
3584 {
3585 struct pvr_emit_state *const emit_state = &cmd_buffer->state.emit_state;
3586 struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state;
3587 const struct pvr_dynamic_state *const dynamic_state =
3588 &cmd_buffer->state.dynamic.common;
3589 const struct PVRX(TA_STATE_ISPCTL) *const ispctl =
3590 &ppp_state->isp.control_struct;
3591 struct pvr_device_info *const dev_info =
3592 &cmd_buffer->device->pdevice->dev_info;
3593
3594 if (ispctl->dbenable)
3595 assert(!"Unimplemented");
3596
3597 if (ispctl->scenable) {
3598 const uint32_t region_clip_align_size =
3599 pvr_get_geom_region_clip_align_size(dev_info);
3600 const VkViewport *const viewport = &dynamic_state->viewport.viewports[0];
3601 const VkRect2D *const scissor = &dynamic_state->scissor.scissors[0];
3602 VkRect2D overlap_rect;
3603 uint32_t scissor_words[2];
3604 uint32_t height;
3605 uint32_t width;
3606 uint32_t x;
3607 uint32_t y;
3608
3609 /* For region clip. */
3610 uint32_t bottom;
3611 uint32_t right;
3612 uint32_t left;
3613 uint32_t top;
3614
3615 /* We don't support multiple viewport calculations. */
3616 assert(dynamic_state->viewport.count == 1);
3617 /* We don't support multiple scissor calculations. */
3618 assert(dynamic_state->scissor.count == 1);
3619
3620 pvr_get_viewport_scissor_overlap(viewport, scissor, &overlap_rect);
3621
3622 x = overlap_rect.offset.x;
3623 y = overlap_rect.offset.y;
3624 width = overlap_rect.extent.width;
3625 height = overlap_rect.extent.height;
3626
3627 pvr_csb_pack (&scissor_words[0], IPF_SCISSOR_WORD_0, word0) {
3628 word0.scw0_xmax = x + width;
3629 word0.scw0_xmin = x;
3630 }
3631
3632 pvr_csb_pack (&scissor_words[1], IPF_SCISSOR_WORD_1, word1) {
3633 word1.scw1_ymax = y + height;
3634 word1.scw1_ymin = y;
3635 }
3636
3637 if (cmd_buffer->scissor_array.size &&
3638 cmd_buffer->scissor_words[0] == scissor_words[0] &&
3639 cmd_buffer->scissor_words[1] == scissor_words[1]) {
3640 return;
3641 }
3642
3643 cmd_buffer->scissor_words[0] = scissor_words[0];
3644 cmd_buffer->scissor_words[1] = scissor_words[1];
3645
3646 /* Calculate region clip. */
3647
3648 left = x / region_clip_align_size;
3649 top = y / region_clip_align_size;
3650
3651 /* We prevent right=-1 with the multiplication. */
3652 /* TODO: Is there a better way of doing this? */
3653 if ((x + width) != 0U)
3654 right = DIV_ROUND_UP(x + width, region_clip_align_size) - 1;
3655 else
3656 right = 0;
3657
3658 if ((y + height) != 0U)
3659 bottom = DIV_ROUND_UP(y + height, region_clip_align_size) - 1;
3660 else
3661 bottom = 0U;
3662
3663 /* Setup region clip to clip everything outside what was calculated. */
3664
3665 /* FIXME: Should we mask to prevent writing over other words? */
3666 pvr_csb_pack (&ppp_state->region_clipping.word0, TA_REGION_CLIP0, word0) {
3667 word0.right = right;
3668 word0.left = left;
3669 word0.mode = PVRX(TA_REGION_CLIP_MODE_OUTSIDE);
3670 }
3671
3672 pvr_csb_pack (&ppp_state->region_clipping.word1, TA_REGION_CLIP1, word1) {
3673 word1.bottom = bottom;
3674 word1.top = top;
3675 }
3676
3677 ppp_state->depthbias_scissor_indices.scissor_index =
3678 util_dynarray_num_elements(&cmd_buffer->scissor_array,
3679 __typeof__(cmd_buffer->scissor_words));
3680
3681 memcpy(util_dynarray_grow_bytes(&cmd_buffer->scissor_array,
3682 1,
3683 sizeof(cmd_buffer->scissor_words)),
3684 cmd_buffer->scissor_words,
3685 sizeof(cmd_buffer->scissor_words));
3686
3687 emit_state->isp_dbsc = true;
3688 emit_state->region_clip = true;
3689 }
3690 }
3691
3692 static void
pvr_setup_triangle_merging_flag(struct pvr_cmd_buffer * const cmd_buffer,struct PVRX (TA_STATE_ISPA)* ispa)3693 pvr_setup_triangle_merging_flag(struct pvr_cmd_buffer *const cmd_buffer,
3694 struct PVRX(TA_STATE_ISPA) * ispa)
3695 {
3696 struct pvr_emit_state *const emit_state = &cmd_buffer->state.emit_state;
3697 struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state;
3698 uint32_t merge_word;
3699 uint32_t mask;
3700
3701 pvr_csb_pack (&merge_word, TA_STATE_PDS_SIZEINFO2, size_info) {
3702 /* Disable for lines or punch-through or for DWD and depth compare
3703 * always.
3704 */
3705 if (ispa->objtype == PVRX(TA_OBJTYPE_LINE) ||
3706 ispa->passtype == PVRX(TA_PASSTYPE_PUNCH_THROUGH) ||
3707 (ispa->dwritedisable && ispa->dcmpmode == PVRX(TA_CMPMODE_ALWAYS))) {
3708 size_info.pds_tri_merge_disable = true;
3709 }
3710 }
3711
3712 pvr_csb_pack (&mask, TA_STATE_PDS_SIZEINFO2, size_info) {
3713 size_info.pds_tri_merge_disable = true;
3714 }
3715
3716 merge_word |= ppp_state->pds.size_info2 & ~mask;
3717
3718 if (merge_word != ppp_state->pds.size_info2) {
3719 ppp_state->pds.size_info2 = merge_word;
3720 emit_state->pds_fragment_stateptr0 = true;
3721 }
3722 }
3723
3724 static void
pvr_setup_fragment_state_pointers(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)3725 pvr_setup_fragment_state_pointers(struct pvr_cmd_buffer *const cmd_buffer,
3726 struct pvr_sub_cmd_gfx *const sub_cmd)
3727 {
3728 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
3729 const struct pvr_stage_allocation_descriptor_state *descriptor_shader_state =
3730 &state->gfx_pipeline->fragment_shader_state.descriptor_state;
3731 const struct pvr_pds_upload *pds_coeff_program =
3732 &state->gfx_pipeline->fragment_shader_state.pds_coeff_program;
3733 const struct pvr_pipeline_stage_state *fragment_state =
3734 &state->gfx_pipeline->fragment_shader_state.stage_state;
3735 const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
3736 struct pvr_emit_state *const emit_state = &state->emit_state;
3737 struct pvr_ppp_state *const ppp_state = &state->ppp_state;
3738
3739 const uint32_t pds_uniform_size =
3740 DIV_ROUND_UP(descriptor_shader_state->pds_info.data_size_in_dwords,
3741 PVRX(TA_STATE_PDS_SIZEINFO1_PDS_UNIFORMSIZE_UNIT_SIZE));
3742
3743 const uint32_t pds_varying_state_size =
3744 DIV_ROUND_UP(pds_coeff_program->data_size,
3745 PVRX(TA_STATE_PDS_SIZEINFO1_PDS_VARYINGSIZE_UNIT_SIZE));
3746
3747 const uint32_t usc_varying_size =
3748 DIV_ROUND_UP(fragment_state->coefficient_size,
3749 PVRX(TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_UNIT_SIZE));
3750
3751 const uint32_t pds_temp_size =
3752 DIV_ROUND_UP(fragment_state->temps_count,
3753 PVRX(TA_STATE_PDS_SIZEINFO1_PDS_TEMPSIZE_UNIT_SIZE));
3754
3755 const uint32_t usc_shared_size =
3756 DIV_ROUND_UP(fragment_state->const_shared_reg_count,
3757 PVRX(TA_STATE_PDS_SIZEINFO2_USC_SHAREDSIZE_UNIT_SIZE));
3758
3759 const uint32_t max_tiles_in_flight =
3760 pvr_calc_fscommon_size_and_tiles_in_flight(
3761 pdevice,
3762 usc_shared_size *
3763 PVRX(TA_STATE_PDS_SIZEINFO2_USC_SHAREDSIZE_UNIT_SIZE),
3764 1);
3765 uint32_t size_info_mask;
3766 uint32_t size_info2;
3767
3768 if (max_tiles_in_flight < sub_cmd->max_tiles_in_flight)
3769 sub_cmd->max_tiles_in_flight = max_tiles_in_flight;
3770
3771 pvr_csb_pack (&ppp_state->pds.pixel_shader_base,
3772 TA_STATE_PDS_SHADERBASE,
3773 shader_base) {
3774 const struct pvr_pds_upload *const pds_upload =
3775 &state->gfx_pipeline->fragment_shader_state.pds_fragment_program;
3776
3777 shader_base.addr = PVR_DEV_ADDR(pds_upload->data_offset);
3778 }
3779
3780 if (descriptor_shader_state->pds_code.pvr_bo) {
3781 pvr_csb_pack (&ppp_state->pds.texture_uniform_code_base,
3782 TA_STATE_PDS_TEXUNICODEBASE,
3783 tex_base) {
3784 tex_base.addr =
3785 PVR_DEV_ADDR(descriptor_shader_state->pds_code.code_offset);
3786 }
3787 } else {
3788 ppp_state->pds.texture_uniform_code_base = 0U;
3789 }
3790
3791 pvr_csb_pack (&ppp_state->pds.size_info1, TA_STATE_PDS_SIZEINFO1, info1) {
3792 info1.pds_uniformsize = pds_uniform_size;
3793 info1.pds_texturestatesize = 0U;
3794 info1.pds_varyingsize = pds_varying_state_size;
3795 info1.usc_varyingsize = usc_varying_size;
3796 info1.pds_tempsize = pds_temp_size;
3797 }
3798
3799 pvr_csb_pack (&size_info_mask, TA_STATE_PDS_SIZEINFO2, mask) {
3800 mask.pds_tri_merge_disable = true;
3801 }
3802
3803 ppp_state->pds.size_info2 &= size_info_mask;
3804
3805 pvr_csb_pack (&size_info2, TA_STATE_PDS_SIZEINFO2, info2) {
3806 info2.usc_sharedsize = usc_shared_size;
3807 }
3808
3809 ppp_state->pds.size_info2 |= size_info2;
3810
3811 if (pds_coeff_program->pvr_bo) {
3812 state->emit_state.pds_fragment_stateptr1 = true;
3813
3814 pvr_csb_pack (&ppp_state->pds.varying_base,
3815 TA_STATE_PDS_VARYINGBASE,
3816 base) {
3817 base.addr = PVR_DEV_ADDR(pds_coeff_program->data_offset);
3818 }
3819 } else {
3820 ppp_state->pds.varying_base = 0U;
3821 }
3822
3823 pvr_csb_pack (&ppp_state->pds.uniform_state_data_base,
3824 TA_STATE_PDS_UNIFORMDATABASE,
3825 base) {
3826 base.addr = PVR_DEV_ADDR(state->pds_fragment_descriptor_data_offset);
3827 }
3828
3829 emit_state->pds_fragment_stateptr0 = true;
3830 emit_state->pds_fragment_stateptr3 = true;
3831 }
3832
pvr_setup_viewport(struct pvr_cmd_buffer * const cmd_buffer)3833 static void pvr_setup_viewport(struct pvr_cmd_buffer *const cmd_buffer)
3834 {
3835 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
3836 struct pvr_emit_state *const emit_state = &state->emit_state;
3837 struct pvr_ppp_state *const ppp_state = &state->ppp_state;
3838
3839 if (ppp_state->viewport_count != state->dynamic.common.viewport.count) {
3840 ppp_state->viewport_count = state->dynamic.common.viewport.count;
3841 emit_state->viewport = true;
3842 }
3843
3844 if (state->gfx_pipeline->raster_state.discard_enable) {
3845 /* We don't want to emit any viewport data as it'll just get thrown
3846 * away. It's after the previous condition because we still want to
3847 * stash the viewport_count as it's our trigger for when
3848 * rasterizer discard gets disabled.
3849 */
3850 emit_state->viewport = false;
3851 return;
3852 }
3853
3854 for (uint32_t i = 0; i < ppp_state->viewport_count; i++) {
3855 VkViewport *viewport = &state->dynamic.common.viewport.viewports[i];
3856 uint32_t x_scale = fui(viewport->width * 0.5f);
3857 uint32_t y_scale = fui(viewport->height * 0.5f);
3858 uint32_t z_scale = fui(viewport->maxDepth - viewport->minDepth);
3859 uint32_t x_center = fui(viewport->x + viewport->width * 0.5f);
3860 uint32_t y_center = fui(viewport->y + viewport->height * 0.5f);
3861 uint32_t z_center = fui(viewport->minDepth);
3862
3863 if (ppp_state->viewports[i].a0 != x_center ||
3864 ppp_state->viewports[i].m0 != x_scale ||
3865 ppp_state->viewports[i].a1 != y_center ||
3866 ppp_state->viewports[i].m1 != y_scale ||
3867 ppp_state->viewports[i].a2 != z_center ||
3868 ppp_state->viewports[i].m2 != z_scale) {
3869 ppp_state->viewports[i].a0 = x_center;
3870 ppp_state->viewports[i].m0 = x_scale;
3871 ppp_state->viewports[i].a1 = y_center;
3872 ppp_state->viewports[i].m1 = y_scale;
3873 ppp_state->viewports[i].a2 = z_center;
3874 ppp_state->viewports[i].m2 = z_scale;
3875
3876 emit_state->viewport = true;
3877 }
3878 }
3879 }
3880
pvr_setup_ppp_control(struct pvr_cmd_buffer * const cmd_buffer)3881 static void pvr_setup_ppp_control(struct pvr_cmd_buffer *const cmd_buffer)
3882 {
3883 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
3884 const struct pvr_graphics_pipeline *const gfx_pipeline = state->gfx_pipeline;
3885 struct pvr_emit_state *const emit_state = &state->emit_state;
3886 struct pvr_ppp_state *const ppp_state = &state->ppp_state;
3887 uint32_t ppp_control;
3888
3889 pvr_csb_pack (&ppp_control, TA_STATE_PPP_CTRL, control) {
3890 const struct pvr_raster_state *raster_state = &gfx_pipeline->raster_state;
3891 VkPrimitiveTopology topology = gfx_pipeline->input_asm_state.topology;
3892 control.drawclippededges = true;
3893 control.wclampen = true;
3894
3895 if (topology == VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN)
3896 control.flatshade_vtx = PVRX(TA_FLATSHADE_VTX_VERTEX_1);
3897 else
3898 control.flatshade_vtx = PVRX(TA_FLATSHADE_VTX_VERTEX_0);
3899
3900 if (raster_state->depth_clamp_enable)
3901 control.clip_mode = PVRX(TA_CLIP_MODE_NO_FRONT_OR_REAR);
3902 else
3903 control.clip_mode = PVRX(TA_CLIP_MODE_FRONT_REAR);
3904
3905 /* +--- FrontIsCCW?
3906 * | +--- Cull Front?
3907 * v v
3908 * 0|0 CULLMODE_CULL_CCW,
3909 * 0|1 CULLMODE_CULL_CW,
3910 * 1|0 CULLMODE_CULL_CW,
3911 * 1|1 CULLMODE_CULL_CCW,
3912 */
3913 switch (raster_state->cull_mode) {
3914 case VK_CULL_MODE_BACK_BIT:
3915 case VK_CULL_MODE_FRONT_BIT:
3916 if ((raster_state->front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE) ^
3917 (raster_state->cull_mode == VK_CULL_MODE_FRONT_BIT)) {
3918 control.cullmode = PVRX(TA_CULLMODE_CULL_CW);
3919 } else {
3920 control.cullmode = PVRX(TA_CULLMODE_CULL_CCW);
3921 }
3922
3923 break;
3924
3925 case VK_CULL_MODE_NONE:
3926 control.cullmode = PVRX(TA_CULLMODE_NO_CULLING);
3927 break;
3928
3929 default:
3930 unreachable("Unsupported cull mode!");
3931 }
3932 }
3933
3934 if (ppp_control != ppp_state->ppp_control) {
3935 ppp_state->ppp_control = ppp_control;
3936 emit_state->ppp_control = true;
3937 }
3938 }
3939
3940 /* Largest valid PPP State update in words = 31
3941 * 1 - Header
3942 * 3 - Stream Out Config words 0, 1 and 2
3943 * 1 - PPP Control word
3944 * 3 - Varying Config words 0, 1 and 2
3945 * 1 - Output Select
3946 * 1 - WClamp
3947 * 6 - Viewport Transform words
3948 * 2 - Region Clip words
3949 * 3 - PDS State for fragment phase (PDSSTATEPTR 1-3)
3950 * 4 - PDS State for fragment phase (PDSSTATEPTR0)
3951 * 6 - ISP Control Words
3952 */
3953 #define PVR_MAX_PPP_STATE_DWORDS 31
3954
pvr_emit_ppp_state(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)3955 static VkResult pvr_emit_ppp_state(struct pvr_cmd_buffer *const cmd_buffer,
3956 struct pvr_sub_cmd_gfx *const sub_cmd)
3957 {
3958 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
3959 struct pvr_emit_state *const emit_state = &state->emit_state;
3960 struct pvr_ppp_state *const ppp_state = &state->ppp_state;
3961 struct pvr_csb *const control_stream = &sub_cmd->control_stream;
3962 uint32_t ppp_state_words[PVR_MAX_PPP_STATE_DWORDS];
3963 uint32_t ppp_state_words_count;
3964 uint32_t ppp_state_header;
3965 bool deferred_secondary;
3966 struct pvr_bo *pvr_bo;
3967 uint32_t *buffer_ptr;
3968 VkResult result;
3969
3970 buffer_ptr = ppp_state_words;
3971
3972 pvr_csb_pack (&ppp_state_header, TA_STATE_HEADER, header) {
3973 header.view_port_count = (ppp_state->viewport_count == 0)
3974 ? 0U
3975 : (ppp_state->viewport_count - 1);
3976
3977 /* Skip over header. */
3978 buffer_ptr++;
3979
3980 /* Set ISP state. */
3981 if (emit_state->isp) {
3982 header.pres_ispctl = true;
3983 *buffer_ptr++ = ppp_state->isp.control;
3984 header.pres_ispctl_fa = true;
3985 *buffer_ptr++ = ppp_state->isp.front_a;
3986
3987 if (emit_state->isp_fb) {
3988 header.pres_ispctl_fb = true;
3989 *buffer_ptr++ = ppp_state->isp.front_b;
3990 }
3991
3992 if (emit_state->isp_ba) {
3993 header.pres_ispctl_ba = true;
3994 *buffer_ptr++ = ppp_state->isp.back_a;
3995 }
3996
3997 if (emit_state->isp_bb) {
3998 header.pres_ispctl_bb = true;
3999 *buffer_ptr++ = ppp_state->isp.back_b;
4000 }
4001 }
4002
4003 /* Depth bias / scissor
4004 * If deferred_secondary is true then we do a separate state update
4005 * which gets patched in ExecuteDeferredCommandBuffer.
4006 */
4007 /* TODO: Update above comment when we port ExecuteDeferredCommandBuffer.
4008 */
4009 deferred_secondary =
4010 cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
4011 cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
4012
4013 if (emit_state->isp_dbsc && !deferred_secondary) {
4014 header.pres_ispctl_dbsc = true;
4015
4016 pvr_csb_pack (buffer_ptr++, TA_STATE_ISPDBSC, ispdbsc) {
4017 ispdbsc.dbindex =
4018 ppp_state->depthbias_scissor_indices.depthbias_index;
4019 ispdbsc.scindex =
4020 ppp_state->depthbias_scissor_indices.scissor_index;
4021 }
4022 }
4023
4024 /* PDS state. */
4025 if (emit_state->pds_fragment_stateptr0) {
4026 header.pres_pds_state_ptr0 = true;
4027
4028 *buffer_ptr++ = ppp_state->pds.pixel_shader_base;
4029 *buffer_ptr++ = ppp_state->pds.texture_uniform_code_base;
4030 *buffer_ptr++ = ppp_state->pds.size_info1;
4031 *buffer_ptr++ = ppp_state->pds.size_info2;
4032 }
4033
4034 if (emit_state->pds_fragment_stateptr1) {
4035 header.pres_pds_state_ptr1 = true;
4036 *buffer_ptr++ = ppp_state->pds.varying_base;
4037 }
4038
4039 /* We don't use the pds_fragment_stateptr2 (texture state programs)
4040 * control word, but this doesn't mean we need to set it to 0. This is
4041 * because the hardware runs the texture state program only when the
4042 * pds_texture state field of PDS_SIZEINFO1 is non-zero.
4043 */
4044
4045 if (emit_state->pds_fragment_stateptr3) {
4046 header.pres_pds_state_ptr3 = true;
4047 *buffer_ptr++ = ppp_state->pds.uniform_state_data_base;
4048 }
4049
4050 /* Region clip. */
4051 if (emit_state->region_clip) {
4052 header.pres_region_clip = true;
4053 *buffer_ptr++ = ppp_state->region_clipping.word0;
4054 *buffer_ptr++ = ppp_state->region_clipping.word1;
4055 }
4056
4057 /* Viewport. */
4058 if (emit_state->viewport) {
4059 const uint32_t viewports = MAX2(1, ppp_state->viewport_count);
4060
4061 header.pres_viewport = true;
4062 for (uint32_t i = 0; i < viewports; i++) {
4063 *buffer_ptr++ = ppp_state->viewports[i].a0;
4064 *buffer_ptr++ = ppp_state->viewports[i].m0;
4065 *buffer_ptr++ = ppp_state->viewports[i].a1;
4066 *buffer_ptr++ = ppp_state->viewports[i].m1;
4067 *buffer_ptr++ = ppp_state->viewports[i].a2;
4068 *buffer_ptr++ = ppp_state->viewports[i].m2;
4069 }
4070 }
4071
4072 /* W clamp. */
4073 if (emit_state->wclamp) {
4074 const float wclamp = 0.00001f;
4075
4076 header.pres_wclamp = true;
4077 *buffer_ptr++ = fui(wclamp);
4078 }
4079
4080 /* Output selects. */
4081 if (emit_state->output_selects) {
4082 header.pres_outselects = true;
4083 *buffer_ptr++ = ppp_state->output_selects;
4084 }
4085
4086 /* Varying words. */
4087 if (emit_state->varying_word0) {
4088 header.pres_varying_word0 = true;
4089 *buffer_ptr++ = ppp_state->varying_word[0];
4090 }
4091
4092 if (emit_state->varying_word1) {
4093 header.pres_varying_word1 = true;
4094 *buffer_ptr++ = ppp_state->varying_word[1];
4095 }
4096
4097 if (emit_state->varying_word2) {
4098 /* We only emit this on the first draw of a render job to prevent us
4099 * from inheriting a non-zero value set elsewhere.
4100 */
4101 header.pres_varying_word2 = true;
4102 *buffer_ptr++ = 0;
4103 }
4104
4105 /* PPP control. */
4106 if (emit_state->ppp_control) {
4107 header.pres_ppp_ctrl = true;
4108 *buffer_ptr++ = ppp_state->ppp_control;
4109 }
4110
4111 if (emit_state->stream_out) {
4112 /* We only emit this on the first draw of a render job to prevent us
4113 * from inheriting a non-zero value set elsewhere.
4114 */
4115 header.pres_stream_out_size = true;
4116 *buffer_ptr++ = 0;
4117 }
4118 }
4119
4120 if (!ppp_state_header)
4121 return VK_SUCCESS;
4122
4123 ppp_state_words_count = buffer_ptr - ppp_state_words;
4124 ppp_state_words[0] = ppp_state_header;
4125
4126 result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
4127 cmd_buffer->device->heaps.general_heap,
4128 ppp_state_words_count * sizeof(uint32_t),
4129 PVR_BO_ALLOC_FLAG_CPU_MAPPED,
4130 &pvr_bo);
4131 if (result != VK_SUCCESS)
4132 return result;
4133
4134 memcpy(pvr_bo->bo->map,
4135 ppp_state_words,
4136 ppp_state_words_count * sizeof(uint32_t));
4137
4138 /* Write the VDM state update into the VDM control stream. */
4139 pvr_csb_emit (control_stream, VDMCTRL_PPP_STATE0, state0) {
4140 state0.word_count = ppp_state_words_count;
4141 state0.addrmsb = pvr_bo->vma->dev_addr;
4142 }
4143
4144 pvr_csb_emit (control_stream, VDMCTRL_PPP_STATE1, state1) {
4145 state1.addrlsb = pvr_bo->vma->dev_addr;
4146 }
4147
4148 if (emit_state->isp_dbsc &&
4149 cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
4150 pvr_finishme("Unimplemented path!!");
4151 }
4152
4153 state->emit_state_bits = 0;
4154
4155 return VK_SUCCESS;
4156 }
4157
4158 static VkResult
pvr_emit_dirty_ppp_state(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)4159 pvr_emit_dirty_ppp_state(struct pvr_cmd_buffer *const cmd_buffer,
4160 struct pvr_sub_cmd_gfx *const sub_cmd)
4161 {
4162 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
4163 const struct pvr_graphics_pipeline *const gfx_pipeline = state->gfx_pipeline;
4164 const bool dirty_stencil = state->dirty.compare_mask ||
4165 state->dirty.write_mask || state->dirty.reference;
4166 VkResult result;
4167
4168 if (!(dirty_stencil || state->dirty.depth_bias ||
4169 state->dirty.fragment_descriptors || state->dirty.line_width ||
4170 state->dirty.gfx_pipeline_binding || state->dirty.scissor ||
4171 state->dirty.userpass_spawn || state->dirty.viewport ||
4172 state->emit_state_bits)) {
4173 return VK_SUCCESS;
4174 }
4175
4176 if (state->dirty.gfx_pipeline_binding) {
4177 struct PVRX(TA_STATE_ISPA) ispa;
4178
4179 pvr_setup_output_select(cmd_buffer);
4180 pvr_setup_isp_faces_and_control(cmd_buffer, &ispa);
4181 pvr_setup_triangle_merging_flag(cmd_buffer, &ispa);
4182 } else if (dirty_stencil || state->dirty.line_width ||
4183 state->dirty.userpass_spawn) {
4184 pvr_setup_isp_faces_and_control(cmd_buffer, NULL);
4185 }
4186
4187 if (!gfx_pipeline->raster_state.discard_enable &&
4188 state->dirty.fragment_descriptors &&
4189 gfx_pipeline->fragment_shader_state.bo) {
4190 pvr_setup_fragment_state_pointers(cmd_buffer, sub_cmd);
4191 }
4192
4193 pvr_setup_isp_depth_bias_scissor_state(cmd_buffer);
4194
4195 if (state->dirty.viewport)
4196 pvr_setup_viewport(cmd_buffer);
4197
4198 pvr_setup_ppp_control(cmd_buffer);
4199
4200 if (gfx_pipeline->raster_state.cull_mode == VK_CULL_MODE_FRONT_AND_BACK) {
4201 /* FIXME: Port SetNegativeViewport(). */
4202 }
4203
4204 result = pvr_emit_ppp_state(cmd_buffer, sub_cmd);
4205 if (result != VK_SUCCESS)
4206 return result;
4207
4208 return VK_SUCCESS;
4209 }
4210
4211 static void
pvr_calculate_vertex_cam_size(const struct pvr_device_info * dev_info,const uint32_t vs_output_size,const bool raster_enable,uint32_t * const cam_size_out,uint32_t * const vs_max_instances_out)4212 pvr_calculate_vertex_cam_size(const struct pvr_device_info *dev_info,
4213 const uint32_t vs_output_size,
4214 const bool raster_enable,
4215 uint32_t *const cam_size_out,
4216 uint32_t *const vs_max_instances_out)
4217 {
4218 /* First work out the size of a vertex in the UVS and multiply by 4 for
4219 * column ordering.
4220 */
4221 const uint32_t uvs_vertex_vector_size_in_dwords =
4222 (vs_output_size + 1U + raster_enable * 4U) * 4U;
4223 const uint32_t vdm_cam_size =
4224 PVR_GET_FEATURE_VALUE(dev_info, vdm_cam_size, 32U);
4225
4226 /* This is a proxy for 8XE. */
4227 if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format) &&
4228 vdm_cam_size < 96U) {
4229 /* Comparisons are based on size including scratch per vertex vector. */
4230 if (uvs_vertex_vector_size_in_dwords < (14U * 4U)) {
4231 *cam_size_out = MIN2(31U, vdm_cam_size - 1U);
4232 *vs_max_instances_out = 16U;
4233 } else if (uvs_vertex_vector_size_in_dwords < (20U * 4U)) {
4234 *cam_size_out = 15U;
4235 *vs_max_instances_out = 16U;
4236 } else if (uvs_vertex_vector_size_in_dwords < (28U * 4U)) {
4237 *cam_size_out = 11U;
4238 *vs_max_instances_out = 12U;
4239 } else if (uvs_vertex_vector_size_in_dwords < (44U * 4U)) {
4240 *cam_size_out = 7U;
4241 *vs_max_instances_out = 8U;
4242 } else if (PVR_HAS_FEATURE(dev_info,
4243 simple_internal_parameter_format_v2) ||
4244 uvs_vertex_vector_size_in_dwords < (64U * 4U)) {
4245 *cam_size_out = 7U;
4246 *vs_max_instances_out = 4U;
4247 } else {
4248 *cam_size_out = 3U;
4249 *vs_max_instances_out = 2U;
4250 }
4251 } else {
4252 /* Comparisons are based on size including scratch per vertex vector. */
4253 if (uvs_vertex_vector_size_in_dwords <= (32U * 4U)) {
4254 /* output size <= 27 + 5 scratch. */
4255 *cam_size_out = MIN2(95U, vdm_cam_size - 1U);
4256 *vs_max_instances_out = 0U;
4257 } else if (uvs_vertex_vector_size_in_dwords <= 48U * 4U) {
4258 /* output size <= 43 + 5 scratch */
4259 *cam_size_out = 63U;
4260 if (PVR_GET_FEATURE_VALUE(dev_info, uvs_vtx_entries, 144U) < 288U)
4261 *vs_max_instances_out = 16U;
4262 else
4263 *vs_max_instances_out = 0U;
4264 } else if (uvs_vertex_vector_size_in_dwords <= 64U * 4U) {
4265 /* output size <= 59 + 5 scratch. */
4266 *cam_size_out = 31U;
4267 if (PVR_GET_FEATURE_VALUE(dev_info, uvs_vtx_entries, 144U) < 288U)
4268 *vs_max_instances_out = 16U;
4269 else
4270 *vs_max_instances_out = 0U;
4271 } else {
4272 *cam_size_out = 15U;
4273 *vs_max_instances_out = 16U;
4274 }
4275 }
4276 }
4277
4278 static void
pvr_emit_dirty_vdm_state(const struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)4279 pvr_emit_dirty_vdm_state(const struct pvr_cmd_buffer *const cmd_buffer,
4280 struct pvr_sub_cmd_gfx *const sub_cmd)
4281 {
4282 /* FIXME: Assume all state is dirty for the moment. */
4283 struct pvr_device_info *const dev_info =
4284 &cmd_buffer->device->pdevice->dev_info;
4285 ASSERTED const uint32_t max_user_vertex_output_components =
4286 pvr_get_max_user_vertex_output_components(dev_info);
4287 struct PVRX(VDMCTRL_VDM_STATE0)
4288 header = { pvr_cmd_header(VDMCTRL_VDM_STATE0) };
4289 const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
4290 const struct pvr_graphics_pipeline *const gfx_pipeline = state->gfx_pipeline;
4291 struct pvr_csb *const csb = &sub_cmd->control_stream;
4292 uint32_t vs_output_size;
4293 uint32_t max_instances;
4294 uint32_t cam_size;
4295
4296 assert(gfx_pipeline);
4297
4298 /* CAM Calculations and HW state take vertex size aligned to DWORDS. */
4299 vs_output_size =
4300 DIV_ROUND_UP(gfx_pipeline->vertex_shader_state.vertex_output_size,
4301 PVRX(VDMCTRL_VDM_STATE4_VS_OUTPUT_SIZE_UNIT_SIZE));
4302
4303 assert(vs_output_size <= max_user_vertex_output_components);
4304
4305 pvr_calculate_vertex_cam_size(dev_info,
4306 vs_output_size,
4307 true,
4308 &cam_size,
4309 &max_instances);
4310
4311 pvr_csb_emit (csb, VDMCTRL_VDM_STATE0, state0) {
4312 state0.cam_size = cam_size;
4313
4314 if (gfx_pipeline->input_asm_state.primitive_restart) {
4315 state0.cut_index_enable = true;
4316 state0.cut_index_present = true;
4317 }
4318
4319 switch (gfx_pipeline->input_asm_state.topology) {
4320 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
4321 state0.flatshade_control = PVRX(VDMCTRL_FLATSHADE_CONTROL_VERTEX_1);
4322 break;
4323
4324 default:
4325 state0.flatshade_control = PVRX(VDMCTRL_FLATSHADE_CONTROL_VERTEX_0);
4326 break;
4327 }
4328
4329 /* If we've bound a different vertex buffer, or this draw-call requires
4330 * a different PDS attrib data-section from the last draw call (changed
4331 * base_instance) then we need to specify a new data section. This is
4332 * also the case if we've switched pipeline or attrib program as the
4333 * data-section layout will be different.
4334 */
4335 state0.vs_data_addr_present =
4336 state->dirty.gfx_pipeline_binding || state->dirty.vertex_bindings ||
4337 state->dirty.draw_base_instance || state->dirty.draw_variant;
4338
4339 /* Need to specify new PDS Attrib program if we've bound a different
4340 * pipeline or we needed a different PDS Attrib variant for this
4341 * draw-call.
4342 */
4343 state0.vs_other_present = state->dirty.gfx_pipeline_binding ||
4344 state->dirty.draw_variant;
4345
4346 /* UVB_SCRATCH_SELECT_ONE with no rasterization is only valid when
4347 * stream output is enabled. We use UVB_SCRATCH_SELECT_FIVE because
4348 * Vulkan doesn't support stream output and the vertex position is
4349 * always emitted to the UVB.
4350 */
4351 state0.uvs_scratch_size_select =
4352 PVRX(VDMCTRL_UVS_SCRATCH_SIZE_SELECT_FIVE);
4353
4354 header = state0;
4355 }
4356
4357 if (header.cut_index_present) {
4358 pvr_csb_emit (csb, VDMCTRL_VDM_STATE1, state1) {
4359 switch (state->index_buffer_binding.type) {
4360 case VK_INDEX_TYPE_UINT32:
4361 /* FIXME: Defines for these? These seem to come from the Vulkan
4362 * spec. for VkPipelineInputAssemblyStateCreateInfo
4363 * primitiveRestartEnable.
4364 */
4365 state1.cut_index = 0xFFFFFFFF;
4366 break;
4367
4368 case VK_INDEX_TYPE_UINT16:
4369 state1.cut_index = 0xFFFF;
4370 break;
4371
4372 default:
4373 unreachable(!"Invalid index type");
4374 }
4375 }
4376 }
4377
4378 if (header.vs_data_addr_present) {
4379 pvr_csb_emit (csb, VDMCTRL_VDM_STATE2, state2) {
4380 state2.vs_pds_data_base_addr =
4381 PVR_DEV_ADDR(state->pds_vertex_attrib_offset);
4382 }
4383 }
4384
4385 if (header.vs_other_present) {
4386 const uint32_t usc_unified_store_size_in_bytes =
4387 gfx_pipeline->vertex_shader_state.vertex_input_size << 2;
4388
4389 pvr_csb_emit (csb, VDMCTRL_VDM_STATE3, state3) {
4390 state3.vs_pds_code_base_addr =
4391 PVR_DEV_ADDR(state->pds_shader.code_offset);
4392 }
4393
4394 pvr_csb_emit (csb, VDMCTRL_VDM_STATE4, state4) {
4395 state4.vs_output_size = vs_output_size;
4396 }
4397
4398 pvr_csb_emit (csb, VDMCTRL_VDM_STATE5, state5) {
4399 state5.vs_max_instances = max_instances;
4400 state5.vs_usc_common_size = 0U;
4401 state5.vs_usc_unified_size = DIV_ROUND_UP(
4402 usc_unified_store_size_in_bytes,
4403 PVRX(VDMCTRL_VDM_STATE5_VS_USC_UNIFIED_SIZE_UNIT_SIZE));
4404 state5.vs_pds_temp_size =
4405 DIV_ROUND_UP(state->pds_shader.info->temps_required << 2,
4406 PVRX(VDMCTRL_VDM_STATE5_VS_PDS_TEMP_SIZE_UNIT_SIZE));
4407 state5.vs_pds_data_size =
4408 DIV_ROUND_UP(state->pds_shader.info->data_size_in_dwords << 2,
4409 PVRX(VDMCTRL_VDM_STATE5_VS_PDS_DATA_SIZE_UNIT_SIZE));
4410 }
4411 }
4412 }
4413
pvr_validate_draw_state(struct pvr_cmd_buffer * cmd_buffer)4414 static VkResult pvr_validate_draw_state(struct pvr_cmd_buffer *cmd_buffer)
4415 {
4416 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
4417 const struct pvr_graphics_pipeline *const gfx_pipeline = state->gfx_pipeline;
4418 const struct pvr_pipeline_stage_state *const fragment_state =
4419 &gfx_pipeline->fragment_shader_state.stage_state;
4420 struct pvr_sub_cmd_gfx *sub_cmd;
4421 bool fstencil_writemask_zero;
4422 bool bstencil_writemask_zero;
4423 bool push_descriptors_dirty;
4424 bool fstencil_keep;
4425 bool bstencil_keep;
4426 VkResult result;
4427
4428 pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
4429
4430 sub_cmd = &state->current_sub_cmd->gfx;
4431 sub_cmd->empty_cmd = false;
4432
4433 /* Determine pipeline depth/stencil usage. If a pipeline uses depth or
4434 * stencil testing, those attachments are using their loaded values, and
4435 * the loadOps cannot be optimized out.
4436 */
4437 /* Pipeline uses depth testing. */
4438 if (sub_cmd->depth_usage == PVR_DEPTH_STENCIL_USAGE_UNDEFINED &&
4439 gfx_pipeline->depth_compare_op != VK_COMPARE_OP_ALWAYS) {
4440 sub_cmd->depth_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED;
4441 }
4442
4443 /* Pipeline uses stencil testing. */
4444 if (sub_cmd->stencil_usage == PVR_DEPTH_STENCIL_USAGE_UNDEFINED &&
4445 (gfx_pipeline->stencil_front.compare_op != VK_COMPARE_OP_ALWAYS ||
4446 gfx_pipeline->stencil_back.compare_op != VK_COMPARE_OP_ALWAYS)) {
4447 sub_cmd->stencil_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED;
4448 }
4449
4450 if (PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info,
4451 compute_overlap)) {
4452 uint32_t coefficient_size =
4453 DIV_ROUND_UP(fragment_state->coefficient_size,
4454 PVRX(TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_UNIT_SIZE));
4455
4456 if (coefficient_size >
4457 PVRX(TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_MAX_SIZE))
4458 sub_cmd->disable_compute_overlap = true;
4459 }
4460
4461 sub_cmd->frag_uses_atomic_ops |= fragment_state->uses_atomic_ops;
4462 sub_cmd->frag_has_side_effects |= fragment_state->has_side_effects;
4463 sub_cmd->frag_uses_texture_rw |= fragment_state->uses_texture_rw;
4464 sub_cmd->vertex_uses_texture_rw |=
4465 gfx_pipeline->vertex_shader_state.stage_state.uses_texture_rw;
4466
4467 fstencil_keep =
4468 (gfx_pipeline->stencil_front.fail_op == VK_STENCIL_OP_KEEP) &&
4469 (gfx_pipeline->stencil_front.pass_op == VK_STENCIL_OP_KEEP);
4470 bstencil_keep = (gfx_pipeline->stencil_back.fail_op == VK_STENCIL_OP_KEEP) &&
4471 (gfx_pipeline->stencil_back.pass_op == VK_STENCIL_OP_KEEP);
4472 fstencil_writemask_zero = (state->dynamic.common.write_mask.front == 0);
4473 bstencil_writemask_zero = (state->dynamic.common.write_mask.back == 0);
4474
4475 /* Set stencil modified flag if:
4476 * - Neither front nor back-facing stencil has a fail_op/pass_op of KEEP.
4477 * - Neither front nor back-facing stencil has a write_mask of zero.
4478 */
4479 if (!(fstencil_keep && bstencil_keep) &&
4480 !(fstencil_writemask_zero && bstencil_writemask_zero)) {
4481 sub_cmd->modifies_stencil = true;
4482 }
4483
4484 /* Set depth modified flag if depth write is enabled. */
4485 if (!gfx_pipeline->depth_write_disable)
4486 sub_cmd->modifies_depth = true;
4487
4488 /* If either the data or code changes for pds vertex attribs, regenerate the
4489 * data segment.
4490 */
4491 if (state->dirty.vertex_bindings || state->dirty.gfx_pipeline_binding ||
4492 state->dirty.draw_variant || state->dirty.draw_base_instance) {
4493 enum pvr_pds_vertex_attrib_program_type prog_type;
4494 const struct pvr_pds_attrib_program *program;
4495
4496 if (state->draw_state.draw_indirect)
4497 prog_type = PVR_PDS_VERTEX_ATTRIB_PROGRAM_DRAW_INDIRECT;
4498 else if (state->draw_state.base_instance)
4499 prog_type = PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASE_INSTANCE;
4500 else
4501 prog_type = PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASIC;
4502
4503 program =
4504 &gfx_pipeline->vertex_shader_state.pds_attrib_programs[prog_type];
4505 state->pds_shader.info = &program->info;
4506 state->pds_shader.code_offset = program->program.code_offset;
4507
4508 state->max_shared_regs =
4509 MAX2(state->max_shared_regs, pvr_calc_shared_regs_count(gfx_pipeline));
4510
4511 pvr_setup_vertex_buffers(cmd_buffer, gfx_pipeline);
4512 }
4513
4514 /* TODO: Check for dirty push constants */
4515
4516 pvr_validate_push_descriptors(cmd_buffer, &push_descriptors_dirty);
4517
4518 state->dirty.vertex_descriptors = push_descriptors_dirty ||
4519 state->dirty.gfx_pipeline_binding;
4520 state->dirty.fragment_descriptors = state->dirty.vertex_descriptors;
4521
4522 if (state->dirty.fragment_descriptors) {
4523 result = pvr_setup_descriptor_mappings(
4524 cmd_buffer,
4525 PVR_STAGE_ALLOCATION_FRAGMENT,
4526 &state->gfx_pipeline->fragment_shader_state.descriptor_state,
4527 NULL,
4528 &state->pds_fragment_descriptor_data_offset);
4529 if (result != VK_SUCCESS) {
4530 mesa_loge("Could not setup fragment descriptor mappings.");
4531 return result;
4532 }
4533 }
4534
4535 if (state->dirty.vertex_descriptors) {
4536 uint32_t pds_vertex_descriptor_data_offset;
4537
4538 result = pvr_setup_descriptor_mappings(
4539 cmd_buffer,
4540 PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY,
4541 &state->gfx_pipeline->vertex_shader_state.descriptor_state,
4542 NULL,
4543 &pds_vertex_descriptor_data_offset);
4544 if (result != VK_SUCCESS) {
4545 mesa_loge("Could not setup vertex descriptor mappings.");
4546 return result;
4547 }
4548
4549 pvr_emit_dirty_pds_state(cmd_buffer,
4550 sub_cmd,
4551 pds_vertex_descriptor_data_offset);
4552 }
4553
4554 pvr_emit_dirty_ppp_state(cmd_buffer, sub_cmd);
4555 pvr_emit_dirty_vdm_state(cmd_buffer, sub_cmd);
4556
4557 state->dirty.gfx_desc_dirty = false;
4558 state->dirty.blend_constants = false;
4559 state->dirty.compare_mask = false;
4560 state->dirty.depth_bias = false;
4561 state->dirty.draw_base_instance = false;
4562 state->dirty.draw_variant = false;
4563 state->dirty.fragment_descriptors = false;
4564 state->dirty.line_width = false;
4565 state->dirty.gfx_pipeline_binding = false;
4566 state->dirty.reference = false;
4567 state->dirty.scissor = false;
4568 state->dirty.userpass_spawn = false;
4569 state->dirty.vertex_bindings = false;
4570 state->dirty.viewport = false;
4571 state->dirty.write_mask = false;
4572
4573 return VK_SUCCESS;
4574 }
4575
pvr_get_hw_primitive_topology(VkPrimitiveTopology topology)4576 static uint32_t pvr_get_hw_primitive_topology(VkPrimitiveTopology topology)
4577 {
4578 switch (topology) {
4579 case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
4580 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_POINT_LIST);
4581 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
4582 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_LIST);
4583 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
4584 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_STRIP);
4585 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
4586 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_LIST);
4587 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
4588 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_STRIP);
4589 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
4590 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_FAN);
4591 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
4592 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_LIST_ADJ);
4593 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
4594 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_STRIP_ADJ);
4595 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
4596 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_LIST_ADJ);
4597 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
4598 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_STRIP_ADJ);
4599 case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
4600 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_PATCH_LIST);
4601 default:
4602 unreachable("Undefined primitive topology");
4603 }
4604 }
4605
pvr_emit_vdm_index_list(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd,VkPrimitiveTopology topology,uint32_t first_vertex,uint32_t vertex_count,uint32_t first_index,uint32_t index_count,uint32_t instance_count)4606 static void pvr_emit_vdm_index_list(struct pvr_cmd_buffer *cmd_buffer,
4607 struct pvr_sub_cmd_gfx *const sub_cmd,
4608 VkPrimitiveTopology topology,
4609 uint32_t first_vertex,
4610 uint32_t vertex_count,
4611 uint32_t first_index,
4612 uint32_t index_count,
4613 uint32_t instance_count)
4614 {
4615 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4616 struct pvr_csb *const csb = &sub_cmd->control_stream;
4617 struct PVRX(VDMCTRL_INDEX_LIST0)
4618 list_hdr = { pvr_cmd_header(VDMCTRL_INDEX_LIST0) };
4619 pvr_dev_addr_t index_buffer_addr = PVR_DEV_ADDR_INVALID;
4620 unsigned int index_stride = 0;
4621
4622 pvr_csb_emit (csb, VDMCTRL_INDEX_LIST0, list0) {
4623 const bool vertex_shader_has_side_effects =
4624 cmd_buffer->state.gfx_pipeline->vertex_shader_state.stage_state
4625 .has_side_effects;
4626
4627 list0.primitive_topology = pvr_get_hw_primitive_topology(topology);
4628
4629 /* First instance is not handled in the VDM state, it's implemented as
4630 * an addition in the PDS vertex fetch.
4631 */
4632 list0.index_count_present = true;
4633
4634 if (instance_count > 1)
4635 list0.index_instance_count_present = true;
4636
4637 if (first_vertex != 0)
4638 list0.index_offset_present = true;
4639
4640 if (state->draw_state.draw_indexed) {
4641 struct pvr_buffer *buffer = state->index_buffer_binding.buffer;
4642
4643 switch (state->index_buffer_binding.type) {
4644 case VK_INDEX_TYPE_UINT32:
4645 list0.index_size = PVRX(VDMCTRL_INDEX_SIZE_B32);
4646 index_stride = 4;
4647 break;
4648
4649 case VK_INDEX_TYPE_UINT16:
4650 list0.index_size = PVRX(VDMCTRL_INDEX_SIZE_B16);
4651 index_stride = 2;
4652 break;
4653
4654 default:
4655 unreachable("Invalid index type");
4656 }
4657
4658 list0.index_addr_present = true;
4659 index_buffer_addr = PVR_DEV_ADDR_OFFSET(
4660 buffer->dev_addr,
4661 state->index_buffer_binding.offset + first_index * index_stride);
4662 list0.index_base_addrmsb = index_buffer_addr;
4663 }
4664
4665 list0.degen_cull_enable =
4666 PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info,
4667 vdm_degenerate_culling) &&
4668 !vertex_shader_has_side_effects;
4669
4670 list_hdr = list0;
4671 }
4672
4673 if (list_hdr.index_addr_present) {
4674 pvr_csb_emit (csb, VDMCTRL_INDEX_LIST1, list1) {
4675 list1.index_base_addrlsb = index_buffer_addr;
4676 }
4677 }
4678
4679 if (list_hdr.index_count_present) {
4680 pvr_csb_emit (csb, VDMCTRL_INDEX_LIST2, list2) {
4681 list2.index_count = vertex_count | index_count;
4682 }
4683 }
4684
4685 if (list_hdr.index_instance_count_present) {
4686 pvr_csb_emit (csb, VDMCTRL_INDEX_LIST3, list3) {
4687 list3.instance_count = instance_count - 1;
4688 }
4689 }
4690
4691 if (list_hdr.index_offset_present) {
4692 pvr_csb_emit (csb, VDMCTRL_INDEX_LIST4, list4) {
4693 list4.index_offset = first_vertex;
4694 }
4695 }
4696
4697 /* TODO: See if we need list_words[5-9]. */
4698 }
4699
pvr_CmdDraw(VkCommandBuffer commandBuffer,uint32_t vertexCount,uint32_t instanceCount,uint32_t firstVertex,uint32_t firstInstance)4700 void pvr_CmdDraw(VkCommandBuffer commandBuffer,
4701 uint32_t vertexCount,
4702 uint32_t instanceCount,
4703 uint32_t firstVertex,
4704 uint32_t firstInstance)
4705 {
4706 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
4707 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4708 struct pvr_cmd_buffer_draw_state draw_state;
4709 VkResult result;
4710
4711 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
4712
4713 draw_state.base_vertex = firstVertex;
4714 draw_state.base_instance = firstInstance;
4715 draw_state.draw_indirect = false;
4716 draw_state.draw_indexed = false;
4717 pvr_update_draw_state(state, &draw_state);
4718
4719 result = pvr_validate_draw_state(cmd_buffer);
4720 if (result != VK_SUCCESS)
4721 return;
4722
4723 /* Write the VDM control stream for the primitive. */
4724 pvr_emit_vdm_index_list(cmd_buffer,
4725 &state->current_sub_cmd->gfx,
4726 state->gfx_pipeline->input_asm_state.topology,
4727 firstVertex,
4728 vertexCount,
4729 0U,
4730 0U,
4731 instanceCount);
4732 }
4733
pvr_CmdDrawIndexed(VkCommandBuffer commandBuffer,uint32_t indexCount,uint32_t instanceCount,uint32_t firstIndex,int32_t vertexOffset,uint32_t firstInstance)4734 void pvr_CmdDrawIndexed(VkCommandBuffer commandBuffer,
4735 uint32_t indexCount,
4736 uint32_t instanceCount,
4737 uint32_t firstIndex,
4738 int32_t vertexOffset,
4739 uint32_t firstInstance)
4740 {
4741 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
4742 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4743 struct pvr_cmd_buffer_draw_state draw_state;
4744 VkResult result;
4745
4746 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
4747
4748 draw_state.base_vertex = vertexOffset;
4749 draw_state.base_instance = firstInstance;
4750 draw_state.draw_indirect = false;
4751 draw_state.draw_indexed = true;
4752 pvr_update_draw_state(state, &draw_state);
4753
4754 result = pvr_validate_draw_state(cmd_buffer);
4755 if (result != VK_SUCCESS)
4756 return;
4757
4758 /* Write the VDM control stream for the primitive. */
4759 pvr_emit_vdm_index_list(cmd_buffer,
4760 &state->current_sub_cmd->gfx,
4761 state->gfx_pipeline->input_asm_state.topology,
4762 vertexOffset,
4763 0,
4764 firstIndex,
4765 indexCount,
4766 instanceCount);
4767 }
4768
pvr_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)4769 void pvr_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
4770 VkBuffer _buffer,
4771 VkDeviceSize offset,
4772 uint32_t drawCount,
4773 uint32_t stride)
4774 {
4775 assert(!"Unimplemented");
4776 }
4777
pvr_CmdDrawIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)4778 void pvr_CmdDrawIndirect(VkCommandBuffer commandBuffer,
4779 VkBuffer _buffer,
4780 VkDeviceSize offset,
4781 uint32_t drawCount,
4782 uint32_t stride)
4783 {
4784 assert(!"Unimplemented");
4785 }
4786
4787 static VkResult
pvr_resolve_unemitted_resolve_attachments(struct pvr_cmd_buffer * cmd_buffer)4788 pvr_resolve_unemitted_resolve_attachments(struct pvr_cmd_buffer *cmd_buffer)
4789 {
4790 pvr_finishme("Add attachment resolve support!");
4791 return pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
4792 }
4793
pvr_CmdEndRenderPass2(VkCommandBuffer commandBuffer,const VkSubpassEndInfo * pSubpassEndInfo)4794 void pvr_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
4795 const VkSubpassEndInfo *pSubpassEndInfo)
4796 {
4797 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
4798 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4799 struct pvr_image_view **attachments;
4800 VkClearValue *clear_values;
4801 VkResult result;
4802
4803 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
4804
4805 assert(state->render_pass_info.pass);
4806 assert(state->render_pass_info.framebuffer);
4807
4808 /* TODO: Investigate why pvr_cmd_buffer_end_sub_cmd/EndSubCommand is called
4809 * twice in this path, one here and one from
4810 * pvr_resolve_unemitted_resolve_attachments.
4811 */
4812 result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
4813 if (result != VK_SUCCESS)
4814 return;
4815
4816 result = pvr_resolve_unemitted_resolve_attachments(cmd_buffer);
4817 if (result != VK_SUCCESS)
4818 return;
4819
4820 /* Save the required fields before clearing render_pass_info struct. */
4821 attachments = state->render_pass_info.attachments;
4822 clear_values = state->render_pass_info.clear_values;
4823
4824 memset(&state->render_pass_info, 0, sizeof(state->render_pass_info));
4825
4826 state->render_pass_info.attachments = attachments;
4827 state->render_pass_info.clear_values = clear_values;
4828 }
4829
pvr_CmdExecuteCommands(VkCommandBuffer commandBuffer,uint32_t commandBufferCount,const VkCommandBuffer * pCommandBuffers)4830 void pvr_CmdExecuteCommands(VkCommandBuffer commandBuffer,
4831 uint32_t commandBufferCount,
4832 const VkCommandBuffer *pCommandBuffers)
4833 {
4834 assert(!"Unimplemented");
4835 }
4836
pvr_CmdNextSubpass2(VkCommandBuffer commandBuffer,const VkSubpassBeginInfo * pSubpassBeginInfo,const VkSubpassEndInfo * pSubpassEndInfo)4837 void pvr_CmdNextSubpass2(VkCommandBuffer commandBuffer,
4838 const VkSubpassBeginInfo *pSubpassBeginInfo,
4839 const VkSubpassEndInfo *pSubpassEndInfo)
4840 {
4841 assert(!"Unimplemented");
4842 }
4843
4844 /* This is just enough to handle vkCmdPipelineBarrier().
4845 * TODO: Complete?
4846 */
pvr_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,const VkDependencyInfo * pDependencyInfo)4847 void pvr_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,
4848 const VkDependencyInfo *pDependencyInfo)
4849 {
4850 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
4851 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
4852 const struct pvr_render_pass *const render_pass =
4853 state->render_pass_info.pass;
4854 VkPipelineStageFlags vk_src_stage_mask = 0U;
4855 VkPipelineStageFlags vk_dst_stage_mask = 0U;
4856 uint32_t required_stage_mask = 0U;
4857 uint32_t src_stage_mask;
4858 uint32_t dst_stage_mask;
4859 bool is_barrier_needed;
4860
4861 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
4862
4863 for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++) {
4864 vk_src_stage_mask |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
4865 vk_dst_stage_mask |= pDependencyInfo->pMemoryBarriers[i].dstStageMask;
4866 }
4867
4868 for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++) {
4869 vk_src_stage_mask |=
4870 pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
4871 vk_dst_stage_mask |=
4872 pDependencyInfo->pBufferMemoryBarriers[i].dstStageMask;
4873 }
4874
4875 for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++) {
4876 vk_src_stage_mask |=
4877 pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
4878 vk_dst_stage_mask |=
4879 pDependencyInfo->pImageMemoryBarriers[i].dstStageMask;
4880 }
4881
4882 src_stage_mask = pvr_stage_mask_src(vk_src_stage_mask);
4883 dst_stage_mask = pvr_stage_mask_dst(vk_dst_stage_mask);
4884
4885 for (uint32_t stage = 0U; stage != PVR_NUM_SYNC_PIPELINE_STAGES; stage++) {
4886 if (!(dst_stage_mask & BITFIELD_BIT(stage)))
4887 continue;
4888
4889 required_stage_mask |= state->barriers_needed[stage];
4890 }
4891
4892 src_stage_mask &= required_stage_mask;
4893 for (uint32_t stage = 0U; stage != PVR_NUM_SYNC_PIPELINE_STAGES; stage++) {
4894 if (!(dst_stage_mask & BITFIELD_BIT(stage)))
4895 continue;
4896
4897 state->barriers_needed[stage] &= ~src_stage_mask;
4898 }
4899
4900 if (src_stage_mask == 0 || dst_stage_mask == 0) {
4901 is_barrier_needed = false;
4902 } else if (src_stage_mask == PVR_PIPELINE_STAGE_GEOM_BIT &&
4903 dst_stage_mask == PVR_PIPELINE_STAGE_FRAG_BIT) {
4904 /* This is implicit so no need to barrier. */
4905 is_barrier_needed = false;
4906 } else if (src_stage_mask == dst_stage_mask &&
4907 util_bitcount(src_stage_mask) == 1) {
4908 switch (src_stage_mask) {
4909 case PVR_PIPELINE_STAGE_FRAG_BIT:
4910 pvr_finishme("Handle fragment stage pipeline barrier.");
4911 is_barrier_needed = true;
4912 break;
4913
4914 case PVR_PIPELINE_STAGE_COMPUTE_BIT: {
4915 struct pvr_sub_cmd *const current_sub_cmd = state->current_sub_cmd;
4916
4917 is_barrier_needed = false;
4918
4919 if (!current_sub_cmd ||
4920 current_sub_cmd->type != PVR_SUB_CMD_TYPE_COMPUTE) {
4921 break;
4922 }
4923
4924 /* Multiple dispatches can be merged into a single job. When back to
4925 * back dispatches have a sequential dependency (CDM -> CDM pipeline
4926 * barrier) we need to do the following.
4927 * - Dispatch a kernel which fences all previous memory writes and
4928 * flushes the MADD cache.
4929 * - Issue a CDM fence which ensures all previous tasks emitted by
4930 * the CDM are completed before starting anything new.
4931 */
4932
4933 /* Issue Data Fence, Wait for Data Fence (IDFWDF) makes the PDS wait
4934 * for data.
4935 */
4936 pvr_compute_generate_idfwdf(cmd_buffer, ¤t_sub_cmd->compute);
4937
4938 pvr_compute_generate_fence(cmd_buffer,
4939 ¤t_sub_cmd->compute,
4940 false);
4941 break;
4942 }
4943
4944 default:
4945 is_barrier_needed = false;
4946 break;
4947 };
4948 } else {
4949 is_barrier_needed = true;
4950 }
4951
4952 if (render_pass) {
4953 pvr_finishme("Insert mid fragment stage barrier if needed.");
4954 } else {
4955 if (is_barrier_needed)
4956 pvr_finishme("Insert barrier if needed.");
4957 }
4958 }
4959
pvr_CmdResetEvent2KHR(VkCommandBuffer commandBuffer,VkEvent _event,VkPipelineStageFlags2 stageMask)4960 void pvr_CmdResetEvent2KHR(VkCommandBuffer commandBuffer,
4961 VkEvent _event,
4962 VkPipelineStageFlags2 stageMask)
4963 {
4964 assert(!"Unimplemented");
4965 }
4966
pvr_CmdSetEvent2KHR(VkCommandBuffer commandBuffer,VkEvent _event,const VkDependencyInfo * pDependencyInfo)4967 void pvr_CmdSetEvent2KHR(VkCommandBuffer commandBuffer,
4968 VkEvent _event,
4969 const VkDependencyInfo *pDependencyInfo)
4970 {
4971 assert(!"Unimplemented");
4972 }
4973
pvr_CmdWaitEvents2KHR(VkCommandBuffer commandBuffer,uint32_t eventCount,const VkEvent * pEvents,const VkDependencyInfo * pDependencyInfos)4974 void pvr_CmdWaitEvents2KHR(VkCommandBuffer commandBuffer,
4975 uint32_t eventCount,
4976 const VkEvent *pEvents,
4977 const VkDependencyInfo *pDependencyInfos)
4978 {
4979 assert(!"Unimplemented");
4980 }
4981
pvr_CmdWriteTimestamp2KHR(VkCommandBuffer commandBuffer,VkPipelineStageFlags2 stage,VkQueryPool queryPool,uint32_t query)4982 void pvr_CmdWriteTimestamp2KHR(VkCommandBuffer commandBuffer,
4983 VkPipelineStageFlags2 stage,
4984 VkQueryPool queryPool,
4985 uint32_t query)
4986 {
4987 unreachable("Timestamp queries are not supported.");
4988 }
4989
pvr_EndCommandBuffer(VkCommandBuffer commandBuffer)4990 VkResult pvr_EndCommandBuffer(VkCommandBuffer commandBuffer)
4991 {
4992 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
4993 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4994 VkResult result;
4995
4996 /* From the Vulkan 1.0 spec:
4997 *
4998 * CommandBuffer must be in the recording state.
4999 */
5000 assert(cmd_buffer->status == PVR_CMD_BUFFER_STATUS_RECORDING);
5001
5002 if (state->status != VK_SUCCESS)
5003 return state->status;
5004
5005 result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
5006 if (result != VK_SUCCESS)
5007 return result;
5008
5009 cmd_buffer->status = PVR_CMD_BUFFER_STATUS_EXECUTABLE;
5010
5011 return VK_SUCCESS;
5012 }
5013