• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2022 Imagination Technologies Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a copy
5  * of this software and associated documentation files (the "Software"), to deal
6  * in the Software without restriction, including without limitation the rights
7  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8  * copies of the Software, and to permit persons to whom the Software is
9  * furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include <assert.h>
25 #include <limits.h>
26 #include <stdbool.h>
27 #include <stddef.h>
28 #include <stdint.h>
29 #include <string.h>
30 #include <vulkan/vulkan.h>
31 
32 #include "hwdef/rogue_hw_defs.h"
33 #include "hwdef/rogue_hw_utils.h"
34 #include "pvr_blit.h"
35 #include "pvr_bo.h"
36 #include "pvr_clear.h"
37 #include "pvr_common.h"
38 #include "pvr_csb.h"
39 #include "pvr_csb_enum_helpers.h"
40 #include "pvr_device_info.h"
41 #include "pvr_formats.h"
42 #include "pvr_hardcode.h"
43 #include "pvr_hw_pass.h"
44 #include "pvr_job_common.h"
45 #include "pvr_job_render.h"
46 #include "pvr_limits.h"
47 #include "pvr_pds.h"
48 #include "pvr_private.h"
49 #include "pvr_tex_state.h"
50 #include "pvr_types.h"
51 #include "pvr_uscgen.h"
52 #include "pvr_winsys.h"
53 #include "util/bitscan.h"
54 #include "util/bitset.h"
55 #include "util/compiler.h"
56 #include "util/list.h"
57 #include "util/macros.h"
58 #include "util/u_dynarray.h"
59 #include "util/u_math.h"
60 #include "util/u_pack_color.h"
61 #include "vk_alloc.h"
62 #include "vk_command_buffer.h"
63 #include "vk_command_pool.h"
64 #include "vk_common_entrypoints.h"
65 #include "vk_format.h"
66 #include "vk_graphics_state.h"
67 #include "vk_log.h"
68 #include "vk_object.h"
69 #include "vk_util.h"
70 
71 /* Structure used to pass data into pvr_compute_generate_control_stream()
72  * function.
73  */
74 struct pvr_compute_kernel_info {
75    pvr_dev_addr_t indirect_buffer_addr;
76    bool global_offsets_present;
77    uint32_t usc_common_size;
78    uint32_t usc_unified_size;
79    uint32_t pds_temp_size;
80    uint32_t pds_data_size;
81    enum PVRX(CDMCTRL_USC_TARGET) usc_target;
82    bool is_fence;
83    uint32_t pds_data_offset;
84    uint32_t pds_code_offset;
85    enum PVRX(CDMCTRL_SD_TYPE) sd_type;
86    bool usc_common_shared;
87    uint32_t local_size[PVR_WORKGROUP_DIMENSIONS];
88    uint32_t global_size[PVR_WORKGROUP_DIMENSIONS];
89    uint32_t max_instances;
90 };
91 
pvr_cmd_buffer_free_sub_cmd(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd * sub_cmd)92 static void pvr_cmd_buffer_free_sub_cmd(struct pvr_cmd_buffer *cmd_buffer,
93                                         struct pvr_sub_cmd *sub_cmd)
94 {
95    if (sub_cmd->owned) {
96       switch (sub_cmd->type) {
97       case PVR_SUB_CMD_TYPE_GRAPHICS:
98          util_dynarray_fini(&sub_cmd->gfx.sec_query_indices);
99          pvr_csb_finish(&sub_cmd->gfx.control_stream);
100          pvr_bo_free(cmd_buffer->device, sub_cmd->gfx.terminate_ctrl_stream);
101          pvr_bo_suballoc_free(sub_cmd->gfx.depth_bias_bo);
102          pvr_bo_suballoc_free(sub_cmd->gfx.scissor_bo);
103          break;
104 
105       case PVR_SUB_CMD_TYPE_COMPUTE:
106       case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY:
107          pvr_csb_finish(&sub_cmd->compute.control_stream);
108          break;
109 
110       case PVR_SUB_CMD_TYPE_TRANSFER:
111          list_for_each_entry_safe (struct pvr_transfer_cmd,
112                                    transfer_cmd,
113                                    sub_cmd->transfer.transfer_cmds,
114                                    link) {
115             list_del(&transfer_cmd->link);
116             if (!transfer_cmd->is_deferred_clear)
117                vk_free(&cmd_buffer->vk.pool->alloc, transfer_cmd);
118          }
119          break;
120 
121       case PVR_SUB_CMD_TYPE_EVENT:
122          if (sub_cmd->event.type == PVR_EVENT_TYPE_WAIT)
123             vk_free(&cmd_buffer->vk.pool->alloc, sub_cmd->event.wait.events);
124          break;
125 
126       default:
127          unreachable("Unsupported sub-command type");
128       }
129    }
130 
131    list_del(&sub_cmd->link);
132    vk_free(&cmd_buffer->vk.pool->alloc, sub_cmd);
133 }
134 
pvr_cmd_buffer_free_sub_cmds(struct pvr_cmd_buffer * cmd_buffer)135 static void pvr_cmd_buffer_free_sub_cmds(struct pvr_cmd_buffer *cmd_buffer)
136 {
137    list_for_each_entry_safe (struct pvr_sub_cmd,
138                              sub_cmd,
139                              &cmd_buffer->sub_cmds,
140                              link) {
141       pvr_cmd_buffer_free_sub_cmd(cmd_buffer, sub_cmd);
142    }
143 }
144 
pvr_cmd_buffer_free_resources(struct pvr_cmd_buffer * cmd_buffer)145 static void pvr_cmd_buffer_free_resources(struct pvr_cmd_buffer *cmd_buffer)
146 {
147    vk_free(&cmd_buffer->vk.pool->alloc,
148            cmd_buffer->state.render_pass_info.attachments);
149    vk_free(&cmd_buffer->vk.pool->alloc,
150            cmd_buffer->state.render_pass_info.clear_values);
151 
152    util_dynarray_fini(&cmd_buffer->state.query_indices);
153 
154    pvr_cmd_buffer_free_sub_cmds(cmd_buffer);
155 
156    list_for_each_entry_safe (struct pvr_suballoc_bo,
157                              suballoc_bo,
158                              &cmd_buffer->bo_list,
159                              link) {
160       list_del(&suballoc_bo->link);
161       pvr_bo_suballoc_free(suballoc_bo);
162    }
163 
164    util_dynarray_fini(&cmd_buffer->deferred_clears);
165    util_dynarray_fini(&cmd_buffer->deferred_csb_commands);
166    util_dynarray_fini(&cmd_buffer->scissor_array);
167    util_dynarray_fini(&cmd_buffer->depth_bias_array);
168 }
169 
pvr_cmd_buffer_reset(struct vk_command_buffer * vk_cmd_buffer,VkCommandBufferResetFlags flags)170 static void pvr_cmd_buffer_reset(struct vk_command_buffer *vk_cmd_buffer,
171                                  VkCommandBufferResetFlags flags)
172 {
173    struct pvr_cmd_buffer *cmd_buffer =
174       container_of(vk_cmd_buffer, struct pvr_cmd_buffer, vk);
175 
176    /* FIXME: For now we always free all resources as if
177     * VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT was set.
178     */
179    pvr_cmd_buffer_free_resources(cmd_buffer);
180 
181    vk_command_buffer_reset(&cmd_buffer->vk);
182 
183    memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
184    memset(&cmd_buffer->scissor_words, 0, sizeof(cmd_buffer->scissor_words));
185 
186    cmd_buffer->usage_flags = 0;
187 }
188 
pvr_cmd_buffer_destroy(struct vk_command_buffer * vk_cmd_buffer)189 static void pvr_cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer)
190 {
191    struct pvr_cmd_buffer *cmd_buffer =
192       container_of(vk_cmd_buffer, struct pvr_cmd_buffer, vk);
193 
194    pvr_cmd_buffer_free_resources(cmd_buffer);
195    vk_command_buffer_finish(&cmd_buffer->vk);
196    vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer);
197 }
198 
199 static const struct vk_command_buffer_ops cmd_buffer_ops = {
200    .reset = pvr_cmd_buffer_reset,
201    .destroy = pvr_cmd_buffer_destroy,
202 };
203 
pvr_cmd_buffer_create(struct pvr_device * device,struct vk_command_pool * pool,VkCommandBufferLevel level,VkCommandBuffer * pCommandBuffer)204 static VkResult pvr_cmd_buffer_create(struct pvr_device *device,
205                                       struct vk_command_pool *pool,
206                                       VkCommandBufferLevel level,
207                                       VkCommandBuffer *pCommandBuffer)
208 {
209    struct pvr_cmd_buffer *cmd_buffer;
210    VkResult result;
211 
212    cmd_buffer = vk_zalloc(&pool->alloc,
213                           sizeof(*cmd_buffer),
214                           8U,
215                           VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
216    if (!cmd_buffer)
217       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
218 
219    result =
220       vk_command_buffer_init(pool, &cmd_buffer->vk, &cmd_buffer_ops, level);
221    if (result != VK_SUCCESS) {
222       vk_free(&pool->alloc, cmd_buffer);
223       return result;
224    }
225 
226    cmd_buffer->device = device;
227 
228    util_dynarray_init(&cmd_buffer->depth_bias_array, NULL);
229    util_dynarray_init(&cmd_buffer->scissor_array, NULL);
230    util_dynarray_init(&cmd_buffer->deferred_csb_commands, NULL);
231    util_dynarray_init(&cmd_buffer->deferred_clears, NULL);
232 
233    list_inithead(&cmd_buffer->sub_cmds);
234    list_inithead(&cmd_buffer->bo_list);
235 
236    *pCommandBuffer = pvr_cmd_buffer_to_handle(cmd_buffer);
237 
238    return VK_SUCCESS;
239 }
240 
241 VkResult
pvr_AllocateCommandBuffers(VkDevice _device,const VkCommandBufferAllocateInfo * pAllocateInfo,VkCommandBuffer * pCommandBuffers)242 pvr_AllocateCommandBuffers(VkDevice _device,
243                            const VkCommandBufferAllocateInfo *pAllocateInfo,
244                            VkCommandBuffer *pCommandBuffers)
245 {
246    VK_FROM_HANDLE(vk_command_pool, pool, pAllocateInfo->commandPool);
247    PVR_FROM_HANDLE(pvr_device, device, _device);
248    VkResult result = VK_SUCCESS;
249    uint32_t i;
250 
251    for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
252       result = pvr_cmd_buffer_create(device,
253                                      pool,
254                                      pAllocateInfo->level,
255                                      &pCommandBuffers[i]);
256       if (result != VK_SUCCESS)
257          break;
258    }
259 
260    if (result != VK_SUCCESS) {
261       while (i--) {
262          VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, pCommandBuffers[i]);
263          pvr_cmd_buffer_destroy(cmd_buffer);
264       }
265 
266       for (i = 0; i < pAllocateInfo->commandBufferCount; i++)
267          pCommandBuffers[i] = VK_NULL_HANDLE;
268    }
269 
270    return result;
271 }
272 
pvr_cmd_buffer_update_barriers(struct pvr_cmd_buffer * cmd_buffer,enum pvr_sub_cmd_type type)273 static void pvr_cmd_buffer_update_barriers(struct pvr_cmd_buffer *cmd_buffer,
274                                            enum pvr_sub_cmd_type type)
275 {
276    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
277    uint32_t barriers;
278 
279    switch (type) {
280    case PVR_SUB_CMD_TYPE_GRAPHICS:
281       barriers = PVR_PIPELINE_STAGE_GEOM_BIT | PVR_PIPELINE_STAGE_FRAG_BIT;
282       break;
283 
284    case PVR_SUB_CMD_TYPE_COMPUTE:
285       barriers = PVR_PIPELINE_STAGE_COMPUTE_BIT;
286       break;
287 
288    case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY:
289    case PVR_SUB_CMD_TYPE_TRANSFER:
290       /* Compute jobs are used for occlusion queries but to copy the results we
291        * have to sync with transfer jobs because vkCmdCopyQueryPoolResults() is
292        * deemed as a transfer operation by the spec.
293        */
294       barriers = PVR_PIPELINE_STAGE_TRANSFER_BIT;
295       break;
296 
297    case PVR_SUB_CMD_TYPE_EVENT:
298       barriers = 0;
299       break;
300 
301    default:
302       unreachable("Unsupported sub-command type");
303    }
304 
305    for (uint32_t i = 0; i < ARRAY_SIZE(state->barriers_needed); i++)
306       state->barriers_needed[i] |= barriers;
307 }
308 
309 static VkResult
pvr_cmd_buffer_upload_tables(struct pvr_device * device,struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)310 pvr_cmd_buffer_upload_tables(struct pvr_device *device,
311                              struct pvr_cmd_buffer *cmd_buffer,
312                              struct pvr_sub_cmd_gfx *const sub_cmd)
313 {
314    const uint32_t cache_line_size =
315       rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
316    VkResult result;
317 
318    assert(!sub_cmd->depth_bias_bo && !sub_cmd->scissor_bo);
319 
320    if (cmd_buffer->depth_bias_array.size > 0) {
321       result =
322          pvr_gpu_upload(device,
323                         device->heaps.general_heap,
324                         util_dynarray_begin(&cmd_buffer->depth_bias_array),
325                         cmd_buffer->depth_bias_array.size,
326                         cache_line_size,
327                         &sub_cmd->depth_bias_bo);
328       if (result != VK_SUCCESS)
329          return result;
330    }
331 
332    if (cmd_buffer->scissor_array.size > 0) {
333       result = pvr_gpu_upload(device,
334                               device->heaps.general_heap,
335                               util_dynarray_begin(&cmd_buffer->scissor_array),
336                               cmd_buffer->scissor_array.size,
337                               cache_line_size,
338                               &sub_cmd->scissor_bo);
339       if (result != VK_SUCCESS)
340          goto err_free_depth_bias_bo;
341    }
342 
343    util_dynarray_clear(&cmd_buffer->depth_bias_array);
344    util_dynarray_clear(&cmd_buffer->scissor_array);
345 
346    return VK_SUCCESS;
347 
348 err_free_depth_bias_bo:
349    pvr_bo_suballoc_free(sub_cmd->depth_bias_bo);
350    sub_cmd->depth_bias_bo = NULL;
351 
352    return result;
353 }
354 
355 static VkResult
pvr_cmd_buffer_emit_ppp_state(const struct pvr_cmd_buffer * const cmd_buffer,struct pvr_csb * const csb)356 pvr_cmd_buffer_emit_ppp_state(const struct pvr_cmd_buffer *const cmd_buffer,
357                               struct pvr_csb *const csb)
358 {
359    const struct pvr_framebuffer *const framebuffer =
360       cmd_buffer->state.render_pass_info.framebuffer;
361 
362    assert(csb->stream_type == PVR_CMD_STREAM_TYPE_GRAPHICS ||
363           csb->stream_type == PVR_CMD_STREAM_TYPE_GRAPHICS_DEFERRED);
364 
365    pvr_csb_set_relocation_mark(csb);
366 
367    pvr_csb_emit (csb, VDMCTRL_PPP_STATE0, state0) {
368       state0.addrmsb = framebuffer->ppp_state_bo->dev_addr;
369       state0.word_count = framebuffer->ppp_state_size;
370    }
371 
372    pvr_csb_emit (csb, VDMCTRL_PPP_STATE1, state1) {
373       state1.addrlsb = framebuffer->ppp_state_bo->dev_addr;
374    }
375 
376    pvr_csb_clear_relocation_mark(csb);
377 
378    return csb->status;
379 }
380 
381 VkResult
pvr_cmd_buffer_upload_general(struct pvr_cmd_buffer * const cmd_buffer,const void * const data,const size_t size,struct pvr_suballoc_bo ** const pvr_bo_out)382 pvr_cmd_buffer_upload_general(struct pvr_cmd_buffer *const cmd_buffer,
383                               const void *const data,
384                               const size_t size,
385                               struct pvr_suballoc_bo **const pvr_bo_out)
386 {
387    struct pvr_device *const device = cmd_buffer->device;
388    const uint32_t cache_line_size =
389       rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
390    struct pvr_suballoc_bo *suballoc_bo;
391    VkResult result;
392 
393    result = pvr_gpu_upload(device,
394                            device->heaps.general_heap,
395                            data,
396                            size,
397                            cache_line_size,
398                            &suballoc_bo);
399    if (result != VK_SUCCESS)
400       return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
401 
402    list_add(&suballoc_bo->link, &cmd_buffer->bo_list);
403 
404    *pvr_bo_out = suballoc_bo;
405 
406    return VK_SUCCESS;
407 }
408 
409 static VkResult
pvr_cmd_buffer_upload_usc(struct pvr_cmd_buffer * const cmd_buffer,const void * const code,const size_t code_size,uint64_t code_alignment,struct pvr_suballoc_bo ** const pvr_bo_out)410 pvr_cmd_buffer_upload_usc(struct pvr_cmd_buffer *const cmd_buffer,
411                           const void *const code,
412                           const size_t code_size,
413                           uint64_t code_alignment,
414                           struct pvr_suballoc_bo **const pvr_bo_out)
415 {
416    struct pvr_device *const device = cmd_buffer->device;
417    const uint32_t cache_line_size =
418       rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
419    struct pvr_suballoc_bo *suballoc_bo;
420    VkResult result;
421 
422    code_alignment = MAX2(code_alignment, cache_line_size);
423 
424    result =
425       pvr_gpu_upload_usc(device, code, code_size, code_alignment, &suballoc_bo);
426    if (result != VK_SUCCESS)
427       return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
428 
429    list_add(&suballoc_bo->link, &cmd_buffer->bo_list);
430 
431    *pvr_bo_out = suballoc_bo;
432 
433    return VK_SUCCESS;
434 }
435 
pvr_cmd_buffer_upload_pds(struct pvr_cmd_buffer * const cmd_buffer,const uint32_t * data,uint32_t data_size_dwords,uint32_t data_alignment,const uint32_t * code,uint32_t code_size_dwords,uint32_t code_alignment,uint64_t min_alignment,struct pvr_pds_upload * const pds_upload_out)436 VkResult pvr_cmd_buffer_upload_pds(struct pvr_cmd_buffer *const cmd_buffer,
437                                    const uint32_t *data,
438                                    uint32_t data_size_dwords,
439                                    uint32_t data_alignment,
440                                    const uint32_t *code,
441                                    uint32_t code_size_dwords,
442                                    uint32_t code_alignment,
443                                    uint64_t min_alignment,
444                                    struct pvr_pds_upload *const pds_upload_out)
445 {
446    struct pvr_device *const device = cmd_buffer->device;
447    VkResult result;
448 
449    result = pvr_gpu_upload_pds(device,
450                                data,
451                                data_size_dwords,
452                                data_alignment,
453                                code,
454                                code_size_dwords,
455                                code_alignment,
456                                min_alignment,
457                                pds_upload_out);
458    if (result != VK_SUCCESS)
459       return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
460 
461    list_add(&pds_upload_out->pvr_bo->link, &cmd_buffer->bo_list);
462 
463    return VK_SUCCESS;
464 }
465 
466 static inline VkResult
pvr_cmd_buffer_upload_pds_data(struct pvr_cmd_buffer * const cmd_buffer,const uint32_t * data,uint32_t data_size_dwords,uint32_t data_alignment,struct pvr_pds_upload * const pds_upload_out)467 pvr_cmd_buffer_upload_pds_data(struct pvr_cmd_buffer *const cmd_buffer,
468                                const uint32_t *data,
469                                uint32_t data_size_dwords,
470                                uint32_t data_alignment,
471                                struct pvr_pds_upload *const pds_upload_out)
472 {
473    return pvr_cmd_buffer_upload_pds(cmd_buffer,
474                                     data,
475                                     data_size_dwords,
476                                     data_alignment,
477                                     NULL,
478                                     0,
479                                     0,
480                                     data_alignment,
481                                     pds_upload_out);
482 }
483 
484 /* pbe_cs_words must be an array of length emit_count with
485  * ROGUE_NUM_PBESTATE_STATE_WORDS entries
486  */
pvr_sub_cmd_gfx_per_job_fragment_programs_create_and_upload(struct pvr_cmd_buffer * const cmd_buffer,const uint32_t emit_count,const uint32_t * pbe_cs_words,struct pvr_pds_upload * const pds_upload_out)487 static VkResult pvr_sub_cmd_gfx_per_job_fragment_programs_create_and_upload(
488    struct pvr_cmd_buffer *const cmd_buffer,
489    const uint32_t emit_count,
490    const uint32_t *pbe_cs_words,
491    struct pvr_pds_upload *const pds_upload_out)
492 {
493    struct pvr_pds_event_program pixel_event_program = {
494       /* No data to DMA, just a DOUTU needed. */
495       .num_emit_word_pairs = 0,
496    };
497    const uint32_t staging_buffer_size =
498       PVR_DW_TO_BYTES(cmd_buffer->device->pixel_event_data_size_in_dwords);
499    const VkAllocationCallbacks *const allocator = &cmd_buffer->vk.pool->alloc;
500    struct pvr_device *const device = cmd_buffer->device;
501    struct pvr_suballoc_bo *usc_eot_program = NULL;
502    struct util_dynarray eot_program_bin;
503    uint32_t *staging_buffer;
504    uint32_t usc_temp_count;
505    VkResult result;
506 
507    assert(emit_count > 0);
508 
509    pvr_uscgen_eot("per-job EOT",
510                   emit_count,
511                   pbe_cs_words,
512                   &usc_temp_count,
513                   &eot_program_bin);
514 
515    result = pvr_cmd_buffer_upload_usc(cmd_buffer,
516                                       eot_program_bin.data,
517                                       eot_program_bin.size,
518                                       4,
519                                       &usc_eot_program);
520 
521    util_dynarray_fini(&eot_program_bin);
522 
523    if (result != VK_SUCCESS)
524       return result;
525 
526    pvr_pds_setup_doutu(&pixel_event_program.task_control,
527                        usc_eot_program->dev_addr.addr,
528                        usc_temp_count,
529                        PVRX(PDSINST_DOUTU_SAMPLE_RATE_INSTANCE),
530                        false);
531 
532    /* TODO: We could skip allocating this and generate directly into the device
533     * buffer thus removing one allocation and memcpy() per job. Would this
534     * speed up things in a noticeable way?
535     */
536    staging_buffer = vk_alloc(allocator,
537                              staging_buffer_size,
538                              8,
539                              VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
540    if (!staging_buffer) {
541       result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
542       goto err_free_usc_pixel_program;
543    }
544 
545    /* Generate the data segment. The code segment was uploaded earlier when
546     * setting up the PDS static heap data.
547     */
548    pvr_pds_generate_pixel_event_data_segment(&pixel_event_program,
549                                              staging_buffer,
550                                              &device->pdevice->dev_info);
551 
552    result = pvr_cmd_buffer_upload_pds_data(
553       cmd_buffer,
554       staging_buffer,
555       cmd_buffer->device->pixel_event_data_size_in_dwords,
556       4,
557       pds_upload_out);
558    if (result != VK_SUCCESS)
559       goto err_free_pixel_event_staging_buffer;
560 
561    vk_free(allocator, staging_buffer);
562 
563    return VK_SUCCESS;
564 
565 err_free_pixel_event_staging_buffer:
566    vk_free(allocator, staging_buffer);
567 
568 err_free_usc_pixel_program:
569    list_del(&usc_eot_program->link);
570    pvr_bo_suballoc_free(usc_eot_program);
571 
572    return result;
573 }
574 
pvr_sub_cmd_gfx_build_terminate_ctrl_stream(struct pvr_device * const device,const struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const gfx_sub_cmd)575 static VkResult pvr_sub_cmd_gfx_build_terminate_ctrl_stream(
576    struct pvr_device *const device,
577    const struct pvr_cmd_buffer *const cmd_buffer,
578    struct pvr_sub_cmd_gfx *const gfx_sub_cmd)
579 {
580    struct list_head bo_list;
581    struct pvr_csb csb;
582    VkResult result;
583 
584    pvr_csb_init(device, PVR_CMD_STREAM_TYPE_GRAPHICS, &csb);
585 
586    result = pvr_cmd_buffer_emit_ppp_state(cmd_buffer, &csb);
587    if (result != VK_SUCCESS)
588       goto err_csb_finish;
589 
590    result = pvr_csb_emit_terminate(&csb);
591    if (result != VK_SUCCESS)
592       goto err_csb_finish;
593 
594    result = pvr_csb_bake(&csb, &bo_list);
595    if (result != VK_SUCCESS)
596       goto err_csb_finish;
597 
598    /* This is a trivial control stream, there's no reason it should ever require
599     * more memory than a single bo can provide.
600     */
601    assert(list_is_singular(&bo_list));
602    gfx_sub_cmd->terminate_ctrl_stream =
603       list_first_entry(&bo_list, struct pvr_bo, link);
604 
605    return VK_SUCCESS;
606 
607 err_csb_finish:
608    pvr_csb_finish(&csb);
609 
610    return result;
611 }
612 
pvr_setup_texture_state_words(struct pvr_device * device,struct pvr_combined_image_sampler_descriptor * descriptor,const struct pvr_image_view * image_view)613 static VkResult pvr_setup_texture_state_words(
614    struct pvr_device *device,
615    struct pvr_combined_image_sampler_descriptor *descriptor,
616    const struct pvr_image_view *image_view)
617 {
618    const struct pvr_image *image = vk_to_pvr_image(image_view->vk.image);
619    struct pvr_texture_state_info info = {
620       .format = image_view->vk.format,
621       .mem_layout = image->memlayout,
622       .type = image_view->vk.view_type,
623       .is_cube = image_view->vk.view_type == VK_IMAGE_VIEW_TYPE_CUBE ||
624                  image_view->vk.view_type == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY,
625       .tex_state_type = PVR_TEXTURE_STATE_SAMPLE,
626       .extent = image_view->vk.extent,
627       .mip_levels = 1,
628       .sample_count = image_view->vk.image->samples,
629       .stride = image->physical_extent.width,
630       .addr = image->dev_addr,
631    };
632    const uint8_t *const swizzle = pvr_get_format_swizzle(info.format);
633    VkResult result;
634 
635    memcpy(&info.swizzle, swizzle, sizeof(info.swizzle));
636 
637    /* TODO: Can we use image_view->texture_state instead of generating here? */
638    result = pvr_pack_tex_state(device, &info, descriptor->image);
639    if (result != VK_SUCCESS)
640       return result;
641 
642    descriptor->sampler = (union pvr_sampler_descriptor){ 0 };
643 
644    pvr_csb_pack (&descriptor->sampler.data.sampler_word,
645                  TEXSTATE_SAMPLER,
646                  sampler) {
647       sampler.non_normalized_coords = true;
648       sampler.addrmode_v = PVRX(TEXSTATE_ADDRMODE_CLAMP_TO_EDGE);
649       sampler.addrmode_u = PVRX(TEXSTATE_ADDRMODE_CLAMP_TO_EDGE);
650       sampler.minfilter = PVRX(TEXSTATE_FILTER_POINT);
651       sampler.magfilter = PVRX(TEXSTATE_FILTER_POINT);
652       sampler.dadjust = PVRX(TEXSTATE_DADJUST_ZERO_UINT);
653    }
654 
655    return VK_SUCCESS;
656 }
657 
658 static VkResult
pvr_load_op_constants_create_and_upload(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_load_op * load_op,pvr_dev_addr_t * const addr_out)659 pvr_load_op_constants_create_and_upload(struct pvr_cmd_buffer *cmd_buffer,
660                                         const struct pvr_load_op *load_op,
661                                         pvr_dev_addr_t *const addr_out)
662 {
663    const struct pvr_render_pass_info *render_pass_info =
664       &cmd_buffer->state.render_pass_info;
665    const struct pvr_render_pass *pass = render_pass_info->pass;
666    const struct pvr_renderpass_hwsetup_render *hw_render = load_op->hw_render;
667    const struct pvr_renderpass_colorinit *color_init =
668       &hw_render->color_init[0];
669    const VkClearValue *clear_value =
670       &render_pass_info->clear_values[color_init->index];
671    struct pvr_suballoc_bo *clear_bo;
672    uint32_t attachment_count;
673    bool has_depth_clear;
674    bool has_depth_load;
675    VkResult result;
676 
677    /* These are only setup and never used for now. These will need to be
678     * uploaded into a buffer based on some compiler info.
679     */
680    /* TODO: Remove the above comment once the compiler is hooked up and we're
681     * setting up + uploading the buffer.
682     */
683    struct pvr_combined_image_sampler_descriptor
684       texture_states[PVR_LOAD_OP_CLEARS_LOADS_MAX_RTS];
685    uint32_t texture_count = 0;
686    uint32_t hw_clear_value[PVR_LOAD_OP_CLEARS_LOADS_MAX_RTS *
687                            PVR_CLEAR_COLOR_ARRAY_SIZE];
688    uint32_t next_clear_consts = 0;
689 
690    if (load_op->is_hw_object)
691       attachment_count = load_op->hw_render->color_init_count;
692    else
693       attachment_count = load_op->subpass->color_count;
694 
695    for (uint32_t i = 0; i < attachment_count; i++) {
696       struct pvr_image_view *image_view;
697       uint32_t attachment_idx;
698 
699       if (load_op->is_hw_object)
700          attachment_idx = load_op->hw_render->color_init[i].index;
701       else
702          attachment_idx = load_op->subpass->color_attachments[i];
703 
704       image_view = render_pass_info->attachments[attachment_idx];
705 
706       assert((load_op->clears_loads_state.rt_load_mask &
707               load_op->clears_loads_state.rt_clear_mask) == 0);
708       if (load_op->clears_loads_state.rt_load_mask & BITFIELD_BIT(i)) {
709          result = pvr_setup_texture_state_words(cmd_buffer->device,
710                                                 &texture_states[texture_count],
711                                                 image_view);
712          if (result != VK_SUCCESS)
713             return result;
714 
715          texture_count++;
716       } else if (load_op->clears_loads_state.rt_clear_mask & BITFIELD_BIT(i)) {
717          const uint32_t accum_fmt_size =
718             pvr_get_pbe_accum_format_size_in_bytes(image_view->vk.format);
719 
720          assert(next_clear_consts +
721                    vk_format_get_blocksize(image_view->vk.format) <=
722                 ARRAY_SIZE(hw_clear_value));
723 
724          /* FIXME: do this at the point we store the clear values? */
725          pvr_get_hw_clear_color(image_view->vk.format,
726                                 clear_value->color,
727                                 &hw_clear_value[next_clear_consts]);
728 
729          next_clear_consts += DIV_ROUND_UP(accum_fmt_size, sizeof(uint32_t));
730       }
731    }
732 
733    has_depth_load = false;
734    for (uint32_t i = 0;
735         i < ARRAY_SIZE(load_op->clears_loads_state.dest_vk_format);
736         i++) {
737       if (load_op->clears_loads_state.dest_vk_format[i] ==
738           VK_FORMAT_D32_SFLOAT) {
739          has_depth_load = true;
740          break;
741       }
742    }
743 
744    has_depth_clear = load_op->clears_loads_state.depth_clear_to_reg != -1;
745 
746    assert(!(has_depth_clear && has_depth_load));
747 
748    if (has_depth_load) {
749       const struct pvr_render_pass_attachment *attachment;
750       const struct pvr_image_view *image_view;
751 
752       assert(load_op->subpass->depth_stencil_attachment !=
753              VK_ATTACHMENT_UNUSED);
754       assert(!load_op->is_hw_object);
755       attachment =
756          &pass->attachments[load_op->subpass->depth_stencil_attachment];
757 
758       image_view = render_pass_info->attachments[attachment->index];
759 
760       result = pvr_setup_texture_state_words(cmd_buffer->device,
761                                              &texture_states[texture_count],
762                                              image_view);
763       if (result != VK_SUCCESS)
764          return result;
765 
766       texture_count++;
767    } else if (has_depth_clear) {
768       const struct pvr_render_pass_attachment *attachment;
769       VkClearValue clear_value;
770 
771       assert(load_op->subpass->depth_stencil_attachment !=
772              VK_ATTACHMENT_UNUSED);
773       attachment =
774          &pass->attachments[load_op->subpass->depth_stencil_attachment];
775 
776       clear_value = render_pass_info->clear_values[attachment->index];
777 
778       assert(next_clear_consts < ARRAY_SIZE(hw_clear_value));
779       hw_clear_value[next_clear_consts++] = fui(clear_value.depthStencil.depth);
780    }
781 
782    result = pvr_cmd_buffer_upload_general(cmd_buffer,
783                                           &hw_clear_value[0],
784                                           sizeof(hw_clear_value),
785                                           &clear_bo);
786    if (result != VK_SUCCESS)
787       return result;
788 
789    *addr_out = clear_bo->dev_addr;
790 
791    return VK_SUCCESS;
792 }
793 
pvr_load_op_pds_data_create_and_upload(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_load_op * load_op,pvr_dev_addr_t constants_addr,struct pvr_pds_upload * const pds_upload_out)794 static VkResult pvr_load_op_pds_data_create_and_upload(
795    struct pvr_cmd_buffer *cmd_buffer,
796    const struct pvr_load_op *load_op,
797    pvr_dev_addr_t constants_addr,
798    struct pvr_pds_upload *const pds_upload_out)
799 {
800    struct pvr_device *device = cmd_buffer->device;
801    const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
802    struct pvr_pds_pixel_shader_sa_program program = { 0 };
803    uint32_t staging_buffer_size;
804    uint32_t *staging_buffer;
805    VkResult result;
806 
807    program.num_texture_dma_kicks = 1;
808 
809    pvr_csb_pack (&program.texture_dma_address[0],
810                  PDSINST_DOUT_FIELDS_DOUTD_SRC0,
811                  value) {
812       value.sbase = constants_addr;
813    }
814 
815    pvr_csb_pack (&program.texture_dma_control[0],
816                  PDSINST_DOUT_FIELDS_DOUTD_SRC1,
817                  value) {
818       value.dest = PVRX(PDSINST_DOUTD_DEST_COMMON_STORE);
819       value.a0 = load_op->shareds_dest_offset;
820       value.bsize = load_op->shareds_count;
821    }
822 
823    pvr_pds_set_sizes_pixel_shader_sa_texture_data(&program, dev_info);
824 
825    staging_buffer_size = PVR_DW_TO_BYTES(program.data_size);
826 
827    staging_buffer = vk_alloc(&cmd_buffer->vk.pool->alloc,
828                              staging_buffer_size,
829                              8,
830                              VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
831    if (!staging_buffer)
832       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
833 
834    pvr_pds_generate_pixel_shader_sa_texture_state_data(&program,
835                                                        staging_buffer,
836                                                        dev_info);
837 
838    result = pvr_cmd_buffer_upload_pds_data(cmd_buffer,
839                                            staging_buffer,
840                                            program.data_size,
841                                            1,
842                                            pds_upload_out);
843    if (result != VK_SUCCESS) {
844       vk_free(&cmd_buffer->vk.pool->alloc, staging_buffer);
845       return result;
846    }
847 
848    vk_free(&cmd_buffer->vk.pool->alloc, staging_buffer);
849 
850    return VK_SUCCESS;
851 }
852 
853 /* FIXME: Should this function be specific to the HW background object, in
854  * which case its name should be changed, or should it have the load op
855  * structure passed in?
856  */
857 static VkResult
pvr_load_op_data_create_and_upload(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_load_op * load_op,struct pvr_pds_upload * const pds_upload_out)858 pvr_load_op_data_create_and_upload(struct pvr_cmd_buffer *cmd_buffer,
859                                    const struct pvr_load_op *load_op,
860                                    struct pvr_pds_upload *const pds_upload_out)
861 {
862    pvr_dev_addr_t constants_addr;
863    VkResult result;
864 
865    result = pvr_load_op_constants_create_and_upload(cmd_buffer,
866                                                     load_op,
867                                                     &constants_addr);
868    if (result != VK_SUCCESS)
869       return result;
870 
871    return pvr_load_op_pds_data_create_and_upload(cmd_buffer,
872                                                  load_op,
873                                                  constants_addr,
874                                                  pds_upload_out);
875 }
876 
pvr_pds_bgnd_pack_state(const struct pvr_load_op * load_op,const struct pvr_pds_upload * load_op_program,uint64_t pds_reg_values[static const ROGUE_NUM_CR_PDS_BGRND_WORDS])877 static void pvr_pds_bgnd_pack_state(
878    const struct pvr_load_op *load_op,
879    const struct pvr_pds_upload *load_op_program,
880    uint64_t pds_reg_values[static const ROGUE_NUM_CR_PDS_BGRND_WORDS])
881 {
882    pvr_csb_pack (&pds_reg_values[0], CR_PDS_BGRND0_BASE, value) {
883       value.shader_addr = PVR_DEV_ADDR(load_op->pds_frag_prog.data_offset);
884       value.texunicode_addr =
885          PVR_DEV_ADDR(load_op->pds_tex_state_prog.code_offset);
886    }
887 
888    pvr_csb_pack (&pds_reg_values[1], CR_PDS_BGRND1_BASE, value) {
889       value.texturedata_addr = PVR_DEV_ADDR(load_op_program->data_offset);
890    }
891 
892    pvr_csb_pack (&pds_reg_values[2], CR_PDS_BGRND3_SIZEINFO, value) {
893       value.usc_sharedsize =
894          DIV_ROUND_UP(load_op->const_shareds_count,
895                       PVRX(CR_PDS_BGRND3_SIZEINFO_USC_SHAREDSIZE_UNIT_SIZE));
896       value.pds_texturestatesize = DIV_ROUND_UP(
897          load_op_program->data_size,
898          PVRX(CR_PDS_BGRND3_SIZEINFO_PDS_TEXTURESTATESIZE_UNIT_SIZE));
899       value.pds_tempsize =
900          DIV_ROUND_UP(load_op->temps_count,
901                       PVRX(CR_PDS_BGRND3_SIZEINFO_PDS_TEMPSIZE_UNIT_SIZE));
902    }
903 }
904 
905 /**
906  * \brief Calculates the stride in pixels based on the pitch in bytes and pixel
907  * format.
908  *
909  * \param[in] pitch     Width pitch in bytes.
910  * \param[in] vk_format Vulkan image format.
911  * \return Stride in pixels.
912  */
pvr_stride_from_pitch(uint32_t pitch,VkFormat vk_format)913 static inline uint32_t pvr_stride_from_pitch(uint32_t pitch, VkFormat vk_format)
914 {
915    const unsigned int cpp = vk_format_get_blocksize(vk_format);
916 
917    assert(pitch % cpp == 0);
918 
919    return pitch / cpp;
920 }
921 
pvr_setup_pbe_state(const struct pvr_device_info * dev_info,const struct pvr_framebuffer * framebuffer,uint32_t mrt_index,const struct usc_mrt_resource * mrt_resource,const struct pvr_image_view * const iview,const VkRect2D * render_area,const bool down_scale,const uint32_t samples,uint32_t pbe_cs_words[static const ROGUE_NUM_PBESTATE_STATE_WORDS],uint64_t pbe_reg_words[static const ROGUE_NUM_PBESTATE_REG_WORDS])922 static void pvr_setup_pbe_state(
923    const struct pvr_device_info *dev_info,
924    const struct pvr_framebuffer *framebuffer,
925    uint32_t mrt_index,
926    const struct usc_mrt_resource *mrt_resource,
927    const struct pvr_image_view *const iview,
928    const VkRect2D *render_area,
929    const bool down_scale,
930    const uint32_t samples,
931    uint32_t pbe_cs_words[static const ROGUE_NUM_PBESTATE_STATE_WORDS],
932    uint64_t pbe_reg_words[static const ROGUE_NUM_PBESTATE_REG_WORDS])
933 {
934    const struct pvr_image *image = pvr_image_view_get_image(iview);
935    uint32_t level_pitch = image->mip_levels[iview->vk.base_mip_level].pitch;
936 
937    struct pvr_pbe_surf_params surface_params;
938    struct pvr_pbe_render_params render_params;
939    bool with_packed_usc_channel;
940    const uint8_t *swizzle;
941    uint32_t position;
942 
943    /* down_scale should be true when performing a resolve, in which case there
944     * should be more than one sample.
945     */
946    assert((down_scale && samples > 1U) || (!down_scale && samples == 1U));
947 
948    /* Setup surface parameters. */
949 
950    if (PVR_HAS_FEATURE(dev_info, usc_f16sop_u8)) {
951       with_packed_usc_channel = vk_format_is_unorm(iview->vk.format) ||
952                                 vk_format_is_snorm(iview->vk.format);
953    } else {
954       with_packed_usc_channel = false;
955    }
956 
957    swizzle = pvr_get_format_swizzle(iview->vk.format);
958    memcpy(surface_params.swizzle, swizzle, sizeof(surface_params.swizzle));
959 
960    pvr_pbe_get_src_format_and_gamma(iview->vk.format,
961                                     PVR_PBE_GAMMA_NONE,
962                                     with_packed_usc_channel,
963                                     &surface_params.source_format,
964                                     &surface_params.gamma);
965 
966    surface_params.is_normalized = vk_format_is_normalized(iview->vk.format);
967    surface_params.pbe_packmode = pvr_get_pbe_packmode(iview->vk.format);
968    surface_params.nr_components = vk_format_get_nr_components(iview->vk.format);
969 
970    /* FIXME: Should we have an inline function to return the address of a mip
971     * level?
972     */
973    surface_params.addr =
974       PVR_DEV_ADDR_OFFSET(image->vma->dev_addr,
975                           image->mip_levels[iview->vk.base_mip_level].offset);
976    surface_params.addr =
977       PVR_DEV_ADDR_OFFSET(surface_params.addr,
978                           iview->vk.base_array_layer * image->layer_size);
979 
980    surface_params.mem_layout = image->memlayout;
981    surface_params.stride = pvr_stride_from_pitch(level_pitch, iview->vk.format);
982    surface_params.depth = iview->vk.extent.depth;
983    surface_params.width = iview->vk.extent.width;
984    surface_params.height = iview->vk.extent.height;
985    surface_params.z_only_render = false;
986    surface_params.down_scale = down_scale;
987 
988    /* Setup render parameters. */
989 
990    if (mrt_resource->type == USC_MRT_RESOURCE_TYPE_MEMORY) {
991       position = mrt_resource->mem.offset_dw;
992    } else {
993       assert(mrt_resource->type == USC_MRT_RESOURCE_TYPE_OUTPUT_REG);
994       assert(mrt_resource->reg.offset == 0);
995 
996       position = mrt_resource->reg.output_reg;
997    }
998 
999    assert(position <= 3 || PVR_HAS_FEATURE(dev_info, eight_output_registers));
1000 
1001    switch (position) {
1002    case 0:
1003    case 4:
1004       render_params.source_start = PVR_PBE_STARTPOS_BIT0;
1005       break;
1006    case 1:
1007    case 5:
1008       render_params.source_start = PVR_PBE_STARTPOS_BIT32;
1009       break;
1010    case 2:
1011    case 6:
1012       render_params.source_start = PVR_PBE_STARTPOS_BIT64;
1013       break;
1014    case 3:
1015    case 7:
1016       render_params.source_start = PVR_PBE_STARTPOS_BIT96;
1017       break;
1018    default:
1019       assert(!"Invalid output register");
1020       break;
1021    }
1022 
1023 #define PVR_DEC_IF_NOT_ZERO(_v) (((_v) > 0) ? (_v)-1 : 0)
1024 
1025    render_params.min_x_clip = MAX2(0, render_area->offset.x);
1026    render_params.min_y_clip = MAX2(0, render_area->offset.y);
1027    render_params.max_x_clip = MIN2(
1028       framebuffer->width - 1,
1029       PVR_DEC_IF_NOT_ZERO(render_area->offset.x + render_area->extent.width));
1030    render_params.max_y_clip = MIN2(
1031       framebuffer->height - 1,
1032       PVR_DEC_IF_NOT_ZERO(render_area->offset.y + render_area->extent.height));
1033 
1034 #undef PVR_DEC_IF_NOT_ZERO
1035 
1036    render_params.slice = 0;
1037    render_params.mrt_index = mrt_index;
1038 
1039    pvr_pbe_pack_state(dev_info,
1040                       &surface_params,
1041                       &render_params,
1042                       pbe_cs_words,
1043                       pbe_reg_words);
1044 }
1045 
1046 static struct pvr_render_target *
pvr_get_render_target(const struct pvr_render_pass * pass,const struct pvr_framebuffer * framebuffer,uint32_t idx)1047 pvr_get_render_target(const struct pvr_render_pass *pass,
1048                       const struct pvr_framebuffer *framebuffer,
1049                       uint32_t idx)
1050 {
1051    const struct pvr_renderpass_hwsetup_render *hw_render =
1052       &pass->hw_setup->renders[idx];
1053    uint32_t rt_idx = 0;
1054 
1055    switch (hw_render->sample_count) {
1056    case 1:
1057    case 2:
1058    case 4:
1059    case 8:
1060       rt_idx = util_logbase2(hw_render->sample_count);
1061       break;
1062 
1063    default:
1064       unreachable("Unsupported sample count");
1065       break;
1066    }
1067 
1068    return &framebuffer->render_targets[rt_idx];
1069 }
1070 
1071 static uint32_t
pvr_pass_get_pixel_output_width(const struct pvr_render_pass * pass,uint32_t idx,const struct pvr_device_info * dev_info)1072 pvr_pass_get_pixel_output_width(const struct pvr_render_pass *pass,
1073                                 uint32_t idx,
1074                                 const struct pvr_device_info *dev_info)
1075 {
1076    const struct pvr_renderpass_hwsetup_render *hw_render =
1077       &pass->hw_setup->renders[idx];
1078    /* Default value based on the maximum value found in all existing cores. The
1079     * maximum is used as this is being treated as a lower bound, making it a
1080     * "safer" choice than the minimum value found in all existing cores.
1081     */
1082    const uint32_t min_output_regs =
1083       PVR_GET_FEATURE_VALUE(dev_info, usc_min_output_registers_per_pix, 2U);
1084    const uint32_t width = MAX2(hw_render->output_regs_count, min_output_regs);
1085 
1086    return util_next_power_of_two(width);
1087 }
1088 
1089 static inline bool
pvr_ds_attachment_requires_zls(const struct pvr_ds_attachment * attachment)1090 pvr_ds_attachment_requires_zls(const struct pvr_ds_attachment *attachment)
1091 {
1092    bool zls_used;
1093 
1094    zls_used = attachment->load.d || attachment->load.s;
1095    zls_used |= attachment->store.d || attachment->store.s;
1096 
1097    return zls_used;
1098 }
1099 
1100 /**
1101  * \brief If depth and/or stencil attachment dimensions are not tile-aligned,
1102  * then we may need to insert some additional transfer subcommands.
1103  *
1104  * It's worth noting that we check whether the dimensions are smaller than a
1105  * tile here, rather than checking whether they're tile-aligned - this relies
1106  * on the assumption that we can safely use any attachment with dimensions
1107  * larger than a tile. If the attachment is twiddled, it will be over-allocated
1108  * to the nearest power-of-two (which will be tile-aligned). If the attachment
1109  * is not twiddled, we don't need to worry about tile-alignment at all.
1110  */
pvr_sub_cmd_gfx_requires_ds_subtile_alignment(const struct pvr_device_info * dev_info,const struct pvr_render_job * job)1111 static bool pvr_sub_cmd_gfx_requires_ds_subtile_alignment(
1112    const struct pvr_device_info *dev_info,
1113    const struct pvr_render_job *job)
1114 {
1115    const struct pvr_image *const ds_image =
1116       pvr_image_view_get_image(job->ds.iview);
1117    uint32_t zls_tile_size_x;
1118    uint32_t zls_tile_size_y;
1119 
1120    rogue_get_zls_tile_size_xy(dev_info, &zls_tile_size_x, &zls_tile_size_y);
1121 
1122    if (ds_image->physical_extent.width >= zls_tile_size_x &&
1123        ds_image->physical_extent.height >= zls_tile_size_y) {
1124       return false;
1125    }
1126 
1127    /* If we have the zls_subtile feature, we can skip the alignment iff:
1128     *  - The attachment is not multisampled, and
1129     *  - The depth and stencil attachments are the same.
1130     */
1131    if (PVR_HAS_FEATURE(dev_info, zls_subtile) &&
1132        ds_image->vk.samples == VK_SAMPLE_COUNT_1_BIT &&
1133        job->has_stencil_attachment == job->has_depth_attachment) {
1134       return false;
1135    }
1136 
1137    /* No ZLS functions enabled; nothing to do. */
1138    if ((!job->has_depth_attachment && !job->has_stencil_attachment) ||
1139        !pvr_ds_attachment_requires_zls(&job->ds)) {
1140       return false;
1141    }
1142 
1143    return true;
1144 }
1145 
1146 static VkResult
pvr_sub_cmd_gfx_align_ds_subtiles(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const gfx_sub_cmd)1147 pvr_sub_cmd_gfx_align_ds_subtiles(struct pvr_cmd_buffer *const cmd_buffer,
1148                                   struct pvr_sub_cmd_gfx *const gfx_sub_cmd)
1149 {
1150    struct pvr_sub_cmd *const prev_sub_cmd =
1151       container_of(gfx_sub_cmd, struct pvr_sub_cmd, gfx);
1152    struct pvr_ds_attachment *const ds = &gfx_sub_cmd->job.ds;
1153    const struct pvr_image *const ds_image = pvr_image_view_get_image(ds->iview);
1154    const VkFormat copy_format = pvr_get_raw_copy_format(ds_image->vk.format);
1155 
1156    struct pvr_suballoc_bo *buffer;
1157    uint32_t buffer_layer_size;
1158    VkBufferImageCopy2 region;
1159    VkExtent2D zls_tile_size;
1160    VkExtent2D rounded_size;
1161    uint32_t buffer_size;
1162    VkExtent2D scale;
1163    VkResult result;
1164 
1165    /* The operations below assume the last command in the buffer was the target
1166     * gfx subcommand. Assert that this is the case.
1167     */
1168    assert(list_last_entry(&cmd_buffer->sub_cmds, struct pvr_sub_cmd, link) ==
1169           prev_sub_cmd);
1170 
1171    if (!pvr_ds_attachment_requires_zls(ds))
1172       return VK_SUCCESS;
1173 
1174    rogue_get_zls_tile_size_xy(&cmd_buffer->device->pdevice->dev_info,
1175                               &zls_tile_size.width,
1176                               &zls_tile_size.height);
1177    rogue_get_isp_scale_xy_from_samples(ds_image->vk.samples,
1178                                        &scale.width,
1179                                        &scale.height);
1180 
1181    rounded_size = (VkExtent2D){
1182       .width = ALIGN_POT(ds_image->physical_extent.width, zls_tile_size.width),
1183       .height =
1184          ALIGN_POT(ds_image->physical_extent.height, zls_tile_size.height),
1185    };
1186 
1187    buffer_layer_size = vk_format_get_blocksize(ds_image->vk.format) *
1188                        rounded_size.width * rounded_size.height * scale.width *
1189                        scale.height;
1190 
1191    if (ds->iview->vk.layer_count > 1)
1192       buffer_layer_size = ALIGN_POT(buffer_layer_size, ds_image->alignment);
1193 
1194    buffer_size = buffer_layer_size * ds->iview->vk.layer_count;
1195 
1196    result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
1197                                      cmd_buffer->device->heaps.general_heap,
1198                                      buffer_size,
1199                                      &buffer);
1200    if (result != VK_SUCCESS)
1201       return result;
1202 
1203    region = (VkBufferImageCopy2){
1204       .sType = VK_STRUCTURE_TYPE_BUFFER_IMAGE_COPY_2,
1205       .pNext = NULL,
1206       .bufferOffset = 0,
1207       .bufferRowLength = rounded_size.width,
1208       .bufferImageHeight = 0,
1209       .imageSubresource = {
1210          .aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT,
1211          .mipLevel = ds->iview->vk.base_mip_level,
1212          .baseArrayLayer = ds->iview->vk.base_array_layer,
1213          .layerCount = ds->iview->vk.layer_count,
1214       },
1215       .imageOffset = { 0 },
1216       .imageExtent = {
1217          .width = ds->iview->vk.extent.width,
1218          .height = ds->iview->vk.extent.height,
1219          .depth = 1,
1220       },
1221    };
1222 
1223    if (ds->load.d || ds->load.s) {
1224       cmd_buffer->state.current_sub_cmd = NULL;
1225 
1226       result =
1227          pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_TRANSFER);
1228       if (result != VK_SUCCESS)
1229          return result;
1230 
1231       result = pvr_copy_image_to_buffer_region_format(cmd_buffer,
1232                                                       ds_image,
1233                                                       buffer->dev_addr,
1234                                                       &region,
1235                                                       copy_format,
1236                                                       copy_format);
1237       if (result != VK_SUCCESS)
1238          return result;
1239 
1240       cmd_buffer->state.current_sub_cmd->transfer.serialize_with_frag = true;
1241 
1242       result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
1243       if (result != VK_SUCCESS)
1244          return result;
1245 
1246       /* Now we have to fiddle with cmd_buffer to place this transfer command
1247        * *before* the target gfx subcommand.
1248        */
1249       list_move_to(&cmd_buffer->state.current_sub_cmd->link,
1250                    &prev_sub_cmd->link);
1251 
1252       cmd_buffer->state.current_sub_cmd = prev_sub_cmd;
1253    }
1254 
1255    if (ds->store.d || ds->store.s) {
1256       cmd_buffer->state.current_sub_cmd = NULL;
1257 
1258       result =
1259          pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_TRANSFER);
1260       if (result != VK_SUCCESS)
1261          return result;
1262 
1263       result = pvr_copy_buffer_to_image_region_format(cmd_buffer,
1264                                                       buffer->dev_addr,
1265                                                       ds_image,
1266                                                       &region,
1267                                                       copy_format,
1268                                                       copy_format,
1269                                                       0);
1270       if (result != VK_SUCCESS)
1271          return result;
1272 
1273       cmd_buffer->state.current_sub_cmd->transfer.serialize_with_frag = true;
1274 
1275       result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
1276       if (result != VK_SUCCESS)
1277          return result;
1278 
1279       cmd_buffer->state.current_sub_cmd = prev_sub_cmd;
1280    }
1281 
1282    /* Finally, patch up the target graphics sub_cmd to use the correctly-strided
1283     * buffer.
1284     */
1285    ds->has_alignment_transfers = true;
1286    ds->addr = buffer->dev_addr;
1287    ds->physical_extent = rounded_size;
1288 
1289    gfx_sub_cmd->wait_on_previous_transfer = true;
1290 
1291    return VK_SUCCESS;
1292 }
1293 
1294 struct pvr_emit_state {
1295    uint32_t pbe_cs_words[PVR_MAX_COLOR_ATTACHMENTS]
1296                         [ROGUE_NUM_PBESTATE_STATE_WORDS];
1297 
1298    uint64_t pbe_reg_words[PVR_MAX_COLOR_ATTACHMENTS]
1299                          [ROGUE_NUM_PBESTATE_REG_WORDS];
1300 
1301    uint32_t emit_count;
1302 };
1303 
1304 static void
pvr_setup_emit_state(const struct pvr_device_info * dev_info,const struct pvr_renderpass_hwsetup_render * hw_render,struct pvr_render_pass_info * render_pass_info,struct pvr_emit_state * emit_state)1305 pvr_setup_emit_state(const struct pvr_device_info *dev_info,
1306                      const struct pvr_renderpass_hwsetup_render *hw_render,
1307                      struct pvr_render_pass_info *render_pass_info,
1308                      struct pvr_emit_state *emit_state)
1309 {
1310    assert(hw_render->pbe_emits <= PVR_NUM_PBE_EMIT_REGS);
1311 
1312    if (hw_render->eot_surface_count == 0) {
1313       emit_state->emit_count = 1;
1314       pvr_csb_pack (&emit_state->pbe_cs_words[0][1],
1315                     PBESTATE_STATE_WORD1,
1316                     state) {
1317          state.emptytile = true;
1318       }
1319       return;
1320    }
1321 
1322    static_assert(USC_MRT_RESOURCE_TYPE_OUTPUT_REG + 1 ==
1323                     USC_MRT_RESOURCE_TYPE_MEMORY,
1324                  "The loop below needs adjusting.");
1325 
1326    emit_state->emit_count = 0;
1327    for (uint32_t resource_type = USC_MRT_RESOURCE_TYPE_OUTPUT_REG;
1328         resource_type <= USC_MRT_RESOURCE_TYPE_MEMORY;
1329         resource_type++) {
1330       for (uint32_t i = 0; i < hw_render->eot_surface_count; i++) {
1331          const struct pvr_framebuffer *framebuffer =
1332             render_pass_info->framebuffer;
1333          const struct pvr_renderpass_hwsetup_eot_surface *surface =
1334             &hw_render->eot_surfaces[i];
1335          const struct pvr_image_view *iview =
1336             render_pass_info->attachments[surface->attachment_idx];
1337          const struct usc_mrt_resource *mrt_resource =
1338             &hw_render->eot_setup.mrt_resources[surface->mrt_idx];
1339          uint32_t samples = 1;
1340 
1341          if (mrt_resource->type != resource_type)
1342             continue;
1343 
1344          if (surface->need_resolve) {
1345             const struct pvr_image_view *resolve_src =
1346                render_pass_info->attachments[surface->src_attachment_idx];
1347 
1348             /* Attachments that are the destination of resolve operations must
1349              * be loaded before their next use.
1350              */
1351             render_pass_info->enable_bg_tag = true;
1352             render_pass_info->process_empty_tiles = true;
1353 
1354             if (surface->resolve_type != PVR_RESOLVE_TYPE_PBE)
1355                continue;
1356 
1357             samples = (uint32_t)resolve_src->vk.image->samples;
1358          }
1359 
1360          assert(emit_state->emit_count < ARRAY_SIZE(emit_state->pbe_cs_words));
1361          assert(emit_state->emit_count < ARRAY_SIZE(emit_state->pbe_reg_words));
1362 
1363          pvr_setup_pbe_state(dev_info,
1364                              framebuffer,
1365                              emit_state->emit_count,
1366                              mrt_resource,
1367                              iview,
1368                              &render_pass_info->render_area,
1369                              surface->need_resolve,
1370                              samples,
1371                              emit_state->pbe_cs_words[emit_state->emit_count],
1372                              emit_state->pbe_reg_words[emit_state->emit_count]);
1373          emit_state->emit_count += 1;
1374       }
1375    }
1376 
1377    assert(emit_state->emit_count == hw_render->pbe_emits);
1378 }
1379 
1380 static inline bool
pvr_is_render_area_tile_aligned(const struct pvr_cmd_buffer * cmd_buffer,const struct pvr_image_view * iview)1381 pvr_is_render_area_tile_aligned(const struct pvr_cmd_buffer *cmd_buffer,
1382                                 const struct pvr_image_view *iview)
1383 {
1384    const VkRect2D *render_area =
1385       &cmd_buffer->state.render_pass_info.render_area;
1386 
1387    return render_area->offset.x == 0 && render_area->offset.y == 0 &&
1388           render_area->extent.height == iview->vk.extent.height &&
1389           render_area->extent.width == iview->vk.extent.width;
1390 }
1391 
pvr_sub_cmd_gfx_job_init(const struct pvr_device_info * dev_info,struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_gfx * sub_cmd)1392 static VkResult pvr_sub_cmd_gfx_job_init(const struct pvr_device_info *dev_info,
1393                                          struct pvr_cmd_buffer *cmd_buffer,
1394                                          struct pvr_sub_cmd_gfx *sub_cmd)
1395 {
1396    static const VkClearDepthStencilValue default_ds_clear_value = {
1397       .depth = 1.0f,
1398       .stencil = 0xFFFFFFFF,
1399    };
1400 
1401    const struct vk_dynamic_graphics_state *dynamic_state =
1402       &cmd_buffer->vk.dynamic_graphics_state;
1403    struct pvr_render_pass_info *render_pass_info =
1404       &cmd_buffer->state.render_pass_info;
1405    const struct pvr_renderpass_hwsetup_render *hw_render =
1406       &render_pass_info->pass->hw_setup->renders[sub_cmd->hw_render_idx];
1407    struct pvr_render_job *job = &sub_cmd->job;
1408    struct pvr_pds_upload pds_pixel_event_program;
1409    struct pvr_framebuffer *framebuffer = render_pass_info->framebuffer;
1410    struct pvr_spm_bgobj_state *spm_bgobj_state =
1411       &framebuffer->spm_bgobj_state_per_render[sub_cmd->hw_render_idx];
1412    struct pvr_render_target *render_target;
1413    VkResult result;
1414 
1415    if (sub_cmd->barrier_store) {
1416       /* There can only ever be one frag job running on the hardware at any one
1417        * time, and a context switch is not allowed mid-tile, so instead of
1418        * allocating a new scratch buffer we can reuse the SPM scratch buffer to
1419        * perform the store.
1420        * So use the SPM EOT program with the SPM PBE reg words in order to store
1421        * the render to the SPM scratch buffer.
1422        */
1423 
1424       memcpy(job->pbe_reg_words,
1425              &framebuffer->spm_eot_state_per_render[0].pbe_reg_words,
1426              sizeof(job->pbe_reg_words));
1427       job->pds_pixel_event_data_offset =
1428          framebuffer->spm_eot_state_per_render[0]
1429             .pixel_event_program_data_offset;
1430    } else {
1431       struct pvr_emit_state emit_state = { 0 };
1432 
1433       pvr_setup_emit_state(dev_info, hw_render, render_pass_info, &emit_state);
1434 
1435       memcpy(job->pbe_reg_words,
1436              emit_state.pbe_reg_words,
1437              sizeof(job->pbe_reg_words));
1438 
1439       result = pvr_sub_cmd_gfx_per_job_fragment_programs_create_and_upload(
1440          cmd_buffer,
1441          emit_state.emit_count,
1442          emit_state.pbe_cs_words[0],
1443          &pds_pixel_event_program);
1444       if (result != VK_SUCCESS)
1445          return result;
1446 
1447       job->pds_pixel_event_data_offset = pds_pixel_event_program.data_offset;
1448    }
1449 
1450    if (sub_cmd->barrier_load) {
1451       job->enable_bg_tag = true;
1452       job->process_empty_tiles = true;
1453 
1454       /* Load the previously stored render from the SPM scratch buffer. */
1455 
1456       STATIC_ASSERT(ARRAY_SIZE(job->pds_bgnd_reg_values) ==
1457                     ARRAY_SIZE(spm_bgobj_state->pds_reg_values));
1458       typed_memcpy(job->pds_bgnd_reg_values,
1459                    spm_bgobj_state->pds_reg_values,
1460                    ARRAY_SIZE(spm_bgobj_state->pds_reg_values));
1461    } else if (hw_render->load_op) {
1462       const struct pvr_load_op *load_op = hw_render->load_op;
1463       struct pvr_pds_upload load_op_program;
1464 
1465       /* Recalculate Background Object(s). */
1466 
1467       /* FIXME: Should we free the PDS pixel event data or let it be freed
1468        * when the pool gets emptied?
1469        */
1470       result = pvr_load_op_data_create_and_upload(cmd_buffer,
1471                                                   load_op,
1472                                                   &load_op_program);
1473       if (result != VK_SUCCESS)
1474          return result;
1475 
1476       job->enable_bg_tag = render_pass_info->enable_bg_tag;
1477       job->process_empty_tiles = render_pass_info->process_empty_tiles;
1478 
1479       pvr_pds_bgnd_pack_state(load_op,
1480                               &load_op_program,
1481                               job->pds_bgnd_reg_values);
1482    }
1483 
1484    /* TODO: In some cases a PR can be removed by storing to the color attachment
1485     * and have the background object load directly from it instead of using the
1486     * scratch buffer. In those cases we can also set this to "false" and avoid
1487     * extra fw overhead.
1488     */
1489    /* The scratch buffer is always needed and allocated to avoid data loss in
1490     * case SPM is hit so set the flag unconditionally.
1491     */
1492    job->requires_spm_scratch_buffer = true;
1493 
1494    memcpy(job->pr_pbe_reg_words,
1495           &framebuffer->spm_eot_state_per_render[0].pbe_reg_words,
1496           sizeof(job->pbe_reg_words));
1497    job->pr_pds_pixel_event_data_offset =
1498       framebuffer->spm_eot_state_per_render[0].pixel_event_program_data_offset;
1499 
1500    STATIC_ASSERT(ARRAY_SIZE(job->pds_pr_bgnd_reg_values) ==
1501                  ARRAY_SIZE(spm_bgobj_state->pds_reg_values));
1502    typed_memcpy(job->pds_pr_bgnd_reg_values,
1503                 spm_bgobj_state->pds_reg_values,
1504                 ARRAY_SIZE(spm_bgobj_state->pds_reg_values));
1505 
1506    render_target = pvr_get_render_target(render_pass_info->pass,
1507                                          framebuffer,
1508                                          sub_cmd->hw_render_idx);
1509    job->rt_dataset = render_target->rt_dataset;
1510 
1511    job->ctrl_stream_addr = pvr_csb_get_start_address(&sub_cmd->control_stream);
1512 
1513    if (sub_cmd->depth_bias_bo)
1514       job->depth_bias_table_addr = sub_cmd->depth_bias_bo->dev_addr;
1515    else
1516       job->depth_bias_table_addr = PVR_DEV_ADDR_INVALID;
1517 
1518    if (sub_cmd->scissor_bo)
1519       job->scissor_table_addr = sub_cmd->scissor_bo->dev_addr;
1520    else
1521       job->scissor_table_addr = PVR_DEV_ADDR_INVALID;
1522 
1523    job->pixel_output_width =
1524       pvr_pass_get_pixel_output_width(render_pass_info->pass,
1525                                       sub_cmd->hw_render_idx,
1526                                       dev_info);
1527 
1528    /* Setup depth/stencil job information. */
1529    if (hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED) {
1530       struct pvr_image_view *ds_iview =
1531          render_pass_info->attachments[hw_render->ds_attach_idx];
1532       const struct pvr_image *ds_image = pvr_image_view_get_image(ds_iview);
1533 
1534       job->has_depth_attachment = vk_format_has_depth(ds_image->vk.format);
1535       job->has_stencil_attachment = vk_format_has_stencil(ds_image->vk.format);
1536 
1537       if (job->has_depth_attachment || job->has_stencil_attachment) {
1538          uint32_t level_pitch =
1539             ds_image->mip_levels[ds_iview->vk.base_mip_level].pitch;
1540          const bool render_area_is_tile_aligned =
1541             pvr_is_render_area_tile_aligned(cmd_buffer, ds_iview);
1542          bool store_was_optimised_out = false;
1543          bool d_store = false, s_store = false;
1544          bool d_load = false, s_load = false;
1545 
1546          job->ds.iview = ds_iview;
1547          job->ds.addr = ds_image->dev_addr;
1548 
1549          job->ds.stride =
1550             pvr_stride_from_pitch(level_pitch, ds_iview->vk.format);
1551          job->ds.height = ds_iview->vk.extent.height;
1552          job->ds.physical_extent = (VkExtent2D){
1553             .width = u_minify(ds_image->physical_extent.width,
1554                               ds_iview->vk.base_mip_level),
1555             .height = u_minify(ds_image->physical_extent.height,
1556                                ds_iview->vk.base_mip_level),
1557          };
1558          job->ds.layer_size = ds_image->layer_size;
1559 
1560          job->ds_clear_value = default_ds_clear_value;
1561 
1562          if (hw_render->ds_attach_idx < render_pass_info->clear_value_count) {
1563             const VkClearDepthStencilValue *const clear_values =
1564                &render_pass_info->clear_values[hw_render->ds_attach_idx]
1565                    .depthStencil;
1566 
1567             if (job->has_depth_attachment)
1568                job->ds_clear_value.depth = clear_values->depth;
1569 
1570             if (job->has_stencil_attachment)
1571                job->ds_clear_value.stencil = clear_values->stencil;
1572          }
1573 
1574          switch (ds_iview->vk.format) {
1575          case VK_FORMAT_D16_UNORM:
1576             job->ds.zls_format = PVRX(CR_ZLS_FORMAT_TYPE_16BITINT);
1577             break;
1578 
1579          case VK_FORMAT_S8_UINT:
1580          case VK_FORMAT_D32_SFLOAT:
1581             job->ds.zls_format = PVRX(CR_ZLS_FORMAT_TYPE_F32Z);
1582             break;
1583 
1584          case VK_FORMAT_D24_UNORM_S8_UINT:
1585             job->ds.zls_format = PVRX(CR_ZLS_FORMAT_TYPE_24BITINT);
1586             break;
1587 
1588          default:
1589             unreachable("Unsupported depth stencil format");
1590          }
1591 
1592          job->ds.memlayout = ds_image->memlayout;
1593 
1594          if (job->has_depth_attachment) {
1595             if (hw_render->depth_store || sub_cmd->barrier_store) {
1596                const bool depth_init_is_clear = hw_render->depth_init ==
1597                                                 VK_ATTACHMENT_LOAD_OP_CLEAR;
1598 
1599                d_store = true;
1600 
1601                if (hw_render->depth_store && render_area_is_tile_aligned &&
1602                    !(sub_cmd->modifies_depth || depth_init_is_clear)) {
1603                   d_store = false;
1604                   store_was_optimised_out = true;
1605                }
1606             }
1607 
1608             if (d_store && !render_area_is_tile_aligned) {
1609                d_load = true;
1610             } else if (hw_render->depth_init == VK_ATTACHMENT_LOAD_OP_LOAD) {
1611                enum pvr_depth_stencil_usage depth_usage = sub_cmd->depth_usage;
1612 
1613                assert(depth_usage != PVR_DEPTH_STENCIL_USAGE_UNDEFINED);
1614                d_load = (depth_usage != PVR_DEPTH_STENCIL_USAGE_NEVER);
1615             } else {
1616                d_load = sub_cmd->barrier_load;
1617             }
1618          }
1619 
1620          if (job->has_stencil_attachment) {
1621             if (hw_render->stencil_store || sub_cmd->barrier_store) {
1622                const bool stencil_init_is_clear = hw_render->stencil_init ==
1623                                                   VK_ATTACHMENT_LOAD_OP_CLEAR;
1624 
1625                s_store = true;
1626 
1627                if (hw_render->stencil_store && render_area_is_tile_aligned &&
1628                    !(sub_cmd->modifies_stencil || stencil_init_is_clear)) {
1629                   s_store = false;
1630                   store_was_optimised_out = true;
1631                }
1632             }
1633 
1634             if (s_store && !render_area_is_tile_aligned) {
1635                s_load = true;
1636             } else if (hw_render->stencil_init == VK_ATTACHMENT_LOAD_OP_LOAD) {
1637                enum pvr_depth_stencil_usage stencil_usage =
1638                   sub_cmd->stencil_usage;
1639 
1640                assert(stencil_usage != PVR_DEPTH_STENCIL_USAGE_UNDEFINED);
1641                s_load = (stencil_usage != PVR_DEPTH_STENCIL_USAGE_NEVER);
1642             } else {
1643                s_load = sub_cmd->barrier_load;
1644             }
1645          }
1646 
1647          job->ds.load.d = d_load;
1648          job->ds.load.s = s_load;
1649          job->ds.store.d = d_store;
1650          job->ds.store.s = s_store;
1651 
1652          /* ZLS can't do masked writes for packed depth stencil formats so if
1653           * we store anything, we have to store everything.
1654           */
1655          if ((job->ds.store.d || job->ds.store.s) &&
1656              pvr_zls_format_type_is_packed(job->ds.zls_format)) {
1657             job->ds.store.d = true;
1658             job->ds.store.s = true;
1659 
1660             /* In case we are only operating on one aspect of the attachment we
1661              * need to load the unused one in order to preserve its contents due
1662              * to the forced store which might otherwise corrupt it.
1663              */
1664             if (hw_render->depth_init != VK_ATTACHMENT_LOAD_OP_CLEAR)
1665                job->ds.load.d = true;
1666 
1667             if (hw_render->stencil_init != VK_ATTACHMENT_LOAD_OP_CLEAR)
1668                job->ds.load.s = true;
1669          }
1670 
1671          if (pvr_ds_attachment_requires_zls(&job->ds) ||
1672              store_was_optimised_out) {
1673             job->process_empty_tiles = true;
1674          }
1675 
1676          if (pvr_sub_cmd_gfx_requires_ds_subtile_alignment(dev_info, job)) {
1677             result = pvr_sub_cmd_gfx_align_ds_subtiles(cmd_buffer, sub_cmd);
1678             if (result != VK_SUCCESS)
1679                return result;
1680          }
1681       }
1682    } else {
1683       job->has_depth_attachment = false;
1684       job->has_stencil_attachment = false;
1685       job->ds_clear_value = default_ds_clear_value;
1686    }
1687 
1688    if (hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED) {
1689       struct pvr_image_view *iview =
1690          render_pass_info->attachments[hw_render->ds_attach_idx];
1691       const struct pvr_image *image = pvr_image_view_get_image(iview);
1692 
1693       /* If the HW render pass has a valid depth/stencil surface, determine the
1694        * sample count from the attachment's image.
1695        */
1696       job->samples = image->vk.samples;
1697    } else if (hw_render->output_regs_count) {
1698       /* If the HW render pass has output registers, we have color attachments
1699        * to write to, so determine the sample count from the count specified for
1700        * every color attachment in this render.
1701        */
1702       job->samples = hw_render->sample_count;
1703    } else if (cmd_buffer->state.gfx_pipeline) {
1704       /* If the HW render pass has no color or depth/stencil attachments, we
1705        * determine the sample count from the count given during pipeline
1706        * creation.
1707        */
1708       job->samples = dynamic_state->ms.rasterization_samples;
1709    } else if (render_pass_info->pass->attachment_count > 0) {
1710       /* If we get here, we have a render pass with subpasses containing no
1711        * attachments. The next best thing is largest of the sample counts
1712        * specified by the render pass attachment descriptions.
1713        */
1714       job->samples = render_pass_info->pass->max_sample_count;
1715    } else {
1716       /* No appropriate framebuffer attachment is available. */
1717       mesa_logw("Defaulting render job sample count to 1.");
1718       job->samples = VK_SAMPLE_COUNT_1_BIT;
1719    }
1720 
1721    if (sub_cmd->max_tiles_in_flight ==
1722        PVR_GET_FEATURE_VALUE(dev_info, isp_max_tiles_in_flight, 1U)) {
1723       /* Use the default limit based on the partition store. */
1724       job->max_tiles_in_flight = 0U;
1725    } else {
1726       job->max_tiles_in_flight = sub_cmd->max_tiles_in_flight;
1727    }
1728 
1729    job->frag_uses_atomic_ops = sub_cmd->frag_uses_atomic_ops;
1730    job->disable_compute_overlap = false;
1731    job->max_shared_registers = cmd_buffer->state.max_shared_regs;
1732    job->run_frag = true;
1733    job->geometry_terminate = true;
1734 
1735    return VK_SUCCESS;
1736 }
1737 
1738 static void
pvr_sub_cmd_compute_job_init(const struct pvr_physical_device * pdevice,struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * sub_cmd)1739 pvr_sub_cmd_compute_job_init(const struct pvr_physical_device *pdevice,
1740                              struct pvr_cmd_buffer *cmd_buffer,
1741                              struct pvr_sub_cmd_compute *sub_cmd)
1742 {
1743    sub_cmd->num_shared_regs = MAX2(cmd_buffer->device->idfwdf_state.usc_shareds,
1744                                    cmd_buffer->state.max_shared_regs);
1745 
1746    cmd_buffer->state.max_shared_regs = 0U;
1747 }
1748 
1749 #define PIXEL_ALLOCATION_SIZE_MAX_IN_BLOCKS \
1750    (1024 / PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE))
1751 
1752 static uint32_t
pvr_compute_flat_slot_size(const struct pvr_physical_device * pdevice,uint32_t coeff_regs_count,bool use_barrier,uint32_t total_workitems)1753 pvr_compute_flat_slot_size(const struct pvr_physical_device *pdevice,
1754                            uint32_t coeff_regs_count,
1755                            bool use_barrier,
1756                            uint32_t total_workitems)
1757 {
1758    const struct pvr_device_runtime_info *dev_runtime_info =
1759       &pdevice->dev_runtime_info;
1760    const struct pvr_device_info *dev_info = &pdevice->dev_info;
1761    uint32_t max_workgroups_per_task = ROGUE_CDM_MAX_PACKED_WORKGROUPS_PER_TASK;
1762    uint32_t max_avail_coeff_regs =
1763       dev_runtime_info->cdm_max_local_mem_size_regs;
1764    uint32_t localstore_chunks_count =
1765       DIV_ROUND_UP(PVR_DW_TO_BYTES(coeff_regs_count),
1766                    PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE));
1767 
1768    /* Ensure that we cannot have more workgroups in a slot than the available
1769     * number of coefficients allow us to have.
1770     */
1771    if (coeff_regs_count > 0U) {
1772       /* If the geometry or fragment jobs can overlap with the compute job, or
1773        * if there is a vertex shader already running then we need to consider
1774        * this in calculating max allowed work-groups.
1775        */
1776       if (PVR_HAS_QUIRK(dev_info, 52354) &&
1777           (PVR_HAS_FEATURE(dev_info, compute_overlap) ||
1778            PVR_HAS_FEATURE(dev_info, gs_rta_support))) {
1779          /* Solve for n (number of work-groups per task). All values are in
1780           * size of common store alloc blocks:
1781           *
1782           * n + (2n + 7) * (local_memory_size_max - 1) =
1783           * 	(coefficient_memory_pool_size) - (7 * pixel_allocation_size_max)
1784           * ==>
1785           * n + 2n * (local_memory_size_max - 1) =
1786           * 	(coefficient_memory_pool_size) - (7 * pixel_allocation_size_max)
1787           * 	- (7 * (local_memory_size_max - 1))
1788           * ==>
1789           * n * (1 + 2 * (local_memory_size_max - 1)) =
1790           * 	(coefficient_memory_pool_size) - (7 * pixel_allocation_size_max)
1791           * 	- (7 * (local_memory_size_max - 1))
1792           * ==>
1793           * n = ((coefficient_memory_pool_size) -
1794           * 	(7 * pixel_allocation_size_max) -
1795           * 	(7 * (local_memory_size_max - 1)) / (1 +
1796           * 2 * (local_memory_size_max - 1)))
1797           */
1798          uint32_t max_common_store_blocks =
1799             DIV_ROUND_UP(max_avail_coeff_regs * 4U,
1800                          PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE));
1801 
1802          /* (coefficient_memory_pool_size) - (7 * pixel_allocation_size_max)
1803           */
1804          max_common_store_blocks -= ROGUE_MAX_OVERLAPPED_PIXEL_TASK_INSTANCES *
1805                                     PIXEL_ALLOCATION_SIZE_MAX_IN_BLOCKS;
1806 
1807          /* - (7 * (local_memory_size_max - 1)) */
1808          max_common_store_blocks -= (ROGUE_MAX_OVERLAPPED_PIXEL_TASK_INSTANCES *
1809                                      (localstore_chunks_count - 1U));
1810 
1811          /* Divide by (1 + 2 * (local_memory_size_max - 1)) */
1812          max_workgroups_per_task = max_common_store_blocks /
1813                                    (1U + 2U * (localstore_chunks_count - 1U));
1814 
1815          max_workgroups_per_task =
1816             MIN2(max_workgroups_per_task,
1817                  ROGUE_CDM_MAX_PACKED_WORKGROUPS_PER_TASK);
1818 
1819       } else {
1820          max_workgroups_per_task =
1821             MIN2((max_avail_coeff_regs / coeff_regs_count),
1822                  max_workgroups_per_task);
1823       }
1824    }
1825 
1826    /* max_workgroups_per_task should at least be one. */
1827    assert(max_workgroups_per_task >= 1U);
1828 
1829    if (total_workitems >= ROGUE_MAX_INSTANCES_PER_TASK) {
1830       /* In this case, the work group size will have been padded up to the
1831        * next ROGUE_MAX_INSTANCES_PER_TASK so we just set max instances to be
1832        * ROGUE_MAX_INSTANCES_PER_TASK.
1833        */
1834       return ROGUE_MAX_INSTANCES_PER_TASK;
1835    }
1836 
1837    /* In this case, the number of instances in the slot must be clamped to
1838     * accommodate whole work-groups only.
1839     */
1840    if (PVR_HAS_QUIRK(dev_info, 49032) || use_barrier) {
1841       max_workgroups_per_task =
1842          MIN2(max_workgroups_per_task,
1843               ROGUE_MAX_INSTANCES_PER_TASK / total_workitems);
1844       return total_workitems * max_workgroups_per_task;
1845    }
1846 
1847    return MIN2(total_workitems * max_workgroups_per_task,
1848                ROGUE_MAX_INSTANCES_PER_TASK);
1849 }
1850 
1851 static void
pvr_compute_generate_control_stream(struct pvr_csb * csb,struct pvr_sub_cmd_compute * sub_cmd,const struct pvr_compute_kernel_info * info)1852 pvr_compute_generate_control_stream(struct pvr_csb *csb,
1853                                     struct pvr_sub_cmd_compute *sub_cmd,
1854                                     const struct pvr_compute_kernel_info *info)
1855 {
1856    pvr_csb_set_relocation_mark(csb);
1857 
1858    /* Compute kernel 0. */
1859    pvr_csb_emit (csb, CDMCTRL_KERNEL0, kernel0) {
1860       kernel0.indirect_present = !!info->indirect_buffer_addr.addr;
1861       kernel0.global_offsets_present = info->global_offsets_present;
1862       kernel0.usc_common_size = info->usc_common_size;
1863       kernel0.usc_unified_size = info->usc_unified_size;
1864       kernel0.pds_temp_size = info->pds_temp_size;
1865       kernel0.pds_data_size = info->pds_data_size;
1866       kernel0.usc_target = info->usc_target;
1867       kernel0.fence = info->is_fence;
1868    }
1869 
1870    /* Compute kernel 1. */
1871    pvr_csb_emit (csb, CDMCTRL_KERNEL1, kernel1) {
1872       kernel1.data_addr = PVR_DEV_ADDR(info->pds_data_offset);
1873       kernel1.sd_type = info->sd_type;
1874       kernel1.usc_common_shared = info->usc_common_shared;
1875    }
1876 
1877    /* Compute kernel 2. */
1878    pvr_csb_emit (csb, CDMCTRL_KERNEL2, kernel2) {
1879       kernel2.code_addr = PVR_DEV_ADDR(info->pds_code_offset);
1880    }
1881 
1882    if (info->indirect_buffer_addr.addr) {
1883       /* Compute kernel 6. */
1884       pvr_csb_emit (csb, CDMCTRL_KERNEL6, kernel6) {
1885          kernel6.indirect_addrmsb = info->indirect_buffer_addr;
1886       }
1887 
1888       /* Compute kernel 7. */
1889       pvr_csb_emit (csb, CDMCTRL_KERNEL7, kernel7) {
1890          kernel7.indirect_addrlsb = info->indirect_buffer_addr;
1891       }
1892    } else {
1893       /* Compute kernel 3. */
1894       pvr_csb_emit (csb, CDMCTRL_KERNEL3, kernel3) {
1895          assert(info->global_size[0U] > 0U);
1896          kernel3.workgroup_x = info->global_size[0U] - 1U;
1897       }
1898 
1899       /* Compute kernel 4. */
1900       pvr_csb_emit (csb, CDMCTRL_KERNEL4, kernel4) {
1901          assert(info->global_size[1U] > 0U);
1902          kernel4.workgroup_y = info->global_size[1U] - 1U;
1903       }
1904 
1905       /* Compute kernel 5. */
1906       pvr_csb_emit (csb, CDMCTRL_KERNEL5, kernel5) {
1907          assert(info->global_size[2U] > 0U);
1908          kernel5.workgroup_z = info->global_size[2U] - 1U;
1909       }
1910    }
1911 
1912    /* Compute kernel 8. */
1913    pvr_csb_emit (csb, CDMCTRL_KERNEL8, kernel8) {
1914       if (info->max_instances == ROGUE_MAX_INSTANCES_PER_TASK)
1915          kernel8.max_instances = 0U;
1916       else
1917          kernel8.max_instances = info->max_instances;
1918 
1919       assert(info->local_size[0U] > 0U);
1920       kernel8.workgroup_size_x = info->local_size[0U] - 1U;
1921       assert(info->local_size[1U] > 0U);
1922       kernel8.workgroup_size_y = info->local_size[1U] - 1U;
1923       assert(info->local_size[2U] > 0U);
1924       kernel8.workgroup_size_z = info->local_size[2U] - 1U;
1925    }
1926 
1927    pvr_csb_clear_relocation_mark(csb);
1928 
1929    /* Track the highest amount of shared registers usage in this dispatch.
1930     * This is used by the FW for context switching, so must be large enough
1931     * to contain all the shared registers that might be in use for this compute
1932     * job. Coefficients don't need to be included as the context switch will not
1933     * happen within the execution of a single workgroup, thus nothing needs to
1934     * be preserved.
1935     */
1936    if (info->usc_common_shared) {
1937       sub_cmd->num_shared_regs =
1938          MAX2(sub_cmd->num_shared_regs, info->usc_common_size);
1939    }
1940 }
1941 
1942 /* TODO: This can be pre-packed and uploaded directly. Would that provide any
1943  * speed up?
1944  */
1945 static void
pvr_compute_generate_idfwdf(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd)1946 pvr_compute_generate_idfwdf(struct pvr_cmd_buffer *cmd_buffer,
1947                             struct pvr_sub_cmd_compute *const sub_cmd)
1948 {
1949    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
1950    bool *const is_sw_barier_required =
1951       &state->current_sub_cmd->compute.pds_sw_barrier_requires_clearing;
1952    const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
1953    struct pvr_csb *csb = &sub_cmd->control_stream;
1954    const struct pvr_pds_upload *program;
1955 
1956    if (PVR_NEED_SW_COMPUTE_PDS_BARRIER(&pdevice->dev_info) &&
1957        *is_sw_barier_required) {
1958       *is_sw_barier_required = false;
1959       program = &cmd_buffer->device->idfwdf_state.sw_compute_barrier_pds;
1960    } else {
1961       program = &cmd_buffer->device->idfwdf_state.pds;
1962    }
1963 
1964    struct pvr_compute_kernel_info info = {
1965       .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
1966       .global_offsets_present = false,
1967       .usc_common_size = DIV_ROUND_UP(
1968          PVR_DW_TO_BYTES(cmd_buffer->device->idfwdf_state.usc_shareds),
1969          PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE)),
1970       .usc_unified_size = 0U,
1971       .pds_temp_size = 0U,
1972       .pds_data_size =
1973          DIV_ROUND_UP(PVR_DW_TO_BYTES(program->data_size),
1974                       PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE)),
1975       .usc_target = PVRX(CDMCTRL_USC_TARGET_ALL),
1976       .is_fence = false,
1977       .pds_data_offset = program->data_offset,
1978       .sd_type = PVRX(CDMCTRL_SD_TYPE_USC),
1979       .usc_common_shared = true,
1980       .pds_code_offset = program->code_offset,
1981       .global_size = { 1U, 1U, 1U },
1982       .local_size = { 1U, 1U, 1U },
1983    };
1984 
1985    /* We don't need to pad work-group size for this case. */
1986 
1987    info.max_instances =
1988       pvr_compute_flat_slot_size(pdevice,
1989                                  cmd_buffer->device->idfwdf_state.usc_shareds,
1990                                  false,
1991                                  1U);
1992 
1993    pvr_compute_generate_control_stream(csb, sub_cmd, &info);
1994 }
1995 
pvr_compute_generate_fence(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd,bool deallocate_shareds)1996 void pvr_compute_generate_fence(struct pvr_cmd_buffer *cmd_buffer,
1997                                 struct pvr_sub_cmd_compute *const sub_cmd,
1998                                 bool deallocate_shareds)
1999 {
2000    const struct pvr_pds_upload *program =
2001       &cmd_buffer->device->pds_compute_fence_program;
2002    const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
2003    struct pvr_csb *csb = &sub_cmd->control_stream;
2004 
2005    struct pvr_compute_kernel_info info = {
2006       .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
2007       .global_offsets_present = false,
2008       .usc_common_size = 0U,
2009       .usc_unified_size = 0U,
2010       .pds_temp_size = 0U,
2011       .pds_data_size =
2012          DIV_ROUND_UP(PVR_DW_TO_BYTES(program->data_size),
2013                       PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE)),
2014       .usc_target = PVRX(CDMCTRL_USC_TARGET_ANY),
2015       .is_fence = true,
2016       .pds_data_offset = program->data_offset,
2017       .sd_type = PVRX(CDMCTRL_SD_TYPE_PDS),
2018       .usc_common_shared = deallocate_shareds,
2019       .pds_code_offset = program->code_offset,
2020       .global_size = { 1U, 1U, 1U },
2021       .local_size = { 1U, 1U, 1U },
2022    };
2023 
2024    /* We don't need to pad work-group size for this case. */
2025    /* Here we calculate the slot size. This can depend on the use of barriers,
2026     * local memory, BRN's or other factors.
2027     */
2028    info.max_instances = pvr_compute_flat_slot_size(pdevice, 0U, false, 1U);
2029 
2030    pvr_compute_generate_control_stream(csb, sub_cmd, &info);
2031 }
2032 
2033 static VkResult
pvr_cmd_buffer_process_deferred_clears(struct pvr_cmd_buffer * cmd_buffer)2034 pvr_cmd_buffer_process_deferred_clears(struct pvr_cmd_buffer *cmd_buffer)
2035 {
2036    util_dynarray_foreach (&cmd_buffer->deferred_clears,
2037                           struct pvr_transfer_cmd,
2038                           transfer_cmd) {
2039       VkResult result;
2040 
2041       result = pvr_cmd_buffer_add_transfer_cmd(cmd_buffer, transfer_cmd);
2042       if (result != VK_SUCCESS)
2043          return result;
2044 
2045       cmd_buffer->state.current_sub_cmd->transfer.serialize_with_frag = true;
2046    }
2047 
2048    return VK_SUCCESS;
2049 }
2050 
pvr_cmd_buffer_end_sub_cmd(struct pvr_cmd_buffer * cmd_buffer)2051 VkResult pvr_cmd_buffer_end_sub_cmd(struct pvr_cmd_buffer *cmd_buffer)
2052 {
2053    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
2054    struct pvr_sub_cmd *sub_cmd = state->current_sub_cmd;
2055    struct pvr_device *device = cmd_buffer->device;
2056    const struct pvr_query_pool *query_pool = NULL;
2057    struct pvr_suballoc_bo *query_bo = NULL;
2058    size_t query_indices_size = 0;
2059    VkResult result;
2060 
2061    /* FIXME: Is this NULL check required because this function is called from
2062     * pvr_resolve_unemitted_resolve_attachments()? See comment about this
2063     * function being called twice in a row in pvr_CmdEndRenderPass().
2064     */
2065    if (!sub_cmd)
2066       return VK_SUCCESS;
2067 
2068    if (!sub_cmd->owned) {
2069       state->current_sub_cmd = NULL;
2070       return VK_SUCCESS;
2071    }
2072 
2073    switch (sub_cmd->type) {
2074    case PVR_SUB_CMD_TYPE_GRAPHICS: {
2075       struct pvr_sub_cmd_gfx *const gfx_sub_cmd = &sub_cmd->gfx;
2076 
2077       query_indices_size =
2078          util_dynarray_num_elements(&state->query_indices, char);
2079 
2080       if (query_indices_size > 0) {
2081          const bool secondary_cont =
2082             cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
2083             cmd_buffer->usage_flags &
2084                VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
2085 
2086          assert(gfx_sub_cmd->query_pool);
2087 
2088          if (secondary_cont) {
2089             util_dynarray_append_dynarray(&state->query_indices,
2090                                           &gfx_sub_cmd->sec_query_indices);
2091          } else {
2092             const void *data = util_dynarray_begin(&state->query_indices);
2093 
2094             result = pvr_cmd_buffer_upload_general(cmd_buffer,
2095                                                    data,
2096                                                    query_indices_size,
2097                                                    &query_bo);
2098             if (result != VK_SUCCESS)
2099                return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2100 
2101             query_pool = gfx_sub_cmd->query_pool;
2102          }
2103 
2104          gfx_sub_cmd->has_occlusion_query = true;
2105 
2106          util_dynarray_clear(&state->query_indices);
2107       }
2108 
2109       if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
2110          result = pvr_csb_emit_return(&gfx_sub_cmd->control_stream);
2111          if (result != VK_SUCCESS)
2112             return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2113 
2114          break;
2115       }
2116 
2117       /* TODO: Check if the sub_cmd can be skipped based on
2118        * sub_cmd->gfx.empty_cmd flag.
2119        */
2120 
2121       /* TODO: Set the state in the functions called with the command buffer
2122        * instead of here.
2123        */
2124 
2125       result = pvr_cmd_buffer_upload_tables(device, cmd_buffer, gfx_sub_cmd);
2126       if (result != VK_SUCCESS)
2127          return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2128 
2129       result = pvr_cmd_buffer_emit_ppp_state(cmd_buffer,
2130                                              &gfx_sub_cmd->control_stream);
2131       if (result != VK_SUCCESS)
2132          return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2133 
2134       result = pvr_csb_emit_terminate(&gfx_sub_cmd->control_stream);
2135       if (result != VK_SUCCESS)
2136          return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2137 
2138       result = pvr_sub_cmd_gfx_job_init(&device->pdevice->dev_info,
2139                                         cmd_buffer,
2140                                         gfx_sub_cmd);
2141       if (result != VK_SUCCESS)
2142          return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2143 
2144       if (pvr_sub_cmd_gfx_requires_split_submit(gfx_sub_cmd)) {
2145          result = pvr_sub_cmd_gfx_build_terminate_ctrl_stream(device,
2146                                                               cmd_buffer,
2147                                                               gfx_sub_cmd);
2148          if (result != VK_SUCCESS)
2149             return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2150       }
2151 
2152       break;
2153    }
2154 
2155    case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY:
2156    case PVR_SUB_CMD_TYPE_COMPUTE: {
2157       struct pvr_sub_cmd_compute *const compute_sub_cmd = &sub_cmd->compute;
2158 
2159       pvr_compute_generate_fence(cmd_buffer, compute_sub_cmd, true);
2160 
2161       result = pvr_csb_emit_terminate(&compute_sub_cmd->control_stream);
2162       if (result != VK_SUCCESS)
2163          return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2164 
2165       pvr_sub_cmd_compute_job_init(device->pdevice,
2166                                    cmd_buffer,
2167                                    compute_sub_cmd);
2168       break;
2169    }
2170 
2171    case PVR_SUB_CMD_TYPE_TRANSFER:
2172       break;
2173 
2174    case PVR_SUB_CMD_TYPE_EVENT:
2175       break;
2176 
2177    default:
2178       unreachable("Unsupported sub-command type");
2179    }
2180 
2181    state->current_sub_cmd = NULL;
2182 
2183    /* pvr_cmd_buffer_process_deferred_clears() must be called with a NULL
2184     * current_sub_cmd.
2185     *
2186     * We can start a sub_cmd of a different type from the current sub_cmd only
2187     * after having ended the current sub_cmd. However, we can't end the current
2188     * sub_cmd if this depends on starting sub_cmd(s) of a different type. Hence,
2189     * don't try to start transfer sub_cmd(s) with
2190     * pvr_cmd_buffer_process_deferred_clears() until the current hasn't ended.
2191     * Failing to do so we will cause a circular dependency between
2192     * pvr_cmd_buffer_{end,start}_cmd and blow the stack.
2193     */
2194    if (sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS) {
2195       result = pvr_cmd_buffer_process_deferred_clears(cmd_buffer);
2196       if (result != VK_SUCCESS)
2197          return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2198    }
2199 
2200    if (query_pool) {
2201       struct pvr_query_info query_info;
2202 
2203       assert(query_bo);
2204       assert(query_indices_size);
2205 
2206       query_info.type = PVR_QUERY_TYPE_AVAILABILITY_WRITE;
2207 
2208       /* sizeof(uint32_t) is for the size of single query. */
2209       query_info.availability_write.num_query_indices =
2210          query_indices_size / sizeof(uint32_t);
2211       query_info.availability_write.index_bo = query_bo;
2212 
2213       query_info.availability_write.num_queries = query_pool->query_count;
2214       query_info.availability_write.availability_bo =
2215          query_pool->availability_buffer;
2216 
2217       /* Insert a barrier after the graphics sub command and before the
2218        * query sub command so that the availability write program waits for the
2219        * fragment shader to complete.
2220        */
2221 
2222       result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
2223       if (result != VK_SUCCESS)
2224          return result;
2225 
2226       cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
2227          .type = PVR_EVENT_TYPE_BARRIER,
2228          .barrier = {
2229             .wait_for_stage_mask = PVR_PIPELINE_STAGE_FRAG_BIT,
2230             .wait_at_stage_mask = PVR_PIPELINE_STAGE_OCCLUSION_QUERY_BIT,
2231          },
2232       };
2233 
2234       return pvr_add_query_program(cmd_buffer, &query_info);
2235    }
2236 
2237    return VK_SUCCESS;
2238 }
2239 
pvr_reset_graphics_dirty_state(struct pvr_cmd_buffer * const cmd_buffer,bool start_geom)2240 void pvr_reset_graphics_dirty_state(struct pvr_cmd_buffer *const cmd_buffer,
2241                                     bool start_geom)
2242 {
2243    struct vk_dynamic_graphics_state *const dynamic_state =
2244       &cmd_buffer->vk.dynamic_graphics_state;
2245 
2246    if (start_geom) {
2247       /*
2248        * Initial geometry phase state.
2249        * It's the driver's responsibility to ensure that the state of the
2250        * hardware is correctly initialized at the start of every geometry
2251        * phase. This is required to prevent stale state from a previous
2252        * geometry phase erroneously affecting the next geometry phase.
2253        *
2254        * If a geometry phase does not contain any geometry, this restriction
2255        * can be ignored. If the first draw call in a geometry phase will only
2256        * update the depth or stencil buffers i.e. ISP_TAGWRITEDISABLE is set
2257        * in the ISP State Control Word, the PDS State Pointers
2258        * (TA_PRES_PDSSTATEPTR*) in the first PPP State Update do not need to
2259        * be supplied, since they will never reach the PDS in the fragment
2260        * phase.
2261        */
2262 
2263       cmd_buffer->state.emit_header = (struct PVRX(TA_STATE_HEADER)){
2264          .pres_stream_out_size = true,
2265          .pres_ppp_ctrl = true,
2266          .pres_varying_word2 = true,
2267          .pres_varying_word1 = true,
2268          .pres_varying_word0 = true,
2269          .pres_outselects = true,
2270          .pres_wclamp = true,
2271          .pres_viewport = true,
2272          .pres_region_clip = true,
2273          .pres_pds_state_ptr0 = true,
2274          .pres_ispctl_fb = true,
2275          .pres_ispctl = true,
2276       };
2277    } else {
2278       struct PVRX(TA_STATE_HEADER) *const emit_header =
2279          &cmd_buffer->state.emit_header;
2280 
2281       emit_header->pres_ppp_ctrl = true;
2282       emit_header->pres_varying_word1 = true;
2283       emit_header->pres_varying_word0 = true;
2284       emit_header->pres_outselects = true;
2285       emit_header->pres_viewport = true;
2286       emit_header->pres_region_clip = true;
2287       emit_header->pres_pds_state_ptr0 = true;
2288       emit_header->pres_ispctl_fb = true;
2289       emit_header->pres_ispctl = true;
2290    }
2291 
2292    memset(&cmd_buffer->state.ppp_state,
2293           0U,
2294           sizeof(cmd_buffer->state.ppp_state));
2295 
2296    cmd_buffer->state.dirty.vertex_bindings = true;
2297    cmd_buffer->state.dirty.gfx_pipeline_binding = true;
2298 
2299    BITSET_SET(dynamic_state->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS);
2300    BITSET_SET(dynamic_state->dirty, MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT);
2301 }
2302 
2303 static inline bool
pvr_cmd_uses_deferred_cs_cmds(const struct pvr_cmd_buffer * const cmd_buffer)2304 pvr_cmd_uses_deferred_cs_cmds(const struct pvr_cmd_buffer *const cmd_buffer)
2305 {
2306    const VkCommandBufferUsageFlags deferred_control_stream_flags =
2307       VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT |
2308       VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
2309 
2310    return cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
2311           (cmd_buffer->usage_flags & deferred_control_stream_flags) ==
2312              deferred_control_stream_flags;
2313 }
2314 
pvr_cmd_buffer_start_sub_cmd(struct pvr_cmd_buffer * cmd_buffer,enum pvr_sub_cmd_type type)2315 VkResult pvr_cmd_buffer_start_sub_cmd(struct pvr_cmd_buffer *cmd_buffer,
2316                                       enum pvr_sub_cmd_type type)
2317 {
2318    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
2319    struct pvr_device *device = cmd_buffer->device;
2320    struct pvr_sub_cmd *sub_cmd;
2321    VkResult result;
2322 
2323    /* Check the current status of the buffer. */
2324    if (vk_command_buffer_has_error(&cmd_buffer->vk))
2325       return vk_command_buffer_get_record_result(&cmd_buffer->vk);
2326 
2327    pvr_cmd_buffer_update_barriers(cmd_buffer, type);
2328 
2329    /* TODO: Add proper support for joining consecutive event sub_cmd? */
2330    if (state->current_sub_cmd) {
2331       if (state->current_sub_cmd->type == type) {
2332          /* Continue adding to the current sub command. */
2333          return VK_SUCCESS;
2334       }
2335 
2336       /* End the current sub command. */
2337       result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
2338       if (result != VK_SUCCESS)
2339          return result;
2340    }
2341 
2342    sub_cmd = vk_zalloc(&cmd_buffer->vk.pool->alloc,
2343                        sizeof(*sub_cmd),
2344                        8,
2345                        VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
2346    if (!sub_cmd) {
2347       return vk_command_buffer_set_error(&cmd_buffer->vk,
2348                                          VK_ERROR_OUT_OF_HOST_MEMORY);
2349    }
2350 
2351    sub_cmd->type = type;
2352    sub_cmd->owned = true;
2353 
2354    switch (type) {
2355    case PVR_SUB_CMD_TYPE_GRAPHICS:
2356       sub_cmd->gfx.depth_usage = PVR_DEPTH_STENCIL_USAGE_UNDEFINED;
2357       sub_cmd->gfx.stencil_usage = PVR_DEPTH_STENCIL_USAGE_UNDEFINED;
2358       sub_cmd->gfx.modifies_depth = false;
2359       sub_cmd->gfx.modifies_stencil = false;
2360       sub_cmd->gfx.max_tiles_in_flight =
2361          PVR_GET_FEATURE_VALUE(&device->pdevice->dev_info,
2362                                isp_max_tiles_in_flight,
2363                                1);
2364       sub_cmd->gfx.hw_render_idx = state->render_pass_info.current_hw_subpass;
2365       sub_cmd->gfx.framebuffer = state->render_pass_info.framebuffer;
2366       sub_cmd->gfx.empty_cmd = true;
2367 
2368       if (state->vis_test_enabled)
2369          sub_cmd->gfx.query_pool = state->query_pool;
2370 
2371       pvr_reset_graphics_dirty_state(cmd_buffer, true);
2372 
2373       if (pvr_cmd_uses_deferred_cs_cmds(cmd_buffer)) {
2374          pvr_csb_init(device,
2375                       PVR_CMD_STREAM_TYPE_GRAPHICS_DEFERRED,
2376                       &sub_cmd->gfx.control_stream);
2377       } else {
2378          pvr_csb_init(device,
2379                       PVR_CMD_STREAM_TYPE_GRAPHICS,
2380                       &sub_cmd->gfx.control_stream);
2381       }
2382 
2383       util_dynarray_init(&sub_cmd->gfx.sec_query_indices, NULL);
2384       break;
2385 
2386    case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY:
2387    case PVR_SUB_CMD_TYPE_COMPUTE:
2388       pvr_csb_init(device,
2389                    PVR_CMD_STREAM_TYPE_COMPUTE,
2390                    &sub_cmd->compute.control_stream);
2391       break;
2392 
2393    case PVR_SUB_CMD_TYPE_TRANSFER:
2394       sub_cmd->transfer.transfer_cmds = &sub_cmd->transfer.transfer_cmds_priv;
2395       list_inithead(sub_cmd->transfer.transfer_cmds);
2396       break;
2397 
2398    case PVR_SUB_CMD_TYPE_EVENT:
2399       break;
2400 
2401    default:
2402       unreachable("Unsupported sub-command type");
2403    }
2404 
2405    list_addtail(&sub_cmd->link, &cmd_buffer->sub_cmds);
2406    state->current_sub_cmd = sub_cmd;
2407 
2408    return VK_SUCCESS;
2409 }
2410 
pvr_cmd_buffer_alloc_mem(struct pvr_cmd_buffer * cmd_buffer,struct pvr_winsys_heap * heap,uint64_t size,struct pvr_suballoc_bo ** const pvr_bo_out)2411 VkResult pvr_cmd_buffer_alloc_mem(struct pvr_cmd_buffer *cmd_buffer,
2412                                   struct pvr_winsys_heap *heap,
2413                                   uint64_t size,
2414                                   struct pvr_suballoc_bo **const pvr_bo_out)
2415 {
2416    const uint32_t cache_line_size =
2417       rogue_get_slc_cache_line_size(&cmd_buffer->device->pdevice->dev_info);
2418    struct pvr_suballoc_bo *suballoc_bo;
2419    struct pvr_suballocator *allocator;
2420    VkResult result;
2421 
2422    if (heap == cmd_buffer->device->heaps.general_heap)
2423       allocator = &cmd_buffer->device->suballoc_general;
2424    else if (heap == cmd_buffer->device->heaps.pds_heap)
2425       allocator = &cmd_buffer->device->suballoc_pds;
2426    else if (heap == cmd_buffer->device->heaps.transfer_frag_heap)
2427       allocator = &cmd_buffer->device->suballoc_transfer;
2428    else if (heap == cmd_buffer->device->heaps.usc_heap)
2429       allocator = &cmd_buffer->device->suballoc_usc;
2430    else
2431       unreachable("Unknown heap type");
2432 
2433    result =
2434       pvr_bo_suballoc(allocator, size, cache_line_size, false, &suballoc_bo);
2435    if (result != VK_SUCCESS)
2436       return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2437 
2438    list_add(&suballoc_bo->link, &cmd_buffer->bo_list);
2439 
2440    *pvr_bo_out = suballoc_bo;
2441 
2442    return VK_SUCCESS;
2443 }
2444 
pvr_cmd_bind_compute_pipeline(const struct pvr_compute_pipeline * const compute_pipeline,struct pvr_cmd_buffer * const cmd_buffer)2445 static void pvr_cmd_bind_compute_pipeline(
2446    const struct pvr_compute_pipeline *const compute_pipeline,
2447    struct pvr_cmd_buffer *const cmd_buffer)
2448 {
2449    cmd_buffer->state.compute_pipeline = compute_pipeline;
2450    cmd_buffer->state.dirty.compute_pipeline_binding = true;
2451 }
2452 
pvr_cmd_bind_graphics_pipeline(const struct pvr_graphics_pipeline * const gfx_pipeline,struct pvr_cmd_buffer * const cmd_buffer)2453 static void pvr_cmd_bind_graphics_pipeline(
2454    const struct pvr_graphics_pipeline *const gfx_pipeline,
2455    struct pvr_cmd_buffer *const cmd_buffer)
2456 {
2457    cmd_buffer->state.gfx_pipeline = gfx_pipeline;
2458    cmd_buffer->state.dirty.gfx_pipeline_binding = true;
2459 
2460    vk_cmd_set_dynamic_graphics_state(&cmd_buffer->vk,
2461                                      &gfx_pipeline->dynamic_state);
2462 }
2463 
pvr_CmdBindPipeline(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipeline _pipeline)2464 void pvr_CmdBindPipeline(VkCommandBuffer commandBuffer,
2465                          VkPipelineBindPoint pipelineBindPoint,
2466                          VkPipeline _pipeline)
2467 {
2468    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2469    PVR_FROM_HANDLE(pvr_pipeline, pipeline, _pipeline);
2470 
2471    switch (pipelineBindPoint) {
2472    case VK_PIPELINE_BIND_POINT_COMPUTE:
2473       pvr_cmd_bind_compute_pipeline(to_pvr_compute_pipeline(pipeline),
2474                                     cmd_buffer);
2475       break;
2476 
2477    case VK_PIPELINE_BIND_POINT_GRAPHICS:
2478       pvr_cmd_bind_graphics_pipeline(to_pvr_graphics_pipeline(pipeline),
2479                                      cmd_buffer);
2480       break;
2481 
2482    default:
2483       unreachable("Invalid bind point.");
2484       break;
2485    }
2486 }
2487 
2488 #if defined(DEBUG)
check_viewport_quirk_70165(const struct pvr_device * device,const VkViewport * pViewport)2489 static void check_viewport_quirk_70165(const struct pvr_device *device,
2490                                        const VkViewport *pViewport)
2491 {
2492    const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
2493    float min_vertex_x, max_vertex_x, min_vertex_y, max_vertex_y;
2494    float min_screen_space_value, max_screen_space_value;
2495    float sign_to_unsigned_offset, fixed_point_max;
2496    float guardband_width, guardband_height;
2497 
2498    if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
2499       /* Max representable value in 13.4 fixed point format.
2500        * Round-down to avoid precision issues.
2501        * Calculated as (2 ** 13) - 2*(2 ** -4)
2502        */
2503       fixed_point_max = 8192.0f - 2.0f / 16.0f;
2504 
2505       if (PVR_HAS_FEATURE(dev_info, screen_size8K)) {
2506          if (pViewport->width <= 4096 && pViewport->height <= 4096) {
2507             guardband_width = pViewport->width / 4.0f;
2508             guardband_height = pViewport->height / 4.0f;
2509 
2510             /* 2k of the range is negative */
2511             sign_to_unsigned_offset = 2048.0f;
2512          } else {
2513             guardband_width = 0.0f;
2514             guardband_height = 0.0f;
2515 
2516             /* For > 4k renders, the entire range is positive */
2517             sign_to_unsigned_offset = 0.0f;
2518          }
2519       } else {
2520          guardband_width = pViewport->width / 4.0f;
2521          guardband_height = pViewport->height / 4.0f;
2522 
2523          /* 2k of the range is negative */
2524          sign_to_unsigned_offset = 2048.0f;
2525       }
2526    } else {
2527       /* Max representable value in 16.8 fixed point format
2528        * Calculated as (2 ** 16) - (2 ** -8)
2529        */
2530       fixed_point_max = 65535.99609375f;
2531       guardband_width = pViewport->width / 4.0f;
2532       guardband_height = pViewport->height / 4.0f;
2533 
2534       /* 4k/20k of the range is negative */
2535       sign_to_unsigned_offset = (float)PVR_MAX_NEG_OFFSCREEN_OFFSET;
2536    }
2537 
2538    min_screen_space_value = -sign_to_unsigned_offset;
2539    max_screen_space_value = fixed_point_max - sign_to_unsigned_offset;
2540 
2541    min_vertex_x = pViewport->x - guardband_width;
2542    max_vertex_x = pViewport->x + pViewport->width + guardband_width;
2543    min_vertex_y = pViewport->y - guardband_height;
2544    max_vertex_y = pViewport->y + pViewport->height + guardband_height;
2545    if (min_vertex_x < min_screen_space_value ||
2546        max_vertex_x > max_screen_space_value ||
2547        min_vertex_y < min_screen_space_value ||
2548        max_vertex_y > max_screen_space_value) {
2549       mesa_logw("Viewport is affected by BRN70165, geometry outside "
2550                 "the viewport could be corrupted");
2551    }
2552 }
2553 #endif
2554 
pvr_CmdSetViewport(VkCommandBuffer commandBuffer,uint32_t firstViewport,uint32_t viewportCount,const VkViewport * pViewports)2555 void pvr_CmdSetViewport(VkCommandBuffer commandBuffer,
2556                         uint32_t firstViewport,
2557                         uint32_t viewportCount,
2558                         const VkViewport *pViewports)
2559 {
2560    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2561    const uint32_t total_count = firstViewport + viewportCount;
2562 
2563    assert(firstViewport < PVR_MAX_VIEWPORTS && viewportCount > 0);
2564    assert(total_count >= 1 && total_count <= PVR_MAX_VIEWPORTS);
2565 
2566    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2567 
2568 #if defined(DEBUG)
2569    if (PVR_HAS_QUIRK(&cmd_buffer->device->pdevice->dev_info, 70165)) {
2570       for (uint32_t viewport = 0; viewport < viewportCount; viewport++) {
2571          check_viewport_quirk_70165(cmd_buffer->device, &pViewports[viewport]);
2572       }
2573    }
2574 #endif
2575 
2576    vk_common_CmdSetViewport(commandBuffer,
2577                             firstViewport,
2578                             viewportCount,
2579                             pViewports);
2580 }
2581 
pvr_CmdSetDepthBounds(VkCommandBuffer commandBuffer,float minDepthBounds,float maxDepthBounds)2582 void pvr_CmdSetDepthBounds(VkCommandBuffer commandBuffer,
2583                            float minDepthBounds,
2584                            float maxDepthBounds)
2585 {
2586    mesa_logd("No support for depth bounds testing.");
2587 }
2588 
pvr_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipelineLayout _layout,uint32_t firstSet,uint32_t descriptorSetCount,const VkDescriptorSet * pDescriptorSets,uint32_t dynamicOffsetCount,const uint32_t * pDynamicOffsets)2589 void pvr_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
2590                                VkPipelineBindPoint pipelineBindPoint,
2591                                VkPipelineLayout _layout,
2592                                uint32_t firstSet,
2593                                uint32_t descriptorSetCount,
2594                                const VkDescriptorSet *pDescriptorSets,
2595                                uint32_t dynamicOffsetCount,
2596                                const uint32_t *pDynamicOffsets)
2597 {
2598    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2599    struct pvr_descriptor_state *descriptor_state;
2600 
2601    assert(firstSet + descriptorSetCount <= PVR_MAX_DESCRIPTOR_SETS);
2602 
2603    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2604 
2605    switch (pipelineBindPoint) {
2606    case VK_PIPELINE_BIND_POINT_GRAPHICS:
2607    case VK_PIPELINE_BIND_POINT_COMPUTE:
2608       break;
2609 
2610    default:
2611       unreachable("Unsupported bind point.");
2612       break;
2613    }
2614 
2615    if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) {
2616       descriptor_state = &cmd_buffer->state.gfx_desc_state;
2617       cmd_buffer->state.dirty.gfx_desc_dirty = true;
2618    } else {
2619       descriptor_state = &cmd_buffer->state.compute_desc_state;
2620       cmd_buffer->state.dirty.compute_desc_dirty = true;
2621    }
2622 
2623    for (uint32_t i = 0; i < descriptorSetCount; i++) {
2624       PVR_FROM_HANDLE(pvr_descriptor_set, set, pDescriptorSets[i]);
2625       uint32_t index = firstSet + i;
2626 
2627       if (descriptor_state->descriptor_sets[index] != set) {
2628          descriptor_state->descriptor_sets[index] = set;
2629          descriptor_state->valid_mask |= (1u << index);
2630       }
2631    }
2632 
2633    if (dynamicOffsetCount > 0) {
2634       PVR_FROM_HANDLE(pvr_pipeline_layout, pipeline_layout, _layout);
2635       uint32_t set_offset = 0;
2636 
2637       for (uint32_t set = 0; set < firstSet; set++)
2638          set_offset += pipeline_layout->set_layout[set]->dynamic_buffer_count;
2639 
2640       assert(set_offset + dynamicOffsetCount <=
2641              ARRAY_SIZE(descriptor_state->dynamic_offsets));
2642 
2643       /* From the Vulkan 1.3.238 spec. :
2644        *
2645        *    "If any of the sets being bound include dynamic uniform or storage
2646        *    buffers, then pDynamicOffsets includes one element for each array
2647        *    element in each dynamic descriptor type binding in each set."
2648        *
2649        */
2650       for (uint32_t i = 0; i < dynamicOffsetCount; i++)
2651          descriptor_state->dynamic_offsets[set_offset + i] = pDynamicOffsets[i];
2652    }
2653 }
2654 
pvr_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets)2655 void pvr_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,
2656                               uint32_t firstBinding,
2657                               uint32_t bindingCount,
2658                               const VkBuffer *pBuffers,
2659                               const VkDeviceSize *pOffsets)
2660 {
2661    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2662    struct pvr_vertex_binding *const vb = cmd_buffer->state.vertex_bindings;
2663 
2664    /* We have to defer setting up vertex buffer since we need the buffer
2665     * stride from the pipeline.
2666     */
2667 
2668    assert(firstBinding < PVR_MAX_VERTEX_INPUT_BINDINGS &&
2669           bindingCount <= PVR_MAX_VERTEX_INPUT_BINDINGS);
2670 
2671    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2672 
2673    for (uint32_t i = 0; i < bindingCount; i++) {
2674       vb[firstBinding + i].buffer = pvr_buffer_from_handle(pBuffers[i]);
2675       vb[firstBinding + i].offset = pOffsets[i];
2676    }
2677 
2678    cmd_buffer->state.dirty.vertex_bindings = true;
2679 }
2680 
pvr_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,VkBuffer buffer,VkDeviceSize offset,VkIndexType indexType)2681 void pvr_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,
2682                             VkBuffer buffer,
2683                             VkDeviceSize offset,
2684                             VkIndexType indexType)
2685 {
2686    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2687    PVR_FROM_HANDLE(pvr_buffer, index_buffer, buffer);
2688    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
2689 
2690    assert(offset < index_buffer->vk.size);
2691    assert(indexType == VK_INDEX_TYPE_UINT32 ||
2692           indexType == VK_INDEX_TYPE_UINT16);
2693 
2694    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2695 
2696    state->index_buffer_binding.buffer = index_buffer;
2697    state->index_buffer_binding.offset = offset;
2698    state->index_buffer_binding.type = indexType;
2699    state->dirty.index_buffer_binding = true;
2700 }
2701 
pvr_CmdPushConstants(VkCommandBuffer commandBuffer,VkPipelineLayout layout,VkShaderStageFlags stageFlags,uint32_t offset,uint32_t size,const void * pValues)2702 void pvr_CmdPushConstants(VkCommandBuffer commandBuffer,
2703                           VkPipelineLayout layout,
2704                           VkShaderStageFlags stageFlags,
2705                           uint32_t offset,
2706                           uint32_t size,
2707                           const void *pValues)
2708 {
2709 #if defined(DEBUG)
2710    const uint64_t ending = (uint64_t)offset + (uint64_t)size;
2711 #endif
2712 
2713    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2714    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
2715 
2716    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2717 
2718    pvr_assert(ending <= PVR_MAX_PUSH_CONSTANTS_SIZE);
2719 
2720    memcpy(&state->push_constants.data[offset], pValues, size);
2721 
2722    state->push_constants.dirty_stages |= stageFlags;
2723    state->push_constants.uploaded = false;
2724 }
2725 
2726 static VkResult
pvr_cmd_buffer_setup_attachments(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_render_pass * pass,const struct pvr_framebuffer * framebuffer)2727 pvr_cmd_buffer_setup_attachments(struct pvr_cmd_buffer *cmd_buffer,
2728                                  const struct pvr_render_pass *pass,
2729                                  const struct pvr_framebuffer *framebuffer)
2730 {
2731    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
2732    struct pvr_render_pass_info *info = &state->render_pass_info;
2733 
2734    assert(pass->attachment_count == framebuffer->attachment_count);
2735 
2736    /* Free any previously allocated attachments. */
2737    vk_free(&cmd_buffer->vk.pool->alloc, state->render_pass_info.attachments);
2738 
2739    if (pass->attachment_count == 0) {
2740       info->attachments = NULL;
2741       return VK_SUCCESS;
2742    }
2743 
2744    info->attachments =
2745       vk_zalloc(&cmd_buffer->vk.pool->alloc,
2746                 pass->attachment_count * sizeof(*info->attachments),
2747                 8,
2748                 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
2749    if (!info->attachments) {
2750       return vk_command_buffer_set_error(&cmd_buffer->vk,
2751                                          VK_ERROR_OUT_OF_HOST_MEMORY);
2752    }
2753 
2754    for (uint32_t i = 0; i < pass->attachment_count; i++)
2755       info->attachments[i] = framebuffer->attachments[i];
2756 
2757    return VK_SUCCESS;
2758 }
2759 
pvr_init_render_targets(struct pvr_device * device,struct pvr_render_pass * pass,struct pvr_framebuffer * framebuffer)2760 static VkResult pvr_init_render_targets(struct pvr_device *device,
2761                                         struct pvr_render_pass *pass,
2762                                         struct pvr_framebuffer *framebuffer)
2763 {
2764    for (uint32_t i = 0; i < pass->hw_setup->render_count; i++) {
2765       struct pvr_render_target *render_target =
2766          pvr_get_render_target(pass, framebuffer, i);
2767 
2768       pthread_mutex_lock(&render_target->mutex);
2769 
2770       if (!render_target->valid) {
2771          const struct pvr_renderpass_hwsetup_render *hw_render =
2772             &pass->hw_setup->renders[i];
2773          VkResult result;
2774 
2775          result = pvr_render_target_dataset_create(device,
2776                                                    framebuffer->width,
2777                                                    framebuffer->height,
2778                                                    hw_render->sample_count,
2779                                                    framebuffer->layers,
2780                                                    &render_target->rt_dataset);
2781          if (result != VK_SUCCESS) {
2782             pthread_mutex_unlock(&render_target->mutex);
2783             return result;
2784          }
2785 
2786          render_target->valid = true;
2787       }
2788 
2789       pthread_mutex_unlock(&render_target->mutex);
2790    }
2791 
2792    return VK_SUCCESS;
2793 }
2794 
2795 const struct pvr_renderpass_hwsetup_subpass *
pvr_get_hw_subpass(const struct pvr_render_pass * pass,const uint32_t subpass)2796 pvr_get_hw_subpass(const struct pvr_render_pass *pass, const uint32_t subpass)
2797 {
2798    const struct pvr_renderpass_hw_map *map =
2799       &pass->hw_setup->subpass_map[subpass];
2800 
2801    return &pass->hw_setup->renders[map->render].subpasses[map->subpass];
2802 }
2803 
pvr_perform_start_of_render_attachment_clear(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_framebuffer * framebuffer,uint32_t index,bool is_depth_stencil,uint32_t * index_list_clear_mask)2804 static void pvr_perform_start_of_render_attachment_clear(
2805    struct pvr_cmd_buffer *cmd_buffer,
2806    const struct pvr_framebuffer *framebuffer,
2807    uint32_t index,
2808    bool is_depth_stencil,
2809    uint32_t *index_list_clear_mask)
2810 {
2811    ASSERTED static const VkImageAspectFlags dsc_aspect_flags =
2812       VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT |
2813       VK_IMAGE_ASPECT_COLOR_BIT;
2814    struct pvr_render_pass_info *info = &cmd_buffer->state.render_pass_info;
2815    const struct pvr_render_pass *pass = info->pass;
2816    const struct pvr_renderpass_hwsetup *hw_setup = pass->hw_setup;
2817    const struct pvr_renderpass_hwsetup_render *hw_render =
2818       &hw_setup->renders[hw_setup->subpass_map[info->subpass_idx].render];
2819    VkImageAspectFlags image_aspect;
2820    struct pvr_image_view *iview;
2821    uint32_t view_idx;
2822 
2823    if (is_depth_stencil) {
2824       bool stencil_clear;
2825       bool depth_clear;
2826       bool is_stencil;
2827       bool is_depth;
2828 
2829       assert(hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED);
2830       assert(index == 0);
2831 
2832       view_idx = hw_render->ds_attach_idx;
2833 
2834       is_depth = vk_format_has_depth(pass->attachments[view_idx].vk_format);
2835       is_stencil = vk_format_has_stencil(pass->attachments[view_idx].vk_format);
2836       depth_clear = hw_render->depth_init == VK_ATTACHMENT_LOAD_OP_CLEAR;
2837       stencil_clear = hw_render->stencil_init == VK_ATTACHMENT_LOAD_OP_CLEAR;
2838 
2839       /* Attempt to clear the ds attachment. Do not erroneously discard an
2840        * attachment that has no depth clear but has a stencil attachment.
2841        */
2842       /* if not (a ∧ c) ∨ (b ∧ d) */
2843       if (!((is_depth && depth_clear) || (is_stencil && stencil_clear)))
2844          return;
2845    } else if (hw_render->color_init[index].op != VK_ATTACHMENT_LOAD_OP_CLEAR) {
2846       return;
2847    } else {
2848       view_idx = hw_render->color_init[index].index;
2849    }
2850 
2851    iview = info->attachments[view_idx];
2852 
2853    /* FIXME: It would be nice if this function and pvr_sub_cmd_gfx_job_init()
2854     * were doing the same check (even if it's just an assert) to determine if a
2855     * clear is needed.
2856     */
2857    /* If this is single-layer fullscreen, we already do the clears in
2858     * pvr_sub_cmd_gfx_job_init().
2859     */
2860    if (pvr_is_render_area_tile_aligned(cmd_buffer, iview) &&
2861        framebuffer->layers == 1) {
2862       return;
2863    }
2864 
2865    image_aspect = vk_format_aspects(pass->attachments[view_idx].vk_format);
2866    assert((image_aspect & ~dsc_aspect_flags) == 0);
2867 
2868    if (image_aspect & VK_IMAGE_ASPECT_DEPTH_BIT &&
2869        hw_render->depth_init != VK_ATTACHMENT_LOAD_OP_CLEAR) {
2870       image_aspect &= ~VK_IMAGE_ASPECT_DEPTH_BIT;
2871    }
2872 
2873    if (image_aspect & VK_IMAGE_ASPECT_STENCIL_BIT &&
2874        hw_render->stencil_init != VK_ATTACHMENT_LOAD_OP_CLEAR) {
2875       image_aspect &= ~VK_IMAGE_ASPECT_STENCIL_BIT;
2876    }
2877 
2878    if (image_aspect != VK_IMAGE_ASPECT_NONE) {
2879       VkClearAttachment clear_attachment = {
2880          .aspectMask = image_aspect,
2881          .colorAttachment = index,
2882          .clearValue = info->clear_values[view_idx],
2883       };
2884       VkClearRect rect = {
2885          .rect = info->render_area,
2886          .baseArrayLayer = 0,
2887          .layerCount = info->framebuffer->layers,
2888       };
2889 
2890       assert(view_idx < info->clear_value_count);
2891 
2892       pvr_clear_attachments_render_init(cmd_buffer, &clear_attachment, &rect);
2893 
2894       *index_list_clear_mask |= (1 << index);
2895    }
2896 }
2897 
2898 static void
pvr_perform_start_of_render_clears(struct pvr_cmd_buffer * cmd_buffer)2899 pvr_perform_start_of_render_clears(struct pvr_cmd_buffer *cmd_buffer)
2900 {
2901    struct pvr_render_pass_info *info = &cmd_buffer->state.render_pass_info;
2902    const struct pvr_framebuffer *framebuffer = info->framebuffer;
2903    const struct pvr_render_pass *pass = info->pass;
2904    const struct pvr_renderpass_hwsetup *hw_setup = pass->hw_setup;
2905    const struct pvr_renderpass_hwsetup_render *hw_render =
2906       &hw_setup->renders[hw_setup->subpass_map[info->subpass_idx].render];
2907 
2908    /* Mask of attachment clears using index lists instead of background object
2909     * to clear.
2910     */
2911    uint32_t index_list_clear_mask = 0;
2912 
2913    for (uint32_t i = 0; i < hw_render->color_init_count; i++) {
2914       pvr_perform_start_of_render_attachment_clear(cmd_buffer,
2915                                                    framebuffer,
2916                                                    i,
2917                                                    false,
2918                                                    &index_list_clear_mask);
2919    }
2920 
2921    info->enable_bg_tag = !!hw_render->color_init_count;
2922 
2923    /* If we're not using index list for all clears/loads then we need to run
2924     * the background object on empty tiles.
2925     */
2926    if (hw_render->color_init_count &&
2927        index_list_clear_mask != ((1u << hw_render->color_init_count) - 1u)) {
2928       info->process_empty_tiles = true;
2929    } else {
2930       info->process_empty_tiles = false;
2931    }
2932 
2933    if (hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED) {
2934       uint32_t ds_index_list = 0;
2935 
2936       pvr_perform_start_of_render_attachment_clear(cmd_buffer,
2937                                                    framebuffer,
2938                                                    0,
2939                                                    true,
2940                                                    &ds_index_list);
2941    }
2942 
2943    if (index_list_clear_mask)
2944       pvr_finishme("Add support for generating loadops shaders!");
2945 }
2946 
pvr_stash_depth_format(struct pvr_cmd_buffer_state * state,struct pvr_sub_cmd_gfx * const sub_cmd)2947 static void pvr_stash_depth_format(struct pvr_cmd_buffer_state *state,
2948                                    struct pvr_sub_cmd_gfx *const sub_cmd)
2949 {
2950    const struct pvr_render_pass *pass = state->render_pass_info.pass;
2951    const struct pvr_renderpass_hwsetup_render *hw_render =
2952       &pass->hw_setup->renders[sub_cmd->hw_render_idx];
2953 
2954    if (hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED) {
2955       struct pvr_image_view **iviews = state->render_pass_info.attachments;
2956 
2957       state->depth_format = iviews[hw_render->ds_attach_idx]->vk.format;
2958    }
2959 }
2960 
pvr_loadops_contain_clear(struct pvr_renderpass_hwsetup * hw_setup)2961 static bool pvr_loadops_contain_clear(struct pvr_renderpass_hwsetup *hw_setup)
2962 {
2963    for (uint32_t i = 0; i < hw_setup->render_count; i++) {
2964       struct pvr_renderpass_hwsetup_render *hw_render = &hw_setup->renders[i];
2965       uint32_t render_targets_count = hw_render->init_setup.num_render_targets;
2966 
2967       for (uint32_t j = 0;
2968            j < (hw_render->color_init_count * render_targets_count);
2969            j += render_targets_count) {
2970          for (uint32_t k = 0; k < hw_render->init_setup.num_render_targets;
2971               k++) {
2972             if (hw_render->color_init[j + k].op ==
2973                 VK_ATTACHMENT_LOAD_OP_CLEAR) {
2974                return true;
2975             }
2976          }
2977       }
2978       if (hw_render->depth_init == VK_ATTACHMENT_LOAD_OP_CLEAR ||
2979           hw_render->stencil_init == VK_ATTACHMENT_LOAD_OP_CLEAR) {
2980          return true;
2981       }
2982    }
2983 
2984    return false;
2985 }
2986 
2987 static VkResult
pvr_cmd_buffer_set_clear_values(struct pvr_cmd_buffer * cmd_buffer,const VkRenderPassBeginInfo * pRenderPassBegin)2988 pvr_cmd_buffer_set_clear_values(struct pvr_cmd_buffer *cmd_buffer,
2989                                 const VkRenderPassBeginInfo *pRenderPassBegin)
2990 {
2991    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
2992 
2993    /* Free any previously allocated clear values. */
2994    vk_free(&cmd_buffer->vk.pool->alloc, state->render_pass_info.clear_values);
2995 
2996    if (pRenderPassBegin->clearValueCount) {
2997       const size_t size = pRenderPassBegin->clearValueCount *
2998                           sizeof(*state->render_pass_info.clear_values);
2999 
3000       state->render_pass_info.clear_values =
3001          vk_zalloc(&cmd_buffer->vk.pool->alloc,
3002                    size,
3003                    8,
3004                    VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
3005       if (!state->render_pass_info.clear_values) {
3006          return vk_command_buffer_set_error(&cmd_buffer->vk,
3007                                             VK_ERROR_OUT_OF_HOST_MEMORY);
3008       }
3009 
3010       memcpy(state->render_pass_info.clear_values,
3011              pRenderPassBegin->pClearValues,
3012              size);
3013    } else {
3014       state->render_pass_info.clear_values = NULL;
3015    }
3016 
3017    state->render_pass_info.clear_value_count =
3018       pRenderPassBegin->clearValueCount;
3019 
3020    return VK_SUCCESS;
3021 }
3022 
3023 /**
3024  * \brief Indicates whether to use the large or normal clear state words.
3025  *
3026  * If the current render area can fit within a quarter of the max framebuffer
3027  * that the device is capable of, we can use the normal clear state words,
3028  * otherwise the large clear state words are needed.
3029  *
3030  * The requirement of a quarter of the max framebuffer comes from the index
3031  * count used in the normal clear state words and the vertices uploaded at
3032  * device creation.
3033  *
3034  * \param[in] cmd_buffer The command buffer for the clear.
3035  * \return true if large clear state words are required.
3036  */
3037 static bool
pvr_is_large_clear_required(const struct pvr_cmd_buffer * const cmd_buffer)3038 pvr_is_large_clear_required(const struct pvr_cmd_buffer *const cmd_buffer)
3039 {
3040    const struct pvr_device_info *const dev_info =
3041       &cmd_buffer->device->pdevice->dev_info;
3042    const VkRect2D render_area = cmd_buffer->state.render_pass_info.render_area;
3043    const uint32_t vf_max_x = rogue_get_param_vf_max_x(dev_info);
3044    const uint32_t vf_max_y = rogue_get_param_vf_max_x(dev_info);
3045 
3046    return (render_area.extent.width > (vf_max_x / 2) - 1) ||
3047           (render_area.extent.height > (vf_max_y / 2) - 1);
3048 }
3049 
pvr_emit_clear_words(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)3050 static void pvr_emit_clear_words(struct pvr_cmd_buffer *const cmd_buffer,
3051                                  struct pvr_sub_cmd_gfx *const sub_cmd)
3052 {
3053    struct pvr_device *device = cmd_buffer->device;
3054    struct pvr_csb *csb = &sub_cmd->control_stream;
3055    uint32_t vdm_state_size_in_dw;
3056    const uint32_t *vdm_state;
3057    uint32_t *stream;
3058 
3059    vdm_state_size_in_dw =
3060       pvr_clear_vdm_state_get_size_in_dw(&device->pdevice->dev_info, 1);
3061 
3062    pvr_csb_set_relocation_mark(csb);
3063 
3064    stream = pvr_csb_alloc_dwords(csb, vdm_state_size_in_dw);
3065    if (!stream) {
3066       pvr_cmd_buffer_set_error_unwarned(cmd_buffer, csb->status);
3067       return;
3068    }
3069 
3070    if (pvr_is_large_clear_required(cmd_buffer))
3071       vdm_state = device->static_clear_state.large_clear_vdm_words;
3072    else
3073       vdm_state = device->static_clear_state.vdm_words;
3074 
3075    memcpy(stream, vdm_state, PVR_DW_TO_BYTES(vdm_state_size_in_dw));
3076 
3077    pvr_csb_clear_relocation_mark(csb);
3078 }
3079 
pvr_cs_write_load_op(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_gfx * sub_cmd,struct pvr_load_op * load_op,uint32_t isp_userpass)3080 static VkResult pvr_cs_write_load_op(struct pvr_cmd_buffer *cmd_buffer,
3081                                      struct pvr_sub_cmd_gfx *sub_cmd,
3082                                      struct pvr_load_op *load_op,
3083                                      uint32_t isp_userpass)
3084 {
3085    const struct pvr_device *device = cmd_buffer->device;
3086    struct pvr_static_clear_ppp_template template =
3087       device->static_clear_state.ppp_templates[VK_IMAGE_ASPECT_COLOR_BIT];
3088    uint32_t pds_state[PVR_STATIC_CLEAR_PDS_STATE_COUNT];
3089    struct pvr_pds_upload shareds_update_program;
3090    struct pvr_suballoc_bo *pvr_bo;
3091    VkResult result;
3092 
3093    result = pvr_load_op_data_create_and_upload(cmd_buffer,
3094                                                load_op,
3095                                                &shareds_update_program);
3096    if (result != VK_SUCCESS)
3097       return result;
3098 
3099    template.config.ispctl.upass = isp_userpass;
3100 
3101    /* It might look odd that we aren't specifying the code segment's
3102     * address anywhere. This is because the hardware always assumes that the
3103     * data size is 2 128bit words and the code segments starts after that.
3104     */
3105    pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_SHADERBASE],
3106                  TA_STATE_PDS_SHADERBASE,
3107                  shaderbase) {
3108       shaderbase.addr = PVR_DEV_ADDR(load_op->pds_frag_prog.data_offset);
3109    }
3110 
3111    pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_TEXUNICODEBASE],
3112                  TA_STATE_PDS_TEXUNICODEBASE,
3113                  texunicodebase) {
3114       texunicodebase.addr =
3115          PVR_DEV_ADDR(load_op->pds_tex_state_prog.code_offset);
3116    }
3117 
3118    pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_SIZEINFO1],
3119                  TA_STATE_PDS_SIZEINFO1,
3120                  sizeinfo1) {
3121       /* Dummy coefficient loading program. */
3122       sizeinfo1.pds_varyingsize = 0;
3123 
3124       sizeinfo1.pds_texturestatesize = DIV_ROUND_UP(
3125          shareds_update_program.data_size,
3126          PVRX(TA_STATE_PDS_SIZEINFO1_PDS_TEXTURESTATESIZE_UNIT_SIZE));
3127 
3128       sizeinfo1.pds_tempsize =
3129          DIV_ROUND_UP(load_op->temps_count,
3130                       PVRX(TA_STATE_PDS_SIZEINFO1_PDS_TEMPSIZE_UNIT_SIZE));
3131    }
3132 
3133    pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_SIZEINFO2],
3134                  TA_STATE_PDS_SIZEINFO2,
3135                  sizeinfo2) {
3136       sizeinfo2.usc_sharedsize =
3137          DIV_ROUND_UP(load_op->const_shareds_count,
3138                       PVRX(TA_STATE_PDS_SIZEINFO2_USC_SHAREDSIZE_UNIT_SIZE));
3139    }
3140 
3141    /* Dummy coefficient loading program. */
3142    pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_VARYINGBASE] = 0;
3143 
3144    pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_TEXTUREDATABASE],
3145                  TA_STATE_PDS_TEXTUREDATABASE,
3146                  texturedatabase) {
3147       texturedatabase.addr = PVR_DEV_ADDR(shareds_update_program.data_offset);
3148    }
3149 
3150    template.config.pds_state = &pds_state;
3151 
3152    pvr_emit_ppp_from_template(&sub_cmd->control_stream, &template, &pvr_bo);
3153    list_add(&pvr_bo->link, &cmd_buffer->bo_list);
3154 
3155    pvr_emit_clear_words(cmd_buffer, sub_cmd);
3156 
3157    pvr_reset_graphics_dirty_state(cmd_buffer, false);
3158 
3159    return VK_SUCCESS;
3160 }
3161 
pvr_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,const VkRenderPassBeginInfo * pRenderPassBeginInfo,const VkSubpassBeginInfo * pSubpassBeginInfo)3162 void pvr_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
3163                              const VkRenderPassBeginInfo *pRenderPassBeginInfo,
3164                              const VkSubpassBeginInfo *pSubpassBeginInfo)
3165 {
3166    PVR_FROM_HANDLE(pvr_framebuffer,
3167                    framebuffer,
3168                    pRenderPassBeginInfo->framebuffer);
3169    PVR_FROM_HANDLE(pvr_render_pass, pass, pRenderPassBeginInfo->renderPass);
3170    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
3171    const struct pvr_renderpass_hwsetup_subpass *hw_subpass;
3172    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
3173    VkResult result;
3174 
3175    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
3176 
3177    assert(!state->render_pass_info.pass);
3178    assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
3179 
3180    /* FIXME: Create a separate function for everything using pass->subpasses,
3181     * look at cmd_buffer_begin_subpass() for example. */
3182    state->render_pass_info.pass = pass;
3183    state->render_pass_info.framebuffer = framebuffer;
3184    state->render_pass_info.subpass_idx = 0;
3185    state->render_pass_info.render_area = pRenderPassBeginInfo->renderArea;
3186    state->render_pass_info.current_hw_subpass = 0;
3187    state->render_pass_info.pipeline_bind_point =
3188       pass->subpasses[0].pipeline_bind_point;
3189    state->render_pass_info.isp_userpass = pass->subpasses[0].isp_userpass;
3190    state->dirty.isp_userpass = true;
3191 
3192    result = pvr_cmd_buffer_setup_attachments(cmd_buffer, pass, framebuffer);
3193    if (result != VK_SUCCESS)
3194       return;
3195 
3196    result = pvr_init_render_targets(cmd_buffer->device, pass, framebuffer);
3197    if (result != VK_SUCCESS) {
3198       pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
3199       return;
3200    }
3201 
3202    result = pvr_cmd_buffer_set_clear_values(cmd_buffer, pRenderPassBeginInfo);
3203    if (result != VK_SUCCESS)
3204       return;
3205 
3206    assert(pass->subpasses[0].pipeline_bind_point ==
3207           VK_PIPELINE_BIND_POINT_GRAPHICS);
3208 
3209    result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
3210    if (result != VK_SUCCESS)
3211       return;
3212 
3213    /* Run subpass 0 "soft" background object after the actual background
3214     * object.
3215     */
3216    hw_subpass = pvr_get_hw_subpass(pass, 0);
3217    if (hw_subpass->load_op) {
3218       result = pvr_cs_write_load_op(cmd_buffer,
3219                                     &cmd_buffer->state.current_sub_cmd->gfx,
3220                                     hw_subpass->load_op,
3221                                     0);
3222       if (result != VK_SUCCESS)
3223          return;
3224    }
3225 
3226    pvr_perform_start_of_render_clears(cmd_buffer);
3227    pvr_stash_depth_format(&cmd_buffer->state,
3228                           &cmd_buffer->state.current_sub_cmd->gfx);
3229 }
3230 
pvr_BeginCommandBuffer(VkCommandBuffer commandBuffer,const VkCommandBufferBeginInfo * pBeginInfo)3231 VkResult pvr_BeginCommandBuffer(VkCommandBuffer commandBuffer,
3232                                 const VkCommandBufferBeginInfo *pBeginInfo)
3233 {
3234    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
3235    struct pvr_cmd_buffer_state *state;
3236    VkResult result;
3237 
3238    vk_command_buffer_begin(&cmd_buffer->vk, pBeginInfo);
3239 
3240    cmd_buffer->usage_flags = pBeginInfo->flags;
3241    state = &cmd_buffer->state;
3242 
3243    /* VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT must be ignored for
3244     * primary level command buffers.
3245     *
3246     * From the Vulkan 1.0 spec:
3247     *
3248     *    VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT specifies that a
3249     *    secondary command buffer is considered to be entirely inside a render
3250     *    pass. If this is a primary command buffer, then this bit is ignored.
3251     */
3252    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
3253       cmd_buffer->usage_flags &=
3254          ~VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
3255    }
3256 
3257    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
3258       if (cmd_buffer->usage_flags &
3259           VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
3260          const VkCommandBufferInheritanceInfo *inheritance_info =
3261             pBeginInfo->pInheritanceInfo;
3262          struct pvr_render_pass *pass;
3263 
3264          pass = pvr_render_pass_from_handle(inheritance_info->renderPass);
3265          state->render_pass_info.pass = pass;
3266          state->render_pass_info.framebuffer =
3267             pvr_framebuffer_from_handle(inheritance_info->framebuffer);
3268          state->render_pass_info.subpass_idx = inheritance_info->subpass;
3269          state->render_pass_info.isp_userpass =
3270             pass->subpasses[inheritance_info->subpass].isp_userpass;
3271 
3272          result =
3273             pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
3274          if (result != VK_SUCCESS)
3275             return result;
3276 
3277          state->vis_test_enabled = inheritance_info->occlusionQueryEnable;
3278       }
3279 
3280       state->dirty.isp_userpass = true;
3281    }
3282 
3283    util_dynarray_init(&state->query_indices, NULL);
3284 
3285    memset(state->barriers_needed,
3286           0xFF,
3287           sizeof(*state->barriers_needed) * ARRAY_SIZE(state->barriers_needed));
3288 
3289    return VK_SUCCESS;
3290 }
3291 
pvr_cmd_buffer_add_transfer_cmd(struct pvr_cmd_buffer * cmd_buffer,struct pvr_transfer_cmd * transfer_cmd)3292 VkResult pvr_cmd_buffer_add_transfer_cmd(struct pvr_cmd_buffer *cmd_buffer,
3293                                          struct pvr_transfer_cmd *transfer_cmd)
3294 {
3295    struct pvr_sub_cmd_transfer *sub_cmd;
3296    VkResult result;
3297 
3298    result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_TRANSFER);
3299    if (result != VK_SUCCESS)
3300       return result;
3301 
3302    sub_cmd = &cmd_buffer->state.current_sub_cmd->transfer;
3303 
3304    list_addtail(&transfer_cmd->link, sub_cmd->transfer_cmds);
3305 
3306    return VK_SUCCESS;
3307 }
3308 
3309 static VkResult
pvr_setup_vertex_buffers(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_graphics_pipeline * const gfx_pipeline)3310 pvr_setup_vertex_buffers(struct pvr_cmd_buffer *cmd_buffer,
3311                          const struct pvr_graphics_pipeline *const gfx_pipeline)
3312 {
3313    const struct pvr_vertex_shader_state *const vertex_state =
3314       &gfx_pipeline->shader_state.vertex;
3315    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
3316    const struct pvr_pds_info *const pds_info = state->pds_shader.info;
3317    struct pvr_suballoc_bo *pvr_bo;
3318    const uint8_t *entries;
3319    uint32_t *dword_buffer;
3320    uint64_t *qword_buffer;
3321    VkResult result;
3322 
3323    result =
3324       pvr_cmd_buffer_alloc_mem(cmd_buffer,
3325                                cmd_buffer->device->heaps.pds_heap,
3326                                PVR_DW_TO_BYTES(pds_info->data_size_in_dwords),
3327                                &pvr_bo);
3328    if (result != VK_SUCCESS)
3329       return result;
3330 
3331    dword_buffer = (uint32_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
3332    qword_buffer = (uint64_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
3333 
3334    entries = (uint8_t *)pds_info->entries;
3335 
3336    for (uint32_t i = 0; i < pds_info->entry_count; i++) {
3337       const struct pvr_const_map_entry *const entry_header =
3338          (struct pvr_const_map_entry *)entries;
3339 
3340       switch (entry_header->type) {
3341       case PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32: {
3342          const struct pvr_const_map_entry_literal32 *const literal =
3343             (struct pvr_const_map_entry_literal32 *)entries;
3344 
3345          PVR_WRITE(dword_buffer,
3346                    literal->literal_value,
3347                    literal->const_offset,
3348                    pds_info->data_size_in_dwords);
3349 
3350          entries += sizeof(*literal);
3351          break;
3352       }
3353 
3354       case PVR_PDS_CONST_MAP_ENTRY_TYPE_DOUTU_ADDRESS: {
3355          const struct pvr_const_map_entry_doutu_address *const doutu_addr =
3356             (struct pvr_const_map_entry_doutu_address *)entries;
3357          const pvr_dev_addr_t exec_addr =
3358             PVR_DEV_ADDR_OFFSET(vertex_state->bo->dev_addr,
3359                                 vertex_state->entry_offset);
3360          uint64_t addr = 0ULL;
3361 
3362          pvr_set_usc_execution_address64(&addr, exec_addr.addr);
3363 
3364          PVR_WRITE(qword_buffer,
3365                    addr | doutu_addr->doutu_control,
3366                    doutu_addr->const_offset,
3367                    pds_info->data_size_in_dwords);
3368 
3369          entries += sizeof(*doutu_addr);
3370          break;
3371       }
3372 
3373       case PVR_PDS_CONST_MAP_ENTRY_TYPE_BASE_INSTANCE: {
3374          const struct pvr_const_map_entry_base_instance *const base_instance =
3375             (struct pvr_const_map_entry_base_instance *)entries;
3376 
3377          PVR_WRITE(dword_buffer,
3378                    state->draw_state.base_instance,
3379                    base_instance->const_offset,
3380                    pds_info->data_size_in_dwords);
3381 
3382          entries += sizeof(*base_instance);
3383          break;
3384       }
3385 
3386       case PVR_PDS_CONST_MAP_ENTRY_TYPE_BASE_VERTEX: {
3387          const struct pvr_const_map_entry_base_instance *const base_instance =
3388             (struct pvr_const_map_entry_base_instance *)entries;
3389 
3390          PVR_WRITE(dword_buffer,
3391                    state->draw_state.base_vertex,
3392                    base_instance->const_offset,
3393                    pds_info->data_size_in_dwords);
3394 
3395          entries += sizeof(*base_instance);
3396          break;
3397       }
3398 
3399       case PVR_PDS_CONST_MAP_ENTRY_TYPE_VERTEX_ATTRIBUTE_ADDRESS: {
3400          const struct pvr_const_map_entry_vertex_attribute_address
3401             *const attribute =
3402                (struct pvr_const_map_entry_vertex_attribute_address *)entries;
3403          const struct pvr_vertex_binding *const binding =
3404             &state->vertex_bindings[attribute->binding_index];
3405          /* In relation to the Vulkan spec. 22.4. Vertex Input Address
3406           * Calculation:
3407           *    Adding binding->offset corresponds to calculating the
3408           *    `bufferBindingAddress`. Adding attribute->offset corresponds to
3409           *    adding the `attribDesc.offset`. The `effectiveVertexOffset` is
3410           *    taken care by the PDS program itself with a DDMAD which will
3411           *    multiply the vertex/instance idx with the binding's stride and
3412           *    add that to the address provided here.
3413           */
3414          const pvr_dev_addr_t addr =
3415             PVR_DEV_ADDR_OFFSET(binding->buffer->dev_addr,
3416                                 binding->offset + attribute->offset);
3417 
3418          PVR_WRITE(qword_buffer,
3419                    addr.addr,
3420                    attribute->const_offset,
3421                    pds_info->data_size_in_dwords);
3422 
3423          entries += sizeof(*attribute);
3424          break;
3425       }
3426 
3427       case PVR_PDS_CONST_MAP_ENTRY_TYPE_ROBUST_VERTEX_ATTRIBUTE_ADDRESS: {
3428          const struct pvr_const_map_entry_robust_vertex_attribute_address
3429             *const attribute =
3430                (struct pvr_const_map_entry_robust_vertex_attribute_address *)
3431                   entries;
3432          const struct pvr_vertex_binding *const binding =
3433             &state->vertex_bindings[attribute->binding_index];
3434          pvr_dev_addr_t addr;
3435 
3436          if (binding->buffer->vk.size <
3437              (attribute->offset + attribute->component_size_in_bytes)) {
3438             /* Replace with load from robustness buffer when no attribute is in
3439              * range
3440              */
3441             addr = PVR_DEV_ADDR_OFFSET(
3442                cmd_buffer->device->robustness_buffer->vma->dev_addr,
3443                attribute->robustness_buffer_offset);
3444          } else {
3445             addr = PVR_DEV_ADDR_OFFSET(binding->buffer->dev_addr,
3446                                        binding->offset + attribute->offset);
3447          }
3448 
3449          PVR_WRITE(qword_buffer,
3450                    addr.addr,
3451                    attribute->const_offset,
3452                    pds_info->data_size_in_dwords);
3453 
3454          entries += sizeof(*attribute);
3455          break;
3456       }
3457 
3458       case PVR_PDS_CONST_MAP_ENTRY_TYPE_VERTEX_ATTRIBUTE_MAX_INDEX: {
3459          const struct pvr_const_map_entry_vertex_attribute_max_index *attribute =
3460             (struct pvr_const_map_entry_vertex_attribute_max_index *)entries;
3461          const struct pvr_vertex_binding *const binding =
3462             &state->vertex_bindings[attribute->binding_index];
3463          const uint64_t bound_size = binding->buffer->vk.size - binding->offset;
3464          const uint32_t attribute_end =
3465             attribute->offset + attribute->component_size_in_bytes;
3466          uint32_t max_index;
3467 
3468          if (PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info,
3469                              pds_ddmadt)) {
3470             /* TODO: PVR_PDS_CONST_MAP_ENTRY_TYPE_VERTEX_ATTRIBUTE_MAX_INDEX
3471              * has the same define value as
3472              * PVR_PDS_CONST_MAP_ENTRY_TYPE_VERTEX_ATTR_DDMADT_OOB_BUFFER_SIZE
3473              * so maybe we want to remove one of the defines or change the
3474              * values.
3475              */
3476             pvr_finishme("Unimplemented robust buffer access with DDMADT");
3477             assert(false);
3478          }
3479 
3480          /* If the stride is 0 then all attributes use the same single element
3481           * from the binding so the index can only be up to 0.
3482           */
3483          if (bound_size < attribute_end || attribute->stride == 0) {
3484             max_index = 0;
3485          } else {
3486             max_index = (uint32_t)(bound_size / attribute->stride) - 1;
3487 
3488             /* There's one last attribute that can fit in. */
3489             if (bound_size % attribute->stride >= attribute_end)
3490                max_index++;
3491          }
3492 
3493          PVR_WRITE(dword_buffer,
3494                    max_index,
3495                    attribute->const_offset,
3496                    pds_info->data_size_in_dwords);
3497 
3498          entries += sizeof(*attribute);
3499          break;
3500       }
3501 
3502       default:
3503          unreachable("Unsupported data section map");
3504          break;
3505       }
3506    }
3507 
3508    state->pds_vertex_attrib_offset =
3509       pvr_bo->dev_addr.addr -
3510       cmd_buffer->device->heaps.pds_heap->base_addr.addr;
3511 
3512    return VK_SUCCESS;
3513 }
3514 
pvr_setup_descriptor_mappings_old(struct pvr_cmd_buffer * const cmd_buffer,enum pvr_stage_allocation stage,const struct pvr_stage_allocation_descriptor_state * descriptor_state,const pvr_dev_addr_t * const num_worgroups_buff_addr,uint32_t * const descriptor_data_offset_out)3515 static VkResult pvr_setup_descriptor_mappings_old(
3516    struct pvr_cmd_buffer *const cmd_buffer,
3517    enum pvr_stage_allocation stage,
3518    const struct pvr_stage_allocation_descriptor_state *descriptor_state,
3519    const pvr_dev_addr_t *const num_worgroups_buff_addr,
3520    uint32_t *const descriptor_data_offset_out)
3521 {
3522    const struct pvr_pds_info *const pds_info = &descriptor_state->pds_info;
3523    const struct pvr_descriptor_state *desc_state;
3524    struct pvr_suballoc_bo *pvr_bo;
3525    const uint8_t *entries;
3526    uint32_t *dword_buffer;
3527    uint64_t *qword_buffer;
3528    VkResult result;
3529 
3530    if (!pds_info->data_size_in_dwords)
3531       return VK_SUCCESS;
3532 
3533    result =
3534       pvr_cmd_buffer_alloc_mem(cmd_buffer,
3535                                cmd_buffer->device->heaps.pds_heap,
3536                                PVR_DW_TO_BYTES(pds_info->data_size_in_dwords),
3537                                &pvr_bo);
3538    if (result != VK_SUCCESS)
3539       return result;
3540 
3541    dword_buffer = (uint32_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
3542    qword_buffer = (uint64_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
3543 
3544    entries = (uint8_t *)pds_info->entries;
3545 
3546    switch (stage) {
3547    case PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY:
3548    case PVR_STAGE_ALLOCATION_FRAGMENT:
3549       desc_state = &cmd_buffer->state.gfx_desc_state;
3550       break;
3551 
3552    case PVR_STAGE_ALLOCATION_COMPUTE:
3553       desc_state = &cmd_buffer->state.compute_desc_state;
3554       break;
3555 
3556    default:
3557       unreachable("Unsupported stage.");
3558       break;
3559    }
3560 
3561    for (uint32_t i = 0; i < pds_info->entry_count; i++) {
3562       const struct pvr_const_map_entry *const entry_header =
3563          (struct pvr_const_map_entry *)entries;
3564 
3565       switch (entry_header->type) {
3566       case PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32: {
3567          const struct pvr_const_map_entry_literal32 *const literal =
3568             (struct pvr_const_map_entry_literal32 *)entries;
3569 
3570          PVR_WRITE(dword_buffer,
3571                    literal->literal_value,
3572                    literal->const_offset,
3573                    pds_info->data_size_in_dwords);
3574 
3575          entries += sizeof(*literal);
3576          break;
3577       }
3578 
3579       case PVR_PDS_CONST_MAP_ENTRY_TYPE_CONSTANT_BUFFER: {
3580          const struct pvr_const_map_entry_constant_buffer *const_buffer_entry =
3581             (struct pvr_const_map_entry_constant_buffer *)entries;
3582          const uint32_t desc_set = const_buffer_entry->desc_set;
3583          const uint32_t binding = const_buffer_entry->binding;
3584          const struct pvr_descriptor_set *descriptor_set;
3585          const struct pvr_descriptor *descriptor;
3586          pvr_dev_addr_t buffer_addr;
3587 
3588          assert(desc_set < PVR_MAX_DESCRIPTOR_SETS);
3589          descriptor_set = desc_state->descriptor_sets[desc_set];
3590 
3591          /* TODO: Handle dynamic buffers. */
3592          descriptor = &descriptor_set->descriptors[binding];
3593          assert(descriptor->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
3594 
3595          assert(descriptor->buffer_desc_range ==
3596                 PVR_DW_TO_BYTES(const_buffer_entry->size_in_dwords));
3597          assert(descriptor->buffer_whole_range ==
3598                 PVR_DW_TO_BYTES(const_buffer_entry->size_in_dwords));
3599 
3600          buffer_addr =
3601             PVR_DEV_ADDR_OFFSET(descriptor->buffer_dev_addr,
3602                                 const_buffer_entry->offset * sizeof(uint32_t));
3603 
3604          PVR_WRITE(qword_buffer,
3605                    buffer_addr.addr,
3606                    const_buffer_entry->const_offset,
3607                    pds_info->data_size_in_dwords);
3608 
3609          entries += sizeof(*const_buffer_entry);
3610          break;
3611       }
3612 
3613       case PVR_PDS_CONST_MAP_ENTRY_TYPE_DESCRIPTOR_SET: {
3614          const struct pvr_const_map_entry_descriptor_set *desc_set_entry =
3615             (struct pvr_const_map_entry_descriptor_set *)entries;
3616          const uint32_t desc_set_num = desc_set_entry->descriptor_set;
3617          const struct pvr_descriptor_set *descriptor_set;
3618          pvr_dev_addr_t desc_set_addr;
3619          uint64_t desc_portion_offset;
3620 
3621          assert(desc_set_num < PVR_MAX_DESCRIPTOR_SETS);
3622 
3623          /* TODO: Remove this when the compiler provides us with usage info?
3624           */
3625          /* We skip DMAing unbound descriptor sets. */
3626          if (!(desc_state->valid_mask & BITFIELD_BIT(desc_set_num))) {
3627             const struct pvr_const_map_entry_literal32 *literal;
3628             uint32_t zero_literal_value;
3629 
3630             /* The code segment contains a DOUT instructions so in the data
3631              * section we have to write a DOUTD_SRC0 and DOUTD_SRC1.
3632              * We'll write 0 for DOUTD_SRC0 since we don't have a buffer to DMA.
3633              * We're expecting a LITERAL32 entry containing the value for
3634              * DOUTD_SRC1 next so let's make sure we get it and write it
3635              * with BSIZE to 0 disabling the DMA operation.
3636              * We don't want the LITERAL32 to be processed as normal otherwise
3637              * we'd be DMAing from an address of 0.
3638              */
3639 
3640             entries += sizeof(*desc_set_entry);
3641             literal = (struct pvr_const_map_entry_literal32 *)entries;
3642 
3643             assert(literal->type == PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32);
3644 
3645             zero_literal_value =
3646                literal->literal_value &
3647                PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTD_SRC1_BSIZE_CLRMSK;
3648 
3649             PVR_WRITE(qword_buffer,
3650                       UINT64_C(0),
3651                       desc_set_entry->const_offset,
3652                       pds_info->data_size_in_dwords);
3653 
3654             PVR_WRITE(dword_buffer,
3655                       zero_literal_value,
3656                       desc_set_entry->const_offset,
3657                       pds_info->data_size_in_dwords);
3658 
3659             entries += sizeof(*literal);
3660             i++;
3661             continue;
3662          }
3663 
3664          descriptor_set = desc_state->descriptor_sets[desc_set_num];
3665 
3666          desc_set_addr = descriptor_set->pvr_bo->dev_addr;
3667 
3668          if (desc_set_entry->primary) {
3669             desc_portion_offset =
3670                descriptor_set->layout->memory_layout_in_dwords_per_stage[stage]
3671                   .primary_offset;
3672          } else {
3673             desc_portion_offset =
3674                descriptor_set->layout->memory_layout_in_dwords_per_stage[stage]
3675                   .secondary_offset;
3676          }
3677          desc_portion_offset = PVR_DW_TO_BYTES(desc_portion_offset);
3678 
3679          desc_set_addr =
3680             PVR_DEV_ADDR_OFFSET(desc_set_addr, desc_portion_offset);
3681 
3682          desc_set_addr = PVR_DEV_ADDR_OFFSET(
3683             desc_set_addr,
3684             PVR_DW_TO_BYTES((uint64_t)desc_set_entry->offset_in_dwords));
3685 
3686          PVR_WRITE(qword_buffer,
3687                    desc_set_addr.addr,
3688                    desc_set_entry->const_offset,
3689                    pds_info->data_size_in_dwords);
3690 
3691          entries += sizeof(*desc_set_entry);
3692          break;
3693       }
3694 
3695       case PVR_PDS_CONST_MAP_ENTRY_TYPE_SPECIAL_BUFFER: {
3696          const struct pvr_const_map_entry_special_buffer *special_buff_entry =
3697             (struct pvr_const_map_entry_special_buffer *)entries;
3698 
3699          switch (special_buff_entry->buffer_type) {
3700          case PVR_BUFFER_TYPE_COMPILE_TIME: {
3701             uint64_t addr = descriptor_state->static_consts->dev_addr.addr;
3702 
3703             PVR_WRITE(qword_buffer,
3704                       addr,
3705                       special_buff_entry->const_offset,
3706                       pds_info->data_size_in_dwords);
3707             break;
3708          }
3709 
3710          case PVR_BUFFER_TYPE_BLEND_CONSTS:
3711             /* TODO: See if instead of reusing the blend constant buffer type
3712              * entry, we can setup a new buffer type specifically for
3713              * num_workgroups or other built-in variables. The mappings are
3714              * setup at pipeline creation when creating the descriptor program.
3715              */
3716             if (stage == PVR_STAGE_ALLOCATION_COMPUTE) {
3717                assert(num_worgroups_buff_addr->addr);
3718 
3719                /* TODO: Check if we need to offset this (e.g. for just y and z),
3720                 * or cope with any reordering?
3721                 */
3722                PVR_WRITE(qword_buffer,
3723                          num_worgroups_buff_addr->addr,
3724                          special_buff_entry->const_offset,
3725                          pds_info->data_size_in_dwords);
3726             } else {
3727                pvr_finishme("Add blend constants support.");
3728             }
3729             break;
3730 
3731          default:
3732             unreachable("Unsupported special buffer type.");
3733          }
3734 
3735          entries += sizeof(*special_buff_entry);
3736          break;
3737       }
3738 
3739       default:
3740          unreachable("Unsupported map entry type.");
3741       }
3742    }
3743 
3744    *descriptor_data_offset_out =
3745       pvr_bo->dev_addr.addr -
3746       cmd_buffer->device->heaps.pds_heap->base_addr.addr;
3747 
3748    return VK_SUCCESS;
3749 }
3750 
3751 /* Note that the descriptor set doesn't have any space for dynamic buffer
3752  * descriptors so this works on the assumption that you have a buffer with space
3753  * for them at the end.
3754  */
pvr_get_dynamic_descriptor_primary_offset(const struct pvr_device * device,const struct pvr_descriptor_set_layout * layout,const struct pvr_descriptor_set_layout_binding * binding,const uint32_t stage,const uint32_t desc_idx)3755 static uint16_t pvr_get_dynamic_descriptor_primary_offset(
3756    const struct pvr_device *device,
3757    const struct pvr_descriptor_set_layout *layout,
3758    const struct pvr_descriptor_set_layout_binding *binding,
3759    const uint32_t stage,
3760    const uint32_t desc_idx)
3761 {
3762    struct pvr_descriptor_size_info size_info;
3763    uint32_t offset;
3764 
3765    assert(binding->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
3766           binding->type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC);
3767    assert(desc_idx < binding->descriptor_count);
3768 
3769    pvr_descriptor_size_info_init(device, binding->type, &size_info);
3770 
3771    offset = layout->total_size_in_dwords;
3772    offset += binding->per_stage_offset_in_dwords[stage].primary;
3773    offset += (desc_idx * size_info.primary);
3774 
3775    /* Offset must be less than * 16bits. */
3776    assert(offset < UINT16_MAX);
3777 
3778    return (uint16_t)offset;
3779 }
3780 
3781 /* Note that the descriptor set doesn't have any space for dynamic buffer
3782  * descriptors so this works on the assumption that you have a buffer with space
3783  * for them at the end.
3784  */
pvr_get_dynamic_descriptor_secondary_offset(const struct pvr_device * device,const struct pvr_descriptor_set_layout * layout,const struct pvr_descriptor_set_layout_binding * binding,const uint32_t stage,const uint32_t desc_idx)3785 static uint16_t pvr_get_dynamic_descriptor_secondary_offset(
3786    const struct pvr_device *device,
3787    const struct pvr_descriptor_set_layout *layout,
3788    const struct pvr_descriptor_set_layout_binding *binding,
3789    const uint32_t stage,
3790    const uint32_t desc_idx)
3791 {
3792    struct pvr_descriptor_size_info size_info;
3793    uint32_t offset;
3794 
3795    assert(binding->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
3796           binding->type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC);
3797    assert(desc_idx < binding->descriptor_count);
3798 
3799    pvr_descriptor_size_info_init(device, binding->type, &size_info);
3800 
3801    offset = layout->total_size_in_dwords;
3802    offset +=
3803       layout->memory_layout_in_dwords_per_stage[stage].primary_dynamic_size;
3804    offset += binding->per_stage_offset_in_dwords[stage].secondary;
3805    offset += (desc_idx * size_info.secondary);
3806 
3807    /* Offset must be less than * 16bits. */
3808    assert(offset < UINT16_MAX);
3809 
3810    return (uint16_t)offset;
3811 }
3812 
3813 /**
3814  * \brief Upload a copy of the descriptor set with dynamic buffer offsets
3815  * applied.
3816  */
3817 /* TODO: We should probably make the compiler aware of the dynamic descriptors.
3818  * We could use push constants like Anv seems to do. This would avoid having to
3819  * duplicate all sets containing dynamic descriptors each time the offsets are
3820  * updated.
3821  */
pvr_cmd_buffer_upload_patched_desc_set(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_descriptor_set * desc_set,const uint32_t * dynamic_offsets,struct pvr_suballoc_bo ** const bo_out)3822 static VkResult pvr_cmd_buffer_upload_patched_desc_set(
3823    struct pvr_cmd_buffer *cmd_buffer,
3824    const struct pvr_descriptor_set *desc_set,
3825    const uint32_t *dynamic_offsets,
3826    struct pvr_suballoc_bo **const bo_out)
3827 {
3828    const struct pvr_descriptor_set_layout *layout = desc_set->layout;
3829    const uint64_t normal_desc_set_size =
3830       PVR_DW_TO_BYTES(layout->total_size_in_dwords);
3831    const uint64_t dynamic_descs_size =
3832       PVR_DW_TO_BYTES(layout->total_dynamic_size_in_dwords);
3833    struct pvr_descriptor_size_info dynamic_uniform_buffer_size_info;
3834    struct pvr_descriptor_size_info dynamic_storage_buffer_size_info;
3835    struct pvr_device *device = cmd_buffer->device;
3836    struct pvr_suballoc_bo *patched_desc_set_bo;
3837    uint32_t *src_mem_ptr, *dst_mem_ptr;
3838    uint32_t desc_idx_offset = 0;
3839    VkResult result;
3840 
3841    assert(desc_set->layout->dynamic_buffer_count > 0);
3842 
3843    pvr_descriptor_size_info_init(device,
3844                                  VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC,
3845                                  &dynamic_uniform_buffer_size_info);
3846    pvr_descriptor_size_info_init(device,
3847                                  VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC,
3848                                  &dynamic_storage_buffer_size_info);
3849 
3850    /* TODO: In the descriptor set we don't account for dynamic buffer
3851     * descriptors and take care of them in the pipeline layout. The pipeline
3852     * layout allocates them at the beginning but let's put them at the end just
3853     * because it makes things a bit easier. Ideally we should be using the
3854     * pipeline layout and use the offsets from the pipeline layout to patch
3855     * descriptors.
3856     */
3857    result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
3858                                      cmd_buffer->device->heaps.general_heap,
3859                                      normal_desc_set_size + dynamic_descs_size,
3860                                      &patched_desc_set_bo);
3861    if (result != VK_SUCCESS)
3862       return result;
3863 
3864    src_mem_ptr = (uint32_t *)pvr_bo_suballoc_get_map_addr(desc_set->pvr_bo);
3865    dst_mem_ptr = (uint32_t *)pvr_bo_suballoc_get_map_addr(patched_desc_set_bo);
3866 
3867    memcpy(dst_mem_ptr, src_mem_ptr, normal_desc_set_size);
3868 
3869    for (uint32_t i = 0; i < desc_set->layout->binding_count; i++) {
3870       const struct pvr_descriptor_set_layout_binding *binding =
3871          &desc_set->layout->bindings[i];
3872       const struct pvr_descriptor *descriptors =
3873          &desc_set->descriptors[binding->descriptor_index];
3874       const struct pvr_descriptor_size_info *size_info;
3875 
3876       if (binding->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC)
3877          size_info = &dynamic_uniform_buffer_size_info;
3878       else if (binding->type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC)
3879          size_info = &dynamic_storage_buffer_size_info;
3880       else
3881          continue;
3882 
3883       for (uint32_t stage = 0; stage < PVR_STAGE_ALLOCATION_COUNT; stage++) {
3884          uint32_t primary_offset;
3885          uint32_t secondary_offset;
3886 
3887          if (!(binding->shader_stage_mask & BITFIELD_BIT(stage)))
3888             continue;
3889 
3890          /* Get the offsets for the first dynamic descriptor in the current
3891           * binding.
3892           */
3893          primary_offset =
3894             pvr_get_dynamic_descriptor_primary_offset(device,
3895                                                       desc_set->layout,
3896                                                       binding,
3897                                                       stage,
3898                                                       0);
3899          secondary_offset =
3900             pvr_get_dynamic_descriptor_secondary_offset(device,
3901                                                         desc_set->layout,
3902                                                         binding,
3903                                                         stage,
3904                                                         0);
3905 
3906          /* clang-format off */
3907          for (uint32_t desc_idx = 0;
3908               desc_idx < binding->descriptor_count;
3909               desc_idx++) {
3910             /* clang-format on */
3911             const pvr_dev_addr_t addr =
3912                PVR_DEV_ADDR_OFFSET(descriptors[desc_idx].buffer_dev_addr,
3913                                    dynamic_offsets[desc_idx + desc_idx_offset]);
3914             const VkDeviceSize range =
3915                MIN2(descriptors[desc_idx].buffer_desc_range,
3916                     descriptors[desc_idx].buffer_whole_range -
3917                        dynamic_offsets[desc_idx]);
3918 
3919 #if defined(DEBUG)
3920             uint32_t desc_primary_offset;
3921             uint32_t desc_secondary_offset;
3922 
3923             desc_primary_offset =
3924                pvr_get_dynamic_descriptor_primary_offset(device,
3925                                                          desc_set->layout,
3926                                                          binding,
3927                                                          stage,
3928                                                          desc_idx);
3929             desc_secondary_offset =
3930                pvr_get_dynamic_descriptor_secondary_offset(device,
3931                                                            desc_set->layout,
3932                                                            binding,
3933                                                            stage,
3934                                                            desc_idx);
3935 
3936             /* Check the assumption that the descriptors within a binding, for
3937              * a particular stage, are allocated consecutively.
3938              */
3939             assert(desc_primary_offset ==
3940                    primary_offset + size_info->primary * desc_idx);
3941             assert(desc_secondary_offset ==
3942                    secondary_offset + size_info->secondary * desc_idx);
3943 #endif
3944 
3945             assert(descriptors[desc_idx].type == binding->type);
3946 
3947             memcpy(dst_mem_ptr + primary_offset + size_info->primary * desc_idx,
3948                    &addr.addr,
3949                    PVR_DW_TO_BYTES(size_info->primary));
3950             memcpy(dst_mem_ptr + secondary_offset +
3951                       size_info->secondary * desc_idx,
3952                    &range,
3953                    PVR_DW_TO_BYTES(size_info->secondary));
3954          }
3955       }
3956 
3957       desc_idx_offset += binding->descriptor_count;
3958    }
3959 
3960    *bo_out = patched_desc_set_bo;
3961 
3962    return VK_SUCCESS;
3963 }
3964 
3965 #define PVR_SELECT(_geom, _frag, _compute)         \
3966    (stage == PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY) \
3967       ? (_geom)                                    \
3968       : (stage == PVR_STAGE_ALLOCATION_FRAGMENT) ? (_frag) : (_compute)
3969 
3970 static VkResult
pvr_cmd_buffer_upload_desc_set_table(struct pvr_cmd_buffer * const cmd_buffer,enum pvr_stage_allocation stage,pvr_dev_addr_t * addr_out)3971 pvr_cmd_buffer_upload_desc_set_table(struct pvr_cmd_buffer *const cmd_buffer,
3972                                      enum pvr_stage_allocation stage,
3973                                      pvr_dev_addr_t *addr_out)
3974 {
3975    uint64_t bound_desc_sets[PVR_MAX_DESCRIPTOR_SETS];
3976    const struct pvr_descriptor_state *desc_state;
3977    struct pvr_suballoc_bo *suballoc_bo;
3978    uint32_t dynamic_offset_idx = 0;
3979    VkResult result;
3980 
3981    switch (stage) {
3982    case PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY:
3983    case PVR_STAGE_ALLOCATION_FRAGMENT:
3984    case PVR_STAGE_ALLOCATION_COMPUTE:
3985       break;
3986 
3987    default:
3988       unreachable("Unsupported stage.");
3989       break;
3990    }
3991 
3992    desc_state = PVR_SELECT(&cmd_buffer->state.gfx_desc_state,
3993                            &cmd_buffer->state.gfx_desc_state,
3994                            &cmd_buffer->state.compute_desc_state);
3995 
3996    for (uint32_t set = 0; set < ARRAY_SIZE(bound_desc_sets); set++)
3997       bound_desc_sets[set] = ~0;
3998 
3999    assert(util_last_bit(desc_state->valid_mask) <= ARRAY_SIZE(bound_desc_sets));
4000    for (uint32_t set = 0; set < util_last_bit(desc_state->valid_mask); set++) {
4001       const struct pvr_descriptor_set *desc_set;
4002 
4003       if (!(desc_state->valid_mask & BITFIELD_BIT(set))) {
4004          const struct pvr_pipeline_layout *pipeline_layout =
4005             PVR_SELECT(cmd_buffer->state.gfx_pipeline->base.layout,
4006                        cmd_buffer->state.gfx_pipeline->base.layout,
4007                        cmd_buffer->state.compute_pipeline->base.layout);
4008          const struct pvr_descriptor_set_layout *set_layout;
4009 
4010          assert(set <= pipeline_layout->set_count);
4011 
4012          set_layout = pipeline_layout->set_layout[set];
4013          dynamic_offset_idx += set_layout->dynamic_buffer_count;
4014 
4015          continue;
4016       }
4017 
4018       desc_set = desc_state->descriptor_sets[set];
4019 
4020       /* TODO: Is it better if we don't set the valid_mask for empty sets? */
4021       if (desc_set->layout->descriptor_count == 0)
4022          continue;
4023 
4024       if (desc_set->layout->dynamic_buffer_count > 0) {
4025          struct pvr_suballoc_bo *new_desc_set_bo;
4026 
4027          assert(dynamic_offset_idx + desc_set->layout->dynamic_buffer_count <=
4028                 ARRAY_SIZE(desc_state->dynamic_offsets));
4029 
4030          result = pvr_cmd_buffer_upload_patched_desc_set(
4031             cmd_buffer,
4032             desc_set,
4033             &desc_state->dynamic_offsets[dynamic_offset_idx],
4034             &new_desc_set_bo);
4035          if (result != VK_SUCCESS)
4036             return result;
4037 
4038          dynamic_offset_idx += desc_set->layout->dynamic_buffer_count;
4039 
4040          bound_desc_sets[set] = new_desc_set_bo->dev_addr.addr;
4041       } else {
4042          bound_desc_sets[set] = desc_set->pvr_bo->dev_addr.addr;
4043       }
4044    }
4045 
4046    result = pvr_cmd_buffer_upload_general(cmd_buffer,
4047                                           bound_desc_sets,
4048                                           sizeof(bound_desc_sets),
4049                                           &suballoc_bo);
4050    if (result != VK_SUCCESS)
4051       return result;
4052 
4053    *addr_out = suballoc_bo->dev_addr;
4054    return VK_SUCCESS;
4055 }
4056 
4057 static VkResult
pvr_process_addr_literal(struct pvr_cmd_buffer * cmd_buffer,enum pvr_pds_addr_literal_type addr_literal_type,enum pvr_stage_allocation stage,pvr_dev_addr_t * addr_out)4058 pvr_process_addr_literal(struct pvr_cmd_buffer *cmd_buffer,
4059                          enum pvr_pds_addr_literal_type addr_literal_type,
4060                          enum pvr_stage_allocation stage,
4061                          pvr_dev_addr_t *addr_out)
4062 {
4063    VkResult result;
4064 
4065    switch (addr_literal_type) {
4066    case PVR_PDS_ADDR_LITERAL_DESC_SET_ADDRS_TABLE: {
4067       /* TODO: Maybe we want to free pvr_bo? And only when the data
4068        * section is written successfully we link all bos to the command
4069        * buffer.
4070        */
4071       result =
4072          pvr_cmd_buffer_upload_desc_set_table(cmd_buffer, stage, addr_out);
4073       if (result != VK_SUCCESS)
4074          return result;
4075 
4076       break;
4077    }
4078 
4079    case PVR_PDS_ADDR_LITERAL_PUSH_CONSTS: {
4080       const struct pvr_pipeline_layout *layout =
4081          PVR_SELECT(cmd_buffer->state.gfx_pipeline->base.layout,
4082                     cmd_buffer->state.gfx_pipeline->base.layout,
4083                     cmd_buffer->state.compute_pipeline->base.layout);
4084       const uint32_t push_constants_offset =
4085          PVR_SELECT(layout->vert_push_constants_offset,
4086                     layout->frag_push_constants_offset,
4087                     layout->compute_push_constants_offset);
4088 
4089       *addr_out = PVR_DEV_ADDR_OFFSET(cmd_buffer->state.push_constants.dev_addr,
4090                                       push_constants_offset);
4091       break;
4092    }
4093 
4094    case PVR_PDS_ADDR_LITERAL_BLEND_CONSTANTS: {
4095       float *blend_consts =
4096          cmd_buffer->vk.dynamic_graphics_state.cb.blend_constants;
4097       size_t size =
4098          sizeof(cmd_buffer->vk.dynamic_graphics_state.cb.blend_constants);
4099       struct pvr_suballoc_bo *blend_consts_bo;
4100 
4101       result = pvr_cmd_buffer_upload_general(cmd_buffer,
4102                                              blend_consts,
4103                                              size,
4104                                              &blend_consts_bo);
4105       if (result != VK_SUCCESS)
4106          return result;
4107 
4108       *addr_out = blend_consts_bo->dev_addr;
4109 
4110       break;
4111    }
4112 
4113    default:
4114       unreachable("Invalid add literal type.");
4115    }
4116 
4117    return VK_SUCCESS;
4118 }
4119 
4120 #undef PVR_SELECT
4121 
pvr_setup_descriptor_mappings_new(struct pvr_cmd_buffer * const cmd_buffer,enum pvr_stage_allocation stage,const struct pvr_stage_allocation_descriptor_state * descriptor_state,uint32_t * const descriptor_data_offset_out)4122 static VkResult pvr_setup_descriptor_mappings_new(
4123    struct pvr_cmd_buffer *const cmd_buffer,
4124    enum pvr_stage_allocation stage,
4125    const struct pvr_stage_allocation_descriptor_state *descriptor_state,
4126    uint32_t *const descriptor_data_offset_out)
4127 {
4128    const struct pvr_pds_info *const pds_info = &descriptor_state->pds_info;
4129    struct pvr_suballoc_bo *pvr_bo;
4130    const uint8_t *entries;
4131    uint32_t *dword_buffer;
4132    uint64_t *qword_buffer;
4133    VkResult result;
4134 
4135    if (!pds_info->data_size_in_dwords)
4136       return VK_SUCCESS;
4137 
4138    result =
4139       pvr_cmd_buffer_alloc_mem(cmd_buffer,
4140                                cmd_buffer->device->heaps.pds_heap,
4141                                PVR_DW_TO_BYTES(pds_info->data_size_in_dwords),
4142                                &pvr_bo);
4143    if (result != VK_SUCCESS)
4144       return result;
4145 
4146    dword_buffer = (uint32_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
4147    qword_buffer = (uint64_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
4148 
4149    entries = (uint8_t *)pds_info->entries;
4150 
4151    switch (stage) {
4152    case PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY:
4153    case PVR_STAGE_ALLOCATION_FRAGMENT:
4154    case PVR_STAGE_ALLOCATION_COMPUTE:
4155       break;
4156 
4157    default:
4158       unreachable("Unsupported stage.");
4159       break;
4160    }
4161 
4162    for (uint32_t i = 0; i < pds_info->entry_count; i++) {
4163       const struct pvr_const_map_entry *const entry_header =
4164          (struct pvr_const_map_entry *)entries;
4165 
4166       switch (entry_header->type) {
4167       case PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32: {
4168          const struct pvr_const_map_entry_literal32 *const literal =
4169             (struct pvr_const_map_entry_literal32 *)entries;
4170 
4171          PVR_WRITE(dword_buffer,
4172                    literal->literal_value,
4173                    literal->const_offset,
4174                    pds_info->data_size_in_dwords);
4175 
4176          entries += sizeof(*literal);
4177          break;
4178       }
4179 
4180       case PVR_PDS_CONST_MAP_ENTRY_TYPE_ADDR_LITERAL_BUFFER: {
4181          const struct pvr_pds_const_map_entry_addr_literal_buffer
4182             *const addr_literal_buffer_entry =
4183                (struct pvr_pds_const_map_entry_addr_literal_buffer *)entries;
4184          struct pvr_device *device = cmd_buffer->device;
4185          struct pvr_suballoc_bo *addr_literal_buffer_bo;
4186          uint32_t addr_literal_count = 0;
4187          uint64_t *addr_literal_buffer;
4188 
4189          result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
4190                                            device->heaps.general_heap,
4191                                            addr_literal_buffer_entry->size,
4192                                            &addr_literal_buffer_bo);
4193          if (result != VK_SUCCESS)
4194             return result;
4195 
4196          addr_literal_buffer =
4197             (uint64_t *)pvr_bo_suballoc_get_map_addr(addr_literal_buffer_bo);
4198 
4199          entries += sizeof(*addr_literal_buffer_entry);
4200 
4201          PVR_WRITE(qword_buffer,
4202                    addr_literal_buffer_bo->dev_addr.addr,
4203                    addr_literal_buffer_entry->const_offset,
4204                    pds_info->data_size_in_dwords);
4205 
4206          for (uint32_t j = i + 1; j < pds_info->entry_count; j++) {
4207             const struct pvr_const_map_entry *const entry_header =
4208                (struct pvr_const_map_entry *)entries;
4209             const struct pvr_pds_const_map_entry_addr_literal *addr_literal;
4210             pvr_dev_addr_t dev_addr;
4211 
4212             if (entry_header->type != PVR_PDS_CONST_MAP_ENTRY_TYPE_ADDR_LITERAL)
4213                break;
4214 
4215             addr_literal =
4216                (struct pvr_pds_const_map_entry_addr_literal *)entries;
4217 
4218             result = pvr_process_addr_literal(cmd_buffer,
4219                                               addr_literal->addr_type,
4220                                               stage,
4221                                               &dev_addr);
4222             if (result != VK_SUCCESS)
4223                return result;
4224 
4225             addr_literal_buffer[addr_literal_count++] = dev_addr.addr;
4226 
4227             entries += sizeof(*addr_literal);
4228          }
4229 
4230          assert(addr_literal_count * sizeof(uint64_t) ==
4231                 addr_literal_buffer_entry->size);
4232 
4233          i += addr_literal_count;
4234 
4235          break;
4236       }
4237 
4238       default:
4239          unreachable("Unsupported map entry type.");
4240       }
4241    }
4242 
4243    *descriptor_data_offset_out =
4244       pvr_bo->dev_addr.addr -
4245       cmd_buffer->device->heaps.pds_heap->base_addr.addr;
4246 
4247    return VK_SUCCESS;
4248 }
4249 
pvr_setup_descriptor_mappings(struct pvr_cmd_buffer * const cmd_buffer,enum pvr_stage_allocation stage,const struct pvr_stage_allocation_descriptor_state * descriptor_state,const pvr_dev_addr_t * const num_worgroups_buff_addr,uint32_t * const descriptor_data_offset_out)4250 static VkResult pvr_setup_descriptor_mappings(
4251    struct pvr_cmd_buffer *const cmd_buffer,
4252    enum pvr_stage_allocation stage,
4253    const struct pvr_stage_allocation_descriptor_state *descriptor_state,
4254    const pvr_dev_addr_t *const num_worgroups_buff_addr,
4255    uint32_t *const descriptor_data_offset_out)
4256 {
4257    const bool old_path =
4258       pvr_has_hard_coded_shaders(&cmd_buffer->device->pdevice->dev_info);
4259 
4260    if (old_path) {
4261       return pvr_setup_descriptor_mappings_old(cmd_buffer,
4262                                                stage,
4263                                                descriptor_state,
4264                                                num_worgroups_buff_addr,
4265                                                descriptor_data_offset_out);
4266    }
4267 
4268    return pvr_setup_descriptor_mappings_new(cmd_buffer,
4269                                             stage,
4270                                             descriptor_state,
4271                                             descriptor_data_offset_out);
4272 }
4273 
pvr_compute_update_shared(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd)4274 static void pvr_compute_update_shared(struct pvr_cmd_buffer *cmd_buffer,
4275                                       struct pvr_sub_cmd_compute *const sub_cmd)
4276 {
4277    const struct pvr_device *device = cmd_buffer->device;
4278    const struct pvr_physical_device *pdevice = device->pdevice;
4279    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4280    struct pvr_csb *csb = &sub_cmd->control_stream;
4281    const struct pvr_compute_pipeline *pipeline = state->compute_pipeline;
4282    const uint32_t const_shared_regs =
4283       pipeline->shader_state.const_shared_reg_count;
4284    struct pvr_compute_kernel_info info;
4285 
4286    /* No shared regs, no need to use an allocation kernel. */
4287    if (!const_shared_regs)
4288       return;
4289 
4290    /* Accumulate the MAX number of shared registers across the kernels in this
4291     * dispatch. This is used by the FW for context switching, so must be large
4292     * enough to contain all the shared registers that might be in use for this
4293     * compute job. Coefficients don't need to be included as the context switch
4294     * will not happen within the execution of a single workgroup, thus nothing
4295     * needs to be preserved.
4296     */
4297    state->max_shared_regs = MAX2(state->max_shared_regs, const_shared_regs);
4298 
4299    info = (struct pvr_compute_kernel_info){
4300       .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
4301       .sd_type = PVRX(CDMCTRL_SD_TYPE_NONE),
4302 
4303       .usc_target = PVRX(CDMCTRL_USC_TARGET_ALL),
4304       .usc_common_shared = true,
4305       .usc_common_size =
4306          DIV_ROUND_UP(const_shared_regs,
4307                       PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE)),
4308 
4309       .local_size = { 1, 1, 1 },
4310       .global_size = { 1, 1, 1 },
4311    };
4312 
4313    /* Sometimes we don't have a secondary program if there were no constants to
4314     * write, but we still need to run a PDS program to accomplish the
4315     * allocation of the local/common store shared registers. Use the
4316     * pre-uploaded empty PDS program in this instance.
4317     */
4318    if (pipeline->descriptor_state.pds_info.code_size_in_dwords) {
4319       uint32_t pds_data_size_in_dwords =
4320          pipeline->descriptor_state.pds_info.data_size_in_dwords;
4321 
4322       info.pds_data_offset = state->pds_compute_descriptor_data_offset;
4323       info.pds_data_size =
4324          DIV_ROUND_UP(PVR_DW_TO_BYTES(pds_data_size_in_dwords),
4325                       PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE));
4326 
4327       /* Check that we have upload the code section. */
4328       assert(pipeline->descriptor_state.pds_code.code_size);
4329       info.pds_code_offset = pipeline->descriptor_state.pds_code.code_offset;
4330    } else {
4331       const struct pvr_pds_upload *program = &device->pds_compute_empty_program;
4332 
4333       info.pds_data_offset = program->data_offset;
4334       info.pds_data_size =
4335          DIV_ROUND_UP(PVR_DW_TO_BYTES(program->data_size),
4336                       PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE));
4337       info.pds_code_offset = program->code_offset;
4338    }
4339 
4340    /* We don't need to pad the workgroup size. */
4341 
4342    info.max_instances =
4343       pvr_compute_flat_slot_size(pdevice, const_shared_regs, false, 1U);
4344 
4345    pvr_compute_generate_control_stream(csb, sub_cmd, &info);
4346 }
4347 
pvr_compute_update_shared_private(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd,struct pvr_private_compute_pipeline * pipeline)4348 void pvr_compute_update_shared_private(
4349    struct pvr_cmd_buffer *cmd_buffer,
4350    struct pvr_sub_cmd_compute *const sub_cmd,
4351    struct pvr_private_compute_pipeline *pipeline)
4352 {
4353    const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
4354    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4355    const uint32_t const_shared_regs = pipeline->const_shared_regs_count;
4356    struct pvr_csb *csb = &sub_cmd->control_stream;
4357    struct pvr_compute_kernel_info info;
4358 
4359    /* No shared regs, no need to use an allocation kernel. */
4360    if (!const_shared_regs)
4361       return;
4362 
4363    /* See comment in pvr_compute_update_shared() for details on this. */
4364    state->max_shared_regs = MAX2(state->max_shared_regs, const_shared_regs);
4365 
4366    info = (struct pvr_compute_kernel_info){
4367       .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
4368       .usc_common_size =
4369          DIV_ROUND_UP(const_shared_regs,
4370                       PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE)),
4371       .pds_data_size =
4372          DIV_ROUND_UP(PVR_DW_TO_BYTES(pipeline->pds_shared_update_data_size_dw),
4373                       PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE)),
4374       .usc_target = PVRX(CDMCTRL_USC_TARGET_ALL),
4375       .pds_data_offset = pipeline->pds_shared_update_data_offset,
4376       .pds_code_offset = pipeline->pds_shared_update_code_offset,
4377       .sd_type = PVRX(CDMCTRL_SD_TYPE_NONE),
4378       .usc_common_shared = true,
4379       .local_size = { 1, 1, 1 },
4380       .global_size = { 1, 1, 1 },
4381    };
4382 
4383    /* We don't need to pad the workgroup size. */
4384 
4385    info.max_instances =
4386       pvr_compute_flat_slot_size(pdevice, const_shared_regs, false, 1U);
4387 
4388    pvr_compute_generate_control_stream(csb, sub_cmd, &info);
4389 }
4390 
4391 static uint32_t
pvr_compute_flat_pad_workgroup_size(const struct pvr_physical_device * pdevice,uint32_t workgroup_size,uint32_t coeff_regs_count)4392 pvr_compute_flat_pad_workgroup_size(const struct pvr_physical_device *pdevice,
4393                                     uint32_t workgroup_size,
4394                                     uint32_t coeff_regs_count)
4395 {
4396    const struct pvr_device_runtime_info *dev_runtime_info =
4397       &pdevice->dev_runtime_info;
4398    const struct pvr_device_info *dev_info = &pdevice->dev_info;
4399    uint32_t max_avail_coeff_regs =
4400       dev_runtime_info->cdm_max_local_mem_size_regs;
4401    uint32_t coeff_regs_count_aligned =
4402       ALIGN_POT(coeff_regs_count,
4403                 PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE) >> 2U);
4404 
4405    /* If the work group size is > ROGUE_MAX_INSTANCES_PER_TASK. We now *always*
4406     * pad the work group size to the next multiple of
4407     * ROGUE_MAX_INSTANCES_PER_TASK.
4408     *
4409     * If we use more than 1/8th of the max coefficient registers then we round
4410     * work group size up to the next multiple of ROGUE_MAX_INSTANCES_PER_TASK
4411     */
4412    /* TODO: See if this can be optimized. */
4413    if (workgroup_size > ROGUE_MAX_INSTANCES_PER_TASK ||
4414        coeff_regs_count_aligned > (max_avail_coeff_regs / 8)) {
4415       assert(workgroup_size < rogue_get_compute_max_work_group_size(dev_info));
4416 
4417       return ALIGN_POT(workgroup_size, ROGUE_MAX_INSTANCES_PER_TASK);
4418    }
4419 
4420    return workgroup_size;
4421 }
4422 
pvr_compute_update_kernel_private(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd,struct pvr_private_compute_pipeline * pipeline,const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])4423 void pvr_compute_update_kernel_private(
4424    struct pvr_cmd_buffer *cmd_buffer,
4425    struct pvr_sub_cmd_compute *const sub_cmd,
4426    struct pvr_private_compute_pipeline *pipeline,
4427    const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])
4428 {
4429    const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
4430    const struct pvr_device_runtime_info *dev_runtime_info =
4431       &pdevice->dev_runtime_info;
4432    struct pvr_csb *csb = &sub_cmd->control_stream;
4433 
4434    struct pvr_compute_kernel_info info = {
4435       .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
4436       .usc_target = PVRX(CDMCTRL_USC_TARGET_ANY),
4437       .pds_temp_size =
4438          DIV_ROUND_UP(pipeline->pds_temps_used << 2U,
4439                       PVRX(CDMCTRL_KERNEL0_PDS_TEMP_SIZE_UNIT_SIZE)),
4440 
4441       .pds_data_size =
4442          DIV_ROUND_UP(PVR_DW_TO_BYTES(pipeline->pds_data_size_dw),
4443                       PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE)),
4444       .pds_data_offset = pipeline->pds_data_offset,
4445       .pds_code_offset = pipeline->pds_code_offset,
4446 
4447       .sd_type = PVRX(CDMCTRL_SD_TYPE_NONE),
4448 
4449       .usc_unified_size =
4450          DIV_ROUND_UP(pipeline->unified_store_regs_count << 2U,
4451                       PVRX(CDMCTRL_KERNEL0_USC_UNIFIED_SIZE_UNIT_SIZE)),
4452 
4453       /* clang-format off */
4454       .global_size = {
4455          global_workgroup_size[0],
4456          global_workgroup_size[1],
4457          global_workgroup_size[2]
4458       },
4459       /* clang-format on */
4460    };
4461 
4462    uint32_t work_size = pipeline->workgroup_size.width *
4463                         pipeline->workgroup_size.height *
4464                         pipeline->workgroup_size.depth;
4465    uint32_t coeff_regs;
4466 
4467    if (work_size > ROGUE_MAX_INSTANCES_PER_TASK) {
4468       /* Enforce a single workgroup per cluster through allocation starvation.
4469        */
4470       coeff_regs = dev_runtime_info->cdm_max_local_mem_size_regs;
4471    } else {
4472       coeff_regs = pipeline->coeff_regs_count;
4473    }
4474 
4475    info.usc_common_size =
4476       DIV_ROUND_UP(PVR_DW_TO_BYTES(coeff_regs),
4477                    PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE));
4478 
4479    /* Use a whole slot per workgroup. */
4480    work_size = MAX2(work_size, ROGUE_MAX_INSTANCES_PER_TASK);
4481 
4482    coeff_regs += pipeline->const_shared_regs_count;
4483 
4484    if (pipeline->const_shared_regs_count > 0)
4485       info.sd_type = PVRX(CDMCTRL_SD_TYPE_USC);
4486 
4487    work_size =
4488       pvr_compute_flat_pad_workgroup_size(pdevice, work_size, coeff_regs);
4489 
4490    info.local_size[0] = work_size;
4491    info.local_size[1] = 1U;
4492    info.local_size[2] = 1U;
4493 
4494    info.max_instances =
4495       pvr_compute_flat_slot_size(pdevice, coeff_regs, false, work_size);
4496 
4497    pvr_compute_generate_control_stream(csb, sub_cmd, &info);
4498 }
4499 
4500 /* TODO: Wire up the base_workgroup variant program when implementing
4501  * VK_KHR_device_group. The values will also need patching into the program.
4502  */
pvr_compute_update_kernel(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd,pvr_dev_addr_t indirect_addr,const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])4503 static void pvr_compute_update_kernel(
4504    struct pvr_cmd_buffer *cmd_buffer,
4505    struct pvr_sub_cmd_compute *const sub_cmd,
4506    pvr_dev_addr_t indirect_addr,
4507    const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])
4508 {
4509    const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
4510    const struct pvr_device_runtime_info *dev_runtime_info =
4511       &pdevice->dev_runtime_info;
4512    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4513    struct pvr_csb *csb = &sub_cmd->control_stream;
4514    const struct pvr_compute_pipeline *pipeline = state->compute_pipeline;
4515    const struct pvr_compute_shader_state *shader_state =
4516       &pipeline->shader_state;
4517    const struct pvr_pds_info *program_info = &pipeline->primary_program_info;
4518 
4519    struct pvr_compute_kernel_info info = {
4520       .indirect_buffer_addr = indirect_addr,
4521       .usc_target = PVRX(CDMCTRL_USC_TARGET_ANY),
4522       .pds_temp_size =
4523          DIV_ROUND_UP(program_info->temps_required << 2U,
4524                       PVRX(CDMCTRL_KERNEL0_PDS_TEMP_SIZE_UNIT_SIZE)),
4525 
4526       .pds_data_size =
4527          DIV_ROUND_UP(PVR_DW_TO_BYTES(program_info->data_size_in_dwords),
4528                       PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE)),
4529       .pds_data_offset = pipeline->primary_program.data_offset,
4530       .pds_code_offset = pipeline->primary_program.code_offset,
4531 
4532       .sd_type = PVRX(CDMCTRL_SD_TYPE_NONE),
4533 
4534       .usc_unified_size =
4535          DIV_ROUND_UP(shader_state->input_register_count << 2U,
4536                       PVRX(CDMCTRL_KERNEL0_USC_UNIFIED_SIZE_UNIT_SIZE)),
4537 
4538       /* clang-format off */
4539       .global_size = {
4540          global_workgroup_size[0],
4541          global_workgroup_size[1],
4542          global_workgroup_size[2]
4543       },
4544       /* clang-format on */
4545    };
4546 
4547    uint32_t work_size = shader_state->work_size;
4548    uint32_t coeff_regs;
4549 
4550    if (work_size > ROGUE_MAX_INSTANCES_PER_TASK) {
4551       /* Enforce a single workgroup per cluster through allocation starvation.
4552        */
4553       coeff_regs = dev_runtime_info->cdm_max_local_mem_size_regs;
4554    } else {
4555       coeff_regs = shader_state->coefficient_register_count;
4556    }
4557 
4558    info.usc_common_size =
4559       DIV_ROUND_UP(PVR_DW_TO_BYTES(coeff_regs),
4560                    PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE));
4561 
4562    /* Use a whole slot per workgroup. */
4563    work_size = MAX2(work_size, ROGUE_MAX_INSTANCES_PER_TASK);
4564 
4565    coeff_regs += shader_state->const_shared_reg_count;
4566 
4567    if (shader_state->const_shared_reg_count > 0)
4568       info.sd_type = PVRX(CDMCTRL_SD_TYPE_USC);
4569 
4570    work_size =
4571       pvr_compute_flat_pad_workgroup_size(pdevice, work_size, coeff_regs);
4572 
4573    info.local_size[0] = work_size;
4574    info.local_size[1] = 1U;
4575    info.local_size[2] = 1U;
4576 
4577    info.max_instances =
4578       pvr_compute_flat_slot_size(pdevice, coeff_regs, false, work_size);
4579 
4580    pvr_compute_generate_control_stream(csb, sub_cmd, &info);
4581 }
4582 
pvr_cmd_upload_push_consts(struct pvr_cmd_buffer * cmd_buffer)4583 static VkResult pvr_cmd_upload_push_consts(struct pvr_cmd_buffer *cmd_buffer)
4584 {
4585    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4586    struct pvr_suballoc_bo *suballoc_bo;
4587    VkResult result;
4588 
4589    /* TODO: Here are some possible optimizations/things to consider:
4590     *
4591     *    - Currently we upload maxPushConstantsSize. The application might only
4592     *      be using a portion of that so we might end up with unused memory.
4593     *      Should we be smarter about this. If we intend to upload the push
4594     *      consts into shareds, we definitely want to do avoid reserving unused
4595     *      regs.
4596     *
4597     *    - For now we have to upload to a new buffer each time since the shaders
4598     *      access the push constants from memory. If we were to reuse the same
4599     *      buffer we might update the contents out of sync with job submission
4600     *      and the shaders will see the updated contents while the command
4601     *      buffer was still being recorded and not yet submitted.
4602     *      If we were to upload the push constants directly to shared regs we
4603     *      could reuse the same buffer (avoiding extra allocation overhead)
4604     *      since the contents will be DMAed only on job submission when the
4605     *      control stream is processed and the PDS program is executed. This
4606     *      approach would also allow us to avoid regenerating the PDS data
4607     *      section in some cases since the buffer address will be constants.
4608     */
4609 
4610    if (cmd_buffer->state.push_constants.uploaded)
4611       return VK_SUCCESS;
4612 
4613    result = pvr_cmd_buffer_upload_general(cmd_buffer,
4614                                           state->push_constants.data,
4615                                           sizeof(state->push_constants.data),
4616                                           &suballoc_bo);
4617    if (result != VK_SUCCESS)
4618       return result;
4619 
4620    cmd_buffer->state.push_constants.dev_addr = suballoc_bo->dev_addr;
4621    cmd_buffer->state.push_constants.uploaded = true;
4622 
4623    return VK_SUCCESS;
4624 }
4625 
pvr_cmd_dispatch(struct pvr_cmd_buffer * const cmd_buffer,const pvr_dev_addr_t indirect_addr,const uint32_t workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])4626 static void pvr_cmd_dispatch(
4627    struct pvr_cmd_buffer *const cmd_buffer,
4628    const pvr_dev_addr_t indirect_addr,
4629    const uint32_t workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])
4630 {
4631    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4632    const struct pvr_compute_pipeline *compute_pipeline =
4633       state->compute_pipeline;
4634    struct pvr_sub_cmd_compute *sub_cmd;
4635    VkResult result;
4636 
4637    pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_COMPUTE);
4638 
4639    sub_cmd = &state->current_sub_cmd->compute;
4640    sub_cmd->uses_atomic_ops |= compute_pipeline->shader_state.uses_atomic_ops;
4641    sub_cmd->uses_barrier |= compute_pipeline->shader_state.uses_barrier;
4642 
4643    if (state->push_constants.dirty_stages & VK_SHADER_STAGE_COMPUTE_BIT) {
4644       result = pvr_cmd_upload_push_consts(cmd_buffer);
4645       if (result != VK_SUCCESS)
4646          return;
4647 
4648       /* Regenerate the PDS program to use the new push consts buffer. */
4649       state->dirty.compute_desc_dirty = true;
4650 
4651       state->push_constants.dirty_stages &= ~VK_SHADER_STAGE_COMPUTE_BIT;
4652    }
4653 
4654    if (compute_pipeline->shader_state.uses_num_workgroups) {
4655       pvr_dev_addr_t descriptor_data_offset_out;
4656 
4657       if (indirect_addr.addr) {
4658          descriptor_data_offset_out = indirect_addr;
4659       } else {
4660          struct pvr_suballoc_bo *num_workgroups_bo;
4661 
4662          result = pvr_cmd_buffer_upload_general(cmd_buffer,
4663                                                 workgroup_size,
4664                                                 sizeof(*workgroup_size) *
4665                                                    PVR_WORKGROUP_DIMENSIONS,
4666                                                 &num_workgroups_bo);
4667          if (result != VK_SUCCESS)
4668             return;
4669 
4670          descriptor_data_offset_out = num_workgroups_bo->dev_addr;
4671       }
4672 
4673       result = pvr_setup_descriptor_mappings(
4674          cmd_buffer,
4675          PVR_STAGE_ALLOCATION_COMPUTE,
4676          &compute_pipeline->descriptor_state,
4677          &descriptor_data_offset_out,
4678          &state->pds_compute_descriptor_data_offset);
4679       if (result != VK_SUCCESS)
4680          return;
4681    } else if ((compute_pipeline->base.layout
4682                   ->per_stage_descriptor_masks[PVR_STAGE_ALLOCATION_COMPUTE] &&
4683                state->dirty.compute_desc_dirty) ||
4684               state->dirty.compute_pipeline_binding) {
4685       result = pvr_setup_descriptor_mappings(
4686          cmd_buffer,
4687          PVR_STAGE_ALLOCATION_COMPUTE,
4688          &compute_pipeline->descriptor_state,
4689          NULL,
4690          &state->pds_compute_descriptor_data_offset);
4691       if (result != VK_SUCCESS)
4692          return;
4693    }
4694 
4695    pvr_compute_update_shared(cmd_buffer, sub_cmd);
4696    pvr_compute_update_kernel(cmd_buffer, sub_cmd, indirect_addr, workgroup_size);
4697 }
4698 
pvr_CmdDispatch(VkCommandBuffer commandBuffer,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)4699 void pvr_CmdDispatch(VkCommandBuffer commandBuffer,
4700                      uint32_t groupCountX,
4701                      uint32_t groupCountY,
4702                      uint32_t groupCountZ)
4703 {
4704    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
4705 
4706    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
4707 
4708    if (!groupCountX || !groupCountY || !groupCountZ)
4709       return;
4710 
4711    pvr_cmd_dispatch(cmd_buffer,
4712                     PVR_DEV_ADDR_INVALID,
4713                     (uint32_t[]){ groupCountX, groupCountY, groupCountZ });
4714 }
4715 
pvr_CmdDispatchIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset)4716 void pvr_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
4717                              VkBuffer _buffer,
4718                              VkDeviceSize offset)
4719 {
4720    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
4721    PVR_FROM_HANDLE(pvr_buffer, buffer, _buffer);
4722 
4723    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
4724 
4725    pvr_cmd_dispatch(cmd_buffer,
4726                     PVR_DEV_ADDR_OFFSET(buffer->dev_addr, offset),
4727                     (uint32_t[]){ 1, 1, 1 });
4728 }
4729 
4730 static void
pvr_update_draw_state(struct pvr_cmd_buffer_state * const state,const struct pvr_cmd_buffer_draw_state * const draw_state)4731 pvr_update_draw_state(struct pvr_cmd_buffer_state *const state,
4732                       const struct pvr_cmd_buffer_draw_state *const draw_state)
4733 {
4734    /* We don't have a state to tell us that base_instance is being used so it
4735     * gets used as a boolean - 0 means we'll use a pds program that skips the
4736     * base instance addition. If the base_instance gets used (and the last
4737     * draw's base_instance was 0) then we switch to the BASE_INSTANCE attrib
4738     * program.
4739     *
4740     * If base_instance changes then we only need to update the data section.
4741     *
4742     * The only draw call state that doesn't really matter is the start vertex
4743     * as that is handled properly in the VDM state in all cases.
4744     */
4745    if ((state->draw_state.draw_indexed != draw_state->draw_indexed) ||
4746        (state->draw_state.draw_indirect != draw_state->draw_indirect) ||
4747        (state->draw_state.base_instance == 0 &&
4748         draw_state->base_instance != 0)) {
4749       state->dirty.draw_variant = true;
4750    } else if (state->draw_state.base_instance != draw_state->base_instance) {
4751       state->dirty.draw_base_instance = true;
4752    }
4753 
4754    state->draw_state = *draw_state;
4755 }
4756 
pvr_calc_shared_regs_count(const struct pvr_graphics_pipeline * const gfx_pipeline)4757 static uint32_t pvr_calc_shared_regs_count(
4758    const struct pvr_graphics_pipeline *const gfx_pipeline)
4759 {
4760    const struct pvr_pipeline_stage_state *const vertex_state =
4761       &gfx_pipeline->shader_state.vertex.stage_state;
4762 
4763    uint32_t shared_regs = vertex_state->const_shared_reg_count +
4764                           vertex_state->const_shared_reg_offset;
4765 
4766    if (gfx_pipeline->shader_state.fragment.bo) {
4767       const struct pvr_pipeline_stage_state *const fragment_state =
4768          &gfx_pipeline->shader_state.fragment.stage_state;
4769 
4770       uint32_t fragment_regs = fragment_state->const_shared_reg_count +
4771                                fragment_state->const_shared_reg_offset;
4772 
4773       shared_regs = MAX2(shared_regs, fragment_regs);
4774    }
4775 
4776    return shared_regs;
4777 }
4778 
4779 static void
pvr_emit_dirty_pds_state(const struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd,const uint32_t pds_vertex_descriptor_data_offset)4780 pvr_emit_dirty_pds_state(const struct pvr_cmd_buffer *const cmd_buffer,
4781                          struct pvr_sub_cmd_gfx *const sub_cmd,
4782                          const uint32_t pds_vertex_descriptor_data_offset)
4783 {
4784    const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
4785    const struct pvr_stage_allocation_descriptor_state
4786       *const vertex_descriptor_state =
4787          &state->gfx_pipeline->shader_state.vertex.descriptor_state;
4788    const struct pvr_pipeline_stage_state *const vertex_stage_state =
4789       &state->gfx_pipeline->shader_state.vertex.stage_state;
4790    struct pvr_csb *const csb = &sub_cmd->control_stream;
4791 
4792    if (!vertex_descriptor_state->pds_info.code_size_in_dwords)
4793       return;
4794 
4795    pvr_csb_set_relocation_mark(csb);
4796 
4797    pvr_csb_emit (csb, VDMCTRL_PDS_STATE0, state0) {
4798       state0.usc_target = PVRX(VDMCTRL_USC_TARGET_ALL);
4799 
4800       state0.usc_common_size =
4801          DIV_ROUND_UP(vertex_stage_state->const_shared_reg_count << 2,
4802                       PVRX(VDMCTRL_PDS_STATE0_USC_COMMON_SIZE_UNIT_SIZE));
4803 
4804       state0.pds_data_size = DIV_ROUND_UP(
4805          PVR_DW_TO_BYTES(vertex_descriptor_state->pds_info.data_size_in_dwords),
4806          PVRX(VDMCTRL_PDS_STATE0_PDS_DATA_SIZE_UNIT_SIZE));
4807    }
4808 
4809    pvr_csb_emit (csb, VDMCTRL_PDS_STATE1, state1) {
4810       state1.pds_data_addr = PVR_DEV_ADDR(pds_vertex_descriptor_data_offset);
4811       state1.sd_type = PVRX(VDMCTRL_SD_TYPE_NONE);
4812    }
4813 
4814    pvr_csb_emit (csb, VDMCTRL_PDS_STATE2, state2) {
4815       state2.pds_code_addr =
4816          PVR_DEV_ADDR(vertex_descriptor_state->pds_code.code_offset);
4817    }
4818 
4819    pvr_csb_clear_relocation_mark(csb);
4820 }
4821 
pvr_setup_output_select(struct pvr_cmd_buffer * const cmd_buffer)4822 static void pvr_setup_output_select(struct pvr_cmd_buffer *const cmd_buffer)
4823 {
4824    const struct pvr_graphics_pipeline *const gfx_pipeline =
4825       cmd_buffer->state.gfx_pipeline;
4826    const struct pvr_vertex_shader_state *const vertex_state =
4827       &gfx_pipeline->shader_state.vertex;
4828    struct vk_dynamic_graphics_state *const dynamic_state =
4829       &cmd_buffer->vk.dynamic_graphics_state;
4830    struct PVRX(TA_STATE_HEADER) *const header = &cmd_buffer->state.emit_header;
4831    struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state;
4832    uint32_t output_selects;
4833 
4834    /* TODO: Handle vertex and fragment shader state flags. */
4835 
4836    pvr_csb_pack (&output_selects, TA_OUTPUT_SEL, state) {
4837       state.rhw_pres = true;
4838       state.vtxsize = DIV_ROUND_UP(vertex_state->vertex_output_size, 4U);
4839       state.psprite_size_pres = (dynamic_state->ia.primitive_topology ==
4840                                  VK_PRIMITIVE_TOPOLOGY_POINT_LIST);
4841    }
4842 
4843    if (ppp_state->output_selects != output_selects) {
4844       ppp_state->output_selects = output_selects;
4845       header->pres_outselects = true;
4846    }
4847 
4848    if (ppp_state->varying_word[0] != vertex_state->varying[0]) {
4849       ppp_state->varying_word[0] = vertex_state->varying[0];
4850       header->pres_varying_word0 = true;
4851    }
4852 
4853    if (ppp_state->varying_word[1] != vertex_state->varying[1]) {
4854       ppp_state->varying_word[1] = vertex_state->varying[1];
4855       header->pres_varying_word1 = true;
4856    }
4857 }
4858 
4859 static void
pvr_setup_isp_faces_and_control(struct pvr_cmd_buffer * const cmd_buffer,struct PVRX (TA_STATE_ISPA)* const ispa_out)4860 pvr_setup_isp_faces_and_control(struct pvr_cmd_buffer *const cmd_buffer,
4861                                 struct PVRX(TA_STATE_ISPA) *const ispa_out)
4862 {
4863    struct PVRX(TA_STATE_HEADER) *const header = &cmd_buffer->state.emit_header;
4864    const struct pvr_fragment_shader_state *const fragment_shader_state =
4865       &cmd_buffer->state.gfx_pipeline->shader_state.fragment;
4866    const struct pvr_render_pass_info *const pass_info =
4867       &cmd_buffer->state.render_pass_info;
4868    struct vk_dynamic_graphics_state *dynamic_state =
4869       &cmd_buffer->vk.dynamic_graphics_state;
4870    struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state;
4871 
4872    const bool rasterizer_discard = dynamic_state->rs.rasterizer_discard_enable;
4873    const uint32_t subpass_idx = pass_info->subpass_idx;
4874    const uint32_t depth_stencil_attachment_idx =
4875       pass_info->pass->subpasses[subpass_idx].depth_stencil_attachment;
4876    const struct pvr_render_pass_attachment *const attachment =
4877       depth_stencil_attachment_idx != VK_ATTACHMENT_UNUSED
4878          ? &pass_info->pass->attachments[depth_stencil_attachment_idx]
4879          : NULL;
4880 
4881    const enum PVRX(TA_OBJTYPE)
4882       obj_type = pvr_ta_objtype(dynamic_state->ia.primitive_topology);
4883 
4884    const VkImageAspectFlags ds_aspects =
4885       (!rasterizer_discard && attachment)
4886          ? vk_format_aspects(attachment->vk_format) &
4887               (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)
4888          : VK_IMAGE_ASPECT_NONE;
4889 
4890    /* This is deliberately a full copy rather than a pointer because
4891     * vk_optimize_depth_stencil_state() can only be run once against any given
4892     * instance of vk_depth_stencil_state.
4893     */
4894    struct vk_depth_stencil_state ds_state = dynamic_state->ds;
4895 
4896    uint32_t ispb_stencil_off;
4897    bool is_two_sided = false;
4898    uint32_t isp_control;
4899 
4900    uint32_t line_width;
4901    uint32_t common_a;
4902    uint32_t front_a;
4903    uint32_t front_b;
4904    uint32_t back_a;
4905    uint32_t back_b;
4906 
4907    vk_optimize_depth_stencil_state(&ds_state, ds_aspects, true);
4908 
4909    /* Convert to 4.4 fixed point format. */
4910    line_width = util_unsigned_fixed(dynamic_state->rs.line.width, 4);
4911 
4912    /* Subtract 1 to shift values from range [0=0,256=16] to [0=1/16,255=16].
4913     * If 0 it stays at 0, otherwise we subtract 1.
4914     */
4915    line_width = (!!line_width) * (line_width - 1);
4916 
4917    line_width = MIN2(line_width, PVRX(TA_STATE_ISPA_POINTLINEWIDTH_SIZE_MAX));
4918 
4919    /* TODO: Part of the logic in this function is duplicated in another part
4920     * of the code. E.g. the dcmpmode, and sop1/2/3. Could we do this earlier?
4921     */
4922 
4923    pvr_csb_pack (&common_a, TA_STATE_ISPA, ispa) {
4924       ispa.pointlinewidth = line_width;
4925 
4926       ispa.dcmpmode = pvr_ta_cmpmode(ds_state.depth.compare_op);
4927       ispa.dwritedisable = !ds_state.depth.write_enable;
4928 
4929       ispa.passtype = fragment_shader_state->pass_type;
4930 
4931       ispa.objtype = obj_type;
4932 
4933       /* Return unpacked ispa structure. dcmpmode, dwritedisable, passtype and
4934        * objtype are needed by pvr_setup_triangle_merging_flag.
4935        */
4936       if (ispa_out)
4937          *ispa_out = ispa;
4938    }
4939 
4940    /* TODO: Does this actually represent the ispb control word on stencil off?
4941     * If not, rename the variable.
4942     */
4943    pvr_csb_pack (&ispb_stencil_off, TA_STATE_ISPB, ispb) {
4944       ispb.sop3 = PVRX(TA_ISPB_STENCILOP_KEEP);
4945       ispb.sop2 = PVRX(TA_ISPB_STENCILOP_KEEP);
4946       ispb.sop1 = PVRX(TA_ISPB_STENCILOP_KEEP);
4947       ispb.scmpmode = PVRX(TA_CMPMODE_ALWAYS);
4948    }
4949 
4950    /* FIXME: This logic should be redone and improved. Can we also get rid of
4951     * the front and back variants?
4952     */
4953 
4954    front_a = common_a;
4955    back_a = common_a;
4956 
4957    if (ds_state.stencil.test_enable) {
4958       uint32_t front_a_sref;
4959       uint32_t back_a_sref;
4960 
4961       pvr_csb_pack (&front_a_sref, TA_STATE_ISPA, ispa) {
4962          ispa.sref = ds_state.stencil.front.reference;
4963       }
4964       front_a |= front_a_sref;
4965 
4966       pvr_csb_pack (&back_a_sref, TA_STATE_ISPA, ispa) {
4967          ispa.sref = ds_state.stencil.back.reference;
4968       }
4969       back_a |= back_a_sref;
4970 
4971       pvr_csb_pack (&front_b, TA_STATE_ISPB, ispb) {
4972          const struct vk_stencil_test_face_state *const front =
4973             &ds_state.stencil.front;
4974 
4975          if (ds_state.stencil.write_enable)
4976             ispb.swmask = front->write_mask;
4977 
4978          ispb.scmpmask = front->compare_mask;
4979 
4980          ispb.sop3 = pvr_ta_stencilop(front->op.pass);
4981          ispb.sop2 = pvr_ta_stencilop(front->op.depth_fail);
4982          ispb.sop1 = pvr_ta_stencilop(front->op.fail);
4983          ispb.scmpmode = pvr_ta_cmpmode(front->op.compare);
4984       }
4985 
4986       pvr_csb_pack (&back_b, TA_STATE_ISPB, ispb) {
4987          const struct vk_stencil_test_face_state *const back =
4988             &ds_state.stencil.back;
4989 
4990          if (ds_state.stencil.write_enable)
4991             ispb.swmask = back->write_mask;
4992 
4993          ispb.scmpmask = back->compare_mask;
4994 
4995          ispb.sop3 = pvr_ta_stencilop(back->op.pass);
4996          ispb.sop2 = pvr_ta_stencilop(back->op.depth_fail);
4997          ispb.sop1 = pvr_ta_stencilop(back->op.fail);
4998          ispb.scmpmode = pvr_ta_cmpmode(back->op.compare);
4999       }
5000    } else {
5001       front_b = ispb_stencil_off;
5002       back_b = ispb_stencil_off;
5003    }
5004 
5005    if (front_a != back_a || front_b != back_b) {
5006       if (dynamic_state->rs.cull_mode & VK_CULL_MODE_BACK_BIT) {
5007          /* Single face, using front state. */
5008       } else if (dynamic_state->rs.cull_mode & VK_CULL_MODE_FRONT_BIT) {
5009          /* Single face, using back state. */
5010 
5011          front_a = back_a;
5012          front_b = back_b;
5013       } else {
5014          /* Both faces. */
5015 
5016          header->pres_ispctl_ba = is_two_sided = true;
5017 
5018          if (dynamic_state->rs.front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE) {
5019             uint32_t tmp = front_a;
5020 
5021             front_a = back_a;
5022             back_a = tmp;
5023 
5024             tmp = front_b;
5025             front_b = back_b;
5026             back_b = tmp;
5027          }
5028 
5029          /* HW defaults to stencil off. */
5030          if (back_b != ispb_stencil_off) {
5031             header->pres_ispctl_fb = true;
5032             header->pres_ispctl_bb = true;
5033          }
5034       }
5035    }
5036 
5037    if (ds_state.stencil.test_enable && front_b != ispb_stencil_off)
5038       header->pres_ispctl_fb = true;
5039 
5040    pvr_csb_pack (&isp_control, TA_STATE_ISPCTL, ispctl) {
5041       ispctl.upass = pass_info->isp_userpass;
5042 
5043       /* TODO: is bo ever NULL? Figure out what to do. */
5044       ispctl.tagwritedisable = rasterizer_discard || !fragment_shader_state->bo;
5045 
5046       ispctl.two_sided = is_two_sided;
5047       ispctl.bpres = header->pres_ispctl_fb || header->pres_ispctl_bb;
5048 
5049       ispctl.dbenable = !rasterizer_discard &&
5050                         dynamic_state->rs.depth_bias.enable &&
5051                         obj_type == PVRX(TA_OBJTYPE_TRIANGLE);
5052       if (!rasterizer_discard && cmd_buffer->state.vis_test_enabled) {
5053          ispctl.vistest = true;
5054          ispctl.visreg = cmd_buffer->state.vis_reg;
5055       }
5056 
5057       ispctl.scenable = !rasterizer_discard;
5058 
5059       ppp_state->isp.control_struct = ispctl;
5060    }
5061 
5062    header->pres_ispctl = true;
5063 
5064    ppp_state->isp.control = isp_control;
5065    ppp_state->isp.front_a = front_a;
5066    ppp_state->isp.front_b = front_b;
5067    ppp_state->isp.back_a = back_a;
5068    ppp_state->isp.back_b = back_b;
5069 }
5070 
5071 static float
pvr_calculate_final_depth_bias_contant_factor(struct pvr_device_info * dev_info,VkFormat format,float depth_bias)5072 pvr_calculate_final_depth_bias_contant_factor(struct pvr_device_info *dev_info,
5073                                               VkFormat format,
5074                                               float depth_bias)
5075 {
5076    /* Information for future modifiers of these depth bias calculations.
5077     * ==================================================================
5078     * Specified depth bias equations scale the specified constant factor by a
5079     * value 'r' that is guaranteed to cause a resolvable difference in depth
5080     * across the entire range of depth values.
5081     * For floating point depth formats 'r' is calculated by taking the maximum
5082     * exponent across the triangle.
5083     * For UNORM formats 'r' is constant.
5084     * Here 'n' is the number of mantissa bits stored in the floating point
5085     * representation (23 for F32).
5086     *
5087     *    UNORM Format -> z += dbcf * r + slope
5088     *    FLOAT Format -> z += dbcf * 2^(e-n) + slope
5089     *
5090     * HW Variations.
5091     * ==============
5092     * The HW either always performs the F32 depth bias equation (exponent based
5093     * r), or in the case of HW that correctly supports the integer depth bias
5094     * equation for UNORM depth formats, we can select between both equations
5095     * using the ROGUE_CR_ISP_CTL.dbias_is_int flag - this is required to
5096     * correctly perform Vulkan UNORM depth bias (constant r).
5097     *
5098     *    if ern42307:
5099     *       if DBIAS_IS_INT_EN:
5100     *          z += dbcf + slope
5101     *       else:
5102     *          z += dbcf * 2^(e-n) + slope
5103     *    else:
5104     *       z += dbcf * 2^(e-n) + slope
5105     *
5106     */
5107 
5108    float nudge_factor;
5109 
5110    if (PVR_HAS_ERN(dev_info, 42307)) {
5111       switch (format) {
5112       case VK_FORMAT_D16_UNORM:
5113          return depth_bias / (1 << 15);
5114 
5115       case VK_FORMAT_D24_UNORM_S8_UINT:
5116       case VK_FORMAT_X8_D24_UNORM_PACK32:
5117          return depth_bias / (1 << 23);
5118 
5119       default:
5120          return depth_bias;
5121       }
5122    }
5123 
5124    /* The reasoning behind clamping/nudging the value here is because UNORM
5125     * depth formats can have higher precision over our underlying D32F
5126     * representation for some depth ranges.
5127     *
5128     * When the HW scales the depth bias value by 2^(e-n) [The 'r' term'] a depth
5129     * bias of 1 can result in a value smaller than one F32 ULP, which will get
5130     * quantized to 0 - resulting in no bias.
5131     *
5132     * Biasing small values away from zero will ensure that small depth biases of
5133     * 1 still yield a result and overcome Z-fighting.
5134     */
5135    switch (format) {
5136    case VK_FORMAT_D16_UNORM:
5137       depth_bias *= 512.0f;
5138       nudge_factor = 1.0f;
5139       break;
5140 
5141    case VK_FORMAT_D24_UNORM_S8_UINT:
5142    case VK_FORMAT_X8_D24_UNORM_PACK32:
5143       depth_bias *= 2.0f;
5144       nudge_factor = 2.0f;
5145       break;
5146 
5147    default:
5148       nudge_factor = 0.0f;
5149       break;
5150    }
5151 
5152    if (nudge_factor != 0.0f) {
5153       if (depth_bias < 0.0f && depth_bias > -nudge_factor)
5154          depth_bias -= nudge_factor;
5155       else if (depth_bias > 0.0f && depth_bias < nudge_factor)
5156          depth_bias += nudge_factor;
5157    }
5158 
5159    return depth_bias;
5160 }
5161 
pvr_get_viewport_scissor_overlap(const VkViewport * const viewport,const VkRect2D * const scissor,VkRect2D * const rect_out)5162 static void pvr_get_viewport_scissor_overlap(const VkViewport *const viewport,
5163                                              const VkRect2D *const scissor,
5164                                              VkRect2D *const rect_out)
5165 {
5166    /* TODO: See if we can remove this struct. */
5167    struct pvr_rect {
5168       int32_t x0, y0;
5169       int32_t x1, y1;
5170    };
5171 
5172    /* TODO: Worry about overflow? */
5173    const struct pvr_rect scissor_rect = {
5174       .x0 = scissor->offset.x,
5175       .y0 = scissor->offset.y,
5176       .x1 = scissor->offset.x + scissor->extent.width,
5177       .y1 = scissor->offset.y + scissor->extent.height
5178    };
5179    struct pvr_rect viewport_rect = { 0 };
5180 
5181    assert(viewport->width >= 0.0f);
5182    assert(scissor_rect.x0 >= 0);
5183    assert(scissor_rect.y0 >= 0);
5184 
5185    if (scissor->extent.width == 0 || scissor->extent.height == 0) {
5186       *rect_out = (VkRect2D){ 0 };
5187       return;
5188    }
5189 
5190    viewport_rect.x0 = (int32_t)viewport->x;
5191    viewport_rect.x1 = (int32_t)viewport->x + (int32_t)viewport->width;
5192 
5193    /* TODO: Is there a mathematical way of doing all this and then clamp at
5194     * the end?
5195     */
5196    /* We flip the y0 and y1 when height is negative. */
5197    viewport_rect.y0 = (int32_t)viewport->y + MIN2(0, (int32_t)viewport->height);
5198    viewport_rect.y1 = (int32_t)viewport->y + MAX2(0, (int32_t)viewport->height);
5199 
5200    if (scissor_rect.x1 <= viewport_rect.x0 ||
5201        scissor_rect.y1 <= viewport_rect.y0 ||
5202        scissor_rect.x0 >= viewport_rect.x1 ||
5203        scissor_rect.y0 >= viewport_rect.y1) {
5204       *rect_out = (VkRect2D){ 0 };
5205       return;
5206    }
5207 
5208    /* Determine the overlapping rectangle. */
5209    viewport_rect.x0 = MAX2(viewport_rect.x0, scissor_rect.x0);
5210    viewport_rect.y0 = MAX2(viewport_rect.y0, scissor_rect.y0);
5211    viewport_rect.x1 = MIN2(viewport_rect.x1, scissor_rect.x1);
5212    viewport_rect.y1 = MIN2(viewport_rect.y1, scissor_rect.y1);
5213 
5214    /* TODO: Is this conversion safe? Is this logic right? */
5215    rect_out->offset.x = (uint32_t)viewport_rect.x0;
5216    rect_out->offset.y = (uint32_t)viewport_rect.y0;
5217    rect_out->extent.height = (uint32_t)(viewport_rect.y1 - viewport_rect.y0);
5218    rect_out->extent.width = (uint32_t)(viewport_rect.x1 - viewport_rect.x0);
5219 }
5220 
5221 static inline uint32_t
pvr_get_geom_region_clip_align_size(struct pvr_device_info * const dev_info)5222 pvr_get_geom_region_clip_align_size(struct pvr_device_info *const dev_info)
5223 {
5224    /* TODO: This should come from rogue_ppp.xml. */
5225    return 16U + 16U * (!PVR_HAS_FEATURE(dev_info, tile_size_16x16));
5226 }
5227 
5228 static void
pvr_setup_isp_depth_bias_scissor_state(struct pvr_cmd_buffer * const cmd_buffer)5229 pvr_setup_isp_depth_bias_scissor_state(struct pvr_cmd_buffer *const cmd_buffer)
5230 {
5231    struct PVRX(TA_STATE_HEADER) *const header = &cmd_buffer->state.emit_header;
5232    struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state;
5233    struct vk_dynamic_graphics_state *const dynamic_state =
5234       &cmd_buffer->vk.dynamic_graphics_state;
5235    const struct PVRX(TA_STATE_ISPCTL) *const ispctl =
5236       &ppp_state->isp.control_struct;
5237    struct pvr_device_info *const dev_info =
5238       &cmd_buffer->device->pdevice->dev_info;
5239 
5240    if (ispctl->dbenable &&
5241        (BITSET_TEST(dynamic_state->dirty,
5242                     MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS) ||
5243         cmd_buffer->depth_bias_array.size == 0)) {
5244       struct pvr_depth_bias_state depth_bias = {
5245          .constant_factor = pvr_calculate_final_depth_bias_contant_factor(
5246             dev_info,
5247             cmd_buffer->state.depth_format,
5248             dynamic_state->rs.depth_bias.constant),
5249          .slope_factor = dynamic_state->rs.depth_bias.slope,
5250          .clamp = dynamic_state->rs.depth_bias.clamp,
5251       };
5252 
5253       ppp_state->depthbias_scissor_indices.depthbias_index =
5254          util_dynarray_num_elements(&cmd_buffer->depth_bias_array,
5255                                     __typeof__(depth_bias));
5256 
5257       util_dynarray_append(&cmd_buffer->depth_bias_array,
5258                            __typeof__(depth_bias),
5259                            depth_bias);
5260 
5261       header->pres_ispctl_dbsc = true;
5262    }
5263 
5264    if (ispctl->scenable) {
5265       const uint32_t region_clip_align_size =
5266          pvr_get_geom_region_clip_align_size(dev_info);
5267       const VkViewport *const viewport = &dynamic_state->vp.viewports[0];
5268       const VkRect2D *const scissor = &dynamic_state->vp.scissors[0];
5269       struct pvr_scissor_words scissor_words;
5270       VkRect2D overlap_rect;
5271       uint32_t height;
5272       uint32_t width;
5273       uint32_t x;
5274       uint32_t y;
5275 
5276       /* For region clip. */
5277       uint32_t bottom;
5278       uint32_t right;
5279       uint32_t left;
5280       uint32_t top;
5281 
5282       /* We don't support multiple viewport calculations. */
5283       assert(dynamic_state->vp.viewport_count == 1);
5284       /* We don't support multiple scissor calculations. */
5285       assert(dynamic_state->vp.scissor_count == 1);
5286 
5287       pvr_get_viewport_scissor_overlap(viewport, scissor, &overlap_rect);
5288 
5289       x = overlap_rect.offset.x;
5290       y = overlap_rect.offset.y;
5291       width = overlap_rect.extent.width;
5292       height = overlap_rect.extent.height;
5293 
5294       pvr_csb_pack (&scissor_words.w0, IPF_SCISSOR_WORD_0, word0) {
5295          word0.scw0_xmax = x + width;
5296          word0.scw0_xmin = x;
5297       }
5298 
5299       pvr_csb_pack (&scissor_words.w1, IPF_SCISSOR_WORD_1, word1) {
5300          word1.scw1_ymax = y + height;
5301          word1.scw1_ymin = y;
5302       }
5303 
5304       if (cmd_buffer->scissor_array.size &&
5305           cmd_buffer->scissor_words.w0 == scissor_words.w0 &&
5306           cmd_buffer->scissor_words.w1 == scissor_words.w1) {
5307          return;
5308       }
5309 
5310       cmd_buffer->scissor_words = scissor_words;
5311 
5312       /* Calculate region clip. */
5313 
5314       left = x / region_clip_align_size;
5315       top = y / region_clip_align_size;
5316 
5317       /* We prevent right=-1 with the multiplication. */
5318       /* TODO: Is there a better way of doing this? */
5319       if ((x + width) != 0U)
5320          right = DIV_ROUND_UP(x + width, region_clip_align_size) - 1;
5321       else
5322          right = 0;
5323 
5324       if ((y + height) != 0U)
5325          bottom = DIV_ROUND_UP(y + height, region_clip_align_size) - 1;
5326       else
5327          bottom = 0U;
5328 
5329       /* Setup region clip to clip everything outside what was calculated. */
5330 
5331       /* FIXME: Should we mask to prevent writing over other words? */
5332       pvr_csb_pack (&ppp_state->region_clipping.word0, TA_REGION_CLIP0, word0) {
5333          word0.right = right;
5334          word0.left = left;
5335          word0.mode = PVRX(TA_REGION_CLIP_MODE_OUTSIDE);
5336       }
5337 
5338       pvr_csb_pack (&ppp_state->region_clipping.word1, TA_REGION_CLIP1, word1) {
5339          word1.bottom = bottom;
5340          word1.top = top;
5341       }
5342 
5343       ppp_state->depthbias_scissor_indices.scissor_index =
5344          util_dynarray_num_elements(&cmd_buffer->scissor_array,
5345                                     struct pvr_scissor_words);
5346 
5347       util_dynarray_append(&cmd_buffer->scissor_array,
5348                            struct pvr_scissor_words,
5349                            cmd_buffer->scissor_words);
5350 
5351       header->pres_ispctl_dbsc = true;
5352       header->pres_region_clip = true;
5353    }
5354 }
5355 
5356 static void
pvr_setup_triangle_merging_flag(struct pvr_cmd_buffer * const cmd_buffer,struct PVRX (TA_STATE_ISPA)* ispa)5357 pvr_setup_triangle_merging_flag(struct pvr_cmd_buffer *const cmd_buffer,
5358                                 struct PVRX(TA_STATE_ISPA) * ispa)
5359 {
5360    struct PVRX(TA_STATE_HEADER) *const header = &cmd_buffer->state.emit_header;
5361    struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state;
5362    uint32_t merge_word;
5363    uint32_t mask;
5364 
5365    pvr_csb_pack (&merge_word, TA_STATE_PDS_SIZEINFO2, size_info) {
5366       /* Disable for lines or punch-through or for DWD and depth compare
5367        * always.
5368        */
5369       if (ispa->objtype == PVRX(TA_OBJTYPE_LINE) ||
5370           ispa->passtype == PVRX(TA_PASSTYPE_PUNCH_THROUGH) ||
5371           (ispa->dwritedisable && ispa->dcmpmode == PVRX(TA_CMPMODE_ALWAYS))) {
5372          size_info.pds_tri_merge_disable = true;
5373       }
5374    }
5375 
5376    pvr_csb_pack (&mask, TA_STATE_PDS_SIZEINFO2, size_info) {
5377       size_info.pds_tri_merge_disable = true;
5378    }
5379 
5380    merge_word |= ppp_state->pds.size_info2 & ~mask;
5381 
5382    if (merge_word != ppp_state->pds.size_info2) {
5383       ppp_state->pds.size_info2 = merge_word;
5384       header->pres_pds_state_ptr0 = true;
5385    }
5386 }
5387 
5388 static void
pvr_setup_fragment_state_pointers(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)5389 pvr_setup_fragment_state_pointers(struct pvr_cmd_buffer *const cmd_buffer,
5390                                   struct pvr_sub_cmd_gfx *const sub_cmd)
5391 {
5392    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
5393 
5394    const struct pvr_fragment_shader_state *const fragment =
5395       &state->gfx_pipeline->shader_state.fragment;
5396    const struct pvr_stage_allocation_descriptor_state *descriptor_shader_state =
5397       &fragment->descriptor_state;
5398    const struct pvr_pipeline_stage_state *fragment_state =
5399       &fragment->stage_state;
5400    const struct pvr_pds_upload *pds_coeff_program =
5401       &fragment->pds_coeff_program;
5402 
5403    const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
5404    struct PVRX(TA_STATE_HEADER) *const header = &state->emit_header;
5405    struct pvr_ppp_state *const ppp_state = &state->ppp_state;
5406 
5407    const uint32_t pds_uniform_size =
5408       DIV_ROUND_UP(descriptor_shader_state->pds_info.data_size_in_dwords,
5409                    PVRX(TA_STATE_PDS_SIZEINFO1_PDS_UNIFORMSIZE_UNIT_SIZE));
5410 
5411    const uint32_t pds_varying_state_size =
5412       DIV_ROUND_UP(pds_coeff_program->data_size,
5413                    PVRX(TA_STATE_PDS_SIZEINFO1_PDS_VARYINGSIZE_UNIT_SIZE));
5414 
5415    const uint32_t usc_varying_size =
5416       DIV_ROUND_UP(fragment_state->coefficient_size,
5417                    PVRX(TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_UNIT_SIZE));
5418 
5419    const uint32_t pds_temp_size =
5420       DIV_ROUND_UP(fragment_state->pds_temps_count,
5421                    PVRX(TA_STATE_PDS_SIZEINFO1_PDS_TEMPSIZE_UNIT_SIZE));
5422 
5423    const uint32_t usc_shared_size =
5424       DIV_ROUND_UP(fragment_state->const_shared_reg_count,
5425                    PVRX(TA_STATE_PDS_SIZEINFO2_USC_SHAREDSIZE_UNIT_SIZE));
5426 
5427    const uint32_t max_tiles_in_flight =
5428       pvr_calc_fscommon_size_and_tiles_in_flight(
5429          &pdevice->dev_info,
5430          &pdevice->dev_runtime_info,
5431          usc_shared_size *
5432             PVRX(TA_STATE_PDS_SIZEINFO2_USC_SHAREDSIZE_UNIT_SIZE),
5433          1);
5434    uint32_t size_info_mask;
5435    uint32_t size_info2;
5436 
5437    if (max_tiles_in_flight < sub_cmd->max_tiles_in_flight)
5438       sub_cmd->max_tiles_in_flight = max_tiles_in_flight;
5439 
5440    pvr_csb_pack (&ppp_state->pds.pixel_shader_base,
5441                  TA_STATE_PDS_SHADERBASE,
5442                  shader_base) {
5443       const struct pvr_pds_upload *const pds_upload =
5444          &fragment->pds_fragment_program;
5445 
5446       shader_base.addr = PVR_DEV_ADDR(pds_upload->data_offset);
5447    }
5448 
5449    if (descriptor_shader_state->pds_code.pvr_bo) {
5450       pvr_csb_pack (&ppp_state->pds.texture_uniform_code_base,
5451                     TA_STATE_PDS_TEXUNICODEBASE,
5452                     tex_base) {
5453          tex_base.addr =
5454             PVR_DEV_ADDR(descriptor_shader_state->pds_code.code_offset);
5455       }
5456    } else {
5457       ppp_state->pds.texture_uniform_code_base = 0U;
5458    }
5459 
5460    pvr_csb_pack (&ppp_state->pds.size_info1, TA_STATE_PDS_SIZEINFO1, info1) {
5461       info1.pds_uniformsize = pds_uniform_size;
5462       info1.pds_texturestatesize = 0U;
5463       info1.pds_varyingsize = pds_varying_state_size;
5464       info1.usc_varyingsize = usc_varying_size;
5465       info1.pds_tempsize = pds_temp_size;
5466    }
5467 
5468    pvr_csb_pack (&size_info_mask, TA_STATE_PDS_SIZEINFO2, mask) {
5469       mask.pds_tri_merge_disable = true;
5470    }
5471 
5472    ppp_state->pds.size_info2 &= size_info_mask;
5473 
5474    pvr_csb_pack (&size_info2, TA_STATE_PDS_SIZEINFO2, info2) {
5475       info2.usc_sharedsize = usc_shared_size;
5476    }
5477 
5478    ppp_state->pds.size_info2 |= size_info2;
5479 
5480    if (pds_coeff_program->pvr_bo) {
5481       header->pres_pds_state_ptr1 = true;
5482 
5483       pvr_csb_pack (&ppp_state->pds.varying_base,
5484                     TA_STATE_PDS_VARYINGBASE,
5485                     base) {
5486          base.addr = PVR_DEV_ADDR(pds_coeff_program->data_offset);
5487       }
5488    } else {
5489       ppp_state->pds.varying_base = 0U;
5490    }
5491 
5492    pvr_csb_pack (&ppp_state->pds.uniform_state_data_base,
5493                  TA_STATE_PDS_UNIFORMDATABASE,
5494                  base) {
5495       base.addr = PVR_DEV_ADDR(state->pds_fragment_descriptor_data_offset);
5496    }
5497 
5498    header->pres_pds_state_ptr0 = true;
5499    header->pres_pds_state_ptr3 = true;
5500 }
5501 
pvr_setup_viewport(struct pvr_cmd_buffer * const cmd_buffer)5502 static void pvr_setup_viewport(struct pvr_cmd_buffer *const cmd_buffer)
5503 {
5504    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
5505    struct PVRX(TA_STATE_HEADER) *const header = &state->emit_header;
5506    struct vk_dynamic_graphics_state *const dynamic_state =
5507       &cmd_buffer->vk.dynamic_graphics_state;
5508    struct pvr_ppp_state *const ppp_state = &state->ppp_state;
5509 
5510    if (ppp_state->viewport_count != dynamic_state->vp.viewport_count) {
5511       ppp_state->viewport_count = dynamic_state->vp.viewport_count;
5512       header->pres_viewport = true;
5513    }
5514 
5515    if (dynamic_state->rs.rasterizer_discard_enable) {
5516       /* We don't want to emit any viewport data as it'll just get thrown
5517        * away. It's after the previous condition because we still want to
5518        * stash the viewport_count as it's our trigger for when
5519        * rasterizer discard gets disabled.
5520        */
5521       header->pres_viewport = false;
5522       return;
5523    }
5524 
5525    for (uint32_t i = 0; i < ppp_state->viewport_count; i++) {
5526       VkViewport *viewport = &dynamic_state->vp.viewports[i];
5527       uint32_t x_scale = fui(viewport->width * 0.5f);
5528       uint32_t y_scale = fui(viewport->height * 0.5f);
5529       uint32_t z_scale = fui(viewport->maxDepth - viewport->minDepth);
5530       uint32_t x_center = fui(viewport->x + viewport->width * 0.5f);
5531       uint32_t y_center = fui(viewport->y + viewport->height * 0.5f);
5532       uint32_t z_center = fui(viewport->minDepth);
5533 
5534       if (ppp_state->viewports[i].a0 != x_center ||
5535           ppp_state->viewports[i].m0 != x_scale ||
5536           ppp_state->viewports[i].a1 != y_center ||
5537           ppp_state->viewports[i].m1 != y_scale ||
5538           ppp_state->viewports[i].a2 != z_center ||
5539           ppp_state->viewports[i].m2 != z_scale) {
5540          ppp_state->viewports[i].a0 = x_center;
5541          ppp_state->viewports[i].m0 = x_scale;
5542          ppp_state->viewports[i].a1 = y_center;
5543          ppp_state->viewports[i].m1 = y_scale;
5544          ppp_state->viewports[i].a2 = z_center;
5545          ppp_state->viewports[i].m2 = z_scale;
5546 
5547          header->pres_viewport = true;
5548       }
5549    }
5550 }
5551 
pvr_setup_ppp_control(struct pvr_cmd_buffer * const cmd_buffer)5552 static void pvr_setup_ppp_control(struct pvr_cmd_buffer *const cmd_buffer)
5553 {
5554    struct vk_dynamic_graphics_state *const dynamic_state =
5555       &cmd_buffer->vk.dynamic_graphics_state;
5556    const VkPrimitiveTopology topology = dynamic_state->ia.primitive_topology;
5557    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
5558    struct PVRX(TA_STATE_HEADER) *const header = &state->emit_header;
5559    struct pvr_ppp_state *const ppp_state = &state->ppp_state;
5560    uint32_t ppp_control;
5561 
5562    pvr_csb_pack (&ppp_control, TA_STATE_PPP_CTRL, control) {
5563       control.drawclippededges = true;
5564       control.wclampen = true;
5565 
5566       if (topology == VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN)
5567          control.flatshade_vtx = PVRX(TA_FLATSHADE_VTX_VERTEX_1);
5568       else
5569          control.flatshade_vtx = PVRX(TA_FLATSHADE_VTX_VERTEX_0);
5570 
5571       if (dynamic_state->rs.depth_clamp_enable)
5572          control.clip_mode = PVRX(TA_CLIP_MODE_NO_FRONT_OR_REAR);
5573       else
5574          control.clip_mode = PVRX(TA_CLIP_MODE_FRONT_REAR);
5575 
5576       /* +--- FrontIsCCW?
5577        * | +--- Cull Front?
5578        * v v
5579        * 0|0 CULLMODE_CULL_CCW,
5580        * 0|1 CULLMODE_CULL_CW,
5581        * 1|0 CULLMODE_CULL_CW,
5582        * 1|1 CULLMODE_CULL_CCW,
5583        */
5584       switch (dynamic_state->rs.cull_mode) {
5585       case VK_CULL_MODE_BACK_BIT:
5586       case VK_CULL_MODE_FRONT_BIT:
5587          if ((dynamic_state->rs.front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE) ^
5588              (dynamic_state->rs.cull_mode == VK_CULL_MODE_FRONT_BIT)) {
5589             control.cullmode = PVRX(TA_CULLMODE_CULL_CW);
5590          } else {
5591             control.cullmode = PVRX(TA_CULLMODE_CULL_CCW);
5592          }
5593 
5594          break;
5595 
5596       case VK_CULL_MODE_FRONT_AND_BACK:
5597       case VK_CULL_MODE_NONE:
5598          control.cullmode = PVRX(TA_CULLMODE_NO_CULLING);
5599          break;
5600 
5601       default:
5602          unreachable("Unsupported cull mode!");
5603       }
5604    }
5605 
5606    if (ppp_control != ppp_state->ppp_control) {
5607       ppp_state->ppp_control = ppp_control;
5608       header->pres_ppp_ctrl = true;
5609    }
5610 }
5611 
5612 /* Largest valid PPP State update in words = 31
5613  * 1 - Header
5614  * 3 - Stream Out Config words 0, 1 and 2
5615  * 1 - PPP Control word
5616  * 3 - Varying Config words 0, 1 and 2
5617  * 1 - Output Select
5618  * 1 - WClamp
5619  * 6 - Viewport Transform words
5620  * 2 - Region Clip words
5621  * 3 - PDS State for fragment phase (PDSSTATEPTR 1-3)
5622  * 4 - PDS State for fragment phase (PDSSTATEPTR0)
5623  * 6 - ISP Control Words
5624  */
5625 #define PVR_MAX_PPP_STATE_DWORDS 31
5626 
pvr_emit_ppp_state(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)5627 static VkResult pvr_emit_ppp_state(struct pvr_cmd_buffer *const cmd_buffer,
5628                                    struct pvr_sub_cmd_gfx *const sub_cmd)
5629 {
5630    const bool deferred_secondary = pvr_cmd_uses_deferred_cs_cmds(cmd_buffer);
5631    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
5632    struct PVRX(TA_STATE_HEADER) *const header = &state->emit_header;
5633    struct pvr_csb *const control_stream = &sub_cmd->control_stream;
5634    struct pvr_ppp_state *const ppp_state = &state->ppp_state;
5635    uint32_t ppp_state_words[PVR_MAX_PPP_STATE_DWORDS];
5636    const bool emit_dbsc = header->pres_ispctl_dbsc;
5637    uint32_t *buffer_ptr = ppp_state_words;
5638    uint32_t dbsc_patching_offset = 0;
5639    uint32_t ppp_state_words_count;
5640    struct pvr_suballoc_bo *pvr_bo;
5641    VkResult result;
5642 
5643 #if !defined(NDEBUG)
5644    struct PVRX(TA_STATE_HEADER) emit_mask = *header;
5645    uint32_t packed_emit_mask;
5646 
5647    static_assert(pvr_cmd_length(TA_STATE_HEADER) == 1,
5648                  "EMIT_MASK_IS_CLEAR assumes 1 dword sized header.");
5649 
5650 #   define EMIT_MASK_GET(field) (emit_mask.field)
5651 #   define EMIT_MASK_SET(field, value) (emit_mask.field = (value))
5652 #   define EMIT_MASK_IS_CLEAR                                        \
5653       (pvr_cmd_pack(TA_STATE_HEADER)(&packed_emit_mask, &emit_mask), \
5654        packed_emit_mask == 0)
5655 #else
5656 #   define EMIT_MASK_GET(field)
5657 #   define EMIT_MASK_SET(field, value)
5658 #endif
5659 
5660    header->view_port_count =
5661       (ppp_state->viewport_count == 0) ? 0U : (ppp_state->viewport_count - 1);
5662    header->pres_ispctl_fa = header->pres_ispctl;
5663 
5664    /* If deferred_secondary is true then we do a separate state update
5665     * which gets patched in vkCmdExecuteCommands().
5666     */
5667    header->pres_ispctl_dbsc &= !deferred_secondary;
5668 
5669    pvr_csb_write_struct(buffer_ptr, TA_STATE_HEADER, header);
5670 
5671    static_assert(pvr_cmd_length(TA_STATE_HEADER) == 1,
5672                  "Following header check assumes 1 dword sized header.");
5673    /* If the header is empty we exit early and prevent a bo alloc of 0 size. */
5674    if (ppp_state_words[0] == 0)
5675       return VK_SUCCESS;
5676 
5677    if (header->pres_ispctl) {
5678       pvr_csb_write_value(buffer_ptr, TA_STATE_ISPCTL, ppp_state->isp.control);
5679 
5680       assert(header->pres_ispctl_fa);
5681       /* This is not a mistake. FA, BA have the ISPA format, and FB, BB have the
5682        * ISPB format.
5683        */
5684       pvr_csb_write_value(buffer_ptr, TA_STATE_ISPA, ppp_state->isp.front_a);
5685       EMIT_MASK_SET(pres_ispctl_fa, false);
5686 
5687       if (header->pres_ispctl_fb) {
5688          pvr_csb_write_value(buffer_ptr, TA_STATE_ISPB, ppp_state->isp.front_b);
5689          EMIT_MASK_SET(pres_ispctl_fb, false);
5690       }
5691 
5692       if (header->pres_ispctl_ba) {
5693          pvr_csb_write_value(buffer_ptr, TA_STATE_ISPA, ppp_state->isp.back_a);
5694          EMIT_MASK_SET(pres_ispctl_ba, false);
5695       }
5696 
5697       if (header->pres_ispctl_bb) {
5698          pvr_csb_write_value(buffer_ptr, TA_STATE_ISPB, ppp_state->isp.back_b);
5699          EMIT_MASK_SET(pres_ispctl_bb, false);
5700       }
5701 
5702       EMIT_MASK_SET(pres_ispctl, false);
5703    }
5704 
5705    if (header->pres_ispctl_dbsc) {
5706       assert(!deferred_secondary);
5707 
5708       dbsc_patching_offset = buffer_ptr - ppp_state_words;
5709 
5710       pvr_csb_pack (buffer_ptr, TA_STATE_ISPDBSC, ispdbsc) {
5711          ispdbsc.dbindex = ppp_state->depthbias_scissor_indices.depthbias_index;
5712          ispdbsc.scindex = ppp_state->depthbias_scissor_indices.scissor_index;
5713       }
5714       buffer_ptr += pvr_cmd_length(TA_STATE_ISPDBSC);
5715 
5716       EMIT_MASK_SET(pres_ispctl_dbsc, false);
5717    }
5718 
5719    if (header->pres_pds_state_ptr0) {
5720       pvr_csb_write_value(buffer_ptr,
5721                           TA_STATE_PDS_SHADERBASE,
5722                           ppp_state->pds.pixel_shader_base);
5723 
5724       pvr_csb_write_value(buffer_ptr,
5725                           TA_STATE_PDS_TEXUNICODEBASE,
5726                           ppp_state->pds.texture_uniform_code_base);
5727 
5728       pvr_csb_write_value(buffer_ptr,
5729                           TA_STATE_PDS_SIZEINFO1,
5730                           ppp_state->pds.size_info1);
5731       pvr_csb_write_value(buffer_ptr,
5732                           TA_STATE_PDS_SIZEINFO2,
5733                           ppp_state->pds.size_info2);
5734 
5735       EMIT_MASK_SET(pres_pds_state_ptr0, false);
5736    }
5737 
5738    if (header->pres_pds_state_ptr1) {
5739       pvr_csb_write_value(buffer_ptr,
5740                           TA_STATE_PDS_VARYINGBASE,
5741                           ppp_state->pds.varying_base);
5742       EMIT_MASK_SET(pres_pds_state_ptr1, false);
5743    }
5744 
5745    /* We don't use pds_state_ptr2 (texture state programs) control word, but
5746     * this doesn't mean we need to set it to 0. This is because the hardware
5747     * runs the texture state program only when
5748     * ROGUE_TA_STATE_PDS_SIZEINFO1.pds_texturestatesize is non-zero.
5749     */
5750    assert(pvr_csb_unpack(&ppp_state->pds.size_info1, TA_STATE_PDS_SIZEINFO1)
5751              .pds_texturestatesize == 0);
5752 
5753    if (header->pres_pds_state_ptr3) {
5754       pvr_csb_write_value(buffer_ptr,
5755                           TA_STATE_PDS_UNIFORMDATABASE,
5756                           ppp_state->pds.uniform_state_data_base);
5757       EMIT_MASK_SET(pres_pds_state_ptr3, false);
5758    }
5759 
5760    if (header->pres_region_clip) {
5761       pvr_csb_write_value(buffer_ptr,
5762                           TA_REGION_CLIP0,
5763                           ppp_state->region_clipping.word0);
5764       pvr_csb_write_value(buffer_ptr,
5765                           TA_REGION_CLIP1,
5766                           ppp_state->region_clipping.word1);
5767 
5768       EMIT_MASK_SET(pres_region_clip, false);
5769    }
5770 
5771    if (header->pres_viewport) {
5772       const uint32_t viewports = MAX2(1, ppp_state->viewport_count);
5773       EMIT_MASK_SET(view_port_count, viewports);
5774 
5775       for (uint32_t i = 0; i < viewports; i++) {
5776          /* These don't have any definitions in the csbgen xml files and none
5777           * will be added.
5778           */
5779          *buffer_ptr++ = ppp_state->viewports[i].a0;
5780          *buffer_ptr++ = ppp_state->viewports[i].m0;
5781          *buffer_ptr++ = ppp_state->viewports[i].a1;
5782          *buffer_ptr++ = ppp_state->viewports[i].m1;
5783          *buffer_ptr++ = ppp_state->viewports[i].a2;
5784          *buffer_ptr++ = ppp_state->viewports[i].m2;
5785 
5786          EMIT_MASK_SET(view_port_count, EMIT_MASK_GET(view_port_count) - 1);
5787       }
5788 
5789       EMIT_MASK_SET(pres_viewport, false);
5790    }
5791 
5792    if (header->pres_wclamp) {
5793       pvr_csb_pack (buffer_ptr, TA_WCLAMP, wclamp) {
5794          wclamp.val = fui(0.00001f);
5795       }
5796       buffer_ptr += pvr_cmd_length(TA_WCLAMP);
5797       EMIT_MASK_SET(pres_wclamp, false);
5798    }
5799 
5800    if (header->pres_outselects) {
5801       pvr_csb_write_value(buffer_ptr, TA_OUTPUT_SEL, ppp_state->output_selects);
5802       EMIT_MASK_SET(pres_outselects, false);
5803    }
5804 
5805    if (header->pres_varying_word0) {
5806       pvr_csb_write_value(buffer_ptr,
5807                           TA_STATE_VARYING0,
5808                           ppp_state->varying_word[0]);
5809       EMIT_MASK_SET(pres_varying_word0, false);
5810    }
5811 
5812    if (header->pres_varying_word1) {
5813       pvr_csb_write_value(buffer_ptr,
5814                           TA_STATE_VARYING1,
5815                           ppp_state->varying_word[1]);
5816       EMIT_MASK_SET(pres_varying_word1, false);
5817    }
5818 
5819    /* We only emit this on the first draw of a render job to prevent us from
5820     * inheriting a non-zero value set elsewhere.
5821     */
5822    if (header->pres_varying_word2) {
5823       pvr_csb_write_value(buffer_ptr, TA_STATE_VARYING2, 0);
5824       EMIT_MASK_SET(pres_varying_word2, false);
5825    }
5826 
5827    if (header->pres_ppp_ctrl) {
5828       pvr_csb_write_value(buffer_ptr,
5829                           TA_STATE_PPP_CTRL,
5830                           ppp_state->ppp_control);
5831       EMIT_MASK_SET(pres_ppp_ctrl, false);
5832    }
5833 
5834    /* We only emit this on the first draw of a render job to prevent us from
5835     * inheriting a non-zero value set elsewhere.
5836     */
5837    if (header->pres_stream_out_size) {
5838       pvr_csb_write_value(buffer_ptr, TA_STATE_STREAM_OUT0, 0);
5839       EMIT_MASK_SET(pres_stream_out_size, false);
5840    }
5841 
5842    assert(EMIT_MASK_IS_CLEAR);
5843 
5844 #undef EMIT_MASK_GET
5845 #undef EMIT_MASK_SET
5846 #if !defined(NDEBUG)
5847 #   undef EMIT_MASK_IS_CLEAR
5848 #endif
5849 
5850    ppp_state_words_count = buffer_ptr - ppp_state_words;
5851    assert(ppp_state_words_count <= PVR_MAX_PPP_STATE_DWORDS);
5852 
5853    result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
5854                                      cmd_buffer->device->heaps.general_heap,
5855                                      PVR_DW_TO_BYTES(ppp_state_words_count),
5856                                      &pvr_bo);
5857    if (result != VK_SUCCESS)
5858       return result;
5859 
5860    memcpy(pvr_bo_suballoc_get_map_addr(pvr_bo),
5861           ppp_state_words,
5862           PVR_DW_TO_BYTES(ppp_state_words_count));
5863 
5864    pvr_csb_set_relocation_mark(control_stream);
5865 
5866    /* Write the VDM state update into the VDM control stream. */
5867    pvr_csb_emit (control_stream, VDMCTRL_PPP_STATE0, state0) {
5868       state0.word_count = ppp_state_words_count;
5869       state0.addrmsb = pvr_bo->dev_addr;
5870    }
5871 
5872    pvr_csb_emit (control_stream, VDMCTRL_PPP_STATE1, state1) {
5873       state1.addrlsb = pvr_bo->dev_addr;
5874    }
5875 
5876    pvr_csb_clear_relocation_mark(control_stream);
5877 
5878    if (emit_dbsc && cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
5879       struct pvr_deferred_cs_command cmd;
5880 
5881       if (deferred_secondary) {
5882          const uint32_t num_dwords = pvr_cmd_length(VDMCTRL_PPP_STATE0) +
5883                                      pvr_cmd_length(VDMCTRL_PPP_STATE1);
5884          uint32_t *vdm_state;
5885 
5886          pvr_csb_set_relocation_mark(control_stream);
5887 
5888          vdm_state = pvr_csb_alloc_dwords(control_stream, num_dwords);
5889          if (!vdm_state) {
5890             result = pvr_csb_get_status(control_stream);
5891             return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
5892          }
5893 
5894          pvr_csb_clear_relocation_mark(control_stream);
5895 
5896          cmd = (struct pvr_deferred_cs_command){
5897             .type = PVR_DEFERRED_CS_COMMAND_TYPE_DBSC,
5898             .dbsc = {
5899                .state = ppp_state->depthbias_scissor_indices,
5900                .vdm_state = vdm_state,
5901             },
5902          };
5903       } else {
5904          cmd = (struct pvr_deferred_cs_command){
5905             .type = PVR_DEFERRED_CS_COMMAND_TYPE_DBSC2,
5906             .dbsc2 = {
5907                .state = ppp_state->depthbias_scissor_indices,
5908                .ppp_cs_bo = pvr_bo,
5909                .patch_offset = dbsc_patching_offset,
5910             },
5911          };
5912       }
5913 
5914       util_dynarray_append(&cmd_buffer->deferred_csb_commands,
5915                            struct pvr_deferred_cs_command,
5916                            cmd);
5917    }
5918 
5919    state->emit_header = (struct PVRX(TA_STATE_HEADER)){ 0 };
5920 
5921    return VK_SUCCESS;
5922 }
5923 
5924 static inline bool
pvr_ppp_state_update_required(const struct pvr_cmd_buffer * cmd_buffer)5925 pvr_ppp_state_update_required(const struct pvr_cmd_buffer *cmd_buffer)
5926 {
5927    const BITSET_WORD *const dynamic_dirty =
5928       cmd_buffer->vk.dynamic_graphics_state.dirty;
5929    const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
5930    const struct PVRX(TA_STATE_HEADER) *const header = &state->emit_header;
5931 
5932    /* For push constants we only need to worry if they are updated for the
5933     * fragment stage since we're only updating the pds programs used in the
5934     * fragment stage.
5935     */
5936 
5937    return header->pres_ppp_ctrl || header->pres_ispctl ||
5938           header->pres_ispctl_fb || header->pres_ispctl_ba ||
5939           header->pres_ispctl_bb || header->pres_ispctl_dbsc ||
5940           header->pres_pds_state_ptr0 || header->pres_pds_state_ptr1 ||
5941           header->pres_pds_state_ptr2 || header->pres_pds_state_ptr3 ||
5942           header->pres_region_clip || header->pres_viewport ||
5943           header->pres_wclamp || header->pres_outselects ||
5944           header->pres_varying_word0 || header->pres_varying_word1 ||
5945           header->pres_varying_word2 || header->pres_stream_out_program ||
5946           state->dirty.fragment_descriptors || state->dirty.vis_test ||
5947           state->dirty.gfx_pipeline_binding || state->dirty.isp_userpass ||
5948           state->push_constants.dirty_stages & VK_SHADER_STAGE_FRAGMENT_BIT ||
5949           BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) ||
5950           BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) ||
5951           BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE) ||
5952           BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE) ||
5953           BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS) ||
5954           BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH) ||
5955           BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_VP_SCISSORS) ||
5956           BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_VP_SCISSOR_COUNT) ||
5957           BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) ||
5958           BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT);
5959 }
5960 
5961 static VkResult
pvr_emit_dirty_ppp_state(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)5962 pvr_emit_dirty_ppp_state(struct pvr_cmd_buffer *const cmd_buffer,
5963                          struct pvr_sub_cmd_gfx *const sub_cmd)
5964 {
5965    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
5966    struct vk_dynamic_graphics_state *const dynamic_state =
5967       &cmd_buffer->vk.dynamic_graphics_state;
5968    VkResult result;
5969 
5970    /* TODO: The emit_header will be dirty only if
5971     * pvr_reset_graphics_dirty_state() was called before this (so when command
5972     * buffer begins recording or when it's reset). Otherwise it will have been
5973     * zeroed out by the previous pvr_emit_ppp_state(). We can probably set a
5974     * flag in there and check it here instead of checking the header.
5975     * Check if this is true and implement the flag.
5976     */
5977    if (!pvr_ppp_state_update_required(cmd_buffer))
5978       return VK_SUCCESS;
5979 
5980    if (state->dirty.gfx_pipeline_binding) {
5981       struct PVRX(TA_STATE_ISPA) ispa;
5982 
5983       pvr_setup_output_select(cmd_buffer);
5984       pvr_setup_isp_faces_and_control(cmd_buffer, &ispa);
5985       pvr_setup_triangle_merging_flag(cmd_buffer, &ispa);
5986    } else if (BITSET_TEST(dynamic_state->dirty,
5987                           MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) ||
5988               BITSET_TEST(dynamic_state->dirty,
5989                           MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE) ||
5990               BITSET_TEST(dynamic_state->dirty,
5991                           MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) ||
5992               BITSET_TEST(dynamic_state->dirty,
5993                           MESA_VK_DYNAMIC_RS_LINE_WIDTH) ||
5994               state->dirty.isp_userpass || state->dirty.vis_test) {
5995       pvr_setup_isp_faces_and_control(cmd_buffer, NULL);
5996    }
5997 
5998    if (!dynamic_state->rs.rasterizer_discard_enable &&
5999        state->dirty.fragment_descriptors &&
6000        state->gfx_pipeline->shader_state.fragment.bo) {
6001       pvr_setup_fragment_state_pointers(cmd_buffer, sub_cmd);
6002    }
6003 
6004    pvr_setup_isp_depth_bias_scissor_state(cmd_buffer);
6005 
6006    if (BITSET_TEST(dynamic_state->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) ||
6007        BITSET_TEST(dynamic_state->dirty, MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT))
6008       pvr_setup_viewport(cmd_buffer);
6009 
6010    pvr_setup_ppp_control(cmd_buffer);
6011 
6012    /* The hardware doesn't have an explicit mode for this so we use a
6013     * negative viewport to make sure all objects are culled out early.
6014     */
6015    if (dynamic_state->rs.cull_mode == VK_CULL_MODE_FRONT_AND_BACK) {
6016       /* Shift the viewport out of the guard-band culling everything. */
6017       const uint32_t negative_vp_val = fui(-2.0f);
6018 
6019       state->ppp_state.viewports[0].a0 = negative_vp_val;
6020       state->ppp_state.viewports[0].m0 = 0;
6021       state->ppp_state.viewports[0].a1 = negative_vp_val;
6022       state->ppp_state.viewports[0].m1 = 0;
6023       state->ppp_state.viewports[0].a2 = negative_vp_val;
6024       state->ppp_state.viewports[0].m2 = 0;
6025 
6026       state->ppp_state.viewport_count = 1;
6027 
6028       state->emit_header.pres_viewport = true;
6029    }
6030 
6031    result = pvr_emit_ppp_state(cmd_buffer, sub_cmd);
6032    if (result != VK_SUCCESS)
6033       return result;
6034 
6035    return VK_SUCCESS;
6036 }
6037 
pvr_calculate_vertex_cam_size(const struct pvr_device_info * dev_info,const uint32_t vs_output_size,const bool raster_enable,uint32_t * const cam_size_out,uint32_t * const vs_max_instances_out)6038 void pvr_calculate_vertex_cam_size(const struct pvr_device_info *dev_info,
6039                                    const uint32_t vs_output_size,
6040                                    const bool raster_enable,
6041                                    uint32_t *const cam_size_out,
6042                                    uint32_t *const vs_max_instances_out)
6043 {
6044    /* First work out the size of a vertex in the UVS and multiply by 4 for
6045     * column ordering.
6046     */
6047    const uint32_t uvs_vertex_vector_size_in_dwords =
6048       (vs_output_size + 1U + raster_enable * 4U) * 4U;
6049    const uint32_t vdm_cam_size =
6050       PVR_GET_FEATURE_VALUE(dev_info, vdm_cam_size, 32U);
6051 
6052    /* This is a proxy for 8XE. */
6053    if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format) &&
6054        vdm_cam_size < 96U) {
6055       /* Comparisons are based on size including scratch per vertex vector. */
6056       if (uvs_vertex_vector_size_in_dwords < (14U * 4U)) {
6057          *cam_size_out = MIN2(31U, vdm_cam_size - 1U);
6058          *vs_max_instances_out = 16U;
6059       } else if (uvs_vertex_vector_size_in_dwords < (20U * 4U)) {
6060          *cam_size_out = 15U;
6061          *vs_max_instances_out = 16U;
6062       } else if (uvs_vertex_vector_size_in_dwords < (28U * 4U)) {
6063          *cam_size_out = 11U;
6064          *vs_max_instances_out = 12U;
6065       } else if (uvs_vertex_vector_size_in_dwords < (44U * 4U)) {
6066          *cam_size_out = 7U;
6067          *vs_max_instances_out = 8U;
6068       } else if (PVR_HAS_FEATURE(dev_info,
6069                                  simple_internal_parameter_format_v2) ||
6070                  uvs_vertex_vector_size_in_dwords < (64U * 4U)) {
6071          *cam_size_out = 7U;
6072          *vs_max_instances_out = 4U;
6073       } else {
6074          *cam_size_out = 3U;
6075          *vs_max_instances_out = 2U;
6076       }
6077    } else {
6078       /* Comparisons are based on size including scratch per vertex vector. */
6079       if (uvs_vertex_vector_size_in_dwords <= (32U * 4U)) {
6080          /* output size <= 27 + 5 scratch. */
6081          *cam_size_out = MIN2(95U, vdm_cam_size - 1U);
6082          *vs_max_instances_out = 0U;
6083       } else if (uvs_vertex_vector_size_in_dwords <= 48U * 4U) {
6084          /* output size <= 43 + 5 scratch */
6085          *cam_size_out = 63U;
6086          if (PVR_GET_FEATURE_VALUE(dev_info, uvs_vtx_entries, 144U) < 288U)
6087             *vs_max_instances_out = 16U;
6088          else
6089             *vs_max_instances_out = 0U;
6090       } else if (uvs_vertex_vector_size_in_dwords <= 64U * 4U) {
6091          /* output size <= 59 + 5 scratch. */
6092          *cam_size_out = 31U;
6093          if (PVR_GET_FEATURE_VALUE(dev_info, uvs_vtx_entries, 144U) < 288U)
6094             *vs_max_instances_out = 16U;
6095          else
6096             *vs_max_instances_out = 0U;
6097       } else {
6098          *cam_size_out = 15U;
6099          *vs_max_instances_out = 16U;
6100       }
6101    }
6102 }
6103 
pvr_emit_dirty_vdm_state(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)6104 static void pvr_emit_dirty_vdm_state(struct pvr_cmd_buffer *const cmd_buffer,
6105                                      struct pvr_sub_cmd_gfx *const sub_cmd)
6106 {
6107    /* FIXME: Assume all state is dirty for the moment. */
6108    struct pvr_device_info *const dev_info =
6109       &cmd_buffer->device->pdevice->dev_info;
6110    ASSERTED const uint32_t max_user_vertex_output_components =
6111       pvr_get_max_user_vertex_output_components(dev_info);
6112    struct PVRX(VDMCTRL_VDM_STATE0)
6113       header = { pvr_cmd_header(VDMCTRL_VDM_STATE0) };
6114    struct vk_dynamic_graphics_state *const dynamic_state =
6115       &cmd_buffer->vk.dynamic_graphics_state;
6116    const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
6117    const struct pvr_vertex_shader_state *const vertex_shader_state =
6118       &state->gfx_pipeline->shader_state.vertex;
6119    struct pvr_csb *const csb = &sub_cmd->control_stream;
6120    uint32_t vs_output_size;
6121    uint32_t max_instances;
6122    uint32_t cam_size;
6123 
6124    /* CAM Calculations and HW state take vertex size aligned to DWORDS. */
6125    vs_output_size =
6126       DIV_ROUND_UP(vertex_shader_state->vertex_output_size,
6127                    PVRX(VDMCTRL_VDM_STATE4_VS_OUTPUT_SIZE_UNIT_SIZE));
6128 
6129    assert(vs_output_size <= max_user_vertex_output_components);
6130 
6131    pvr_calculate_vertex_cam_size(dev_info,
6132                                  vs_output_size,
6133                                  true,
6134                                  &cam_size,
6135                                  &max_instances);
6136 
6137    pvr_csb_set_relocation_mark(csb);
6138 
6139    pvr_csb_emit (csb, VDMCTRL_VDM_STATE0, state0) {
6140       state0.cam_size = cam_size;
6141 
6142       if (dynamic_state->ia.primitive_restart_enable) {
6143          state0.cut_index_enable = true;
6144          state0.cut_index_present = true;
6145       }
6146 
6147       switch (dynamic_state->ia.primitive_topology) {
6148       case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
6149          state0.flatshade_control = PVRX(VDMCTRL_FLATSHADE_CONTROL_VERTEX_1);
6150          break;
6151 
6152       default:
6153          state0.flatshade_control = PVRX(VDMCTRL_FLATSHADE_CONTROL_VERTEX_0);
6154          break;
6155       }
6156 
6157       /* If we've bound a different vertex buffer, or this draw-call requires
6158        * a different PDS attrib data-section from the last draw call (changed
6159        * base_instance) then we need to specify a new data section. This is
6160        * also the case if we've switched pipeline or attrib program as the
6161        * data-section layout will be different.
6162        */
6163       state0.vs_data_addr_present =
6164          state->dirty.gfx_pipeline_binding || state->dirty.vertex_bindings ||
6165          state->dirty.draw_base_instance || state->dirty.draw_variant;
6166 
6167       /* Need to specify new PDS Attrib program if we've bound a different
6168        * pipeline or we needed a different PDS Attrib variant for this
6169        * draw-call.
6170        */
6171       state0.vs_other_present = state->dirty.gfx_pipeline_binding ||
6172                                 state->dirty.draw_variant;
6173 
6174       /* UVB_SCRATCH_SELECT_ONE with no rasterization is only valid when
6175        * stream output is enabled. We use UVB_SCRATCH_SELECT_FIVE because
6176        * Vulkan doesn't support stream output and the vertex position is
6177        * always emitted to the UVB.
6178        */
6179       state0.uvs_scratch_size_select =
6180          PVRX(VDMCTRL_UVS_SCRATCH_SIZE_SELECT_FIVE);
6181 
6182       header = state0;
6183    }
6184 
6185    if (header.cut_index_present) {
6186       pvr_csb_emit (csb, VDMCTRL_VDM_STATE1, state1) {
6187          switch (state->index_buffer_binding.type) {
6188          case VK_INDEX_TYPE_UINT32:
6189             /* FIXME: Defines for these? These seem to come from the Vulkan
6190              * spec. for VkPipelineInputAssemblyStateCreateInfo
6191              * primitiveRestartEnable.
6192              */
6193             state1.cut_index = 0xFFFFFFFF;
6194             break;
6195 
6196          case VK_INDEX_TYPE_UINT16:
6197             state1.cut_index = 0xFFFF;
6198             break;
6199 
6200          default:
6201             unreachable("Invalid index type");
6202          }
6203       }
6204    }
6205 
6206    if (header.vs_data_addr_present) {
6207       pvr_csb_emit (csb, VDMCTRL_VDM_STATE2, state2) {
6208          state2.vs_pds_data_base_addr =
6209             PVR_DEV_ADDR(state->pds_vertex_attrib_offset);
6210       }
6211    }
6212 
6213    if (header.vs_other_present) {
6214       const uint32_t usc_unified_store_size_in_bytes =
6215          vertex_shader_state->vertex_input_size << 2;
6216 
6217       pvr_csb_emit (csb, VDMCTRL_VDM_STATE3, state3) {
6218          state3.vs_pds_code_base_addr =
6219             PVR_DEV_ADDR(state->pds_shader.code_offset);
6220       }
6221 
6222       pvr_csb_emit (csb, VDMCTRL_VDM_STATE4, state4) {
6223          state4.vs_output_size = vs_output_size;
6224       }
6225 
6226       pvr_csb_emit (csb, VDMCTRL_VDM_STATE5, state5) {
6227          state5.vs_max_instances = max_instances;
6228          state5.vs_usc_common_size = 0U;
6229          state5.vs_usc_unified_size = DIV_ROUND_UP(
6230             usc_unified_store_size_in_bytes,
6231             PVRX(VDMCTRL_VDM_STATE5_VS_USC_UNIFIED_SIZE_UNIT_SIZE));
6232          state5.vs_pds_temp_size =
6233             DIV_ROUND_UP(state->pds_shader.info->temps_required << 2,
6234                          PVRX(VDMCTRL_VDM_STATE5_VS_PDS_TEMP_SIZE_UNIT_SIZE));
6235          state5.vs_pds_data_size = DIV_ROUND_UP(
6236             PVR_DW_TO_BYTES(state->pds_shader.info->data_size_in_dwords),
6237             PVRX(VDMCTRL_VDM_STATE5_VS_PDS_DATA_SIZE_UNIT_SIZE));
6238       }
6239    }
6240 
6241    pvr_csb_clear_relocation_mark(csb);
6242 }
6243 
pvr_validate_draw_state(struct pvr_cmd_buffer * cmd_buffer)6244 static VkResult pvr_validate_draw_state(struct pvr_cmd_buffer *cmd_buffer)
6245 {
6246    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
6247    struct vk_dynamic_graphics_state *const dynamic_state =
6248       &cmd_buffer->vk.dynamic_graphics_state;
6249    const struct pvr_graphics_pipeline *const gfx_pipeline = state->gfx_pipeline;
6250    const struct pvr_pipeline_stage_state *const fragment_state =
6251       &gfx_pipeline->shader_state.fragment.stage_state;
6252    const struct pvr_pipeline_stage_state *const vertex_state =
6253       &gfx_pipeline->shader_state.vertex.stage_state;
6254    const struct pvr_pipeline_layout *const pipeline_layout =
6255       gfx_pipeline->base.layout;
6256    struct pvr_sub_cmd_gfx *sub_cmd;
6257    bool fstencil_writemask_zero;
6258    bool bstencil_writemask_zero;
6259    bool fstencil_keep;
6260    bool bstencil_keep;
6261    VkResult result;
6262 
6263    pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
6264 
6265    sub_cmd = &state->current_sub_cmd->gfx;
6266    sub_cmd->empty_cmd = false;
6267 
6268    /* Determine pipeline depth/stencil usage. If a pipeline uses depth or
6269     * stencil testing, those attachments are using their loaded values, and
6270     * the loadOps cannot be optimized out.
6271     */
6272    /* Pipeline uses depth testing. */
6273    if (sub_cmd->depth_usage == PVR_DEPTH_STENCIL_USAGE_UNDEFINED &&
6274        dynamic_state->ds.depth.compare_op != VK_COMPARE_OP_ALWAYS) {
6275       sub_cmd->depth_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED;
6276    }
6277 
6278    /* Pipeline uses stencil testing. */
6279    if (sub_cmd->stencil_usage == PVR_DEPTH_STENCIL_USAGE_UNDEFINED &&
6280        (dynamic_state->ds.stencil.front.op.compare != VK_COMPARE_OP_ALWAYS ||
6281         dynamic_state->ds.stencil.back.op.compare != VK_COMPARE_OP_ALWAYS)) {
6282       sub_cmd->stencil_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED;
6283    }
6284 
6285    if (PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info,
6286                        compute_overlap)) {
6287       uint32_t coefficient_size =
6288          DIV_ROUND_UP(fragment_state->coefficient_size,
6289                       PVRX(TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_UNIT_SIZE));
6290 
6291       if (coefficient_size >
6292           PVRX(TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_MAX_SIZE))
6293          sub_cmd->disable_compute_overlap = true;
6294    }
6295 
6296    sub_cmd->frag_uses_atomic_ops |= fragment_state->uses_atomic_ops;
6297    sub_cmd->frag_has_side_effects |= fragment_state->has_side_effects;
6298    sub_cmd->frag_uses_texture_rw |= fragment_state->uses_texture_rw;
6299    sub_cmd->vertex_uses_texture_rw |= vertex_state->uses_texture_rw;
6300 
6301    sub_cmd->job.get_vis_results = state->vis_test_enabled;
6302 
6303    fstencil_keep =
6304       (dynamic_state->ds.stencil.front.op.fail == VK_STENCIL_OP_KEEP) &&
6305       (dynamic_state->ds.stencil.front.op.pass == VK_STENCIL_OP_KEEP);
6306    bstencil_keep =
6307       (dynamic_state->ds.stencil.back.op.fail == VK_STENCIL_OP_KEEP) &&
6308       (dynamic_state->ds.stencil.back.op.pass == VK_STENCIL_OP_KEEP);
6309    fstencil_writemask_zero = (dynamic_state->ds.stencil.front.write_mask == 0);
6310    bstencil_writemask_zero = (dynamic_state->ds.stencil.back.write_mask == 0);
6311 
6312    /* Set stencil modified flag if:
6313     * - Neither front nor back-facing stencil has a fail_op/pass_op of KEEP.
6314     * - Neither front nor back-facing stencil has a write_mask of zero.
6315     */
6316    if (!(fstencil_keep && bstencil_keep) &&
6317        !(fstencil_writemask_zero && bstencil_writemask_zero)) {
6318       sub_cmd->modifies_stencil = true;
6319    }
6320 
6321    /* Set depth modified flag if depth write is enabled. */
6322    if (dynamic_state->ds.depth.write_enable)
6323       sub_cmd->modifies_depth = true;
6324 
6325    /* If either the data or code changes for pds vertex attribs, regenerate the
6326     * data segment.
6327     */
6328    if (state->dirty.vertex_bindings || state->dirty.gfx_pipeline_binding ||
6329        state->dirty.draw_variant || state->dirty.draw_base_instance) {
6330       enum pvr_pds_vertex_attrib_program_type prog_type;
6331       const struct pvr_pds_attrib_program *program;
6332 
6333       if (state->draw_state.draw_indirect)
6334          prog_type = PVR_PDS_VERTEX_ATTRIB_PROGRAM_DRAW_INDIRECT;
6335       else if (state->draw_state.base_instance)
6336          prog_type = PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASE_INSTANCE;
6337       else
6338          prog_type = PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASIC;
6339 
6340       program =
6341          &gfx_pipeline->shader_state.vertex.pds_attrib_programs[prog_type];
6342       state->pds_shader.info = &program->info;
6343       state->pds_shader.code_offset = program->program.code_offset;
6344 
6345       state->max_shared_regs =
6346          MAX2(state->max_shared_regs, pvr_calc_shared_regs_count(gfx_pipeline));
6347 
6348       pvr_setup_vertex_buffers(cmd_buffer, gfx_pipeline);
6349    }
6350 
6351    if (state->push_constants.dirty_stages & VK_SHADER_STAGE_ALL_GRAPHICS) {
6352       result = pvr_cmd_upload_push_consts(cmd_buffer);
6353       if (result != VK_SUCCESS)
6354          return result;
6355    }
6356 
6357    state->dirty.vertex_descriptors = state->dirty.gfx_pipeline_binding;
6358    state->dirty.fragment_descriptors = state->dirty.vertex_descriptors;
6359 
6360    /* Account for dirty descriptor set. */
6361    state->dirty.vertex_descriptors |=
6362       state->dirty.gfx_desc_dirty &&
6363       pipeline_layout
6364          ->per_stage_descriptor_masks[PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY];
6365    state->dirty.fragment_descriptors |=
6366       state->dirty.gfx_desc_dirty &&
6367       pipeline_layout->per_stage_descriptor_masks[PVR_STAGE_ALLOCATION_FRAGMENT];
6368 
6369    if (BITSET_TEST(dynamic_state->dirty, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS))
6370       state->dirty.fragment_descriptors = true;
6371 
6372    state->dirty.vertex_descriptors |=
6373       state->push_constants.dirty_stages &
6374       (VK_SHADER_STAGE_ALL_GRAPHICS & ~VK_SHADER_STAGE_FRAGMENT_BIT);
6375    state->dirty.fragment_descriptors |= state->push_constants.dirty_stages &
6376                                         VK_SHADER_STAGE_FRAGMENT_BIT;
6377 
6378    if (state->dirty.fragment_descriptors) {
6379       result = pvr_setup_descriptor_mappings(
6380          cmd_buffer,
6381          PVR_STAGE_ALLOCATION_FRAGMENT,
6382          &state->gfx_pipeline->shader_state.fragment.descriptor_state,
6383          NULL,
6384          &state->pds_fragment_descriptor_data_offset);
6385       if (result != VK_SUCCESS) {
6386          mesa_loge("Could not setup fragment descriptor mappings.");
6387          return result;
6388       }
6389    }
6390 
6391    if (state->dirty.vertex_descriptors) {
6392       uint32_t pds_vertex_descriptor_data_offset;
6393 
6394       result = pvr_setup_descriptor_mappings(
6395          cmd_buffer,
6396          PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY,
6397          &state->gfx_pipeline->shader_state.vertex.descriptor_state,
6398          NULL,
6399          &pds_vertex_descriptor_data_offset);
6400       if (result != VK_SUCCESS) {
6401          mesa_loge("Could not setup vertex descriptor mappings.");
6402          return result;
6403       }
6404 
6405       pvr_emit_dirty_pds_state(cmd_buffer,
6406                                sub_cmd,
6407                                pds_vertex_descriptor_data_offset);
6408    }
6409 
6410    pvr_emit_dirty_ppp_state(cmd_buffer, sub_cmd);
6411    pvr_emit_dirty_vdm_state(cmd_buffer, sub_cmd);
6412 
6413    vk_dynamic_graphics_state_clear_dirty(dynamic_state);
6414    state->dirty.gfx_desc_dirty = false;
6415    state->dirty.draw_base_instance = false;
6416    state->dirty.draw_variant = false;
6417    state->dirty.fragment_descriptors = false;
6418    state->dirty.gfx_pipeline_binding = false;
6419    state->dirty.isp_userpass = false;
6420    state->dirty.vertex_bindings = false;
6421    state->dirty.vis_test = false;
6422 
6423    state->push_constants.dirty_stages &= ~VK_SHADER_STAGE_ALL_GRAPHICS;
6424 
6425    return VK_SUCCESS;
6426 }
6427 
pvr_get_hw_primitive_topology(VkPrimitiveTopology topology)6428 static uint32_t pvr_get_hw_primitive_topology(VkPrimitiveTopology topology)
6429 {
6430    switch (topology) {
6431    case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
6432       return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_POINT_LIST);
6433    case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
6434       return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_LIST);
6435    case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
6436       return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_STRIP);
6437    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
6438       return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_LIST);
6439    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
6440       return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_STRIP);
6441    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
6442       return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_FAN);
6443    case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
6444       return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_LIST_ADJ);
6445    case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
6446       return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_STRIP_ADJ);
6447    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
6448       return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_LIST_ADJ);
6449    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
6450       return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_STRIP_ADJ);
6451    case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
6452       return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_PATCH_LIST);
6453    default:
6454       unreachable("Undefined primitive topology");
6455    }
6456 }
6457 
6458 /* TODO: Rewrite this in terms of ALIGN_POT() and pvr_cmd_length(). */
6459 /* Aligned to 128 bit for PDS loads / stores */
6460 #define DUMMY_VDM_CONTROL_STREAM_BLOCK_SIZE 8
6461 
6462 static VkResult
pvr_write_draw_indirect_vdm_stream(struct pvr_cmd_buffer * cmd_buffer,struct pvr_csb * const csb,pvr_dev_addr_t idx_buffer_addr,uint32_t idx_stride,struct PVRX (VDMCTRL_INDEX_LIST0)* list_hdr,struct pvr_buffer * buffer,VkDeviceSize offset,uint32_t count,uint32_t stride)6463 pvr_write_draw_indirect_vdm_stream(struct pvr_cmd_buffer *cmd_buffer,
6464                                    struct pvr_csb *const csb,
6465                                    pvr_dev_addr_t idx_buffer_addr,
6466                                    uint32_t idx_stride,
6467                                    struct PVRX(VDMCTRL_INDEX_LIST0) * list_hdr,
6468                                    struct pvr_buffer *buffer,
6469                                    VkDeviceSize offset,
6470                                    uint32_t count,
6471                                    uint32_t stride)
6472 {
6473    struct pvr_pds_drawindirect_program pds_prog = { 0 };
6474    uint32_t word0;
6475 
6476    /* Draw indirect always has index offset and instance count. */
6477    list_hdr->index_offset_present = true;
6478    list_hdr->index_instance_count_present = true;
6479 
6480    pvr_cmd_pack(VDMCTRL_INDEX_LIST0)(&word0, list_hdr);
6481 
6482    pds_prog.support_base_instance = true;
6483    pds_prog.arg_buffer = buffer->dev_addr.addr + offset;
6484    pds_prog.index_buffer = idx_buffer_addr.addr;
6485    pds_prog.index_block_header = word0;
6486    pds_prog.index_stride = idx_stride;
6487    pds_prog.num_views = 1U;
6488 
6489    /* TODO: See if we can pre-upload the code section of all the pds programs
6490     * and reuse them here.
6491     */
6492    /* Generate and upload the PDS programs (code + data). */
6493    for (uint32_t i = 0U; i < count; i++) {
6494       const struct pvr_device_info *dev_info =
6495          &cmd_buffer->device->pdevice->dev_info;
6496       struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6497       struct pvr_suballoc_bo *dummy_bo;
6498       struct pvr_suballoc_bo *pds_bo;
6499       uint32_t *dummy_stream;
6500       uint32_t *pds_base;
6501       uint32_t pds_size;
6502       VkResult result;
6503 
6504       /* TODO: Move this outside the loop and allocate all of them in one go? */
6505       result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
6506                                         cmd_buffer->device->heaps.general_heap,
6507                                         DUMMY_VDM_CONTROL_STREAM_BLOCK_SIZE,
6508                                         &dummy_bo);
6509       if (result != VK_SUCCESS)
6510          return result;
6511 
6512       pds_prog.increment_draw_id = (i != 0);
6513       pds_prog.index_list_addr_buffer = dummy_bo->dev_addr.addr;
6514 
6515       if (state->draw_state.draw_indexed) {
6516          pvr_pds_generate_draw_elements_indirect(&pds_prog,
6517                                                  0,
6518                                                  PDS_GENERATE_SIZES,
6519                                                  dev_info);
6520       } else {
6521          pvr_pds_generate_draw_arrays_indirect(&pds_prog,
6522                                                0,
6523                                                PDS_GENERATE_SIZES,
6524                                                dev_info);
6525       }
6526 
6527       pds_size = PVR_DW_TO_BYTES(pds_prog.program.data_size_aligned +
6528                                  pds_prog.program.code_size_aligned);
6529 
6530       result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
6531                                         cmd_buffer->device->heaps.pds_heap,
6532                                         pds_size,
6533                                         &pds_bo);
6534       if (result != VK_SUCCESS)
6535          return result;
6536 
6537       pds_base = pvr_bo_suballoc_get_map_addr(pds_bo);
6538       memcpy(pds_base,
6539              pds_prog.program.code,
6540              PVR_DW_TO_BYTES(pds_prog.program.code_size_aligned));
6541 
6542       if (state->draw_state.draw_indexed) {
6543          pvr_pds_generate_draw_elements_indirect(
6544             &pds_prog,
6545             pds_base + pds_prog.program.code_size_aligned,
6546             PDS_GENERATE_DATA_SEGMENT,
6547             dev_info);
6548       } else {
6549          pvr_pds_generate_draw_arrays_indirect(
6550             &pds_prog,
6551             pds_base + pds_prog.program.code_size_aligned,
6552             PDS_GENERATE_DATA_SEGMENT,
6553             dev_info);
6554       }
6555 
6556       pvr_csb_set_relocation_mark(csb);
6557 
6558       pvr_csb_emit (csb, VDMCTRL_PDS_STATE0, state0) {
6559          state0.usc_target = PVRX(VDMCTRL_USC_TARGET_ANY);
6560 
6561          state0.pds_temp_size =
6562             DIV_ROUND_UP(PVR_DW_TO_BYTES(pds_prog.program.temp_size_aligned),
6563                          PVRX(VDMCTRL_PDS_STATE0_PDS_TEMP_SIZE_UNIT_SIZE));
6564 
6565          state0.pds_data_size =
6566             DIV_ROUND_UP(PVR_DW_TO_BYTES(pds_prog.program.data_size_aligned),
6567                          PVRX(VDMCTRL_PDS_STATE0_PDS_DATA_SIZE_UNIT_SIZE));
6568       }
6569 
6570       pvr_csb_emit (csb, VDMCTRL_PDS_STATE1, state1) {
6571          const uint32_t data_offset =
6572             pds_bo->dev_addr.addr +
6573             PVR_DW_TO_BYTES(pds_prog.program.code_size_aligned) -
6574             cmd_buffer->device->heaps.pds_heap->base_addr.addr;
6575 
6576          state1.pds_data_addr = PVR_DEV_ADDR(data_offset);
6577          state1.sd_type = PVRX(VDMCTRL_SD_TYPE_PDS);
6578          state1.sd_next_type = PVRX(VDMCTRL_SD_TYPE_NONE);
6579       }
6580 
6581       pvr_csb_emit (csb, VDMCTRL_PDS_STATE2, state2) {
6582          const uint32_t code_offset =
6583             pds_bo->dev_addr.addr -
6584             cmd_buffer->device->heaps.pds_heap->base_addr.addr;
6585 
6586          state2.pds_code_addr = PVR_DEV_ADDR(code_offset);
6587       }
6588 
6589       pvr_csb_clear_relocation_mark(csb);
6590 
6591       /* We don't really need to set the relocation mark since the following
6592        * state update is just one emit but let's be nice and use it.
6593        */
6594       pvr_csb_set_relocation_mark(csb);
6595 
6596       /* Sync task to ensure the VDM doesn't start reading the dummy blocks
6597        * before they are ready.
6598        */
6599       pvr_csb_emit (csb, VDMCTRL_INDEX_LIST0, list0) {
6600          list0.primitive_topology = PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_LIST);
6601       }
6602 
6603       pvr_csb_clear_relocation_mark(csb);
6604 
6605       dummy_stream = pvr_bo_suballoc_get_map_addr(dummy_bo);
6606 
6607       /* For indexed draw cmds fill in the dummy's header (as it won't change
6608        * based on the indirect args) and increment by the in-use size of each
6609        * dummy block.
6610        */
6611       if (!state->draw_state.draw_indexed) {
6612          dummy_stream[0] = word0;
6613          dummy_stream += 4;
6614       } else {
6615          dummy_stream += 5;
6616       }
6617 
6618       /* clang-format off */
6619       pvr_csb_pack (dummy_stream, VDMCTRL_STREAM_RETURN, word);
6620       /* clang-format on */
6621 
6622       pvr_csb_set_relocation_mark(csb);
6623 
6624       /* Stream link to the first dummy which forces the VDM to discard any
6625        * prefetched (dummy) control stream.
6626        */
6627       pvr_csb_emit (csb, VDMCTRL_STREAM_LINK0, link) {
6628          link.with_return = true;
6629          link.link_addrmsb = dummy_bo->dev_addr;
6630       }
6631 
6632       pvr_csb_emit (csb, VDMCTRL_STREAM_LINK1, link) {
6633          link.link_addrlsb = dummy_bo->dev_addr;
6634       }
6635 
6636       pvr_csb_clear_relocation_mark(csb);
6637 
6638       /* Point the pds program to the next argument buffer and the next VDM
6639        * dummy buffer.
6640        */
6641       pds_prog.arg_buffer += stride;
6642    }
6643 
6644    return VK_SUCCESS;
6645 }
6646 
6647 #undef DUMMY_VDM_CONTROL_STREAM_BLOCK_SIZE
6648 
pvr_emit_vdm_index_list(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd,VkPrimitiveTopology topology,uint32_t index_offset,uint32_t first_index,uint32_t index_count,uint32_t instance_count,struct pvr_buffer * buffer,VkDeviceSize offset,uint32_t count,uint32_t stride)6649 static void pvr_emit_vdm_index_list(struct pvr_cmd_buffer *cmd_buffer,
6650                                     struct pvr_sub_cmd_gfx *const sub_cmd,
6651                                     VkPrimitiveTopology topology,
6652                                     uint32_t index_offset,
6653                                     uint32_t first_index,
6654                                     uint32_t index_count,
6655                                     uint32_t instance_count,
6656                                     struct pvr_buffer *buffer,
6657                                     VkDeviceSize offset,
6658                                     uint32_t count,
6659                                     uint32_t stride)
6660 {
6661    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6662    const bool vertex_shader_has_side_effects =
6663       state->gfx_pipeline->shader_state.vertex.stage_state.has_side_effects;
6664    struct PVRX(VDMCTRL_INDEX_LIST0)
6665       list_hdr = { pvr_cmd_header(VDMCTRL_INDEX_LIST0) };
6666    pvr_dev_addr_t index_buffer_addr = PVR_DEV_ADDR_INVALID;
6667    struct pvr_csb *const csb = &sub_cmd->control_stream;
6668    unsigned int index_stride = 0;
6669 
6670    list_hdr.primitive_topology = pvr_get_hw_primitive_topology(topology);
6671 
6672    /* firstInstance is not handled here in the VDM state, it's implemented as
6673     * an addition in the PDS vertex fetch using
6674     * PVR_PDS_CONST_MAP_ENTRY_TYPE_BASE_INSTANCE entry type.
6675     */
6676 
6677    list_hdr.index_count_present = true;
6678 
6679    if (instance_count > 1)
6680       list_hdr.index_instance_count_present = true;
6681 
6682    if (index_offset)
6683       list_hdr.index_offset_present = true;
6684 
6685    if (state->draw_state.draw_indexed) {
6686       switch (state->index_buffer_binding.type) {
6687       case VK_INDEX_TYPE_UINT32:
6688          list_hdr.index_size = PVRX(VDMCTRL_INDEX_SIZE_B32);
6689          index_stride = 4;
6690          break;
6691 
6692       case VK_INDEX_TYPE_UINT16:
6693          list_hdr.index_size = PVRX(VDMCTRL_INDEX_SIZE_B16);
6694          index_stride = 2;
6695          break;
6696 
6697       default:
6698          unreachable("Invalid index type");
6699       }
6700 
6701       index_buffer_addr = PVR_DEV_ADDR_OFFSET(
6702          state->index_buffer_binding.buffer->dev_addr,
6703          state->index_buffer_binding.offset + first_index * index_stride);
6704 
6705       list_hdr.index_addr_present = true;
6706       list_hdr.index_base_addrmsb = index_buffer_addr;
6707    }
6708 
6709    list_hdr.degen_cull_enable =
6710       PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info,
6711                       vdm_degenerate_culling) &&
6712       !vertex_shader_has_side_effects;
6713 
6714    if (state->draw_state.draw_indirect) {
6715       assert(buffer);
6716       pvr_write_draw_indirect_vdm_stream(cmd_buffer,
6717                                          csb,
6718                                          index_buffer_addr,
6719                                          index_stride,
6720                                          &list_hdr,
6721                                          buffer,
6722                                          offset,
6723                                          count,
6724                                          stride);
6725       return;
6726    }
6727 
6728    pvr_csb_set_relocation_mark(csb);
6729 
6730    pvr_csb_emit (csb, VDMCTRL_INDEX_LIST0, list0) {
6731       list0 = list_hdr;
6732    }
6733 
6734    if (list_hdr.index_addr_present) {
6735       pvr_csb_emit (csb, VDMCTRL_INDEX_LIST1, list1) {
6736          list1.index_base_addrlsb = index_buffer_addr;
6737       }
6738    }
6739 
6740    if (list_hdr.index_count_present) {
6741       pvr_csb_emit (csb, VDMCTRL_INDEX_LIST2, list2) {
6742          list2.index_count = index_count;
6743       }
6744    }
6745 
6746    if (list_hdr.index_instance_count_present) {
6747       pvr_csb_emit (csb, VDMCTRL_INDEX_LIST3, list3) {
6748          list3.instance_count = instance_count - 1;
6749       }
6750    }
6751 
6752    if (list_hdr.index_offset_present) {
6753       pvr_csb_emit (csb, VDMCTRL_INDEX_LIST4, list4) {
6754          list4.index_offset = index_offset;
6755       }
6756    }
6757 
6758    pvr_csb_clear_relocation_mark(csb);
6759 }
6760 
pvr_CmdDraw(VkCommandBuffer commandBuffer,uint32_t vertexCount,uint32_t instanceCount,uint32_t firstVertex,uint32_t firstInstance)6761 void pvr_CmdDraw(VkCommandBuffer commandBuffer,
6762                  uint32_t vertexCount,
6763                  uint32_t instanceCount,
6764                  uint32_t firstVertex,
6765                  uint32_t firstInstance)
6766 {
6767    const struct pvr_cmd_buffer_draw_state draw_state = {
6768       .base_vertex = firstVertex,
6769       .base_instance = firstInstance,
6770    };
6771 
6772    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
6773    struct vk_dynamic_graphics_state *const dynamic_state =
6774       &cmd_buffer->vk.dynamic_graphics_state;
6775    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6776    VkResult result;
6777 
6778    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
6779 
6780    pvr_update_draw_state(state, &draw_state);
6781 
6782    result = pvr_validate_draw_state(cmd_buffer);
6783    if (result != VK_SUCCESS)
6784       return;
6785 
6786    /* Write the VDM control stream for the primitive. */
6787    pvr_emit_vdm_index_list(cmd_buffer,
6788                            &state->current_sub_cmd->gfx,
6789                            dynamic_state->ia.primitive_topology,
6790                            firstVertex,
6791                            0U,
6792                            vertexCount,
6793                            instanceCount,
6794                            NULL,
6795                            0U,
6796                            0U,
6797                            0U);
6798 }
6799 
pvr_CmdDrawIndexed(VkCommandBuffer commandBuffer,uint32_t indexCount,uint32_t instanceCount,uint32_t firstIndex,int32_t vertexOffset,uint32_t firstInstance)6800 void pvr_CmdDrawIndexed(VkCommandBuffer commandBuffer,
6801                         uint32_t indexCount,
6802                         uint32_t instanceCount,
6803                         uint32_t firstIndex,
6804                         int32_t vertexOffset,
6805                         uint32_t firstInstance)
6806 {
6807    const struct pvr_cmd_buffer_draw_state draw_state = {
6808       .base_vertex = vertexOffset,
6809       .base_instance = firstInstance,
6810       .draw_indexed = true,
6811    };
6812 
6813    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
6814    struct vk_dynamic_graphics_state *const dynamic_state =
6815       &cmd_buffer->vk.dynamic_graphics_state;
6816    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6817    VkResult result;
6818 
6819    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
6820 
6821    pvr_update_draw_state(state, &draw_state);
6822 
6823    result = pvr_validate_draw_state(cmd_buffer);
6824    if (result != VK_SUCCESS)
6825       return;
6826 
6827    /* Write the VDM control stream for the primitive. */
6828    pvr_emit_vdm_index_list(cmd_buffer,
6829                            &state->current_sub_cmd->gfx,
6830                            dynamic_state->ia.primitive_topology,
6831                            vertexOffset,
6832                            firstIndex,
6833                            indexCount,
6834                            instanceCount,
6835                            NULL,
6836                            0U,
6837                            0U,
6838                            0U);
6839 }
6840 
pvr_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)6841 void pvr_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
6842                                 VkBuffer _buffer,
6843                                 VkDeviceSize offset,
6844                                 uint32_t drawCount,
6845                                 uint32_t stride)
6846 {
6847    const struct pvr_cmd_buffer_draw_state draw_state = {
6848       .draw_indirect = true,
6849       .draw_indexed = true,
6850    };
6851 
6852    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
6853    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6854    struct vk_dynamic_graphics_state *const dynamic_state =
6855       &cmd_buffer->vk.dynamic_graphics_state;
6856    PVR_FROM_HANDLE(pvr_buffer, buffer, _buffer);
6857    VkResult result;
6858 
6859    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
6860 
6861    pvr_update_draw_state(state, &draw_state);
6862 
6863    result = pvr_validate_draw_state(cmd_buffer);
6864    if (result != VK_SUCCESS)
6865       return;
6866 
6867    /* Write the VDM control stream for the primitive. */
6868    pvr_emit_vdm_index_list(cmd_buffer,
6869                            &state->current_sub_cmd->gfx,
6870                            dynamic_state->ia.primitive_topology,
6871                            0U,
6872                            0U,
6873                            0U,
6874                            0U,
6875                            buffer,
6876                            offset,
6877                            drawCount,
6878                            stride);
6879 }
6880 
pvr_CmdDrawIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)6881 void pvr_CmdDrawIndirect(VkCommandBuffer commandBuffer,
6882                          VkBuffer _buffer,
6883                          VkDeviceSize offset,
6884                          uint32_t drawCount,
6885                          uint32_t stride)
6886 {
6887    const struct pvr_cmd_buffer_draw_state draw_state = {
6888       .draw_indirect = true,
6889    };
6890 
6891    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
6892    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6893    PVR_FROM_HANDLE(pvr_buffer, buffer, _buffer);
6894    struct vk_dynamic_graphics_state *const dynamic_state =
6895       &cmd_buffer->vk.dynamic_graphics_state;
6896    VkResult result;
6897 
6898    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
6899 
6900    pvr_update_draw_state(state, &draw_state);
6901 
6902    result = pvr_validate_draw_state(cmd_buffer);
6903    if (result != VK_SUCCESS)
6904       return;
6905 
6906    /* Write the VDM control stream for the primitive. */
6907    pvr_emit_vdm_index_list(cmd_buffer,
6908                            &state->current_sub_cmd->gfx,
6909                            dynamic_state->ia.primitive_topology,
6910                            0U,
6911                            0U,
6912                            0U,
6913                            0U,
6914                            buffer,
6915                            offset,
6916                            drawCount,
6917                            stride);
6918 }
6919 
6920 static VkResult
pvr_resolve_unemitted_resolve_attachments(struct pvr_cmd_buffer * cmd_buffer,struct pvr_render_pass_info * info)6921 pvr_resolve_unemitted_resolve_attachments(struct pvr_cmd_buffer *cmd_buffer,
6922                                           struct pvr_render_pass_info *info)
6923 {
6924    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6925    const struct pvr_renderpass_hwsetup_render *hw_render =
6926       &state->render_pass_info.pass->hw_setup->renders[info->current_hw_subpass];
6927 
6928    for (uint32_t i = 0U; i < hw_render->eot_surface_count; i++) {
6929       const struct pvr_renderpass_hwsetup_eot_surface *surface =
6930          &hw_render->eot_surfaces[i];
6931       const uint32_t color_attach_idx = surface->src_attachment_idx;
6932       const uint32_t resolve_attach_idx = surface->attachment_idx;
6933       VkImageSubresourceLayers src_subresource;
6934       VkImageSubresourceLayers dst_subresource;
6935       struct pvr_image_view *dst_view;
6936       struct pvr_image_view *src_view;
6937       VkFormat src_format;
6938       VkFormat dst_format;
6939       VkImageCopy2 region;
6940       VkResult result;
6941 
6942       if (!surface->need_resolve ||
6943           surface->resolve_type != PVR_RESOLVE_TYPE_TRANSFER)
6944          continue;
6945 
6946       dst_view = info->attachments[resolve_attach_idx];
6947       src_view = info->attachments[color_attach_idx];
6948 
6949       src_subresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
6950       src_subresource.mipLevel = src_view->vk.base_mip_level;
6951       src_subresource.baseArrayLayer = src_view->vk.base_array_layer;
6952       src_subresource.layerCount = src_view->vk.layer_count;
6953 
6954       dst_subresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
6955       dst_subresource.mipLevel = dst_view->vk.base_mip_level;
6956       dst_subresource.baseArrayLayer = dst_view->vk.base_array_layer;
6957       dst_subresource.layerCount = dst_view->vk.layer_count;
6958 
6959       region.srcOffset = (VkOffset3D){ info->render_area.offset.x,
6960                                        info->render_area.offset.y,
6961                                        0 };
6962       region.dstOffset = (VkOffset3D){ info->render_area.offset.x,
6963                                        info->render_area.offset.y,
6964                                        0 };
6965       region.extent = (VkExtent3D){ info->render_area.extent.width,
6966                                     info->render_area.extent.height,
6967                                     1 };
6968 
6969       region.srcSubresource = src_subresource;
6970       region.dstSubresource = dst_subresource;
6971 
6972       /* TODO: if ERN_46863 is supported, Depth and stencil are sampled
6973        * separately from images with combined depth+stencil. Add logic here to
6974        * handle it using appropriate format from image view.
6975        */
6976       src_format = src_view->vk.image->format;
6977       dst_format = dst_view->vk.image->format;
6978       src_view->vk.image->format = src_view->vk.format;
6979       dst_view->vk.image->format = dst_view->vk.format;
6980 
6981       result = pvr_copy_or_resolve_color_image_region(
6982          cmd_buffer,
6983          vk_to_pvr_image(src_view->vk.image),
6984          vk_to_pvr_image(dst_view->vk.image),
6985          &region);
6986 
6987       src_view->vk.image->format = src_format;
6988       dst_view->vk.image->format = dst_format;
6989 
6990       state->current_sub_cmd->transfer.serialize_with_frag = true;
6991 
6992       if (result != VK_SUCCESS)
6993          return result;
6994    }
6995 
6996    return pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
6997 }
6998 
pvr_CmdEndRenderPass2(VkCommandBuffer commandBuffer,const VkSubpassEndInfo * pSubpassEndInfo)6999 void pvr_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
7000                            const VkSubpassEndInfo *pSubpassEndInfo)
7001 {
7002    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7003    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
7004    struct pvr_image_view **attachments;
7005    VkClearValue *clear_values;
7006    VkResult result;
7007 
7008    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
7009 
7010    assert(state->render_pass_info.pass);
7011    assert(state->render_pass_info.framebuffer);
7012 
7013    result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7014    if (result != VK_SUCCESS)
7015       return;
7016 
7017    result = pvr_resolve_unemitted_resolve_attachments(cmd_buffer,
7018                                                       &state->render_pass_info);
7019    if (result != VK_SUCCESS)
7020       return;
7021 
7022    /* Save the required fields before clearing render_pass_info struct. */
7023    attachments = state->render_pass_info.attachments;
7024    clear_values = state->render_pass_info.clear_values;
7025 
7026    memset(&state->render_pass_info, 0, sizeof(state->render_pass_info));
7027 
7028    state->render_pass_info.attachments = attachments;
7029    state->render_pass_info.clear_values = clear_values;
7030 }
7031 
7032 static VkResult
pvr_execute_deferred_cmd_buffer(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_cmd_buffer * sec_cmd_buffer)7033 pvr_execute_deferred_cmd_buffer(struct pvr_cmd_buffer *cmd_buffer,
7034                                 const struct pvr_cmd_buffer *sec_cmd_buffer)
7035 {
7036    struct vk_dynamic_graphics_state *const dynamic_state =
7037       &cmd_buffer->vk.dynamic_graphics_state;
7038    const uint32_t prim_db_elems =
7039       util_dynarray_num_elements(&cmd_buffer->depth_bias_array,
7040                                  struct pvr_depth_bias_state);
7041    const uint32_t prim_scissor_elems =
7042       util_dynarray_num_elements(&cmd_buffer->scissor_array,
7043                                  struct pvr_scissor_words);
7044 
7045    util_dynarray_foreach (&sec_cmd_buffer->deferred_csb_commands,
7046                           struct pvr_deferred_cs_command,
7047                           cmd) {
7048       switch (cmd->type) {
7049       case PVR_DEFERRED_CS_COMMAND_TYPE_DBSC: {
7050          const uint32_t scissor_idx =
7051             prim_scissor_elems + cmd->dbsc.state.scissor_index;
7052          const uint32_t db_idx =
7053             prim_db_elems + cmd->dbsc.state.depthbias_index;
7054          const uint32_t num_dwords =
7055             pvr_cmd_length(TA_STATE_HEADER) + pvr_cmd_length(TA_STATE_ISPDBSC);
7056          struct pvr_suballoc_bo *suballoc_bo;
7057          uint32_t ppp_state[num_dwords];
7058          VkResult result;
7059 
7060          pvr_csb_pack (&ppp_state[0], TA_STATE_HEADER, header) {
7061             header.pres_ispctl_dbsc = true;
7062          }
7063 
7064          pvr_csb_pack (&ppp_state[1], TA_STATE_ISPDBSC, ispdbsc) {
7065             ispdbsc.dbindex = db_idx;
7066             ispdbsc.scindex = scissor_idx;
7067          }
7068 
7069          result = pvr_cmd_buffer_upload_general(cmd_buffer,
7070                                                 &ppp_state[0],
7071                                                 sizeof(ppp_state),
7072                                                 &suballoc_bo);
7073          if (result != VK_SUCCESS)
7074             return result;
7075 
7076          pvr_csb_pack (&cmd->dbsc.vdm_state[0], VDMCTRL_PPP_STATE0, state) {
7077             state.word_count = num_dwords;
7078             state.addrmsb = suballoc_bo->dev_addr;
7079          }
7080 
7081          pvr_csb_pack (&cmd->dbsc.vdm_state[1], VDMCTRL_PPP_STATE1, state) {
7082             state.addrlsb = suballoc_bo->dev_addr;
7083          }
7084 
7085          break;
7086       }
7087 
7088       case PVR_DEFERRED_CS_COMMAND_TYPE_DBSC2: {
7089          const uint32_t scissor_idx =
7090             prim_scissor_elems + cmd->dbsc2.state.scissor_index;
7091          const uint32_t db_idx =
7092             prim_db_elems + cmd->dbsc2.state.depthbias_index;
7093 
7094          uint32_t *const addr =
7095             (uint32_t *)pvr_bo_suballoc_get_map_addr(cmd->dbsc2.ppp_cs_bo) +
7096             cmd->dbsc2.patch_offset;
7097 
7098          assert(pvr_bo_suballoc_get_map_addr(cmd->dbsc2.ppp_cs_bo));
7099 
7100          pvr_csb_pack (addr, TA_STATE_ISPDBSC, ispdbsc) {
7101             ispdbsc.dbindex = db_idx;
7102             ispdbsc.scindex = scissor_idx;
7103          }
7104 
7105          break;
7106       }
7107 
7108       default:
7109          unreachable("Invalid deferred control stream command type.");
7110          break;
7111       }
7112    }
7113 
7114    util_dynarray_append_dynarray(&cmd_buffer->depth_bias_array,
7115                                  &sec_cmd_buffer->depth_bias_array);
7116 
7117    util_dynarray_append_dynarray(&cmd_buffer->scissor_array,
7118                                  &sec_cmd_buffer->scissor_array);
7119 
7120    BITSET_SET(dynamic_state->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS);
7121    cmd_buffer->scissor_words = (struct pvr_scissor_words){ 0 };
7122 
7123    return VK_SUCCESS;
7124 }
7125 
7126 /* Caller needs to make sure that it ends the current sub_cmd. This function
7127  * only creates a copy of sec_sub_cmd and links it to the cmd_buffer's
7128  * sub_cmd list.
7129  */
pvr_execute_sub_cmd(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd * sec_sub_cmd)7130 static VkResult pvr_execute_sub_cmd(struct pvr_cmd_buffer *cmd_buffer,
7131                                     struct pvr_sub_cmd *sec_sub_cmd)
7132 {
7133    struct pvr_sub_cmd *primary_sub_cmd =
7134       vk_zalloc(&cmd_buffer->vk.pool->alloc,
7135                 sizeof(*primary_sub_cmd),
7136                 8,
7137                 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
7138    if (!primary_sub_cmd) {
7139       return vk_command_buffer_set_error(&cmd_buffer->vk,
7140                                          VK_ERROR_OUT_OF_HOST_MEMORY);
7141    }
7142 
7143    primary_sub_cmd->type = sec_sub_cmd->type;
7144    primary_sub_cmd->owned = false;
7145 
7146    list_addtail(&primary_sub_cmd->link, &cmd_buffer->sub_cmds);
7147 
7148    switch (sec_sub_cmd->type) {
7149    case PVR_SUB_CMD_TYPE_GRAPHICS:
7150       primary_sub_cmd->gfx = sec_sub_cmd->gfx;
7151       break;
7152 
7153    case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY:
7154    case PVR_SUB_CMD_TYPE_COMPUTE:
7155       primary_sub_cmd->compute = sec_sub_cmd->compute;
7156       break;
7157 
7158    case PVR_SUB_CMD_TYPE_TRANSFER:
7159       primary_sub_cmd->transfer = sec_sub_cmd->transfer;
7160       break;
7161 
7162    case PVR_SUB_CMD_TYPE_EVENT:
7163       primary_sub_cmd->event = sec_sub_cmd->event;
7164       break;
7165 
7166    default:
7167       unreachable("Unsupported sub-command type");
7168    }
7169 
7170    return VK_SUCCESS;
7171 }
7172 
7173 static VkResult
pvr_execute_graphics_cmd_buffer(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_cmd_buffer * sec_cmd_buffer)7174 pvr_execute_graphics_cmd_buffer(struct pvr_cmd_buffer *cmd_buffer,
7175                                 const struct pvr_cmd_buffer *sec_cmd_buffer)
7176 {
7177    const struct pvr_device_info *dev_info =
7178       &cmd_buffer->device->pdevice->dev_info;
7179    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
7180    struct pvr_sub_cmd *primary_sub_cmd = state->current_sub_cmd;
7181    struct pvr_sub_cmd *first_sec_cmd;
7182    VkResult result;
7183 
7184    /* Inherited queries are not supported. */
7185    assert(!state->vis_test_enabled);
7186 
7187    if (list_is_empty(&sec_cmd_buffer->sub_cmds))
7188       return VK_SUCCESS;
7189 
7190    first_sec_cmd =
7191       list_first_entry(&sec_cmd_buffer->sub_cmds, struct pvr_sub_cmd, link);
7192 
7193    /* Kick a render if we have a new base address. */
7194    if (primary_sub_cmd->gfx.query_pool && first_sec_cmd->gfx.query_pool &&
7195        primary_sub_cmd->gfx.query_pool != first_sec_cmd->gfx.query_pool) {
7196       state->current_sub_cmd->gfx.barrier_store = true;
7197 
7198       result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7199       if (result != VK_SUCCESS)
7200          return result;
7201 
7202       result =
7203          pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
7204       if (result != VK_SUCCESS)
7205          return result;
7206 
7207       primary_sub_cmd = state->current_sub_cmd;
7208 
7209       /* Use existing render setup, but load color attachments from HW
7210        * Background object.
7211        */
7212       primary_sub_cmd->gfx.barrier_load = true;
7213       primary_sub_cmd->gfx.barrier_store = false;
7214    }
7215 
7216    list_for_each_entry (struct pvr_sub_cmd,
7217                         sec_sub_cmd,
7218                         &sec_cmd_buffer->sub_cmds,
7219                         link) {
7220       /* Only graphics secondary execution supported within a renderpass. */
7221       assert(sec_sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS);
7222 
7223       if (!sec_sub_cmd->gfx.empty_cmd)
7224          primary_sub_cmd->gfx.empty_cmd = false;
7225 
7226       if (sec_sub_cmd->gfx.query_pool) {
7227          primary_sub_cmd->gfx.query_pool = sec_sub_cmd->gfx.query_pool;
7228 
7229          util_dynarray_append_dynarray(&state->query_indices,
7230                                        &sec_sub_cmd->gfx.sec_query_indices);
7231       }
7232 
7233       if (pvr_cmd_uses_deferred_cs_cmds(sec_cmd_buffer)) {
7234          /* TODO: In case if secondary buffer is created with
7235           * VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT, then we patch the
7236           * stream and copy it to primary stream using pvr_csb_copy below.
7237           * This will need locking if the same secondary command buffer is
7238           * executed in multiple primary buffers at the same time.
7239           */
7240          result = pvr_execute_deferred_cmd_buffer(cmd_buffer, sec_cmd_buffer);
7241          if (result != VK_SUCCESS)
7242             return result;
7243 
7244          result = pvr_csb_copy(&primary_sub_cmd->gfx.control_stream,
7245                                &sec_sub_cmd->gfx.control_stream);
7246          if (result != VK_SUCCESS)
7247             return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
7248       } else {
7249          result = pvr_execute_deferred_cmd_buffer(cmd_buffer, sec_cmd_buffer);
7250          if (result != VK_SUCCESS)
7251             return result;
7252 
7253          pvr_csb_emit_link(
7254             &primary_sub_cmd->gfx.control_stream,
7255             pvr_csb_get_start_address(&sec_sub_cmd->gfx.control_stream),
7256             true);
7257       }
7258 
7259       if (PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info,
7260                           compute_overlap)) {
7261          primary_sub_cmd->gfx.job.disable_compute_overlap |=
7262             sec_sub_cmd->gfx.job.disable_compute_overlap;
7263       }
7264 
7265       primary_sub_cmd->gfx.max_tiles_in_flight =
7266          MIN2(primary_sub_cmd->gfx.max_tiles_in_flight,
7267               sec_sub_cmd->gfx.max_tiles_in_flight);
7268 
7269       /* Pass loaded depth/stencil usage from secondary command buffer. */
7270       if (sec_sub_cmd->gfx.depth_usage == PVR_DEPTH_STENCIL_USAGE_NEEDED)
7271          primary_sub_cmd->gfx.depth_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED;
7272 
7273       if (sec_sub_cmd->gfx.stencil_usage == PVR_DEPTH_STENCIL_USAGE_NEEDED)
7274          primary_sub_cmd->gfx.stencil_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED;
7275 
7276       /* Pass depth/stencil modification state from secondary command buffer. */
7277       if (sec_sub_cmd->gfx.modifies_depth)
7278          primary_sub_cmd->gfx.modifies_depth = true;
7279 
7280       if (sec_sub_cmd->gfx.modifies_stencil)
7281          primary_sub_cmd->gfx.modifies_stencil = true;
7282 
7283       if (sec_sub_cmd->gfx.barrier_store) {
7284          struct pvr_sub_cmd *sec_next =
7285             list_entry(sec_sub_cmd->link.next, struct pvr_sub_cmd, link);
7286 
7287          /* This shouldn't be the last sub cmd. There should be a barrier load
7288           * subsequent to the barrier store.
7289           */
7290          assert(list_last_entry(&sec_cmd_buffer->sub_cmds,
7291                                 struct pvr_sub_cmd,
7292                                 link) != sec_sub_cmd);
7293 
7294          /* Kick render to store stencil. */
7295          state->current_sub_cmd->gfx.barrier_store = true;
7296          state->current_sub_cmd->gfx.empty_cmd = false;
7297 
7298          result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7299          if (result != VK_SUCCESS)
7300             return result;
7301 
7302          result =
7303             pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
7304          if (result != VK_SUCCESS)
7305             return result;
7306 
7307          primary_sub_cmd = state->current_sub_cmd;
7308 
7309          /* Use existing render setup, but load color attachments from HW
7310           * Background object.
7311           */
7312          primary_sub_cmd->gfx.barrier_load = sec_next->gfx.barrier_load;
7313          primary_sub_cmd->gfx.barrier_store = sec_next->gfx.barrier_store;
7314          primary_sub_cmd->gfx.empty_cmd = false;
7315       }
7316 
7317       if (!PVR_HAS_FEATURE(dev_info, gs_rta_support)) {
7318          util_dynarray_append_dynarray(&cmd_buffer->deferred_clears,
7319                                        &sec_cmd_buffer->deferred_clears);
7320       }
7321    }
7322 
7323    return VK_SUCCESS;
7324 }
7325 
pvr_CmdExecuteCommands(VkCommandBuffer commandBuffer,uint32_t commandBufferCount,const VkCommandBuffer * pCommandBuffers)7326 void pvr_CmdExecuteCommands(VkCommandBuffer commandBuffer,
7327                             uint32_t commandBufferCount,
7328                             const VkCommandBuffer *pCommandBuffers)
7329 {
7330    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7331    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
7332    struct pvr_cmd_buffer *last_cmd_buffer;
7333    VkResult result;
7334 
7335    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
7336 
7337    assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
7338 
7339    /* Reset the CPU copy of the most recent PPP state of the primary command
7340     * buffer.
7341     *
7342     * The next draw call in the primary after CmdExecuteCommands may send
7343     * redundant state, if it all goes in the same geom job.
7344     *
7345     * Can't just copy state from the secondary because the recording state of
7346     * the secondary command buffers would have been deleted at this point.
7347     */
7348    pvr_reset_graphics_dirty_state(cmd_buffer, false);
7349 
7350    if (state->current_sub_cmd &&
7351        state->current_sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS) {
7352       for (uint32_t i = 0; i < commandBufferCount; i++) {
7353          PVR_FROM_HANDLE(pvr_cmd_buffer, sec_cmd_buffer, pCommandBuffers[i]);
7354 
7355          assert(sec_cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
7356 
7357          result = pvr_execute_graphics_cmd_buffer(cmd_buffer, sec_cmd_buffer);
7358          if (result != VK_SUCCESS)
7359             return;
7360       }
7361 
7362       last_cmd_buffer =
7363          pvr_cmd_buffer_from_handle(pCommandBuffers[commandBufferCount - 1]);
7364 
7365       /* Set barriers from final command secondary command buffer. */
7366       for (uint32_t i = 0; i != PVR_NUM_SYNC_PIPELINE_STAGES; i++) {
7367          state->barriers_needed[i] |=
7368             last_cmd_buffer->state.barriers_needed[i] &
7369             PVR_PIPELINE_STAGE_ALL_GRAPHICS_BITS;
7370       }
7371    } else {
7372       for (uint32_t i = 0; i < commandBufferCount; i++) {
7373          PVR_FROM_HANDLE(pvr_cmd_buffer, sec_cmd_buffer, pCommandBuffers[i]);
7374 
7375          assert(sec_cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
7376 
7377          result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7378          if (result != VK_SUCCESS)
7379             return;
7380 
7381          list_for_each_entry_safe (struct pvr_sub_cmd,
7382                                    sec_sub_cmd,
7383                                    &sec_cmd_buffer->sub_cmds,
7384                                    link) {
7385             result = pvr_execute_sub_cmd(cmd_buffer, sec_sub_cmd);
7386             if (result != VK_SUCCESS)
7387                return;
7388          }
7389       }
7390 
7391       last_cmd_buffer =
7392          pvr_cmd_buffer_from_handle(pCommandBuffers[commandBufferCount - 1]);
7393 
7394       memcpy(state->barriers_needed,
7395              last_cmd_buffer->state.barriers_needed,
7396              sizeof(state->barriers_needed));
7397    }
7398 }
7399 
pvr_insert_transparent_obj(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)7400 static void pvr_insert_transparent_obj(struct pvr_cmd_buffer *const cmd_buffer,
7401                                        struct pvr_sub_cmd_gfx *const sub_cmd)
7402 {
7403    struct pvr_device *const device = cmd_buffer->device;
7404    /* Yes we want a copy. The user could be recording multiple command buffers
7405     * in parallel so writing the template in place could cause problems.
7406     */
7407    struct pvr_static_clear_ppp_template clear =
7408       device->static_clear_state.ppp_templates[VK_IMAGE_ASPECT_COLOR_BIT];
7409    uint32_t pds_state[PVR_STATIC_CLEAR_PDS_STATE_COUNT] = { 0 };
7410    struct pvr_csb *csb = &sub_cmd->control_stream;
7411    struct pvr_suballoc_bo *ppp_bo;
7412 
7413    assert(clear.requires_pds_state);
7414 
7415    /* Patch the template. */
7416 
7417    pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_SHADERBASE],
7418                  TA_STATE_PDS_SHADERBASE,
7419                  shaderbase) {
7420       shaderbase.addr = PVR_DEV_ADDR(device->nop_program.pds.data_offset);
7421    }
7422 
7423    clear.config.pds_state = &pds_state;
7424 
7425    clear.config.ispctl.upass = cmd_buffer->state.render_pass_info.isp_userpass;
7426 
7427    /* Emit PPP state from template. */
7428 
7429    pvr_emit_ppp_from_template(csb, &clear, &ppp_bo);
7430    list_add(&ppp_bo->link, &cmd_buffer->bo_list);
7431 
7432    /* Emit VDM state. */
7433 
7434    pvr_emit_clear_words(cmd_buffer, sub_cmd);
7435 
7436    /* Reset graphics state. */
7437    pvr_reset_graphics_dirty_state(cmd_buffer, false);
7438 }
7439 
7440 static inline struct pvr_render_subpass *
pvr_get_current_subpass(const struct pvr_cmd_buffer_state * const state)7441 pvr_get_current_subpass(const struct pvr_cmd_buffer_state *const state)
7442 {
7443    const uint32_t subpass_idx = state->render_pass_info.subpass_idx;
7444 
7445    return &state->render_pass_info.pass->subpasses[subpass_idx];
7446 }
7447 
pvr_CmdNextSubpass2(VkCommandBuffer commandBuffer,const VkSubpassBeginInfo * pSubpassBeginInfo,const VkSubpassEndInfo * pSubpassEndInfo)7448 void pvr_CmdNextSubpass2(VkCommandBuffer commandBuffer,
7449                          const VkSubpassBeginInfo *pSubpassBeginInfo,
7450                          const VkSubpassEndInfo *pSubpassEndInfo)
7451 {
7452    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7453    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
7454    struct pvr_render_pass_info *rp_info = &state->render_pass_info;
7455    const struct pvr_renderpass_hwsetup_subpass *hw_subpass;
7456    struct pvr_renderpass_hwsetup_render *next_hw_render;
7457    const struct pvr_render_pass *pass = rp_info->pass;
7458    const struct pvr_renderpass_hw_map *current_map;
7459    const struct pvr_renderpass_hw_map *next_map;
7460    struct pvr_load_op *hw_subpass_load_op;
7461    VkResult result;
7462 
7463    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
7464 
7465    current_map = &pass->hw_setup->subpass_map[rp_info->subpass_idx];
7466    next_map = &pass->hw_setup->subpass_map[rp_info->subpass_idx + 1];
7467    next_hw_render = &pass->hw_setup->renders[next_map->render];
7468 
7469    if (current_map->render != next_map->render) {
7470       result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7471       if (result != VK_SUCCESS)
7472          return;
7473 
7474       result = pvr_resolve_unemitted_resolve_attachments(cmd_buffer, rp_info);
7475       if (result != VK_SUCCESS)
7476          return;
7477 
7478       rp_info->current_hw_subpass = next_map->render;
7479 
7480       result =
7481          pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
7482       if (result != VK_SUCCESS)
7483          return;
7484 
7485       rp_info->enable_bg_tag = false;
7486       rp_info->process_empty_tiles = false;
7487 
7488       /* If this subpass contains any load ops the HW Background Object must be
7489        * run to do the clears/loads.
7490        */
7491       if (next_hw_render->color_init_count > 0) {
7492          rp_info->enable_bg_tag = true;
7493 
7494          for (uint32_t i = 0; i < next_hw_render->color_init_count; i++) {
7495             /* Empty tiles need to be cleared too. */
7496             if (next_hw_render->color_init[i].op ==
7497                 VK_ATTACHMENT_LOAD_OP_CLEAR) {
7498                rp_info->process_empty_tiles = true;
7499                break;
7500             }
7501          }
7502       }
7503 
7504       /* Set isp_userpass to zero for new hw_render. This will be used to set
7505        * ROGUE_CR_ISP_CTL::upass_start.
7506        */
7507       rp_info->isp_userpass = 0;
7508    }
7509 
7510    hw_subpass = &next_hw_render->subpasses[next_map->subpass];
7511    hw_subpass_load_op = hw_subpass->load_op;
7512 
7513    if (hw_subpass_load_op) {
7514       result = pvr_cs_write_load_op(cmd_buffer,
7515                                     &state->current_sub_cmd->gfx,
7516                                     hw_subpass_load_op,
7517                                     rp_info->isp_userpass);
7518    }
7519 
7520    /* Pipelines are created for a particular subpass so unbind but leave the
7521     * vertex and descriptor bindings intact as they are orthogonal to the
7522     * subpass.
7523     */
7524    state->gfx_pipeline = NULL;
7525 
7526    /* User-pass spawn is 4 bits so if the driver has to wrap it, it will emit a
7527     * full screen transparent object to flush all tags up until now, then the
7528     * user-pass spawn value will implicitly be reset to 0 because
7529     * pvr_render_subpass::isp_userpass values are stored ANDed with
7530     * ROGUE_CR_ISP_CTL_UPASS_START_SIZE_MAX.
7531     */
7532    /* If hw_subpass_load_op is valid then pvr_write_load_op_control_stream
7533     * has already done a full-screen transparent object.
7534     */
7535    if (rp_info->isp_userpass == PVRX(CR_ISP_CTL_UPASS_START_SIZE_MAX) &&
7536        !hw_subpass_load_op) {
7537       pvr_insert_transparent_obj(cmd_buffer, &state->current_sub_cmd->gfx);
7538    }
7539 
7540    rp_info->subpass_idx++;
7541 
7542    rp_info->isp_userpass = pass->subpasses[rp_info->subpass_idx].isp_userpass;
7543    state->dirty.isp_userpass = true;
7544 
7545    rp_info->pipeline_bind_point =
7546       pass->subpasses[rp_info->subpass_idx].pipeline_bind_point;
7547 
7548    pvr_stash_depth_format(state, &state->current_sub_cmd->gfx);
7549 }
7550 
7551 static bool
pvr_stencil_has_self_dependency(const struct pvr_cmd_buffer_state * const state)7552 pvr_stencil_has_self_dependency(const struct pvr_cmd_buffer_state *const state)
7553 {
7554    const struct pvr_render_subpass *const current_subpass =
7555       pvr_get_current_subpass(state);
7556    const uint32_t *const input_attachments = current_subpass->input_attachments;
7557 
7558    if (current_subpass->depth_stencil_attachment == VK_ATTACHMENT_UNUSED)
7559       return false;
7560 
7561    /* We only need to check the current software subpass as we don't support
7562     * merging to/from a subpass with self-dep stencil.
7563     */
7564 
7565    for (uint32_t i = 0; i < current_subpass->input_count; i++) {
7566       if (input_attachments[i] == current_subpass->depth_stencil_attachment)
7567          return true;
7568    }
7569 
7570    return false;
7571 }
7572 
pvr_is_stencil_store_load_needed(const struct pvr_cmd_buffer * const cmd_buffer,VkPipelineStageFlags2 vk_src_stage_mask,VkPipelineStageFlags2 vk_dst_stage_mask,uint32_t memory_barrier_count,const VkMemoryBarrier2 * const memory_barriers,uint32_t image_barrier_count,const VkImageMemoryBarrier2 * const image_barriers)7573 static bool pvr_is_stencil_store_load_needed(
7574    const struct pvr_cmd_buffer *const cmd_buffer,
7575    VkPipelineStageFlags2 vk_src_stage_mask,
7576    VkPipelineStageFlags2 vk_dst_stage_mask,
7577    uint32_t memory_barrier_count,
7578    const VkMemoryBarrier2 *const memory_barriers,
7579    uint32_t image_barrier_count,
7580    const VkImageMemoryBarrier2 *const image_barriers)
7581 {
7582    const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
7583    const uint32_t fragment_test_stages =
7584       VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
7585       VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT;
7586    const struct pvr_render_pass *const pass = state->render_pass_info.pass;
7587    const struct pvr_renderpass_hwsetup_render *hw_render;
7588    struct pvr_image_view **const attachments =
7589       state->render_pass_info.attachments;
7590    const struct pvr_image_view *attachment;
7591    uint32_t hw_render_idx;
7592 
7593    if (!pass)
7594       return false;
7595 
7596    hw_render_idx = state->current_sub_cmd->gfx.hw_render_idx;
7597    hw_render = &pass->hw_setup->renders[hw_render_idx];
7598 
7599    if (hw_render->ds_attach_idx == VK_ATTACHMENT_UNUSED)
7600       return false;
7601 
7602    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
7603       attachment = attachments[hw_render->ds_attach_idx];
7604    } else {
7605       assert(!attachments);
7606       attachment = NULL;
7607    }
7608 
7609    if (!(vk_src_stage_mask & fragment_test_stages) &&
7610        vk_dst_stage_mask & VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT)
7611       return false;
7612 
7613    for (uint32_t i = 0; i < memory_barrier_count; i++) {
7614       const uint32_t stencil_write_bit =
7615          VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
7616       const uint32_t input_attachment_read_bit =
7617          VK_ACCESS_INPUT_ATTACHMENT_READ_BIT;
7618 
7619       if (!(memory_barriers[i].srcAccessMask & stencil_write_bit))
7620          continue;
7621 
7622       if (!(memory_barriers[i].dstAccessMask & input_attachment_read_bit))
7623          continue;
7624 
7625       return pvr_stencil_has_self_dependency(state);
7626    }
7627 
7628    for (uint32_t i = 0; i < image_barrier_count; i++) {
7629       PVR_FROM_HANDLE(pvr_image, image, image_barriers[i].image);
7630       const uint32_t stencil_bit = VK_IMAGE_ASPECT_STENCIL_BIT;
7631 
7632       if (!(image_barriers[i].subresourceRange.aspectMask & stencil_bit))
7633          continue;
7634 
7635       if (attachment && image != vk_to_pvr_image(attachment->vk.image))
7636          continue;
7637 
7638       if (!vk_format_has_stencil(image->vk.format))
7639          continue;
7640 
7641       return pvr_stencil_has_self_dependency(state);
7642    }
7643 
7644    return false;
7645 }
7646 
7647 static VkResult
pvr_cmd_buffer_insert_mid_frag_barrier_event(struct pvr_cmd_buffer * cmd_buffer,uint32_t src_stage_mask,uint32_t dst_stage_mask)7648 pvr_cmd_buffer_insert_mid_frag_barrier_event(struct pvr_cmd_buffer *cmd_buffer,
7649                                              uint32_t src_stage_mask,
7650                                              uint32_t dst_stage_mask)
7651 {
7652    VkResult result;
7653 
7654    assert(cmd_buffer->state.current_sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS);
7655 
7656    cmd_buffer->state.current_sub_cmd->gfx.empty_cmd = false;
7657 
7658    /* Submit graphics job to store stencil. */
7659    cmd_buffer->state.current_sub_cmd->gfx.barrier_store = true;
7660 
7661    pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7662    result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
7663    if (result != VK_SUCCESS)
7664       return result;
7665 
7666    cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
7667       .type = PVR_EVENT_TYPE_BARRIER,
7668       .barrier = {
7669          .in_render_pass = true,
7670          .wait_for_stage_mask = src_stage_mask,
7671          .wait_at_stage_mask = dst_stage_mask,
7672       },
7673    };
7674 
7675    pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7676    pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
7677 
7678    /* Use existing render setup, but load color attachments from HW BGOBJ */
7679    cmd_buffer->state.current_sub_cmd->gfx.barrier_load = true;
7680    cmd_buffer->state.current_sub_cmd->gfx.barrier_store = false;
7681 
7682    return VK_SUCCESS;
7683 }
7684 
7685 static VkResult
pvr_cmd_buffer_insert_barrier_event(struct pvr_cmd_buffer * cmd_buffer,uint32_t src_stage_mask,uint32_t dst_stage_mask)7686 pvr_cmd_buffer_insert_barrier_event(struct pvr_cmd_buffer *cmd_buffer,
7687                                     uint32_t src_stage_mask,
7688                                     uint32_t dst_stage_mask)
7689 {
7690    VkResult result;
7691 
7692    result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
7693    if (result != VK_SUCCESS)
7694       return result;
7695 
7696    cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
7697       .type = PVR_EVENT_TYPE_BARRIER,
7698       .barrier = {
7699          .wait_for_stage_mask = src_stage_mask,
7700          .wait_at_stage_mask = dst_stage_mask,
7701       },
7702    };
7703 
7704    return pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7705 }
7706 
7707 /* This is just enough to handle vkCmdPipelineBarrier().
7708  * TODO: Complete?
7709  */
pvr_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,const VkDependencyInfo * pDependencyInfo)7710 void pvr_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,
7711                              const VkDependencyInfo *pDependencyInfo)
7712 {
7713    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7714    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
7715    const struct pvr_render_pass *const render_pass =
7716       state->render_pass_info.pass;
7717    VkPipelineStageFlags vk_src_stage_mask = 0U;
7718    VkPipelineStageFlags vk_dst_stage_mask = 0U;
7719    bool is_stencil_store_load_needed;
7720    uint32_t required_stage_mask = 0U;
7721    uint32_t src_stage_mask;
7722    uint32_t dst_stage_mask;
7723    bool is_barrier_needed;
7724 
7725    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
7726 
7727    for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++) {
7728       vk_src_stage_mask |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
7729       vk_dst_stage_mask |= pDependencyInfo->pMemoryBarriers[i].dstStageMask;
7730    }
7731 
7732    for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++) {
7733       vk_src_stage_mask |=
7734          pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
7735       vk_dst_stage_mask |=
7736          pDependencyInfo->pBufferMemoryBarriers[i].dstStageMask;
7737    }
7738 
7739    for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++) {
7740       vk_src_stage_mask |=
7741          pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
7742       vk_dst_stage_mask |=
7743          pDependencyInfo->pImageMemoryBarriers[i].dstStageMask;
7744    }
7745 
7746    src_stage_mask = pvr_stage_mask_src(vk_src_stage_mask);
7747    dst_stage_mask = pvr_stage_mask_dst(vk_dst_stage_mask);
7748 
7749    for (uint32_t stage = 0U; stage != PVR_NUM_SYNC_PIPELINE_STAGES; stage++) {
7750       if (!(dst_stage_mask & BITFIELD_BIT(stage)))
7751          continue;
7752 
7753       required_stage_mask |= state->barriers_needed[stage];
7754    }
7755 
7756    src_stage_mask &= required_stage_mask;
7757    for (uint32_t stage = 0U; stage != PVR_NUM_SYNC_PIPELINE_STAGES; stage++) {
7758       if (!(dst_stage_mask & BITFIELD_BIT(stage)))
7759          continue;
7760 
7761       state->barriers_needed[stage] &= ~src_stage_mask;
7762    }
7763 
7764    if (src_stage_mask == 0 || dst_stage_mask == 0) {
7765       is_barrier_needed = false;
7766    } else if (src_stage_mask == PVR_PIPELINE_STAGE_GEOM_BIT &&
7767               dst_stage_mask == PVR_PIPELINE_STAGE_FRAG_BIT) {
7768       /* This is implicit so no need to barrier. */
7769       is_barrier_needed = false;
7770    } else if (src_stage_mask == dst_stage_mask &&
7771               util_bitcount(src_stage_mask) == 1) {
7772       struct pvr_sub_cmd *const current_sub_cmd = state->current_sub_cmd;
7773 
7774       switch (src_stage_mask) {
7775       case PVR_PIPELINE_STAGE_FRAG_BIT:
7776          is_barrier_needed = !render_pass;
7777 
7778          if (is_barrier_needed)
7779             break;
7780 
7781          assert(current_sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS);
7782 
7783          /* Flush all fragment work up to this point. */
7784          pvr_insert_transparent_obj(cmd_buffer, &current_sub_cmd->gfx);
7785          break;
7786 
7787       case PVR_PIPELINE_STAGE_COMPUTE_BIT:
7788          is_barrier_needed = false;
7789 
7790          if (!current_sub_cmd ||
7791              current_sub_cmd->type != PVR_SUB_CMD_TYPE_COMPUTE) {
7792             break;
7793          }
7794 
7795          /* Multiple dispatches can be merged into a single job. When back to
7796           * back dispatches have a sequential dependency (Compute -> compute
7797           * pipeline barrier) we need to do the following.
7798           *   - Dispatch a kernel which fences all previous memory writes and
7799           *     flushes the MADD cache.
7800           *   - Issue a compute fence which ensures all previous tasks emitted
7801           *     by the compute data master are completed before starting
7802           *     anything new.
7803           */
7804 
7805          /* Issue Data Fence, Wait for Data Fence (IDFWDF) makes the PDS wait
7806           * for data.
7807           */
7808          pvr_compute_generate_idfwdf(cmd_buffer, &current_sub_cmd->compute);
7809 
7810          pvr_compute_generate_fence(cmd_buffer,
7811                                     &current_sub_cmd->compute,
7812                                     false);
7813          break;
7814 
7815       default:
7816          is_barrier_needed = false;
7817          break;
7818       };
7819    } else {
7820       is_barrier_needed = true;
7821    }
7822 
7823    is_stencil_store_load_needed =
7824       pvr_is_stencil_store_load_needed(cmd_buffer,
7825                                        vk_src_stage_mask,
7826                                        vk_dst_stage_mask,
7827                                        pDependencyInfo->memoryBarrierCount,
7828                                        pDependencyInfo->pMemoryBarriers,
7829                                        pDependencyInfo->imageMemoryBarrierCount,
7830                                        pDependencyInfo->pImageMemoryBarriers);
7831 
7832    if (is_stencil_store_load_needed) {
7833       VkResult result;
7834 
7835       result = pvr_cmd_buffer_insert_mid_frag_barrier_event(cmd_buffer,
7836                                                             src_stage_mask,
7837                                                             dst_stage_mask);
7838       if (result != VK_SUCCESS)
7839          mesa_loge("Failed to insert mid frag barrier event.");
7840    } else {
7841       if (is_barrier_needed) {
7842          VkResult result;
7843 
7844          result = pvr_cmd_buffer_insert_barrier_event(cmd_buffer,
7845                                                       src_stage_mask,
7846                                                       dst_stage_mask);
7847          if (result != VK_SUCCESS)
7848             mesa_loge("Failed to insert pipeline barrier event.");
7849       }
7850    }
7851 }
7852 
pvr_CmdResetEvent2(VkCommandBuffer commandBuffer,VkEvent _event,VkPipelineStageFlags2 stageMask)7853 void pvr_CmdResetEvent2(VkCommandBuffer commandBuffer,
7854                         VkEvent _event,
7855                         VkPipelineStageFlags2 stageMask)
7856 {
7857    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7858    PVR_FROM_HANDLE(pvr_event, event, _event);
7859    VkResult result;
7860 
7861    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
7862 
7863    result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
7864    if (result != VK_SUCCESS)
7865       return;
7866 
7867    cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
7868       .type = PVR_EVENT_TYPE_RESET,
7869       .set_reset = {
7870          .event = event,
7871          .wait_for_stage_mask = pvr_stage_mask_src(stageMask),
7872       },
7873    };
7874 
7875    pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7876 }
7877 
pvr_CmdSetEvent2(VkCommandBuffer commandBuffer,VkEvent _event,const VkDependencyInfo * pDependencyInfo)7878 void pvr_CmdSetEvent2(VkCommandBuffer commandBuffer,
7879                       VkEvent _event,
7880                       const VkDependencyInfo *pDependencyInfo)
7881 {
7882    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7883    PVR_FROM_HANDLE(pvr_event, event, _event);
7884    VkPipelineStageFlags2 stage_mask = 0;
7885    VkResult result;
7886 
7887    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
7888 
7889    result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
7890    if (result != VK_SUCCESS)
7891       return;
7892 
7893    for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++)
7894       stage_mask |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
7895 
7896    for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++)
7897       stage_mask |= pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
7898 
7899    for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++)
7900       stage_mask |= pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
7901 
7902    cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
7903       .type = PVR_EVENT_TYPE_SET,
7904       .set_reset = {
7905          .event = event,
7906          .wait_for_stage_mask = pvr_stage_mask_dst(stage_mask),
7907       },
7908    };
7909 
7910    pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7911 }
7912 
pvr_CmdWaitEvents2(VkCommandBuffer commandBuffer,uint32_t eventCount,const VkEvent * pEvents,const VkDependencyInfo * pDependencyInfos)7913 void pvr_CmdWaitEvents2(VkCommandBuffer commandBuffer,
7914                         uint32_t eventCount,
7915                         const VkEvent *pEvents,
7916                         const VkDependencyInfo *pDependencyInfos)
7917 {
7918    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7919    struct pvr_event **events_array;
7920    uint32_t *stage_masks;
7921    VkResult result;
7922 
7923    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
7924 
7925    VK_MULTIALLOC(ma);
7926    vk_multialloc_add(&ma, &events_array, __typeof__(*events_array), eventCount);
7927    vk_multialloc_add(&ma, &stage_masks, __typeof__(*stage_masks), eventCount);
7928 
7929    if (!vk_multialloc_alloc(&ma,
7930                             &cmd_buffer->vk.pool->alloc,
7931                             VK_SYSTEM_ALLOCATION_SCOPE_OBJECT)) {
7932       vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
7933       return;
7934    }
7935 
7936    result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
7937    if (result != VK_SUCCESS) {
7938       vk_free(&cmd_buffer->vk.pool->alloc, events_array);
7939       return;
7940    }
7941 
7942    memcpy(events_array, pEvents, sizeof(*events_array) * eventCount);
7943 
7944    for (uint32_t i = 0; i < eventCount; i++) {
7945       const VkDependencyInfo *info = &pDependencyInfos[i];
7946       VkPipelineStageFlags2 mask = 0;
7947 
7948       for (uint32_t j = 0; j < info->memoryBarrierCount; j++)
7949          mask |= info->pMemoryBarriers[j].dstStageMask;
7950 
7951       for (uint32_t j = 0; j < info->bufferMemoryBarrierCount; j++)
7952          mask |= info->pBufferMemoryBarriers[j].dstStageMask;
7953 
7954       for (uint32_t j = 0; j < info->imageMemoryBarrierCount; j++)
7955          mask |= info->pImageMemoryBarriers[j].dstStageMask;
7956 
7957       stage_masks[i] = pvr_stage_mask_dst(mask);
7958    }
7959 
7960    cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
7961       .type = PVR_EVENT_TYPE_WAIT,
7962       .wait = {
7963          .count = eventCount,
7964          .events = events_array,
7965          .wait_at_stage_masks = stage_masks,
7966       },
7967    };
7968 
7969    pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7970 }
7971 
pvr_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,VkPipelineStageFlags2 stage,VkQueryPool queryPool,uint32_t query)7972 void pvr_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,
7973                                VkPipelineStageFlags2 stage,
7974                                VkQueryPool queryPool,
7975                                uint32_t query)
7976 {
7977    unreachable("Timestamp queries are not supported.");
7978 }
7979 
pvr_EndCommandBuffer(VkCommandBuffer commandBuffer)7980 VkResult pvr_EndCommandBuffer(VkCommandBuffer commandBuffer)
7981 {
7982    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7983    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
7984    VkResult result;
7985 
7986    if (vk_command_buffer_has_error(&cmd_buffer->vk))
7987       return vk_command_buffer_end(&cmd_buffer->vk);
7988 
7989    /* TODO: We should be freeing all the resources, allocated for recording,
7990     * here.
7991     */
7992    util_dynarray_fini(&state->query_indices);
7993 
7994    result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7995    if (result != VK_SUCCESS)
7996       pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
7997 
7998    return vk_command_buffer_end(&cmd_buffer->vk);
7999 }
8000