• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2022 Imagination Technologies Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a copy
5  * of this software and associated documentation files (the "Software"), to deal
6  * in the Software without restriction, including without limitation the rights
7  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8  * copies of the Software, and to permit persons to whom the Software is
9  * furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include <assert.h>
25 #include <limits.h>
26 #include <stdbool.h>
27 #include <stddef.h>
28 #include <stdint.h>
29 #include <string.h>
30 #include <vulkan/vulkan.h>
31 
32 #include "hwdef/rogue_hw_defs.h"
33 #include "hwdef/rogue_hw_utils.h"
34 #include "pvr_blit.h"
35 #include "pvr_bo.h"
36 #include "pvr_clear.h"
37 #include "pvr_common.h"
38 #include "pvr_csb.h"
39 #include "pvr_csb_enum_helpers.h"
40 #include "pvr_device_info.h"
41 #include "pvr_formats.h"
42 #include "pvr_hardcode.h"
43 #include "pvr_hw_pass.h"
44 #include "pvr_job_common.h"
45 #include "pvr_job_render.h"
46 #include "pvr_limits.h"
47 #include "pvr_pds.h"
48 #include "pvr_private.h"
49 #include "pvr_tex_state.h"
50 #include "pvr_types.h"
51 #include "usc/pvr_uscgen.h"
52 #include "pvr_winsys.h"
53 #include "util/bitscan.h"
54 #include "util/bitset.h"
55 #include "util/compiler.h"
56 #include "util/list.h"
57 #include "util/macros.h"
58 #include "util/u_dynarray.h"
59 #include "util/u_math.h"
60 #include "util/u_pack_color.h"
61 #include "vk_alloc.h"
62 #include "vk_command_buffer.h"
63 #include "vk_command_pool.h"
64 #include "vk_common_entrypoints.h"
65 #include "vk_format.h"
66 #include "vk_graphics_state.h"
67 #include "vk_log.h"
68 #include "vk_object.h"
69 #include "vk_util.h"
70 
71 /* Structure used to pass data into pvr_compute_generate_control_stream()
72  * function.
73  */
74 struct pvr_compute_kernel_info {
75    pvr_dev_addr_t indirect_buffer_addr;
76    bool global_offsets_present;
77    uint32_t usc_common_size;
78    uint32_t usc_unified_size;
79    uint32_t pds_temp_size;
80    uint32_t pds_data_size;
81    enum ROGUE_CDMCTRL_USC_TARGET usc_target;
82    bool is_fence;
83    uint32_t pds_data_offset;
84    uint32_t pds_code_offset;
85    enum ROGUE_CDMCTRL_SD_TYPE sd_type;
86    bool usc_common_shared;
87    uint32_t global_size[PVR_WORKGROUP_DIMENSIONS];
88    uint32_t local_size[PVR_WORKGROUP_DIMENSIONS];
89    uint32_t max_instances;
90 };
91 
pvr_cmd_buffer_free_sub_cmd(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd * sub_cmd)92 static void pvr_cmd_buffer_free_sub_cmd(struct pvr_cmd_buffer *cmd_buffer,
93                                         struct pvr_sub_cmd *sub_cmd)
94 {
95    if (sub_cmd->owned) {
96       switch (sub_cmd->type) {
97       case PVR_SUB_CMD_TYPE_GRAPHICS:
98          util_dynarray_fini(&sub_cmd->gfx.sec_query_indices);
99          pvr_csb_finish(&sub_cmd->gfx.control_stream);
100          pvr_bo_free(cmd_buffer->device, sub_cmd->gfx.terminate_ctrl_stream);
101          pvr_bo_suballoc_free(sub_cmd->gfx.depth_bias_bo);
102          pvr_bo_suballoc_free(sub_cmd->gfx.scissor_bo);
103          break;
104 
105       case PVR_SUB_CMD_TYPE_COMPUTE:
106       case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY:
107          pvr_csb_finish(&sub_cmd->compute.control_stream);
108          break;
109 
110       case PVR_SUB_CMD_TYPE_TRANSFER:
111          list_for_each_entry_safe (struct pvr_transfer_cmd,
112                                    transfer_cmd,
113                                    sub_cmd->transfer.transfer_cmds,
114                                    link) {
115             list_del(&transfer_cmd->link);
116             if (!transfer_cmd->is_deferred_clear)
117                vk_free(&cmd_buffer->vk.pool->alloc, transfer_cmd);
118          }
119          break;
120 
121       case PVR_SUB_CMD_TYPE_EVENT:
122          if (sub_cmd->event.type == PVR_EVENT_TYPE_WAIT)
123             vk_free(&cmd_buffer->vk.pool->alloc, sub_cmd->event.wait.events);
124          break;
125 
126       default:
127          unreachable("Unsupported sub-command type");
128       }
129    }
130 
131    list_del(&sub_cmd->link);
132    vk_free(&cmd_buffer->vk.pool->alloc, sub_cmd);
133 }
134 
pvr_cmd_buffer_free_sub_cmds(struct pvr_cmd_buffer * cmd_buffer)135 static void pvr_cmd_buffer_free_sub_cmds(struct pvr_cmd_buffer *cmd_buffer)
136 {
137    list_for_each_entry_safe (struct pvr_sub_cmd,
138                              sub_cmd,
139                              &cmd_buffer->sub_cmds,
140                              link) {
141       pvr_cmd_buffer_free_sub_cmd(cmd_buffer, sub_cmd);
142    }
143 }
144 
pvr_cmd_buffer_free_resources(struct pvr_cmd_buffer * cmd_buffer)145 static void pvr_cmd_buffer_free_resources(struct pvr_cmd_buffer *cmd_buffer)
146 {
147    vk_free(&cmd_buffer->vk.pool->alloc,
148            cmd_buffer->state.render_pass_info.attachments);
149    vk_free(&cmd_buffer->vk.pool->alloc,
150            cmd_buffer->state.render_pass_info.clear_values);
151 
152    util_dynarray_fini(&cmd_buffer->state.query_indices);
153 
154    pvr_cmd_buffer_free_sub_cmds(cmd_buffer);
155 
156    list_for_each_entry_safe (struct pvr_suballoc_bo,
157                              suballoc_bo,
158                              &cmd_buffer->bo_list,
159                              link) {
160       list_del(&suballoc_bo->link);
161       pvr_bo_suballoc_free(suballoc_bo);
162    }
163 
164    util_dynarray_fini(&cmd_buffer->deferred_clears);
165    util_dynarray_fini(&cmd_buffer->deferred_csb_commands);
166    util_dynarray_fini(&cmd_buffer->scissor_array);
167    util_dynarray_fini(&cmd_buffer->depth_bias_array);
168 }
169 
pvr_cmd_buffer_reset(struct vk_command_buffer * vk_cmd_buffer,VkCommandBufferResetFlags flags)170 static void pvr_cmd_buffer_reset(struct vk_command_buffer *vk_cmd_buffer,
171                                  VkCommandBufferResetFlags flags)
172 {
173    struct pvr_cmd_buffer *cmd_buffer =
174       container_of(vk_cmd_buffer, struct pvr_cmd_buffer, vk);
175 
176    /* FIXME: For now we always free all resources as if
177     * VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT was set.
178     */
179    pvr_cmd_buffer_free_resources(cmd_buffer);
180 
181    vk_command_buffer_reset(&cmd_buffer->vk);
182 
183    memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
184    memset(&cmd_buffer->scissor_words, 0, sizeof(cmd_buffer->scissor_words));
185 
186    cmd_buffer->usage_flags = 0;
187 }
188 
pvr_cmd_buffer_destroy(struct vk_command_buffer * vk_cmd_buffer)189 static void pvr_cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer)
190 {
191    struct pvr_cmd_buffer *cmd_buffer =
192       container_of(vk_cmd_buffer, struct pvr_cmd_buffer, vk);
193 
194    pvr_cmd_buffer_free_resources(cmd_buffer);
195    vk_command_buffer_finish(&cmd_buffer->vk);
196    vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer);
197 }
198 
199 static const struct vk_command_buffer_ops cmd_buffer_ops = {
200    .reset = pvr_cmd_buffer_reset,
201    .destroy = pvr_cmd_buffer_destroy,
202 };
203 
pvr_cmd_buffer_create(struct pvr_device * device,struct vk_command_pool * pool,VkCommandBufferLevel level,VkCommandBuffer * pCommandBuffer)204 static VkResult pvr_cmd_buffer_create(struct pvr_device *device,
205                                       struct vk_command_pool *pool,
206                                       VkCommandBufferLevel level,
207                                       VkCommandBuffer *pCommandBuffer)
208 {
209    struct pvr_cmd_buffer *cmd_buffer;
210    VkResult result;
211 
212    cmd_buffer = vk_zalloc(&pool->alloc,
213                           sizeof(*cmd_buffer),
214                           8U,
215                           VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
216    if (!cmd_buffer)
217       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
218 
219    result =
220       vk_command_buffer_init(pool, &cmd_buffer->vk, &cmd_buffer_ops, level);
221    if (result != VK_SUCCESS) {
222       vk_free(&pool->alloc, cmd_buffer);
223       return result;
224    }
225 
226    cmd_buffer->device = device;
227 
228    util_dynarray_init(&cmd_buffer->depth_bias_array, NULL);
229    util_dynarray_init(&cmd_buffer->scissor_array, NULL);
230    util_dynarray_init(&cmd_buffer->deferred_csb_commands, NULL);
231    util_dynarray_init(&cmd_buffer->deferred_clears, NULL);
232 
233    list_inithead(&cmd_buffer->sub_cmds);
234    list_inithead(&cmd_buffer->bo_list);
235 
236    *pCommandBuffer = pvr_cmd_buffer_to_handle(cmd_buffer);
237 
238    return VK_SUCCESS;
239 }
240 
241 VkResult
pvr_AllocateCommandBuffers(VkDevice _device,const VkCommandBufferAllocateInfo * pAllocateInfo,VkCommandBuffer * pCommandBuffers)242 pvr_AllocateCommandBuffers(VkDevice _device,
243                            const VkCommandBufferAllocateInfo *pAllocateInfo,
244                            VkCommandBuffer *pCommandBuffers)
245 {
246    VK_FROM_HANDLE(vk_command_pool, pool, pAllocateInfo->commandPool);
247    PVR_FROM_HANDLE(pvr_device, device, _device);
248    VkResult result = VK_SUCCESS;
249    uint32_t i;
250 
251    for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
252       result = pvr_cmd_buffer_create(device,
253                                      pool,
254                                      pAllocateInfo->level,
255                                      &pCommandBuffers[i]);
256       if (result != VK_SUCCESS)
257          break;
258    }
259 
260    if (result != VK_SUCCESS) {
261       while (i--) {
262          VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, pCommandBuffers[i]);
263          pvr_cmd_buffer_destroy(cmd_buffer);
264       }
265 
266       for (i = 0; i < pAllocateInfo->commandBufferCount; i++)
267          pCommandBuffers[i] = VK_NULL_HANDLE;
268    }
269 
270    return result;
271 }
272 
pvr_cmd_buffer_update_barriers(struct pvr_cmd_buffer * cmd_buffer,enum pvr_sub_cmd_type type)273 static void pvr_cmd_buffer_update_barriers(struct pvr_cmd_buffer *cmd_buffer,
274                                            enum pvr_sub_cmd_type type)
275 {
276    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
277    uint32_t barriers;
278 
279    switch (type) {
280    case PVR_SUB_CMD_TYPE_GRAPHICS:
281       barriers = PVR_PIPELINE_STAGE_GEOM_BIT | PVR_PIPELINE_STAGE_FRAG_BIT;
282       break;
283 
284    case PVR_SUB_CMD_TYPE_COMPUTE:
285       barriers = PVR_PIPELINE_STAGE_COMPUTE_BIT;
286       break;
287 
288    case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY:
289    case PVR_SUB_CMD_TYPE_TRANSFER:
290       /* Compute jobs are used for occlusion queries but to copy the results we
291        * have to sync with transfer jobs because vkCmdCopyQueryPoolResults() is
292        * deemed as a transfer operation by the spec.
293        */
294       barriers = PVR_PIPELINE_STAGE_TRANSFER_BIT;
295       break;
296 
297    case PVR_SUB_CMD_TYPE_EVENT:
298       barriers = 0;
299       break;
300 
301    default:
302       unreachable("Unsupported sub-command type");
303    }
304 
305    for (uint32_t i = 0; i < ARRAY_SIZE(state->barriers_needed); i++)
306       state->barriers_needed[i] |= barriers;
307 }
308 
309 static VkResult
pvr_cmd_buffer_upload_tables(struct pvr_device * device,struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)310 pvr_cmd_buffer_upload_tables(struct pvr_device *device,
311                              struct pvr_cmd_buffer *cmd_buffer,
312                              struct pvr_sub_cmd_gfx *const sub_cmd)
313 {
314    const uint32_t cache_line_size =
315       rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
316    VkResult result;
317 
318    assert(!sub_cmd->depth_bias_bo && !sub_cmd->scissor_bo);
319 
320    if (cmd_buffer->depth_bias_array.size > 0) {
321       result =
322          pvr_gpu_upload(device,
323                         device->heaps.general_heap,
324                         util_dynarray_begin(&cmd_buffer->depth_bias_array),
325                         cmd_buffer->depth_bias_array.size,
326                         cache_line_size,
327                         &sub_cmd->depth_bias_bo);
328       if (result != VK_SUCCESS)
329          return result;
330    }
331 
332    if (cmd_buffer->scissor_array.size > 0) {
333       result = pvr_gpu_upload(device,
334                               device->heaps.general_heap,
335                               util_dynarray_begin(&cmd_buffer->scissor_array),
336                               cmd_buffer->scissor_array.size,
337                               cache_line_size,
338                               &sub_cmd->scissor_bo);
339       if (result != VK_SUCCESS)
340          goto err_free_depth_bias_bo;
341    }
342 
343    util_dynarray_clear(&cmd_buffer->depth_bias_array);
344    util_dynarray_clear(&cmd_buffer->scissor_array);
345 
346    return VK_SUCCESS;
347 
348 err_free_depth_bias_bo:
349    pvr_bo_suballoc_free(sub_cmd->depth_bias_bo);
350    sub_cmd->depth_bias_bo = NULL;
351 
352    return result;
353 }
354 
355 static VkResult
pvr_cmd_buffer_emit_ppp_state(const struct pvr_cmd_buffer * const cmd_buffer,struct pvr_csb * const csb)356 pvr_cmd_buffer_emit_ppp_state(const struct pvr_cmd_buffer *const cmd_buffer,
357                               struct pvr_csb *const csb)
358 {
359    const struct pvr_framebuffer *const framebuffer =
360       cmd_buffer->state.render_pass_info.framebuffer;
361 
362    assert(csb->stream_type == PVR_CMD_STREAM_TYPE_GRAPHICS ||
363           csb->stream_type == PVR_CMD_STREAM_TYPE_GRAPHICS_DEFERRED);
364 
365    pvr_csb_set_relocation_mark(csb);
366 
367    pvr_csb_emit (csb, VDMCTRL_PPP_STATE0, state0) {
368       state0.addrmsb = framebuffer->ppp_state_bo->dev_addr;
369       state0.word_count = framebuffer->ppp_state_size;
370    }
371 
372    pvr_csb_emit (csb, VDMCTRL_PPP_STATE1, state1) {
373       state1.addrlsb = framebuffer->ppp_state_bo->dev_addr;
374    }
375 
376    pvr_csb_clear_relocation_mark(csb);
377 
378    return csb->status;
379 }
380 
381 VkResult
pvr_cmd_buffer_upload_general(struct pvr_cmd_buffer * const cmd_buffer,const void * const data,const size_t size,struct pvr_suballoc_bo ** const pvr_bo_out)382 pvr_cmd_buffer_upload_general(struct pvr_cmd_buffer *const cmd_buffer,
383                               const void *const data,
384                               const size_t size,
385                               struct pvr_suballoc_bo **const pvr_bo_out)
386 {
387    struct pvr_device *const device = cmd_buffer->device;
388    const uint32_t cache_line_size =
389       rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
390    struct pvr_suballoc_bo *suballoc_bo;
391    VkResult result;
392 
393    result = pvr_gpu_upload(device,
394                            device->heaps.general_heap,
395                            data,
396                            size,
397                            cache_line_size,
398                            &suballoc_bo);
399    if (result != VK_SUCCESS)
400       return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
401 
402    list_add(&suballoc_bo->link, &cmd_buffer->bo_list);
403 
404    *pvr_bo_out = suballoc_bo;
405 
406    return VK_SUCCESS;
407 }
408 
409 static VkResult
pvr_cmd_buffer_upload_usc(struct pvr_cmd_buffer * const cmd_buffer,const void * const code,const size_t code_size,uint64_t code_alignment,struct pvr_suballoc_bo ** const pvr_bo_out)410 pvr_cmd_buffer_upload_usc(struct pvr_cmd_buffer *const cmd_buffer,
411                           const void *const code,
412                           const size_t code_size,
413                           uint64_t code_alignment,
414                           struct pvr_suballoc_bo **const pvr_bo_out)
415 {
416    struct pvr_device *const device = cmd_buffer->device;
417    const uint32_t cache_line_size =
418       rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
419    struct pvr_suballoc_bo *suballoc_bo;
420    VkResult result;
421 
422    code_alignment = MAX2(code_alignment, cache_line_size);
423 
424    result =
425       pvr_gpu_upload_usc(device, code, code_size, code_alignment, &suballoc_bo);
426    if (result != VK_SUCCESS)
427       return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
428 
429    list_add(&suballoc_bo->link, &cmd_buffer->bo_list);
430 
431    *pvr_bo_out = suballoc_bo;
432 
433    return VK_SUCCESS;
434 }
435 
pvr_cmd_buffer_upload_pds(struct pvr_cmd_buffer * const cmd_buffer,const uint32_t * data,uint32_t data_size_dwords,uint32_t data_alignment,const uint32_t * code,uint32_t code_size_dwords,uint32_t code_alignment,uint64_t min_alignment,struct pvr_pds_upload * const pds_upload_out)436 VkResult pvr_cmd_buffer_upload_pds(struct pvr_cmd_buffer *const cmd_buffer,
437                                    const uint32_t *data,
438                                    uint32_t data_size_dwords,
439                                    uint32_t data_alignment,
440                                    const uint32_t *code,
441                                    uint32_t code_size_dwords,
442                                    uint32_t code_alignment,
443                                    uint64_t min_alignment,
444                                    struct pvr_pds_upload *const pds_upload_out)
445 {
446    struct pvr_device *const device = cmd_buffer->device;
447    VkResult result;
448 
449    result = pvr_gpu_upload_pds(device,
450                                data,
451                                data_size_dwords,
452                                data_alignment,
453                                code,
454                                code_size_dwords,
455                                code_alignment,
456                                min_alignment,
457                                pds_upload_out);
458    if (result != VK_SUCCESS)
459       return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
460 
461    list_add(&pds_upload_out->pvr_bo->link, &cmd_buffer->bo_list);
462 
463    return VK_SUCCESS;
464 }
465 
466 static inline VkResult
pvr_cmd_buffer_upload_pds_data(struct pvr_cmd_buffer * const cmd_buffer,const uint32_t * data,uint32_t data_size_dwords,uint32_t data_alignment,struct pvr_pds_upload * const pds_upload_out)467 pvr_cmd_buffer_upload_pds_data(struct pvr_cmd_buffer *const cmd_buffer,
468                                const uint32_t *data,
469                                uint32_t data_size_dwords,
470                                uint32_t data_alignment,
471                                struct pvr_pds_upload *const pds_upload_out)
472 {
473    return pvr_cmd_buffer_upload_pds(cmd_buffer,
474                                     data,
475                                     data_size_dwords,
476                                     data_alignment,
477                                     NULL,
478                                     0,
479                                     0,
480                                     data_alignment,
481                                     pds_upload_out);
482 }
483 
484 /* pbe_cs_words must be an array of length emit_count with
485  * ROGUE_NUM_PBESTATE_STATE_WORDS entries
486  */
pvr_sub_cmd_gfx_per_job_fragment_programs_create_and_upload(struct pvr_cmd_buffer * const cmd_buffer,const uint32_t emit_count,const uint32_t * pbe_cs_words,struct pvr_pds_upload * const pds_upload_out)487 static VkResult pvr_sub_cmd_gfx_per_job_fragment_programs_create_and_upload(
488    struct pvr_cmd_buffer *const cmd_buffer,
489    const uint32_t emit_count,
490    const uint32_t *pbe_cs_words,
491    struct pvr_pds_upload *const pds_upload_out)
492 {
493    struct pvr_pds_event_program pixel_event_program = {
494       /* No data to DMA, just a DOUTU needed. */
495       .num_emit_word_pairs = 0,
496    };
497    const uint32_t staging_buffer_size =
498       PVR_DW_TO_BYTES(cmd_buffer->device->pixel_event_data_size_in_dwords);
499    const VkAllocationCallbacks *const allocator = &cmd_buffer->vk.pool->alloc;
500    struct pvr_device *const device = cmd_buffer->device;
501    struct pvr_suballoc_bo *usc_eot_program = NULL;
502    struct util_dynarray eot_program_bin;
503    uint32_t *staging_buffer;
504    uint32_t usc_temp_count;
505    VkResult result;
506 
507    assert(emit_count > 0);
508 
509    pvr_uscgen_eot("per-job EOT",
510                   emit_count,
511                   pbe_cs_words,
512                   &usc_temp_count,
513                   &eot_program_bin);
514 
515    result = pvr_cmd_buffer_upload_usc(cmd_buffer,
516                                       eot_program_bin.data,
517                                       eot_program_bin.size,
518                                       4,
519                                       &usc_eot_program);
520 
521    util_dynarray_fini(&eot_program_bin);
522 
523    if (result != VK_SUCCESS)
524       return result;
525 
526    pvr_pds_setup_doutu(&pixel_event_program.task_control,
527                        usc_eot_program->dev_addr.addr,
528                        usc_temp_count,
529                        ROGUE_PDSINST_DOUTU_SAMPLE_RATE_INSTANCE,
530                        false);
531 
532    /* TODO: We could skip allocating this and generate directly into the device
533     * buffer thus removing one allocation and memcpy() per job. Would this
534     * speed up things in a noticeable way?
535     */
536    staging_buffer = vk_alloc(allocator,
537                              staging_buffer_size,
538                              8,
539                              VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
540    if (!staging_buffer) {
541       result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
542       goto err_free_usc_pixel_program;
543    }
544 
545    /* Generate the data segment. The code segment was uploaded earlier when
546     * setting up the PDS static heap data.
547     */
548    pvr_pds_generate_pixel_event_data_segment(&pixel_event_program,
549                                              staging_buffer,
550                                              &device->pdevice->dev_info);
551 
552    result = pvr_cmd_buffer_upload_pds_data(
553       cmd_buffer,
554       staging_buffer,
555       cmd_buffer->device->pixel_event_data_size_in_dwords,
556       4,
557       pds_upload_out);
558    if (result != VK_SUCCESS)
559       goto err_free_pixel_event_staging_buffer;
560 
561    vk_free(allocator, staging_buffer);
562 
563    return VK_SUCCESS;
564 
565 err_free_pixel_event_staging_buffer:
566    vk_free(allocator, staging_buffer);
567 
568 err_free_usc_pixel_program:
569    list_del(&usc_eot_program->link);
570    pvr_bo_suballoc_free(usc_eot_program);
571 
572    return result;
573 }
574 
pvr_sub_cmd_gfx_build_terminate_ctrl_stream(struct pvr_device * const device,const struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const gfx_sub_cmd)575 static VkResult pvr_sub_cmd_gfx_build_terminate_ctrl_stream(
576    struct pvr_device *const device,
577    const struct pvr_cmd_buffer *const cmd_buffer,
578    struct pvr_sub_cmd_gfx *const gfx_sub_cmd)
579 {
580    struct list_head bo_list;
581    struct pvr_csb csb;
582    VkResult result;
583 
584    pvr_csb_init(device, PVR_CMD_STREAM_TYPE_GRAPHICS, &csb);
585 
586    result = pvr_cmd_buffer_emit_ppp_state(cmd_buffer, &csb);
587    if (result != VK_SUCCESS)
588       goto err_csb_finish;
589 
590    result = pvr_csb_emit_terminate(&csb);
591    if (result != VK_SUCCESS)
592       goto err_csb_finish;
593 
594    result = pvr_csb_bake(&csb, &bo_list);
595    if (result != VK_SUCCESS)
596       goto err_csb_finish;
597 
598    /* This is a trivial control stream, there's no reason it should ever require
599     * more memory than a single bo can provide.
600     */
601    assert(list_is_singular(&bo_list));
602    gfx_sub_cmd->terminate_ctrl_stream =
603       list_first_entry(&bo_list, struct pvr_bo, link);
604 
605    return VK_SUCCESS;
606 
607 err_csb_finish:
608    pvr_csb_finish(&csb);
609 
610    return result;
611 }
612 
pvr_setup_texture_state_words(struct pvr_device * device,struct pvr_combined_image_sampler_descriptor * descriptor,const struct pvr_image_view * image_view)613 static VkResult pvr_setup_texture_state_words(
614    struct pvr_device *device,
615    struct pvr_combined_image_sampler_descriptor *descriptor,
616    const struct pvr_image_view *image_view)
617 {
618    const struct pvr_image *image = vk_to_pvr_image(image_view->vk.image);
619    struct pvr_texture_state_info info = {
620       .format = image_view->vk.format,
621       .mem_layout = image->memlayout,
622       .type = image_view->vk.view_type,
623       .is_cube = image_view->vk.view_type == VK_IMAGE_VIEW_TYPE_CUBE ||
624                  image_view->vk.view_type == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY,
625       .tex_state_type = PVR_TEXTURE_STATE_SAMPLE,
626       .extent = image_view->vk.extent,
627       .mip_levels = 1,
628       .sample_count = image_view->vk.image->samples,
629       .stride = image->physical_extent.width,
630       .addr = image->dev_addr,
631    };
632    const uint8_t *const swizzle = pvr_get_format_swizzle(info.format);
633    VkResult result;
634 
635    memcpy(&info.swizzle, swizzle, sizeof(info.swizzle));
636 
637    /* TODO: Can we use image_view->texture_state instead of generating here? */
638    result = pvr_pack_tex_state(device, &info, descriptor->image);
639    if (result != VK_SUCCESS)
640       return result;
641 
642    descriptor->sampler = (union pvr_sampler_descriptor){ 0 };
643 
644    pvr_csb_pack (&descriptor->sampler.data.sampler_word,
645                  TEXSTATE_SAMPLER,
646                  sampler) {
647       sampler.non_normalized_coords = true;
648       sampler.addrmode_v = ROGUE_TEXSTATE_ADDRMODE_CLAMP_TO_EDGE;
649       sampler.addrmode_u = ROGUE_TEXSTATE_ADDRMODE_CLAMP_TO_EDGE;
650       sampler.minfilter = ROGUE_TEXSTATE_FILTER_POINT;
651       sampler.magfilter = ROGUE_TEXSTATE_FILTER_POINT;
652       sampler.dadjust = ROGUE_TEXSTATE_DADJUST_ZERO_UINT;
653    }
654 
655    return VK_SUCCESS;
656 }
657 
658 static VkResult
pvr_load_op_constants_create_and_upload(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_load_op * load_op,pvr_dev_addr_t * const addr_out)659 pvr_load_op_constants_create_and_upload(struct pvr_cmd_buffer *cmd_buffer,
660                                         const struct pvr_load_op *load_op,
661                                         pvr_dev_addr_t *const addr_out)
662 {
663    const struct pvr_render_pass_info *render_pass_info =
664       &cmd_buffer->state.render_pass_info;
665    const struct pvr_render_pass *pass = render_pass_info->pass;
666    const struct pvr_renderpass_hwsetup_render *hw_render = load_op->hw_render;
667    const struct pvr_renderpass_colorinit *color_init =
668       &hw_render->color_init[0];
669    const VkClearValue *clear_value =
670       &render_pass_info->clear_values[color_init->index];
671    struct pvr_suballoc_bo *clear_bo;
672    uint32_t attachment_count;
673    bool has_depth_clear;
674    bool has_depth_load;
675    VkResult result;
676 
677    /* These are only setup and never used for now. These will need to be
678     * uploaded into a buffer based on some compiler info.
679     */
680    /* TODO: Remove the above comment once the compiler is hooked up and we're
681     * setting up + uploading the buffer.
682     */
683    struct pvr_combined_image_sampler_descriptor
684       texture_states[PVR_LOAD_OP_CLEARS_LOADS_MAX_RTS];
685    uint32_t texture_count = 0;
686    uint32_t hw_clear_value[PVR_LOAD_OP_CLEARS_LOADS_MAX_RTS *
687                            PVR_CLEAR_COLOR_ARRAY_SIZE];
688    uint32_t next_clear_consts = 0;
689 
690    if (load_op->is_hw_object)
691       attachment_count = load_op->hw_render->color_init_count;
692    else
693       attachment_count = load_op->subpass->color_count;
694 
695    for (uint32_t i = 0; i < attachment_count; i++) {
696       struct pvr_image_view *image_view;
697       uint32_t attachment_idx;
698 
699       if (load_op->is_hw_object)
700          attachment_idx = load_op->hw_render->color_init[i].index;
701       else
702          attachment_idx = load_op->subpass->color_attachments[i];
703 
704       image_view = render_pass_info->attachments[attachment_idx];
705 
706       assert((load_op->clears_loads_state.rt_load_mask &
707               load_op->clears_loads_state.rt_clear_mask) == 0);
708       if (load_op->clears_loads_state.rt_load_mask & BITFIELD_BIT(i)) {
709          result = pvr_setup_texture_state_words(cmd_buffer->device,
710                                                 &texture_states[texture_count],
711                                                 image_view);
712          if (result != VK_SUCCESS)
713             return result;
714 
715          texture_count++;
716       } else if (load_op->clears_loads_state.rt_clear_mask & BITFIELD_BIT(i)) {
717          const uint32_t accum_fmt_size =
718             pvr_get_pbe_accum_format_size_in_bytes(image_view->vk.format);
719 
720          assert(next_clear_consts +
721                    vk_format_get_blocksize(image_view->vk.format) <=
722                 ARRAY_SIZE(hw_clear_value));
723 
724          /* FIXME: do this at the point we store the clear values? */
725          pvr_get_hw_clear_color(image_view->vk.format,
726                                 clear_value->color,
727                                 &hw_clear_value[next_clear_consts]);
728 
729          next_clear_consts += DIV_ROUND_UP(accum_fmt_size, sizeof(uint32_t));
730       }
731    }
732 
733    has_depth_load = false;
734    for (uint32_t i = 0;
735         i < ARRAY_SIZE(load_op->clears_loads_state.dest_vk_format);
736         i++) {
737       if (load_op->clears_loads_state.dest_vk_format[i] ==
738           VK_FORMAT_D32_SFLOAT) {
739          has_depth_load = true;
740          break;
741       }
742    }
743 
744    has_depth_clear = load_op->clears_loads_state.depth_clear_to_reg != -1;
745 
746    assert(!(has_depth_clear && has_depth_load));
747 
748    if (has_depth_load) {
749       const struct pvr_render_pass_attachment *attachment;
750       const struct pvr_image_view *image_view;
751 
752       assert(load_op->subpass->depth_stencil_attachment !=
753              VK_ATTACHMENT_UNUSED);
754       assert(!load_op->is_hw_object);
755       attachment =
756          &pass->attachments[load_op->subpass->depth_stencil_attachment];
757 
758       image_view = render_pass_info->attachments[attachment->index];
759 
760       result = pvr_setup_texture_state_words(cmd_buffer->device,
761                                              &texture_states[texture_count],
762                                              image_view);
763       if (result != VK_SUCCESS)
764          return result;
765 
766       texture_count++;
767    } else if (has_depth_clear) {
768       const struct pvr_render_pass_attachment *attachment;
769       VkClearValue clear_value;
770 
771       assert(load_op->subpass->depth_stencil_attachment !=
772              VK_ATTACHMENT_UNUSED);
773       attachment =
774          &pass->attachments[load_op->subpass->depth_stencil_attachment];
775 
776       clear_value = render_pass_info->clear_values[attachment->index];
777 
778       assert(next_clear_consts < ARRAY_SIZE(hw_clear_value));
779       hw_clear_value[next_clear_consts++] = fui(clear_value.depthStencil.depth);
780    }
781 
782    result = pvr_cmd_buffer_upload_general(cmd_buffer,
783                                           &hw_clear_value[0],
784                                           sizeof(hw_clear_value),
785                                           &clear_bo);
786    if (result != VK_SUCCESS)
787       return result;
788 
789    *addr_out = clear_bo->dev_addr;
790 
791    return VK_SUCCESS;
792 }
793 
pvr_load_op_pds_data_create_and_upload(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_load_op * load_op,pvr_dev_addr_t constants_addr,struct pvr_pds_upload * const pds_upload_out)794 static VkResult pvr_load_op_pds_data_create_and_upload(
795    struct pvr_cmd_buffer *cmd_buffer,
796    const struct pvr_load_op *load_op,
797    pvr_dev_addr_t constants_addr,
798    struct pvr_pds_upload *const pds_upload_out)
799 {
800    struct pvr_device *device = cmd_buffer->device;
801    const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
802    struct pvr_pds_pixel_shader_sa_program program = { 0 };
803    uint32_t staging_buffer_size;
804    uint32_t *staging_buffer;
805    VkResult result;
806 
807    program.num_texture_dma_kicks = 1;
808 
809    pvr_csb_pack (&program.texture_dma_address[0],
810                  PDSINST_DOUT_FIELDS_DOUTD_SRC0,
811                  value) {
812       value.sbase = constants_addr;
813    }
814 
815    pvr_csb_pack (&program.texture_dma_control[0],
816                  PDSINST_DOUT_FIELDS_DOUTD_SRC1,
817                  value) {
818       value.dest = ROGUE_PDSINST_DOUTD_DEST_COMMON_STORE;
819       value.a0 = load_op->shareds_dest_offset;
820       value.bsize = load_op->shareds_count;
821    }
822 
823    pvr_pds_set_sizes_pixel_shader_sa_texture_data(&program, dev_info);
824 
825    staging_buffer_size = PVR_DW_TO_BYTES(program.data_size);
826 
827    staging_buffer = vk_alloc(&cmd_buffer->vk.pool->alloc,
828                              staging_buffer_size,
829                              8,
830                              VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
831    if (!staging_buffer)
832       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
833 
834    pvr_pds_generate_pixel_shader_sa_texture_state_data(&program,
835                                                        staging_buffer,
836                                                        dev_info);
837 
838    result = pvr_cmd_buffer_upload_pds_data(cmd_buffer,
839                                            staging_buffer,
840                                            program.data_size,
841                                            1,
842                                            pds_upload_out);
843    if (result != VK_SUCCESS) {
844       vk_free(&cmd_buffer->vk.pool->alloc, staging_buffer);
845       return result;
846    }
847 
848    vk_free(&cmd_buffer->vk.pool->alloc, staging_buffer);
849 
850    return VK_SUCCESS;
851 }
852 
853 /* FIXME: Should this function be specific to the HW background object, in
854  * which case its name should be changed, or should it have the load op
855  * structure passed in?
856  */
857 static VkResult
pvr_load_op_data_create_and_upload(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_load_op * load_op,struct pvr_pds_upload * const pds_upload_out)858 pvr_load_op_data_create_and_upload(struct pvr_cmd_buffer *cmd_buffer,
859                                    const struct pvr_load_op *load_op,
860                                    struct pvr_pds_upload *const pds_upload_out)
861 {
862    pvr_dev_addr_t constants_addr;
863    VkResult result;
864 
865    result = pvr_load_op_constants_create_and_upload(cmd_buffer,
866                                                     load_op,
867                                                     &constants_addr);
868    if (result != VK_SUCCESS)
869       return result;
870 
871    return pvr_load_op_pds_data_create_and_upload(cmd_buffer,
872                                                  load_op,
873                                                  constants_addr,
874                                                  pds_upload_out);
875 }
876 
pvr_pds_bgnd_pack_state(const struct pvr_load_op * load_op,const struct pvr_pds_upload * load_op_program,uint64_t pds_reg_values[static const ROGUE_NUM_CR_PDS_BGRND_WORDS])877 static void pvr_pds_bgnd_pack_state(
878    const struct pvr_load_op *load_op,
879    const struct pvr_pds_upload *load_op_program,
880    uint64_t pds_reg_values[static const ROGUE_NUM_CR_PDS_BGRND_WORDS])
881 {
882    pvr_csb_pack (&pds_reg_values[0], CR_PDS_BGRND0_BASE, value) {
883       value.shader_addr = PVR_DEV_ADDR(load_op->pds_frag_prog.data_offset);
884       value.texunicode_addr =
885          PVR_DEV_ADDR(load_op->pds_tex_state_prog.code_offset);
886    }
887 
888    pvr_csb_pack (&pds_reg_values[1], CR_PDS_BGRND1_BASE, value) {
889       value.texturedata_addr = PVR_DEV_ADDR(load_op_program->data_offset);
890    }
891 
892    pvr_csb_pack (&pds_reg_values[2], CR_PDS_BGRND3_SIZEINFO, value) {
893       value.usc_sharedsize =
894          DIV_ROUND_UP(load_op->const_shareds_count,
895                       ROGUE_CR_PDS_BGRND3_SIZEINFO_USC_SHAREDSIZE_UNIT_SIZE);
896       value.pds_texturestatesize = DIV_ROUND_UP(
897          load_op_program->data_size,
898          ROGUE_CR_PDS_BGRND3_SIZEINFO_PDS_TEXTURESTATESIZE_UNIT_SIZE);
899       value.pds_tempsize =
900          DIV_ROUND_UP(load_op->temps_count,
901                       ROGUE_CR_PDS_BGRND3_SIZEINFO_PDS_TEMPSIZE_UNIT_SIZE);
902    }
903 }
904 
905 /**
906  * \brief Calculates the stride in pixels based on the pitch in bytes and pixel
907  * format.
908  *
909  * \param[in] pitch     Width pitch in bytes.
910  * \param[in] vk_format Vulkan image format.
911  * \return Stride in pixels.
912  */
pvr_stride_from_pitch(uint32_t pitch,VkFormat vk_format)913 static inline uint32_t pvr_stride_from_pitch(uint32_t pitch, VkFormat vk_format)
914 {
915    const unsigned int cpp = vk_format_get_blocksize(vk_format);
916 
917    assert(pitch % cpp == 0);
918 
919    return pitch / cpp;
920 }
921 
pvr_setup_pbe_state(const struct pvr_device_info * dev_info,const struct pvr_framebuffer * framebuffer,uint32_t mrt_index,const struct usc_mrt_resource * mrt_resource,const struct pvr_image_view * const iview,const VkRect2D * render_area,const bool down_scale,const uint32_t samples,uint32_t pbe_cs_words[static const ROGUE_NUM_PBESTATE_STATE_WORDS],uint64_t pbe_reg_words[static const ROGUE_NUM_PBESTATE_REG_WORDS])922 static void pvr_setup_pbe_state(
923    const struct pvr_device_info *dev_info,
924    const struct pvr_framebuffer *framebuffer,
925    uint32_t mrt_index,
926    const struct usc_mrt_resource *mrt_resource,
927    const struct pvr_image_view *const iview,
928    const VkRect2D *render_area,
929    const bool down_scale,
930    const uint32_t samples,
931    uint32_t pbe_cs_words[static const ROGUE_NUM_PBESTATE_STATE_WORDS],
932    uint64_t pbe_reg_words[static const ROGUE_NUM_PBESTATE_REG_WORDS])
933 {
934    const struct pvr_image *image = pvr_image_view_get_image(iview);
935    uint32_t level_pitch = image->mip_levels[iview->vk.base_mip_level].pitch;
936 
937    struct pvr_pbe_surf_params surface_params;
938    struct pvr_pbe_render_params render_params;
939    bool with_packed_usc_channel;
940    const uint8_t *swizzle;
941    uint32_t position;
942 
943    /* down_scale should be true when performing a resolve, in which case there
944     * should be more than one sample.
945     */
946    assert((down_scale && samples > 1U) || (!down_scale && samples == 1U));
947 
948    /* Setup surface parameters. */
949 
950    if (PVR_HAS_FEATURE(dev_info, usc_f16sop_u8)) {
951       with_packed_usc_channel = vk_format_is_unorm(iview->vk.format) ||
952                                 vk_format_is_snorm(iview->vk.format);
953    } else {
954       with_packed_usc_channel = false;
955    }
956 
957    swizzle = pvr_get_format_swizzle(iview->vk.format);
958    memcpy(surface_params.swizzle, swizzle, sizeof(surface_params.swizzle));
959 
960    pvr_pbe_get_src_format_and_gamma(iview->vk.format,
961                                     PVR_PBE_GAMMA_NONE,
962                                     with_packed_usc_channel,
963                                     &surface_params.source_format,
964                                     &surface_params.gamma);
965 
966    surface_params.is_normalized =
967       pvr_vk_format_is_fully_normalized(iview->vk.format);
968    surface_params.pbe_packmode = pvr_get_pbe_packmode(iview->vk.format);
969    surface_params.nr_components = vk_format_get_nr_components(iview->vk.format);
970 
971    /* FIXME: Should we have an inline function to return the address of a mip
972     * level?
973     */
974    surface_params.addr =
975       PVR_DEV_ADDR_OFFSET(image->vma->dev_addr,
976                           image->mip_levels[iview->vk.base_mip_level].offset);
977    surface_params.addr =
978       PVR_DEV_ADDR_OFFSET(surface_params.addr,
979                           iview->vk.base_array_layer * image->layer_size);
980 
981    surface_params.mem_layout = image->memlayout;
982    surface_params.stride = pvr_stride_from_pitch(level_pitch, iview->vk.format);
983    surface_params.depth = iview->vk.extent.depth;
984    surface_params.width = iview->vk.extent.width;
985    surface_params.height = iview->vk.extent.height;
986    surface_params.z_only_render = false;
987    surface_params.down_scale = down_scale;
988 
989    /* Setup render parameters. */
990 
991    if (mrt_resource->type == USC_MRT_RESOURCE_TYPE_MEMORY) {
992       position = mrt_resource->mem.offset_dw;
993    } else {
994       assert(mrt_resource->type == USC_MRT_RESOURCE_TYPE_OUTPUT_REG);
995       assert(mrt_resource->reg.offset == 0);
996 
997       position = mrt_resource->reg.output_reg;
998    }
999 
1000    assert(position <= 3 || PVR_HAS_FEATURE(dev_info, eight_output_registers));
1001 
1002    switch (position) {
1003    case 0:
1004    case 4:
1005       render_params.source_start = PVR_PBE_STARTPOS_BIT0;
1006       break;
1007    case 1:
1008    case 5:
1009       render_params.source_start = PVR_PBE_STARTPOS_BIT32;
1010       break;
1011    case 2:
1012    case 6:
1013       render_params.source_start = PVR_PBE_STARTPOS_BIT64;
1014       break;
1015    case 3:
1016    case 7:
1017       render_params.source_start = PVR_PBE_STARTPOS_BIT96;
1018       break;
1019    default:
1020       assert(!"Invalid output register");
1021       break;
1022    }
1023 
1024 #define PVR_DEC_IF_NOT_ZERO(_v) (((_v) > 0) ? (_v)-1 : 0)
1025 
1026    render_params.min_x_clip = MAX2(0, render_area->offset.x);
1027    render_params.min_y_clip = MAX2(0, render_area->offset.y);
1028    render_params.max_x_clip = MIN2(
1029       framebuffer->width - 1,
1030       PVR_DEC_IF_NOT_ZERO(render_area->offset.x + render_area->extent.width));
1031    render_params.max_y_clip = MIN2(
1032       framebuffer->height - 1,
1033       PVR_DEC_IF_NOT_ZERO(render_area->offset.y + render_area->extent.height));
1034 
1035 #undef PVR_DEC_IF_NOT_ZERO
1036 
1037    render_params.slice = 0;
1038    render_params.mrt_index = mrt_index;
1039 
1040    pvr_pbe_pack_state(dev_info,
1041                       &surface_params,
1042                       &render_params,
1043                       pbe_cs_words,
1044                       pbe_reg_words);
1045 }
1046 
1047 static struct pvr_render_target *
pvr_get_render_target(const struct pvr_render_pass * pass,const struct pvr_framebuffer * framebuffer,uint32_t idx)1048 pvr_get_render_target(const struct pvr_render_pass *pass,
1049                       const struct pvr_framebuffer *framebuffer,
1050                       uint32_t idx)
1051 {
1052    const struct pvr_renderpass_hwsetup_render *hw_render =
1053       &pass->hw_setup->renders[idx];
1054    uint32_t rt_idx = 0;
1055 
1056    switch (hw_render->sample_count) {
1057    case 1:
1058    case 2:
1059    case 4:
1060    case 8:
1061       rt_idx = util_logbase2(hw_render->sample_count);
1062       break;
1063 
1064    default:
1065       unreachable("Unsupported sample count");
1066       break;
1067    }
1068 
1069    return &framebuffer->render_targets[rt_idx];
1070 }
1071 
1072 static uint32_t
pvr_pass_get_pixel_output_width(const struct pvr_render_pass * pass,uint32_t idx,const struct pvr_device_info * dev_info)1073 pvr_pass_get_pixel_output_width(const struct pvr_render_pass *pass,
1074                                 uint32_t idx,
1075                                 const struct pvr_device_info *dev_info)
1076 {
1077    const struct pvr_renderpass_hwsetup_render *hw_render =
1078       &pass->hw_setup->renders[idx];
1079    /* Default value based on the maximum value found in all existing cores. The
1080     * maximum is used as this is being treated as a lower bound, making it a
1081     * "safer" choice than the minimum value found in all existing cores.
1082     */
1083    const uint32_t min_output_regs =
1084       PVR_GET_FEATURE_VALUE(dev_info, usc_min_output_registers_per_pix, 2U);
1085    const uint32_t width = MAX2(hw_render->output_regs_count, min_output_regs);
1086 
1087    return util_next_power_of_two(width);
1088 }
1089 
1090 static inline bool
pvr_ds_attachment_requires_zls(const struct pvr_ds_attachment * attachment)1091 pvr_ds_attachment_requires_zls(const struct pvr_ds_attachment *attachment)
1092 {
1093    bool zls_used;
1094 
1095    zls_used = attachment->load.d || attachment->load.s;
1096    zls_used |= attachment->store.d || attachment->store.s;
1097 
1098    return zls_used;
1099 }
1100 
1101 /**
1102  * \brief If depth and/or stencil attachment dimensions are not tile-aligned,
1103  * then we may need to insert some additional transfer subcommands.
1104  *
1105  * It's worth noting that we check whether the dimensions are smaller than a
1106  * tile here, rather than checking whether they're tile-aligned - this relies
1107  * on the assumption that we can safely use any attachment with dimensions
1108  * larger than a tile. If the attachment is twiddled, it will be over-allocated
1109  * to the nearest power-of-two (which will be tile-aligned). If the attachment
1110  * is not twiddled, we don't need to worry about tile-alignment at all.
1111  */
pvr_sub_cmd_gfx_requires_ds_subtile_alignment(const struct pvr_device_info * dev_info,const struct pvr_render_job * job)1112 static bool pvr_sub_cmd_gfx_requires_ds_subtile_alignment(
1113    const struct pvr_device_info *dev_info,
1114    const struct pvr_render_job *job)
1115 {
1116    const struct pvr_image *const ds_image =
1117       pvr_image_view_get_image(job->ds.iview);
1118    uint32_t zls_tile_size_x;
1119    uint32_t zls_tile_size_y;
1120 
1121    rogue_get_zls_tile_size_xy(dev_info, &zls_tile_size_x, &zls_tile_size_y);
1122 
1123    if (ds_image->physical_extent.width >= zls_tile_size_x &&
1124        ds_image->physical_extent.height >= zls_tile_size_y) {
1125       return false;
1126    }
1127 
1128    /* If we have the zls_subtile feature, we can skip the alignment iff:
1129     *  - The attachment is not multisampled, and
1130     *  - The depth and stencil attachments are the same.
1131     */
1132    if (PVR_HAS_FEATURE(dev_info, zls_subtile) &&
1133        ds_image->vk.samples == VK_SAMPLE_COUNT_1_BIT &&
1134        job->has_stencil_attachment == job->has_depth_attachment) {
1135       return false;
1136    }
1137 
1138    /* No ZLS functions enabled; nothing to do. */
1139    if ((!job->has_depth_attachment && !job->has_stencil_attachment) ||
1140        !pvr_ds_attachment_requires_zls(&job->ds)) {
1141       return false;
1142    }
1143 
1144    return true;
1145 }
1146 
1147 static VkResult
pvr_sub_cmd_gfx_align_ds_subtiles(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const gfx_sub_cmd)1148 pvr_sub_cmd_gfx_align_ds_subtiles(struct pvr_cmd_buffer *const cmd_buffer,
1149                                   struct pvr_sub_cmd_gfx *const gfx_sub_cmd)
1150 {
1151    struct pvr_sub_cmd *const prev_sub_cmd =
1152       container_of(gfx_sub_cmd, struct pvr_sub_cmd, gfx);
1153    struct pvr_ds_attachment *const ds = &gfx_sub_cmd->job.ds;
1154    const struct pvr_image *const ds_image = pvr_image_view_get_image(ds->iview);
1155    const VkFormat copy_format = pvr_get_raw_copy_format(ds_image->vk.format);
1156 
1157    struct pvr_suballoc_bo *buffer;
1158    uint32_t buffer_layer_size;
1159    VkBufferImageCopy2 region;
1160    VkExtent2D zls_tile_size;
1161    VkExtent2D rounded_size;
1162    uint32_t buffer_size;
1163    VkExtent2D scale;
1164    VkResult result;
1165 
1166    /* The operations below assume the last command in the buffer was the target
1167     * gfx subcommand. Assert that this is the case.
1168     */
1169    assert(list_last_entry(&cmd_buffer->sub_cmds, struct pvr_sub_cmd, link) ==
1170           prev_sub_cmd);
1171    assert(prev_sub_cmd == cmd_buffer->state.current_sub_cmd);
1172 
1173    if (!pvr_ds_attachment_requires_zls(ds))
1174       return VK_SUCCESS;
1175 
1176    rogue_get_zls_tile_size_xy(&cmd_buffer->device->pdevice->dev_info,
1177                               &zls_tile_size.width,
1178                               &zls_tile_size.height);
1179    rogue_get_isp_scale_xy_from_samples(ds_image->vk.samples,
1180                                        &scale.width,
1181                                        &scale.height);
1182 
1183    rounded_size = (VkExtent2D){
1184       .width = ALIGN_POT(ds_image->physical_extent.width, zls_tile_size.width),
1185       .height =
1186          ALIGN_POT(ds_image->physical_extent.height, zls_tile_size.height),
1187    };
1188 
1189    buffer_layer_size = vk_format_get_blocksize(ds_image->vk.format) *
1190                        rounded_size.width * rounded_size.height * scale.width *
1191                        scale.height;
1192 
1193    if (ds->iview->vk.layer_count > 1)
1194       buffer_layer_size = ALIGN_POT(buffer_layer_size, ds_image->alignment);
1195 
1196    buffer_size = buffer_layer_size * ds->iview->vk.layer_count;
1197 
1198    result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
1199                                      cmd_buffer->device->heaps.general_heap,
1200                                      buffer_size,
1201                                      &buffer);
1202    if (result != VK_SUCCESS)
1203       return result;
1204 
1205    region = (VkBufferImageCopy2){
1206       .sType = VK_STRUCTURE_TYPE_BUFFER_IMAGE_COPY_2,
1207       .pNext = NULL,
1208       .bufferOffset = 0,
1209       .bufferRowLength = rounded_size.width,
1210       .bufferImageHeight = 0,
1211       .imageSubresource = {
1212          .aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT,
1213          .mipLevel = ds->iview->vk.base_mip_level,
1214          .baseArrayLayer = ds->iview->vk.base_array_layer,
1215          .layerCount = ds->iview->vk.layer_count,
1216       },
1217       .imageOffset = { 0 },
1218       .imageExtent = {
1219          .width = ds->iview->vk.extent.width,
1220          .height = ds->iview->vk.extent.height,
1221          .depth = 1,
1222       },
1223    };
1224 
1225    if (ds->load.d || ds->load.s) {
1226       struct pvr_sub_cmd *new_sub_cmd;
1227 
1228       cmd_buffer->state.current_sub_cmd = NULL;
1229 
1230       result =
1231          pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_TRANSFER);
1232       if (result != VK_SUCCESS)
1233          return result;
1234 
1235       new_sub_cmd = cmd_buffer->state.current_sub_cmd;
1236 
1237       result = pvr_copy_image_to_buffer_region_format(cmd_buffer,
1238                                                       ds_image,
1239                                                       buffer->dev_addr,
1240                                                       &region,
1241                                                       copy_format,
1242                                                       copy_format);
1243       if (result != VK_SUCCESS)
1244          return result;
1245 
1246       new_sub_cmd->transfer.serialize_with_frag = true;
1247 
1248       result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
1249       if (result != VK_SUCCESS)
1250          return result;
1251 
1252       /* Now we have to fiddle with cmd_buffer to place this transfer command
1253        * *before* the target gfx subcommand.
1254        *
1255        * Note the doc for list_move_to() is subtly wrong - item is placed
1256        * directly *after* loc in the list, not "in front of".
1257        */
1258       list_move_to(&new_sub_cmd->link, prev_sub_cmd->link.prev);
1259 
1260       cmd_buffer->state.current_sub_cmd = prev_sub_cmd;
1261    }
1262 
1263    if (ds->store.d || ds->store.s) {
1264       cmd_buffer->state.current_sub_cmd = NULL;
1265 
1266       result =
1267          pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_TRANSFER);
1268       if (result != VK_SUCCESS)
1269          return result;
1270 
1271       result = pvr_copy_buffer_to_image_region_format(cmd_buffer,
1272                                                       buffer->dev_addr,
1273                                                       ds_image,
1274                                                       &region,
1275                                                       copy_format,
1276                                                       copy_format,
1277                                                       0);
1278       if (result != VK_SUCCESS)
1279          return result;
1280 
1281       cmd_buffer->state.current_sub_cmd->transfer.serialize_with_frag = true;
1282 
1283       result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
1284       if (result != VK_SUCCESS)
1285          return result;
1286 
1287       cmd_buffer->state.current_sub_cmd = prev_sub_cmd;
1288    }
1289 
1290    /* Finally, patch up the target graphics sub_cmd to use the correctly-strided
1291     * buffer.
1292     */
1293    ds->has_alignment_transfers = true;
1294    ds->addr = buffer->dev_addr;
1295    ds->physical_extent = rounded_size;
1296 
1297    gfx_sub_cmd->wait_on_previous_transfer = true;
1298 
1299    return VK_SUCCESS;
1300 }
1301 
1302 struct pvr_emit_state {
1303    uint32_t pbe_cs_words[PVR_MAX_COLOR_ATTACHMENTS]
1304                         [ROGUE_NUM_PBESTATE_STATE_WORDS];
1305 
1306    uint64_t pbe_reg_words[PVR_MAX_COLOR_ATTACHMENTS]
1307                          [ROGUE_NUM_PBESTATE_REG_WORDS];
1308 
1309    uint32_t emit_count;
1310 };
1311 
1312 static void
pvr_setup_emit_state(const struct pvr_device_info * dev_info,const struct pvr_renderpass_hwsetup_render * hw_render,struct pvr_render_pass_info * render_pass_info,struct pvr_emit_state * emit_state)1313 pvr_setup_emit_state(const struct pvr_device_info *dev_info,
1314                      const struct pvr_renderpass_hwsetup_render *hw_render,
1315                      struct pvr_render_pass_info *render_pass_info,
1316                      struct pvr_emit_state *emit_state)
1317 {
1318    assert(hw_render->pbe_emits <= PVR_NUM_PBE_EMIT_REGS);
1319 
1320    if (hw_render->eot_surface_count == 0) {
1321       emit_state->emit_count = 1;
1322       pvr_csb_pack (&emit_state->pbe_cs_words[0][1],
1323                     PBESTATE_STATE_WORD1,
1324                     state) {
1325          state.emptytile = true;
1326       }
1327       return;
1328    }
1329 
1330    static_assert(USC_MRT_RESOURCE_TYPE_OUTPUT_REG + 1 ==
1331                     USC_MRT_RESOURCE_TYPE_MEMORY,
1332                  "The loop below needs adjusting.");
1333 
1334    emit_state->emit_count = 0;
1335    for (uint32_t resource_type = USC_MRT_RESOURCE_TYPE_OUTPUT_REG;
1336         resource_type <= USC_MRT_RESOURCE_TYPE_MEMORY;
1337         resource_type++) {
1338       for (uint32_t i = 0; i < hw_render->eot_surface_count; i++) {
1339          const struct pvr_framebuffer *framebuffer =
1340             render_pass_info->framebuffer;
1341          const struct pvr_renderpass_hwsetup_eot_surface *surface =
1342             &hw_render->eot_surfaces[i];
1343          const struct pvr_image_view *iview =
1344             render_pass_info->attachments[surface->attachment_idx];
1345          const struct usc_mrt_resource *mrt_resource =
1346             &hw_render->eot_setup.mrt_resources[surface->mrt_idx];
1347          uint32_t samples = 1;
1348 
1349          if (mrt_resource->type != resource_type)
1350             continue;
1351 
1352          if (surface->need_resolve) {
1353             const struct pvr_image_view *resolve_src =
1354                render_pass_info->attachments[surface->src_attachment_idx];
1355 
1356             /* Attachments that are the destination of resolve operations must
1357              * be loaded before their next use.
1358              */
1359             render_pass_info->enable_bg_tag = true;
1360             render_pass_info->process_empty_tiles = true;
1361 
1362             if (surface->resolve_type != PVR_RESOLVE_TYPE_PBE)
1363                continue;
1364 
1365             samples = (uint32_t)resolve_src->vk.image->samples;
1366          }
1367 
1368          assert(emit_state->emit_count < ARRAY_SIZE(emit_state->pbe_cs_words));
1369          assert(emit_state->emit_count < ARRAY_SIZE(emit_state->pbe_reg_words));
1370 
1371          pvr_setup_pbe_state(dev_info,
1372                              framebuffer,
1373                              emit_state->emit_count,
1374                              mrt_resource,
1375                              iview,
1376                              &render_pass_info->render_area,
1377                              surface->need_resolve,
1378                              samples,
1379                              emit_state->pbe_cs_words[emit_state->emit_count],
1380                              emit_state->pbe_reg_words[emit_state->emit_count]);
1381          emit_state->emit_count += 1;
1382       }
1383    }
1384 
1385    assert(emit_state->emit_count == hw_render->pbe_emits);
1386 }
1387 
1388 static inline bool
pvr_is_render_area_tile_aligned(const struct pvr_cmd_buffer * cmd_buffer,const struct pvr_image_view * iview)1389 pvr_is_render_area_tile_aligned(const struct pvr_cmd_buffer *cmd_buffer,
1390                                 const struct pvr_image_view *iview)
1391 {
1392    const VkRect2D *render_area =
1393       &cmd_buffer->state.render_pass_info.render_area;
1394 
1395    return render_area->offset.x == 0 && render_area->offset.y == 0 &&
1396           render_area->extent.height == iview->vk.extent.height &&
1397           render_area->extent.width == iview->vk.extent.width;
1398 }
1399 
pvr_sub_cmd_gfx_job_init(const struct pvr_device_info * dev_info,struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_gfx * sub_cmd)1400 static VkResult pvr_sub_cmd_gfx_job_init(const struct pvr_device_info *dev_info,
1401                                          struct pvr_cmd_buffer *cmd_buffer,
1402                                          struct pvr_sub_cmd_gfx *sub_cmd)
1403 {
1404    static const VkClearDepthStencilValue default_ds_clear_value = {
1405       .depth = 1.0f,
1406       .stencil = 0xFFFFFFFF,
1407    };
1408 
1409    const struct vk_dynamic_graphics_state *dynamic_state =
1410       &cmd_buffer->vk.dynamic_graphics_state;
1411    struct pvr_render_pass_info *render_pass_info =
1412       &cmd_buffer->state.render_pass_info;
1413    const struct pvr_renderpass_hwsetup_render *hw_render =
1414       &render_pass_info->pass->hw_setup->renders[sub_cmd->hw_render_idx];
1415    struct pvr_render_job *job = &sub_cmd->job;
1416    struct pvr_pds_upload pds_pixel_event_program;
1417    struct pvr_framebuffer *framebuffer = render_pass_info->framebuffer;
1418    struct pvr_spm_bgobj_state *spm_bgobj_state =
1419       &framebuffer->spm_bgobj_state_per_render[sub_cmd->hw_render_idx];
1420    struct pvr_render_target *render_target;
1421    VkResult result;
1422 
1423    if (sub_cmd->barrier_store) {
1424       /* There can only ever be one frag job running on the hardware at any one
1425        * time, and a context switch is not allowed mid-tile, so instead of
1426        * allocating a new scratch buffer we can reuse the SPM scratch buffer to
1427        * perform the store.
1428        * So use the SPM EOT program with the SPM PBE reg words in order to store
1429        * the render to the SPM scratch buffer.
1430        */
1431 
1432       memcpy(job->pbe_reg_words,
1433              &framebuffer->spm_eot_state_per_render[0].pbe_reg_words,
1434              sizeof(job->pbe_reg_words));
1435       job->pds_pixel_event_data_offset =
1436          framebuffer->spm_eot_state_per_render[0]
1437             .pixel_event_program_data_offset;
1438    } else {
1439       struct pvr_emit_state emit_state = { 0 };
1440 
1441       pvr_setup_emit_state(dev_info, hw_render, render_pass_info, &emit_state);
1442 
1443       memcpy(job->pbe_reg_words,
1444              emit_state.pbe_reg_words,
1445              sizeof(job->pbe_reg_words));
1446 
1447       result = pvr_sub_cmd_gfx_per_job_fragment_programs_create_and_upload(
1448          cmd_buffer,
1449          emit_state.emit_count,
1450          emit_state.pbe_cs_words[0],
1451          &pds_pixel_event_program);
1452       if (result != VK_SUCCESS)
1453          return result;
1454 
1455       job->pds_pixel_event_data_offset = pds_pixel_event_program.data_offset;
1456    }
1457 
1458    if (sub_cmd->barrier_load) {
1459       job->enable_bg_tag = true;
1460       job->process_empty_tiles = true;
1461 
1462       /* Load the previously stored render from the SPM scratch buffer. */
1463 
1464       STATIC_ASSERT(ARRAY_SIZE(job->pds_bgnd_reg_values) ==
1465                     ARRAY_SIZE(spm_bgobj_state->pds_reg_values));
1466       typed_memcpy(job->pds_bgnd_reg_values,
1467                    spm_bgobj_state->pds_reg_values,
1468                    ARRAY_SIZE(spm_bgobj_state->pds_reg_values));
1469    } else if (hw_render->load_op) {
1470       const struct pvr_load_op *load_op = hw_render->load_op;
1471       struct pvr_pds_upload load_op_program;
1472 
1473       /* Recalculate Background Object(s). */
1474 
1475       /* FIXME: Should we free the PDS pixel event data or let it be freed
1476        * when the pool gets emptied?
1477        */
1478       result = pvr_load_op_data_create_and_upload(cmd_buffer,
1479                                                   load_op,
1480                                                   &load_op_program);
1481       if (result != VK_SUCCESS)
1482          return result;
1483 
1484       job->enable_bg_tag = render_pass_info->enable_bg_tag;
1485       job->process_empty_tiles = render_pass_info->process_empty_tiles;
1486 
1487       pvr_pds_bgnd_pack_state(load_op,
1488                               &load_op_program,
1489                               job->pds_bgnd_reg_values);
1490    }
1491 
1492    /* TODO: In some cases a PR can be removed by storing to the color attachment
1493     * and have the background object load directly from it instead of using the
1494     * scratch buffer. In those cases we can also set this to "false" and avoid
1495     * extra fw overhead.
1496     */
1497    /* The scratch buffer is always needed and allocated to avoid data loss in
1498     * case SPM is hit so set the flag unconditionally.
1499     */
1500    job->requires_spm_scratch_buffer = true;
1501 
1502    memcpy(job->pr_pbe_reg_words,
1503           &framebuffer->spm_eot_state_per_render[0].pbe_reg_words,
1504           sizeof(job->pbe_reg_words));
1505    job->pr_pds_pixel_event_data_offset =
1506       framebuffer->spm_eot_state_per_render[0].pixel_event_program_data_offset;
1507 
1508    STATIC_ASSERT(ARRAY_SIZE(job->pds_pr_bgnd_reg_values) ==
1509                  ARRAY_SIZE(spm_bgobj_state->pds_reg_values));
1510    typed_memcpy(job->pds_pr_bgnd_reg_values,
1511                 spm_bgobj_state->pds_reg_values,
1512                 ARRAY_SIZE(spm_bgobj_state->pds_reg_values));
1513 
1514    render_target = pvr_get_render_target(render_pass_info->pass,
1515                                          framebuffer,
1516                                          sub_cmd->hw_render_idx);
1517    job->rt_dataset = render_target->rt_dataset;
1518 
1519    job->ctrl_stream_addr = pvr_csb_get_start_address(&sub_cmd->control_stream);
1520 
1521    if (sub_cmd->depth_bias_bo)
1522       job->depth_bias_table_addr = sub_cmd->depth_bias_bo->dev_addr;
1523    else
1524       job->depth_bias_table_addr = PVR_DEV_ADDR_INVALID;
1525 
1526    if (sub_cmd->scissor_bo)
1527       job->scissor_table_addr = sub_cmd->scissor_bo->dev_addr;
1528    else
1529       job->scissor_table_addr = PVR_DEV_ADDR_INVALID;
1530 
1531    job->pixel_output_width =
1532       pvr_pass_get_pixel_output_width(render_pass_info->pass,
1533                                       sub_cmd->hw_render_idx,
1534                                       dev_info);
1535 
1536    /* Setup depth/stencil job information. */
1537    if (hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED) {
1538       struct pvr_image_view *ds_iview =
1539          render_pass_info->attachments[hw_render->ds_attach_idx];
1540       const struct pvr_image *ds_image = pvr_image_view_get_image(ds_iview);
1541 
1542       job->has_depth_attachment = vk_format_has_depth(ds_image->vk.format);
1543       job->has_stencil_attachment = vk_format_has_stencil(ds_image->vk.format);
1544 
1545       if (job->has_depth_attachment || job->has_stencil_attachment) {
1546          uint32_t level_pitch =
1547             ds_image->mip_levels[ds_iview->vk.base_mip_level].pitch;
1548          const bool render_area_is_tile_aligned =
1549             pvr_is_render_area_tile_aligned(cmd_buffer, ds_iview);
1550          bool store_was_optimised_out = false;
1551          bool d_store = false, s_store = false;
1552          bool d_load = false, s_load = false;
1553 
1554          job->ds.iview = ds_iview;
1555          job->ds.addr = ds_image->dev_addr;
1556 
1557          job->ds.stride =
1558             pvr_stride_from_pitch(level_pitch, ds_iview->vk.format);
1559          job->ds.height = ds_iview->vk.extent.height;
1560          job->ds.physical_extent = (VkExtent2D){
1561             .width = u_minify(ds_image->physical_extent.width,
1562                               ds_iview->vk.base_mip_level),
1563             .height = u_minify(ds_image->physical_extent.height,
1564                                ds_iview->vk.base_mip_level),
1565          };
1566          job->ds.layer_size = ds_image->layer_size;
1567 
1568          job->ds_clear_value = default_ds_clear_value;
1569 
1570          if (hw_render->ds_attach_idx < render_pass_info->clear_value_count) {
1571             const VkClearDepthStencilValue *const clear_values =
1572                &render_pass_info->clear_values[hw_render->ds_attach_idx]
1573                    .depthStencil;
1574 
1575             if (job->has_depth_attachment)
1576                job->ds_clear_value.depth = clear_values->depth;
1577 
1578             if (job->has_stencil_attachment)
1579                job->ds_clear_value.stencil = clear_values->stencil;
1580          }
1581 
1582          switch (ds_iview->vk.format) {
1583          case VK_FORMAT_D16_UNORM:
1584             job->ds.zls_format = ROGUE_CR_ZLS_FORMAT_TYPE_16BITINT;
1585             break;
1586 
1587          case VK_FORMAT_S8_UINT:
1588          case VK_FORMAT_D32_SFLOAT:
1589             job->ds.zls_format = ROGUE_CR_ZLS_FORMAT_TYPE_F32Z;
1590             break;
1591 
1592          case VK_FORMAT_D24_UNORM_S8_UINT:
1593             job->ds.zls_format = ROGUE_CR_ZLS_FORMAT_TYPE_24BITINT;
1594             break;
1595 
1596          default:
1597             unreachable("Unsupported depth stencil format");
1598          }
1599 
1600          job->ds.memlayout = ds_image->memlayout;
1601 
1602          if (job->has_depth_attachment) {
1603             if (hw_render->depth_store || sub_cmd->barrier_store) {
1604                const bool depth_init_is_clear = hw_render->depth_init ==
1605                                                 VK_ATTACHMENT_LOAD_OP_CLEAR;
1606 
1607                d_store = true;
1608 
1609                if (hw_render->depth_store && render_area_is_tile_aligned &&
1610                    !(sub_cmd->modifies_depth || depth_init_is_clear)) {
1611                   d_store = false;
1612                   store_was_optimised_out = true;
1613                }
1614             }
1615 
1616             if (d_store && !render_area_is_tile_aligned) {
1617                d_load = true;
1618             } else if (hw_render->depth_init == VK_ATTACHMENT_LOAD_OP_LOAD) {
1619                enum pvr_depth_stencil_usage depth_usage = sub_cmd->depth_usage;
1620 
1621                assert(depth_usage != PVR_DEPTH_STENCIL_USAGE_UNDEFINED);
1622                d_load = (depth_usage != PVR_DEPTH_STENCIL_USAGE_NEVER);
1623             } else {
1624                d_load = sub_cmd->barrier_load;
1625             }
1626          }
1627 
1628          if (job->has_stencil_attachment) {
1629             if (hw_render->stencil_store || sub_cmd->barrier_store) {
1630                const bool stencil_init_is_clear = hw_render->stencil_init ==
1631                                                   VK_ATTACHMENT_LOAD_OP_CLEAR;
1632 
1633                s_store = true;
1634 
1635                if (hw_render->stencil_store && render_area_is_tile_aligned &&
1636                    !(sub_cmd->modifies_stencil || stencil_init_is_clear)) {
1637                   s_store = false;
1638                   store_was_optimised_out = true;
1639                }
1640             }
1641 
1642             if (s_store && !render_area_is_tile_aligned) {
1643                s_load = true;
1644             } else if (hw_render->stencil_init == VK_ATTACHMENT_LOAD_OP_LOAD) {
1645                enum pvr_depth_stencil_usage stencil_usage =
1646                   sub_cmd->stencil_usage;
1647 
1648                assert(stencil_usage != PVR_DEPTH_STENCIL_USAGE_UNDEFINED);
1649                s_load = (stencil_usage != PVR_DEPTH_STENCIL_USAGE_NEVER);
1650             } else {
1651                s_load = sub_cmd->barrier_load;
1652             }
1653          }
1654 
1655          job->ds.load.d = d_load;
1656          job->ds.load.s = s_load;
1657          job->ds.store.d = d_store;
1658          job->ds.store.s = s_store;
1659 
1660          /* ZLS can't do masked writes for packed depth stencil formats so if
1661           * we store anything, we have to store everything.
1662           */
1663          if ((job->ds.store.d || job->ds.store.s) &&
1664              pvr_zls_format_type_is_packed(job->ds.zls_format)) {
1665             job->ds.store.d = true;
1666             job->ds.store.s = true;
1667 
1668             /* In case we are only operating on one aspect of the attachment we
1669              * need to load the unused one in order to preserve its contents due
1670              * to the forced store which might otherwise corrupt it.
1671              */
1672             if (hw_render->depth_init != VK_ATTACHMENT_LOAD_OP_CLEAR)
1673                job->ds.load.d = true;
1674 
1675             if (hw_render->stencil_init != VK_ATTACHMENT_LOAD_OP_CLEAR)
1676                job->ds.load.s = true;
1677          }
1678 
1679          if (pvr_ds_attachment_requires_zls(&job->ds) ||
1680              store_was_optimised_out) {
1681             job->process_empty_tiles = true;
1682          }
1683 
1684          if (pvr_sub_cmd_gfx_requires_ds_subtile_alignment(dev_info, job)) {
1685             result = pvr_sub_cmd_gfx_align_ds_subtiles(cmd_buffer, sub_cmd);
1686             if (result != VK_SUCCESS)
1687                return result;
1688          }
1689       }
1690    } else {
1691       job->has_depth_attachment = false;
1692       job->has_stencil_attachment = false;
1693       job->ds_clear_value = default_ds_clear_value;
1694    }
1695 
1696    if (hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED) {
1697       struct pvr_image_view *iview =
1698          render_pass_info->attachments[hw_render->ds_attach_idx];
1699       const struct pvr_image *image = pvr_image_view_get_image(iview);
1700 
1701       /* If the HW render pass has a valid depth/stencil surface, determine the
1702        * sample count from the attachment's image.
1703        */
1704       job->samples = image->vk.samples;
1705    } else if (hw_render->output_regs_count) {
1706       /* If the HW render pass has output registers, we have color attachments
1707        * to write to, so determine the sample count from the count specified for
1708        * every color attachment in this render.
1709        */
1710       job->samples = hw_render->sample_count;
1711    } else if (cmd_buffer->state.gfx_pipeline) {
1712       /* If the HW render pass has no color or depth/stencil attachments, we
1713        * determine the sample count from the count given during pipeline
1714        * creation.
1715        */
1716       job->samples = dynamic_state->ms.rasterization_samples;
1717    } else if (render_pass_info->pass->attachment_count > 0) {
1718       /* If we get here, we have a render pass with subpasses containing no
1719        * attachments. The next best thing is largest of the sample counts
1720        * specified by the render pass attachment descriptions.
1721        */
1722       job->samples = render_pass_info->pass->max_sample_count;
1723    } else {
1724       /* No appropriate framebuffer attachment is available. */
1725       mesa_logw("Defaulting render job sample count to 1.");
1726       job->samples = VK_SAMPLE_COUNT_1_BIT;
1727    }
1728 
1729    if (sub_cmd->max_tiles_in_flight ==
1730        PVR_GET_FEATURE_VALUE(dev_info, isp_max_tiles_in_flight, 1U)) {
1731       /* Use the default limit based on the partition store. */
1732       job->max_tiles_in_flight = 0U;
1733    } else {
1734       job->max_tiles_in_flight = sub_cmd->max_tiles_in_flight;
1735    }
1736 
1737    job->frag_uses_atomic_ops = sub_cmd->frag_uses_atomic_ops;
1738    job->disable_compute_overlap = false;
1739    job->max_shared_registers = cmd_buffer->state.max_shared_regs;
1740    job->run_frag = true;
1741    job->geometry_terminate = true;
1742 
1743    /* TODO: Enable pixel merging when it's safe to do. */
1744    job->disable_pixel_merging = true;
1745 
1746    return VK_SUCCESS;
1747 }
1748 
1749 static void
pvr_sub_cmd_compute_job_init(const struct pvr_physical_device * pdevice,struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * sub_cmd)1750 pvr_sub_cmd_compute_job_init(const struct pvr_physical_device *pdevice,
1751                              struct pvr_cmd_buffer *cmd_buffer,
1752                              struct pvr_sub_cmd_compute *sub_cmd)
1753 {
1754    sub_cmd->num_shared_regs = MAX2(cmd_buffer->device->idfwdf_state.usc_shareds,
1755                                    cmd_buffer->state.max_shared_regs);
1756 
1757    cmd_buffer->state.max_shared_regs = 0U;
1758 }
1759 
1760 #define PIXEL_ALLOCATION_SIZE_MAX_IN_BLOCKS \
1761    (1024 / ROGUE_CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE)
1762 
1763 static uint32_t
pvr_compute_flat_slot_size(const struct pvr_physical_device * pdevice,uint32_t coeff_regs_count,bool use_barrier,uint32_t total_workitems)1764 pvr_compute_flat_slot_size(const struct pvr_physical_device *pdevice,
1765                            uint32_t coeff_regs_count,
1766                            bool use_barrier,
1767                            uint32_t total_workitems)
1768 {
1769    const struct pvr_device_runtime_info *dev_runtime_info =
1770       &pdevice->dev_runtime_info;
1771    const struct pvr_device_info *dev_info = &pdevice->dev_info;
1772    uint32_t max_workgroups_per_task = ROGUE_CDM_MAX_PACKED_WORKGROUPS_PER_TASK;
1773    uint32_t max_avail_coeff_regs =
1774       dev_runtime_info->cdm_max_local_mem_size_regs;
1775    uint32_t localstore_chunks_count =
1776       DIV_ROUND_UP(PVR_DW_TO_BYTES(coeff_regs_count),
1777                    ROGUE_CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE);
1778 
1779    /* Ensure that we cannot have more workgroups in a slot than the available
1780     * number of coefficients allow us to have.
1781     */
1782    if (coeff_regs_count > 0U) {
1783       /* If the geometry or fragment jobs can overlap with the compute job, or
1784        * if there is a vertex shader already running then we need to consider
1785        * this in calculating max allowed work-groups.
1786        */
1787       if (PVR_HAS_QUIRK(dev_info, 52354) &&
1788           (PVR_HAS_FEATURE(dev_info, compute_overlap) ||
1789            PVR_HAS_FEATURE(dev_info, gs_rta_support))) {
1790          /* Solve for n (number of work-groups per task). All values are in
1791           * size of common store alloc blocks:
1792           *
1793           * n + (2n + 7) * (local_memory_size_max - 1) =
1794           * 	(coefficient_memory_pool_size) - (7 * pixel_allocation_size_max)
1795           * ==>
1796           * n + 2n * (local_memory_size_max - 1) =
1797           * 	(coefficient_memory_pool_size) - (7 * pixel_allocation_size_max)
1798           * 	- (7 * (local_memory_size_max - 1))
1799           * ==>
1800           * n * (1 + 2 * (local_memory_size_max - 1)) =
1801           * 	(coefficient_memory_pool_size) - (7 * pixel_allocation_size_max)
1802           * 	- (7 * (local_memory_size_max - 1))
1803           * ==>
1804           * n = ((coefficient_memory_pool_size) -
1805           * 	(7 * pixel_allocation_size_max) -
1806           * 	(7 * (local_memory_size_max - 1)) / (1 +
1807           * 2 * (local_memory_size_max - 1)))
1808           */
1809          uint32_t max_common_store_blocks =
1810             DIV_ROUND_UP(max_avail_coeff_regs * 4U,
1811                          ROGUE_CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE);
1812 
1813          /* (coefficient_memory_pool_size) - (7 * pixel_allocation_size_max)
1814           */
1815          max_common_store_blocks -= ROGUE_MAX_OVERLAPPED_PIXEL_TASK_INSTANCES *
1816                                     PIXEL_ALLOCATION_SIZE_MAX_IN_BLOCKS;
1817 
1818          /* - (7 * (local_memory_size_max - 1)) */
1819          max_common_store_blocks -= (ROGUE_MAX_OVERLAPPED_PIXEL_TASK_INSTANCES *
1820                                      (localstore_chunks_count - 1U));
1821 
1822          /* Divide by (1 + 2 * (local_memory_size_max - 1)) */
1823          max_workgroups_per_task = max_common_store_blocks /
1824                                    (1U + 2U * (localstore_chunks_count - 1U));
1825 
1826          max_workgroups_per_task =
1827             MIN2(max_workgroups_per_task,
1828                  ROGUE_CDM_MAX_PACKED_WORKGROUPS_PER_TASK);
1829 
1830       } else {
1831          max_workgroups_per_task =
1832             MIN2((max_avail_coeff_regs / coeff_regs_count),
1833                  max_workgroups_per_task);
1834       }
1835    }
1836 
1837    /* max_workgroups_per_task should at least be one. */
1838    assert(max_workgroups_per_task >= 1U);
1839 
1840    if (total_workitems >= ROGUE_MAX_INSTANCES_PER_TASK) {
1841       /* In this case, the work group size will have been padded up to the
1842        * next ROGUE_MAX_INSTANCES_PER_TASK so we just set max instances to be
1843        * ROGUE_MAX_INSTANCES_PER_TASK.
1844        */
1845       return ROGUE_MAX_INSTANCES_PER_TASK;
1846    }
1847 
1848    /* In this case, the number of instances in the slot must be clamped to
1849     * accommodate whole work-groups only.
1850     */
1851    if (PVR_HAS_QUIRK(dev_info, 49032) || use_barrier) {
1852       max_workgroups_per_task =
1853          MIN2(max_workgroups_per_task,
1854               ROGUE_MAX_INSTANCES_PER_TASK / total_workitems);
1855       return total_workitems * max_workgroups_per_task;
1856    }
1857 
1858    return MIN2(total_workitems * max_workgroups_per_task,
1859                ROGUE_MAX_INSTANCES_PER_TASK);
1860 }
1861 
1862 static void
pvr_compute_generate_control_stream(struct pvr_csb * csb,struct pvr_sub_cmd_compute * sub_cmd,const struct pvr_compute_kernel_info * info)1863 pvr_compute_generate_control_stream(struct pvr_csb *csb,
1864                                     struct pvr_sub_cmd_compute *sub_cmd,
1865                                     const struct pvr_compute_kernel_info *info)
1866 {
1867    pvr_csb_set_relocation_mark(csb);
1868 
1869    /* Compute kernel 0. */
1870    pvr_csb_emit (csb, CDMCTRL_KERNEL0, kernel0) {
1871       kernel0.indirect_present = !!info->indirect_buffer_addr.addr;
1872       kernel0.global_offsets_present = info->global_offsets_present;
1873       kernel0.usc_common_size = info->usc_common_size;
1874       kernel0.usc_unified_size = info->usc_unified_size;
1875       kernel0.pds_temp_size = info->pds_temp_size;
1876       kernel0.pds_data_size = info->pds_data_size;
1877       kernel0.usc_target = info->usc_target;
1878       kernel0.fence = info->is_fence;
1879    }
1880 
1881    /* Compute kernel 1. */
1882    pvr_csb_emit (csb, CDMCTRL_KERNEL1, kernel1) {
1883       kernel1.data_addr = PVR_DEV_ADDR(info->pds_data_offset);
1884       kernel1.sd_type = info->sd_type;
1885       kernel1.usc_common_shared = info->usc_common_shared;
1886    }
1887 
1888    /* Compute kernel 2. */
1889    pvr_csb_emit (csb, CDMCTRL_KERNEL2, kernel2) {
1890       kernel2.code_addr = PVR_DEV_ADDR(info->pds_code_offset);
1891    }
1892 
1893    if (info->indirect_buffer_addr.addr) {
1894       /* Compute kernel 6. */
1895       pvr_csb_emit (csb, CDMCTRL_KERNEL6, kernel6) {
1896          kernel6.indirect_addrmsb = info->indirect_buffer_addr;
1897       }
1898 
1899       /* Compute kernel 7. */
1900       pvr_csb_emit (csb, CDMCTRL_KERNEL7, kernel7) {
1901          kernel7.indirect_addrlsb = info->indirect_buffer_addr;
1902       }
1903    } else {
1904       /* Compute kernel 3. */
1905       pvr_csb_emit (csb, CDMCTRL_KERNEL3, kernel3) {
1906          assert(info->global_size[0U] > 0U);
1907          kernel3.workgroup_x = info->global_size[0U] - 1U;
1908       }
1909 
1910       /* Compute kernel 4. */
1911       pvr_csb_emit (csb, CDMCTRL_KERNEL4, kernel4) {
1912          assert(info->global_size[1U] > 0U);
1913          kernel4.workgroup_y = info->global_size[1U] - 1U;
1914       }
1915 
1916       /* Compute kernel 5. */
1917       pvr_csb_emit (csb, CDMCTRL_KERNEL5, kernel5) {
1918          assert(info->global_size[2U] > 0U);
1919          kernel5.workgroup_z = info->global_size[2U] - 1U;
1920       }
1921    }
1922 
1923    /* Compute kernel 8. */
1924    pvr_csb_emit (csb, CDMCTRL_KERNEL8, kernel8) {
1925       if (info->max_instances == ROGUE_MAX_INSTANCES_PER_TASK)
1926          kernel8.max_instances = 0U;
1927       else
1928          kernel8.max_instances = info->max_instances;
1929 
1930       assert(info->local_size[0U] > 0U);
1931       kernel8.workgroup_size_x = info->local_size[0U] - 1U;
1932       assert(info->local_size[1U] > 0U);
1933       kernel8.workgroup_size_y = info->local_size[1U] - 1U;
1934       assert(info->local_size[2U] > 0U);
1935       kernel8.workgroup_size_z = info->local_size[2U] - 1U;
1936    }
1937 
1938    pvr_csb_clear_relocation_mark(csb);
1939 
1940    /* Track the highest amount of shared registers usage in this dispatch.
1941     * This is used by the FW for context switching, so must be large enough
1942     * to contain all the shared registers that might be in use for this compute
1943     * job. Coefficients don't need to be included as the context switch will not
1944     * happen within the execution of a single workgroup, thus nothing needs to
1945     * be preserved.
1946     */
1947    if (info->usc_common_shared) {
1948       sub_cmd->num_shared_regs =
1949          MAX2(sub_cmd->num_shared_regs, info->usc_common_size);
1950    }
1951 }
1952 
1953 /* TODO: This can be pre-packed and uploaded directly. Would that provide any
1954  * speed up?
1955  */
1956 static void
pvr_compute_generate_idfwdf(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd)1957 pvr_compute_generate_idfwdf(struct pvr_cmd_buffer *cmd_buffer,
1958                             struct pvr_sub_cmd_compute *const sub_cmd)
1959 {
1960    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
1961    bool *const is_sw_barrier_required =
1962       &state->current_sub_cmd->compute.pds_sw_barrier_requires_clearing;
1963    const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
1964    struct pvr_csb *csb = &sub_cmd->control_stream;
1965    const struct pvr_pds_upload *program;
1966 
1967    if (PVR_NEED_SW_COMPUTE_PDS_BARRIER(&pdevice->dev_info) &&
1968        *is_sw_barrier_required) {
1969       *is_sw_barrier_required = false;
1970       program = &cmd_buffer->device->idfwdf_state.sw_compute_barrier_pds;
1971    } else {
1972       program = &cmd_buffer->device->idfwdf_state.pds;
1973    }
1974 
1975    struct pvr_compute_kernel_info info = {
1976       .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
1977       .global_offsets_present = false,
1978       .usc_common_size = DIV_ROUND_UP(
1979          PVR_DW_TO_BYTES(cmd_buffer->device->idfwdf_state.usc_shareds),
1980          ROGUE_CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE),
1981       .usc_unified_size = 0U,
1982       .pds_temp_size = 0U,
1983       .pds_data_size =
1984          DIV_ROUND_UP(PVR_DW_TO_BYTES(program->data_size),
1985                       ROGUE_CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE),
1986       .usc_target = ROGUE_CDMCTRL_USC_TARGET_ALL,
1987       .is_fence = false,
1988       .pds_data_offset = program->data_offset,
1989       .sd_type = ROGUE_CDMCTRL_SD_TYPE_USC,
1990       .usc_common_shared = true,
1991       .pds_code_offset = program->code_offset,
1992       .global_size = { 1U, 1U, 1U },
1993       .local_size = { 1U, 1U, 1U },
1994    };
1995 
1996    /* We don't need to pad work-group size for this case. */
1997 
1998    info.max_instances =
1999       pvr_compute_flat_slot_size(pdevice,
2000                                  cmd_buffer->device->idfwdf_state.usc_shareds,
2001                                  false,
2002                                  1U);
2003 
2004    pvr_compute_generate_control_stream(csb, sub_cmd, &info);
2005 }
2006 
pvr_compute_generate_fence(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd,bool deallocate_shareds)2007 void pvr_compute_generate_fence(struct pvr_cmd_buffer *cmd_buffer,
2008                                 struct pvr_sub_cmd_compute *const sub_cmd,
2009                                 bool deallocate_shareds)
2010 {
2011    const struct pvr_pds_upload *program =
2012       &cmd_buffer->device->pds_compute_fence_program;
2013    const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
2014    struct pvr_csb *csb = &sub_cmd->control_stream;
2015 
2016    struct pvr_compute_kernel_info info = {
2017       .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
2018       .global_offsets_present = false,
2019       .usc_common_size = 0U,
2020       .usc_unified_size = 0U,
2021       .pds_temp_size = 0U,
2022       .pds_data_size =
2023          DIV_ROUND_UP(PVR_DW_TO_BYTES(program->data_size),
2024                       ROGUE_CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE),
2025       .usc_target = ROGUE_CDMCTRL_USC_TARGET_ANY,
2026       .is_fence = true,
2027       .pds_data_offset = program->data_offset,
2028       .sd_type = ROGUE_CDMCTRL_SD_TYPE_PDS,
2029       .usc_common_shared = deallocate_shareds,
2030       .pds_code_offset = program->code_offset,
2031       .global_size = { 1U, 1U, 1U },
2032       .local_size = { 1U, 1U, 1U },
2033    };
2034 
2035    /* We don't need to pad work-group size for this case. */
2036    /* Here we calculate the slot size. This can depend on the use of barriers,
2037     * local memory, BRN's or other factors.
2038     */
2039    info.max_instances = pvr_compute_flat_slot_size(pdevice, 0U, false, 1U);
2040 
2041    pvr_compute_generate_control_stream(csb, sub_cmd, &info);
2042 }
2043 
2044 static VkResult
pvr_cmd_buffer_process_deferred_clears(struct pvr_cmd_buffer * cmd_buffer)2045 pvr_cmd_buffer_process_deferred_clears(struct pvr_cmd_buffer *cmd_buffer)
2046 {
2047    util_dynarray_foreach (&cmd_buffer->deferred_clears,
2048                           struct pvr_transfer_cmd,
2049                           transfer_cmd) {
2050       VkResult result;
2051 
2052       result = pvr_cmd_buffer_add_transfer_cmd(cmd_buffer, transfer_cmd);
2053       if (result != VK_SUCCESS)
2054          return result;
2055 
2056       cmd_buffer->state.current_sub_cmd->transfer.serialize_with_frag = true;
2057    }
2058 
2059    return VK_SUCCESS;
2060 }
2061 
pvr_cmd_buffer_end_sub_cmd(struct pvr_cmd_buffer * cmd_buffer)2062 VkResult pvr_cmd_buffer_end_sub_cmd(struct pvr_cmd_buffer *cmd_buffer)
2063 {
2064    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
2065    struct pvr_sub_cmd *sub_cmd = state->current_sub_cmd;
2066    struct pvr_device *device = cmd_buffer->device;
2067    const struct pvr_query_pool *query_pool = NULL;
2068    struct pvr_suballoc_bo *query_bo = NULL;
2069    size_t query_indices_size = 0;
2070    VkResult result;
2071 
2072    /* FIXME: Is this NULL check required because this function is called from
2073     * pvr_resolve_unemitted_resolve_attachments()? See comment about this
2074     * function being called twice in a row in pvr_CmdEndRenderPass().
2075     */
2076    if (!sub_cmd)
2077       return VK_SUCCESS;
2078 
2079    if (!sub_cmd->owned) {
2080       state->current_sub_cmd = NULL;
2081       return VK_SUCCESS;
2082    }
2083 
2084    switch (sub_cmd->type) {
2085    case PVR_SUB_CMD_TYPE_GRAPHICS: {
2086       struct pvr_sub_cmd_gfx *const gfx_sub_cmd = &sub_cmd->gfx;
2087 
2088       query_indices_size =
2089          util_dynarray_num_elements(&state->query_indices, char);
2090 
2091       if (query_indices_size > 0) {
2092          const bool secondary_cont =
2093             cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
2094             cmd_buffer->usage_flags &
2095                VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
2096 
2097          assert(gfx_sub_cmd->query_pool);
2098 
2099          if (secondary_cont) {
2100             util_dynarray_append_dynarray(&state->query_indices,
2101                                           &gfx_sub_cmd->sec_query_indices);
2102          } else {
2103             const void *data = util_dynarray_begin(&state->query_indices);
2104 
2105             result = pvr_cmd_buffer_upload_general(cmd_buffer,
2106                                                    data,
2107                                                    query_indices_size,
2108                                                    &query_bo);
2109             if (result != VK_SUCCESS)
2110                return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2111 
2112             query_pool = gfx_sub_cmd->query_pool;
2113          }
2114 
2115          gfx_sub_cmd->has_occlusion_query = true;
2116 
2117          util_dynarray_clear(&state->query_indices);
2118       }
2119 
2120       if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
2121          result = pvr_csb_emit_return(&gfx_sub_cmd->control_stream);
2122          if (result != VK_SUCCESS)
2123             return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2124 
2125          break;
2126       }
2127 
2128       /* TODO: Check if the sub_cmd can be skipped based on
2129        * sub_cmd->gfx.empty_cmd flag.
2130        */
2131 
2132       /* TODO: Set the state in the functions called with the command buffer
2133        * instead of here.
2134        */
2135 
2136       result = pvr_cmd_buffer_upload_tables(device, cmd_buffer, gfx_sub_cmd);
2137       if (result != VK_SUCCESS)
2138          return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2139 
2140       result = pvr_cmd_buffer_emit_ppp_state(cmd_buffer,
2141                                              &gfx_sub_cmd->control_stream);
2142       if (result != VK_SUCCESS)
2143          return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2144 
2145       result = pvr_csb_emit_terminate(&gfx_sub_cmd->control_stream);
2146       if (result != VK_SUCCESS)
2147          return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2148 
2149       result = pvr_sub_cmd_gfx_job_init(&device->pdevice->dev_info,
2150                                         cmd_buffer,
2151                                         gfx_sub_cmd);
2152       if (result != VK_SUCCESS)
2153          return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2154 
2155       if (pvr_sub_cmd_gfx_requires_split_submit(gfx_sub_cmd)) {
2156          result = pvr_sub_cmd_gfx_build_terminate_ctrl_stream(device,
2157                                                               cmd_buffer,
2158                                                               gfx_sub_cmd);
2159          if (result != VK_SUCCESS)
2160             return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2161       }
2162 
2163       break;
2164    }
2165 
2166    case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY:
2167    case PVR_SUB_CMD_TYPE_COMPUTE: {
2168       struct pvr_sub_cmd_compute *const compute_sub_cmd = &sub_cmd->compute;
2169 
2170       pvr_compute_generate_fence(cmd_buffer, compute_sub_cmd, true);
2171 
2172       result = pvr_csb_emit_terminate(&compute_sub_cmd->control_stream);
2173       if (result != VK_SUCCESS)
2174          return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2175 
2176       pvr_sub_cmd_compute_job_init(device->pdevice,
2177                                    cmd_buffer,
2178                                    compute_sub_cmd);
2179       break;
2180    }
2181 
2182    case PVR_SUB_CMD_TYPE_TRANSFER:
2183       break;
2184 
2185    case PVR_SUB_CMD_TYPE_EVENT:
2186       break;
2187 
2188    default:
2189       unreachable("Unsupported sub-command type");
2190    }
2191 
2192    state->current_sub_cmd = NULL;
2193 
2194    /* pvr_cmd_buffer_process_deferred_clears() must be called with a NULL
2195     * current_sub_cmd.
2196     *
2197     * We can start a sub_cmd of a different type from the current sub_cmd only
2198     * after having ended the current sub_cmd. However, we can't end the current
2199     * sub_cmd if this depends on starting sub_cmd(s) of a different type. Hence,
2200     * don't try to start transfer sub_cmd(s) with
2201     * pvr_cmd_buffer_process_deferred_clears() until the current hasn't ended.
2202     * Failing to do so we will cause a circular dependency between
2203     * pvr_cmd_buffer_{end,start}_cmd and blow the stack.
2204     */
2205    if (sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS) {
2206       result = pvr_cmd_buffer_process_deferred_clears(cmd_buffer);
2207       if (result != VK_SUCCESS)
2208          return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2209    }
2210 
2211    if (query_pool) {
2212       struct pvr_query_info query_info;
2213 
2214       assert(query_bo);
2215       assert(query_indices_size);
2216 
2217       query_info.type = PVR_QUERY_TYPE_AVAILABILITY_WRITE;
2218 
2219       /* sizeof(uint32_t) is for the size of single query. */
2220       query_info.availability_write.num_query_indices =
2221          query_indices_size / sizeof(uint32_t);
2222       query_info.availability_write.index_bo = query_bo;
2223 
2224       query_info.availability_write.num_queries = query_pool->query_count;
2225       query_info.availability_write.availability_bo =
2226          query_pool->availability_buffer;
2227 
2228       /* Insert a barrier after the graphics sub command and before the
2229        * query sub command so that the availability write program waits for the
2230        * fragment shader to complete.
2231        */
2232 
2233       result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
2234       if (result != VK_SUCCESS)
2235          return result;
2236 
2237       cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
2238          .type = PVR_EVENT_TYPE_BARRIER,
2239          .barrier = {
2240             .wait_for_stage_mask = PVR_PIPELINE_STAGE_FRAG_BIT,
2241             .wait_at_stage_mask = PVR_PIPELINE_STAGE_OCCLUSION_QUERY_BIT,
2242          },
2243       };
2244 
2245       return pvr_add_query_program(cmd_buffer, &query_info);
2246    }
2247 
2248    return VK_SUCCESS;
2249 }
2250 
pvr_reset_graphics_dirty_state(struct pvr_cmd_buffer * const cmd_buffer,bool start_geom)2251 void pvr_reset_graphics_dirty_state(struct pvr_cmd_buffer *const cmd_buffer,
2252                                     bool start_geom)
2253 {
2254    struct vk_dynamic_graphics_state *const dynamic_state =
2255       &cmd_buffer->vk.dynamic_graphics_state;
2256 
2257    if (start_geom) {
2258       /*
2259        * Initial geometry phase state.
2260        * It's the driver's responsibility to ensure that the state of the
2261        * hardware is correctly initialized at the start of every geometry
2262        * phase. This is required to prevent stale state from a previous
2263        * geometry phase erroneously affecting the next geometry phase.
2264        *
2265        * If a geometry phase does not contain any geometry, this restriction
2266        * can be ignored. If the first draw call in a geometry phase will only
2267        * update the depth or stencil buffers i.e. ISP_TAGWRITEDISABLE is set
2268        * in the ISP State Control Word, the PDS State Pointers
2269        * (TA_PRES_PDSSTATEPTR*) in the first PPP State Update do not need to
2270        * be supplied, since they will never reach the PDS in the fragment
2271        * phase.
2272        */
2273 
2274       cmd_buffer->state.emit_header = (struct ROGUE_TA_STATE_HEADER){
2275          .pres_stream_out_size = true,
2276          .pres_ppp_ctrl = true,
2277          .pres_varying_word2 = true,
2278          .pres_varying_word1 = true,
2279          .pres_varying_word0 = true,
2280          .pres_outselects = true,
2281          .pres_wclamp = true,
2282          .pres_viewport = true,
2283          .pres_region_clip = true,
2284          .pres_pds_state_ptr0 = true,
2285          .pres_ispctl_fb = true,
2286          .pres_ispctl = true,
2287       };
2288    } else {
2289       struct ROGUE_TA_STATE_HEADER *const emit_header =
2290          &cmd_buffer->state.emit_header;
2291 
2292       emit_header->pres_ppp_ctrl = true;
2293       emit_header->pres_varying_word1 = true;
2294       emit_header->pres_varying_word0 = true;
2295       emit_header->pres_outselects = true;
2296       emit_header->pres_viewport = true;
2297       emit_header->pres_region_clip = true;
2298       emit_header->pres_pds_state_ptr0 = true;
2299       emit_header->pres_ispctl_fb = true;
2300       emit_header->pres_ispctl = true;
2301    }
2302 
2303    memset(&cmd_buffer->state.ppp_state,
2304           0U,
2305           sizeof(cmd_buffer->state.ppp_state));
2306 
2307    cmd_buffer->state.dirty.vertex_bindings = true;
2308    cmd_buffer->state.dirty.gfx_pipeline_binding = true;
2309 
2310    BITSET_SET(dynamic_state->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS);
2311    BITSET_SET(dynamic_state->dirty, MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT);
2312 }
2313 
2314 static inline bool
pvr_cmd_uses_deferred_cs_cmds(const struct pvr_cmd_buffer * const cmd_buffer)2315 pvr_cmd_uses_deferred_cs_cmds(const struct pvr_cmd_buffer *const cmd_buffer)
2316 {
2317    const VkCommandBufferUsageFlags deferred_control_stream_flags =
2318       VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT |
2319       VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
2320 
2321    return cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
2322           (cmd_buffer->usage_flags & deferred_control_stream_flags) ==
2323              deferred_control_stream_flags;
2324 }
2325 
pvr_cmd_buffer_start_sub_cmd(struct pvr_cmd_buffer * cmd_buffer,enum pvr_sub_cmd_type type)2326 VkResult pvr_cmd_buffer_start_sub_cmd(struct pvr_cmd_buffer *cmd_buffer,
2327                                       enum pvr_sub_cmd_type type)
2328 {
2329    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
2330    struct pvr_device *device = cmd_buffer->device;
2331    struct pvr_sub_cmd *sub_cmd;
2332    VkResult result;
2333 
2334    /* Check the current status of the buffer. */
2335    if (vk_command_buffer_has_error(&cmd_buffer->vk))
2336       return vk_command_buffer_get_record_result(&cmd_buffer->vk);
2337 
2338    pvr_cmd_buffer_update_barriers(cmd_buffer, type);
2339 
2340    /* TODO: Add proper support for joining consecutive event sub_cmd? */
2341    if (state->current_sub_cmd) {
2342       if (state->current_sub_cmd->type == type) {
2343          /* Continue adding to the current sub command. */
2344          return VK_SUCCESS;
2345       }
2346 
2347       /* End the current sub command. */
2348       result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
2349       if (result != VK_SUCCESS)
2350          return result;
2351    }
2352 
2353    sub_cmd = vk_zalloc(&cmd_buffer->vk.pool->alloc,
2354                        sizeof(*sub_cmd),
2355                        8,
2356                        VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
2357    if (!sub_cmd) {
2358       return vk_command_buffer_set_error(&cmd_buffer->vk,
2359                                          VK_ERROR_OUT_OF_HOST_MEMORY);
2360    }
2361 
2362    sub_cmd->type = type;
2363    sub_cmd->owned = true;
2364 
2365    switch (type) {
2366    case PVR_SUB_CMD_TYPE_GRAPHICS:
2367       sub_cmd->gfx.depth_usage = PVR_DEPTH_STENCIL_USAGE_UNDEFINED;
2368       sub_cmd->gfx.stencil_usage = PVR_DEPTH_STENCIL_USAGE_UNDEFINED;
2369       sub_cmd->gfx.modifies_depth = false;
2370       sub_cmd->gfx.modifies_stencil = false;
2371       sub_cmd->gfx.max_tiles_in_flight =
2372          PVR_GET_FEATURE_VALUE(&device->pdevice->dev_info,
2373                                isp_max_tiles_in_flight,
2374                                1);
2375       sub_cmd->gfx.hw_render_idx = state->render_pass_info.current_hw_subpass;
2376       sub_cmd->gfx.framebuffer = state->render_pass_info.framebuffer;
2377       sub_cmd->gfx.empty_cmd = true;
2378 
2379       if (state->vis_test_enabled)
2380          sub_cmd->gfx.query_pool = state->query_pool;
2381 
2382       pvr_reset_graphics_dirty_state(cmd_buffer, true);
2383 
2384       if (pvr_cmd_uses_deferred_cs_cmds(cmd_buffer)) {
2385          pvr_csb_init(device,
2386                       PVR_CMD_STREAM_TYPE_GRAPHICS_DEFERRED,
2387                       &sub_cmd->gfx.control_stream);
2388       } else {
2389          pvr_csb_init(device,
2390                       PVR_CMD_STREAM_TYPE_GRAPHICS,
2391                       &sub_cmd->gfx.control_stream);
2392       }
2393 
2394       util_dynarray_init(&sub_cmd->gfx.sec_query_indices, NULL);
2395       break;
2396 
2397    case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY:
2398    case PVR_SUB_CMD_TYPE_COMPUTE:
2399       pvr_csb_init(device,
2400                    PVR_CMD_STREAM_TYPE_COMPUTE,
2401                    &sub_cmd->compute.control_stream);
2402       break;
2403 
2404    case PVR_SUB_CMD_TYPE_TRANSFER:
2405       sub_cmd->transfer.transfer_cmds = &sub_cmd->transfer.transfer_cmds_priv;
2406       list_inithead(sub_cmd->transfer.transfer_cmds);
2407       break;
2408 
2409    case PVR_SUB_CMD_TYPE_EVENT:
2410       break;
2411 
2412    default:
2413       unreachable("Unsupported sub-command type");
2414    }
2415 
2416    list_addtail(&sub_cmd->link, &cmd_buffer->sub_cmds);
2417    state->current_sub_cmd = sub_cmd;
2418 
2419    return VK_SUCCESS;
2420 }
2421 
pvr_cmd_buffer_alloc_mem(struct pvr_cmd_buffer * cmd_buffer,struct pvr_winsys_heap * heap,uint64_t size,struct pvr_suballoc_bo ** const pvr_bo_out)2422 VkResult pvr_cmd_buffer_alloc_mem(struct pvr_cmd_buffer *cmd_buffer,
2423                                   struct pvr_winsys_heap *heap,
2424                                   uint64_t size,
2425                                   struct pvr_suballoc_bo **const pvr_bo_out)
2426 {
2427    const uint32_t cache_line_size =
2428       rogue_get_slc_cache_line_size(&cmd_buffer->device->pdevice->dev_info);
2429    struct pvr_suballoc_bo *suballoc_bo;
2430    struct pvr_suballocator *allocator;
2431    VkResult result;
2432 
2433    if (heap == cmd_buffer->device->heaps.general_heap)
2434       allocator = &cmd_buffer->device->suballoc_general;
2435    else if (heap == cmd_buffer->device->heaps.pds_heap)
2436       allocator = &cmd_buffer->device->suballoc_pds;
2437    else if (heap == cmd_buffer->device->heaps.transfer_frag_heap)
2438       allocator = &cmd_buffer->device->suballoc_transfer;
2439    else if (heap == cmd_buffer->device->heaps.usc_heap)
2440       allocator = &cmd_buffer->device->suballoc_usc;
2441    else
2442       unreachable("Unknown heap type");
2443 
2444    result =
2445       pvr_bo_suballoc(allocator, size, cache_line_size, false, &suballoc_bo);
2446    if (result != VK_SUCCESS)
2447       return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2448 
2449    list_add(&suballoc_bo->link, &cmd_buffer->bo_list);
2450 
2451    *pvr_bo_out = suballoc_bo;
2452 
2453    return VK_SUCCESS;
2454 }
2455 
pvr_cmd_bind_compute_pipeline(const struct pvr_compute_pipeline * const compute_pipeline,struct pvr_cmd_buffer * const cmd_buffer)2456 static void pvr_cmd_bind_compute_pipeline(
2457    const struct pvr_compute_pipeline *const compute_pipeline,
2458    struct pvr_cmd_buffer *const cmd_buffer)
2459 {
2460    cmd_buffer->state.compute_pipeline = compute_pipeline;
2461    cmd_buffer->state.dirty.compute_pipeline_binding = true;
2462 }
2463 
pvr_cmd_bind_graphics_pipeline(const struct pvr_graphics_pipeline * const gfx_pipeline,struct pvr_cmd_buffer * const cmd_buffer)2464 static void pvr_cmd_bind_graphics_pipeline(
2465    const struct pvr_graphics_pipeline *const gfx_pipeline,
2466    struct pvr_cmd_buffer *const cmd_buffer)
2467 {
2468    cmd_buffer->state.gfx_pipeline = gfx_pipeline;
2469    cmd_buffer->state.dirty.gfx_pipeline_binding = true;
2470 
2471    vk_cmd_set_dynamic_graphics_state(&cmd_buffer->vk,
2472                                      &gfx_pipeline->dynamic_state);
2473 }
2474 
pvr_CmdBindPipeline(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipeline _pipeline)2475 void pvr_CmdBindPipeline(VkCommandBuffer commandBuffer,
2476                          VkPipelineBindPoint pipelineBindPoint,
2477                          VkPipeline _pipeline)
2478 {
2479    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2480    PVR_FROM_HANDLE(pvr_pipeline, pipeline, _pipeline);
2481 
2482    switch (pipelineBindPoint) {
2483    case VK_PIPELINE_BIND_POINT_COMPUTE:
2484       pvr_cmd_bind_compute_pipeline(to_pvr_compute_pipeline(pipeline),
2485                                     cmd_buffer);
2486       break;
2487 
2488    case VK_PIPELINE_BIND_POINT_GRAPHICS:
2489       pvr_cmd_bind_graphics_pipeline(to_pvr_graphics_pipeline(pipeline),
2490                                      cmd_buffer);
2491       break;
2492 
2493    default:
2494       unreachable("Invalid bind point.");
2495       break;
2496    }
2497 }
2498 
2499 #if MESA_DEBUG
check_viewport_quirk_70165(const struct pvr_device * device,const VkViewport * pViewport)2500 static void check_viewport_quirk_70165(const struct pvr_device *device,
2501                                        const VkViewport *pViewport)
2502 {
2503    const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
2504    float min_vertex_x, max_vertex_x, min_vertex_y, max_vertex_y;
2505    float min_screen_space_value, max_screen_space_value;
2506    float sign_to_unsigned_offset, fixed_point_max;
2507    float guardband_width, guardband_height;
2508 
2509    if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
2510       /* Max representable value in 13.4 fixed point format.
2511        * Round-down to avoid precision issues.
2512        * Calculated as (2 ** 13) - 2*(2 ** -4)
2513        */
2514       fixed_point_max = 8192.0f - 2.0f / 16.0f;
2515 
2516       if (PVR_HAS_FEATURE(dev_info, screen_size8K)) {
2517          if (pViewport->width <= 4096 && pViewport->height <= 4096) {
2518             guardband_width = pViewport->width / 4.0f;
2519             guardband_height = pViewport->height / 4.0f;
2520 
2521             /* 2k of the range is negative */
2522             sign_to_unsigned_offset = 2048.0f;
2523          } else {
2524             guardband_width = 0.0f;
2525             guardband_height = 0.0f;
2526 
2527             /* For > 4k renders, the entire range is positive */
2528             sign_to_unsigned_offset = 0.0f;
2529          }
2530       } else {
2531          guardband_width = pViewport->width / 4.0f;
2532          guardband_height = pViewport->height / 4.0f;
2533 
2534          /* 2k of the range is negative */
2535          sign_to_unsigned_offset = 2048.0f;
2536       }
2537    } else {
2538       /* Max representable value in 16.8 fixed point format
2539        * Calculated as (2 ** 16) - (2 ** -8)
2540        */
2541       fixed_point_max = 65535.99609375f;
2542       guardband_width = pViewport->width / 4.0f;
2543       guardband_height = pViewport->height / 4.0f;
2544 
2545       /* 4k/20k of the range is negative */
2546       sign_to_unsigned_offset = (float)PVR_MAX_NEG_OFFSCREEN_OFFSET;
2547    }
2548 
2549    min_screen_space_value = -sign_to_unsigned_offset;
2550    max_screen_space_value = fixed_point_max - sign_to_unsigned_offset;
2551 
2552    min_vertex_x = pViewport->x - guardband_width;
2553    max_vertex_x = pViewport->x + pViewport->width + guardband_width;
2554    min_vertex_y = pViewport->y - guardband_height;
2555    max_vertex_y = pViewport->y + pViewport->height + guardband_height;
2556    if (min_vertex_x < min_screen_space_value ||
2557        max_vertex_x > max_screen_space_value ||
2558        min_vertex_y < min_screen_space_value ||
2559        max_vertex_y > max_screen_space_value) {
2560       mesa_logw("Viewport is affected by BRN70165, geometry outside "
2561                 "the viewport could be corrupted");
2562    }
2563 }
2564 #endif
2565 
pvr_CmdSetViewport(VkCommandBuffer commandBuffer,uint32_t firstViewport,uint32_t viewportCount,const VkViewport * pViewports)2566 void pvr_CmdSetViewport(VkCommandBuffer commandBuffer,
2567                         uint32_t firstViewport,
2568                         uint32_t viewportCount,
2569                         const VkViewport *pViewports)
2570 {
2571    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2572    const uint32_t total_count = firstViewport + viewportCount;
2573 
2574    assert(firstViewport < PVR_MAX_VIEWPORTS && viewportCount > 0);
2575    assert(total_count >= 1 && total_count <= PVR_MAX_VIEWPORTS);
2576 
2577    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2578 
2579 #if MESA_DEBUG
2580    if (PVR_HAS_QUIRK(&cmd_buffer->device->pdevice->dev_info, 70165)) {
2581       for (uint32_t viewport = 0; viewport < viewportCount; viewport++) {
2582          check_viewport_quirk_70165(cmd_buffer->device, &pViewports[viewport]);
2583       }
2584    }
2585 #endif
2586 
2587    vk_common_CmdSetViewport(commandBuffer,
2588                             firstViewport,
2589                             viewportCount,
2590                             pViewports);
2591 }
2592 
pvr_CmdSetDepthBounds(VkCommandBuffer commandBuffer,float minDepthBounds,float maxDepthBounds)2593 void pvr_CmdSetDepthBounds(VkCommandBuffer commandBuffer,
2594                            float minDepthBounds,
2595                            float maxDepthBounds)
2596 {
2597    mesa_logd("No support for depth bounds testing.");
2598 }
2599 
pvr_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipelineLayout _layout,uint32_t firstSet,uint32_t descriptorSetCount,const VkDescriptorSet * pDescriptorSets,uint32_t dynamicOffsetCount,const uint32_t * pDynamicOffsets)2600 void pvr_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
2601                                VkPipelineBindPoint pipelineBindPoint,
2602                                VkPipelineLayout _layout,
2603                                uint32_t firstSet,
2604                                uint32_t descriptorSetCount,
2605                                const VkDescriptorSet *pDescriptorSets,
2606                                uint32_t dynamicOffsetCount,
2607                                const uint32_t *pDynamicOffsets)
2608 {
2609    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2610    struct pvr_descriptor_state *descriptor_state;
2611 
2612    assert(firstSet + descriptorSetCount <= PVR_MAX_DESCRIPTOR_SETS);
2613 
2614    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2615 
2616    switch (pipelineBindPoint) {
2617    case VK_PIPELINE_BIND_POINT_GRAPHICS:
2618    case VK_PIPELINE_BIND_POINT_COMPUTE:
2619       break;
2620 
2621    default:
2622       unreachable("Unsupported bind point.");
2623       break;
2624    }
2625 
2626    if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) {
2627       descriptor_state = &cmd_buffer->state.gfx_desc_state;
2628       cmd_buffer->state.dirty.gfx_desc_dirty = true;
2629    } else {
2630       descriptor_state = &cmd_buffer->state.compute_desc_state;
2631       cmd_buffer->state.dirty.compute_desc_dirty = true;
2632    }
2633 
2634    for (uint32_t i = 0; i < descriptorSetCount; i++) {
2635       PVR_FROM_HANDLE(pvr_descriptor_set, set, pDescriptorSets[i]);
2636       uint32_t index = firstSet + i;
2637 
2638       if (descriptor_state->descriptor_sets[index] != set) {
2639          descriptor_state->descriptor_sets[index] = set;
2640          descriptor_state->valid_mask |= (1u << index);
2641       }
2642    }
2643 
2644    if (dynamicOffsetCount > 0) {
2645       PVR_FROM_HANDLE(pvr_pipeline_layout, pipeline_layout, _layout);
2646       uint32_t set_offset = 0;
2647 
2648       for (uint32_t set = 0; set < firstSet; set++)
2649          set_offset += pipeline_layout->set_layout[set]->dynamic_buffer_count;
2650 
2651       assert(set_offset + dynamicOffsetCount <=
2652              ARRAY_SIZE(descriptor_state->dynamic_offsets));
2653 
2654       /* From the Vulkan 1.3.238 spec. :
2655        *
2656        *    "If any of the sets being bound include dynamic uniform or storage
2657        *    buffers, then pDynamicOffsets includes one element for each array
2658        *    element in each dynamic descriptor type binding in each set."
2659        *
2660        */
2661       for (uint32_t i = 0; i < dynamicOffsetCount; i++)
2662          descriptor_state->dynamic_offsets[set_offset + i] = pDynamicOffsets[i];
2663    }
2664 }
2665 
pvr_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets)2666 void pvr_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,
2667                               uint32_t firstBinding,
2668                               uint32_t bindingCount,
2669                               const VkBuffer *pBuffers,
2670                               const VkDeviceSize *pOffsets)
2671 {
2672    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2673    struct pvr_vertex_binding *const vb = cmd_buffer->state.vertex_bindings;
2674 
2675    /* We have to defer setting up vertex buffer since we need the buffer
2676     * stride from the pipeline.
2677     */
2678 
2679    assert(firstBinding < PVR_MAX_VERTEX_INPUT_BINDINGS &&
2680           bindingCount <= PVR_MAX_VERTEX_INPUT_BINDINGS);
2681 
2682    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2683 
2684    for (uint32_t i = 0; i < bindingCount; i++) {
2685       vb[firstBinding + i].buffer = pvr_buffer_from_handle(pBuffers[i]);
2686       vb[firstBinding + i].offset = pOffsets[i];
2687    }
2688 
2689    cmd_buffer->state.dirty.vertex_bindings = true;
2690 }
2691 
pvr_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,VkBuffer buffer,VkDeviceSize offset,VkIndexType indexType)2692 void pvr_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,
2693                             VkBuffer buffer,
2694                             VkDeviceSize offset,
2695                             VkIndexType indexType)
2696 {
2697    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2698    PVR_FROM_HANDLE(pvr_buffer, index_buffer, buffer);
2699    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
2700 
2701    assert(offset < index_buffer->vk.size);
2702    assert(indexType == VK_INDEX_TYPE_UINT32 ||
2703           indexType == VK_INDEX_TYPE_UINT16 ||
2704           indexType == VK_INDEX_TYPE_UINT8_KHR);
2705 
2706    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2707 
2708    state->index_buffer_binding.buffer = index_buffer;
2709    state->index_buffer_binding.offset = offset;
2710    state->index_buffer_binding.type = indexType;
2711    state->dirty.index_buffer_binding = true;
2712 }
2713 
pvr_CmdPushConstants(VkCommandBuffer commandBuffer,VkPipelineLayout layout,VkShaderStageFlags stageFlags,uint32_t offset,uint32_t size,const void * pValues)2714 void pvr_CmdPushConstants(VkCommandBuffer commandBuffer,
2715                           VkPipelineLayout layout,
2716                           VkShaderStageFlags stageFlags,
2717                           uint32_t offset,
2718                           uint32_t size,
2719                           const void *pValues)
2720 {
2721 #if MESA_DEBUG
2722    const uint64_t ending = (uint64_t)offset + (uint64_t)size;
2723 #endif
2724 
2725    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2726    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
2727 
2728    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2729 
2730    pvr_assert(ending <= PVR_MAX_PUSH_CONSTANTS_SIZE);
2731 
2732    memcpy(&state->push_constants.data[offset], pValues, size);
2733 
2734    state->push_constants.dirty_stages |= stageFlags;
2735    state->push_constants.uploaded = false;
2736 }
2737 
2738 static VkResult
pvr_cmd_buffer_setup_attachments(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_render_pass * pass,const struct pvr_framebuffer * framebuffer)2739 pvr_cmd_buffer_setup_attachments(struct pvr_cmd_buffer *cmd_buffer,
2740                                  const struct pvr_render_pass *pass,
2741                                  const struct pvr_framebuffer *framebuffer)
2742 {
2743    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
2744    struct pvr_render_pass_info *info = &state->render_pass_info;
2745 
2746    assert(pass->attachment_count == framebuffer->attachment_count);
2747 
2748    /* Free any previously allocated attachments. */
2749    vk_free(&cmd_buffer->vk.pool->alloc, state->render_pass_info.attachments);
2750 
2751    if (pass->attachment_count == 0) {
2752       info->attachments = NULL;
2753       return VK_SUCCESS;
2754    }
2755 
2756    info->attachments =
2757       vk_zalloc(&cmd_buffer->vk.pool->alloc,
2758                 pass->attachment_count * sizeof(*info->attachments),
2759                 8,
2760                 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
2761    if (!info->attachments) {
2762       return vk_command_buffer_set_error(&cmd_buffer->vk,
2763                                          VK_ERROR_OUT_OF_HOST_MEMORY);
2764    }
2765 
2766    for (uint32_t i = 0; i < pass->attachment_count; i++)
2767       info->attachments[i] = framebuffer->attachments[i];
2768 
2769    return VK_SUCCESS;
2770 }
2771 
pvr_init_render_targets(struct pvr_device * device,struct pvr_render_pass * pass,struct pvr_framebuffer * framebuffer)2772 static VkResult pvr_init_render_targets(struct pvr_device *device,
2773                                         struct pvr_render_pass *pass,
2774                                         struct pvr_framebuffer *framebuffer)
2775 {
2776    for (uint32_t i = 0; i < pass->hw_setup->render_count; i++) {
2777       struct pvr_render_target *render_target =
2778          pvr_get_render_target(pass, framebuffer, i);
2779 
2780       pthread_mutex_lock(&render_target->mutex);
2781 
2782       if (!render_target->valid) {
2783          const struct pvr_renderpass_hwsetup_render *hw_render =
2784             &pass->hw_setup->renders[i];
2785          VkResult result;
2786 
2787          result = pvr_render_target_dataset_create(device,
2788                                                    framebuffer->width,
2789                                                    framebuffer->height,
2790                                                    hw_render->sample_count,
2791                                                    framebuffer->layers,
2792                                                    &render_target->rt_dataset);
2793          if (result != VK_SUCCESS) {
2794             pthread_mutex_unlock(&render_target->mutex);
2795             return result;
2796          }
2797 
2798          render_target->valid = true;
2799       }
2800 
2801       pthread_mutex_unlock(&render_target->mutex);
2802    }
2803 
2804    return VK_SUCCESS;
2805 }
2806 
2807 const struct pvr_renderpass_hwsetup_subpass *
pvr_get_hw_subpass(const struct pvr_render_pass * pass,const uint32_t subpass)2808 pvr_get_hw_subpass(const struct pvr_render_pass *pass, const uint32_t subpass)
2809 {
2810    const struct pvr_renderpass_hw_map *map =
2811       &pass->hw_setup->subpass_map[subpass];
2812 
2813    return &pass->hw_setup->renders[map->render].subpasses[map->subpass];
2814 }
2815 
pvr_perform_start_of_render_attachment_clear(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_framebuffer * framebuffer,uint32_t index,bool is_depth_stencil,uint32_t * index_list_clear_mask)2816 static void pvr_perform_start_of_render_attachment_clear(
2817    struct pvr_cmd_buffer *cmd_buffer,
2818    const struct pvr_framebuffer *framebuffer,
2819    uint32_t index,
2820    bool is_depth_stencil,
2821    uint32_t *index_list_clear_mask)
2822 {
2823    ASSERTED static const VkImageAspectFlags dsc_aspect_flags =
2824       VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT |
2825       VK_IMAGE_ASPECT_COLOR_BIT;
2826    struct pvr_render_pass_info *info = &cmd_buffer->state.render_pass_info;
2827    const struct pvr_render_pass *pass = info->pass;
2828    const struct pvr_renderpass_hwsetup *hw_setup = pass->hw_setup;
2829    const struct pvr_renderpass_hwsetup_render *hw_render =
2830       &hw_setup->renders[hw_setup->subpass_map[info->subpass_idx].render];
2831    VkImageAspectFlags image_aspect;
2832    struct pvr_image_view *iview;
2833    uint32_t view_idx;
2834 
2835    if (is_depth_stencil) {
2836       bool stencil_clear;
2837       bool depth_clear;
2838       bool is_stencil;
2839       bool is_depth;
2840 
2841       assert(hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED);
2842       assert(index == 0);
2843 
2844       view_idx = hw_render->ds_attach_idx;
2845 
2846       is_depth = vk_format_has_depth(pass->attachments[view_idx].vk_format);
2847       is_stencil = vk_format_has_stencil(pass->attachments[view_idx].vk_format);
2848       depth_clear = hw_render->depth_init == VK_ATTACHMENT_LOAD_OP_CLEAR;
2849       stencil_clear = hw_render->stencil_init == VK_ATTACHMENT_LOAD_OP_CLEAR;
2850 
2851       /* Attempt to clear the ds attachment. Do not erroneously discard an
2852        * attachment that has no depth clear but has a stencil attachment.
2853        */
2854       /* if not (a ∧ c) ∨ (b ∧ d) */
2855       if (!((is_depth && depth_clear) || (is_stencil && stencil_clear)))
2856          return;
2857    } else if (hw_render->color_init[index].op != VK_ATTACHMENT_LOAD_OP_CLEAR) {
2858       return;
2859    } else {
2860       view_idx = hw_render->color_init[index].index;
2861    }
2862 
2863    iview = info->attachments[view_idx];
2864 
2865    /* FIXME: It would be nice if this function and pvr_sub_cmd_gfx_job_init()
2866     * were doing the same check (even if it's just an assert) to determine if a
2867     * clear is needed.
2868     */
2869    /* If this is single-layer fullscreen, we already do the clears in
2870     * pvr_sub_cmd_gfx_job_init().
2871     */
2872    if (pvr_is_render_area_tile_aligned(cmd_buffer, iview) &&
2873        framebuffer->layers == 1) {
2874       return;
2875    }
2876 
2877    image_aspect = vk_format_aspects(pass->attachments[view_idx].vk_format);
2878    assert((image_aspect & ~dsc_aspect_flags) == 0);
2879 
2880    if (image_aspect & VK_IMAGE_ASPECT_DEPTH_BIT &&
2881        hw_render->depth_init != VK_ATTACHMENT_LOAD_OP_CLEAR) {
2882       image_aspect &= ~VK_IMAGE_ASPECT_DEPTH_BIT;
2883    }
2884 
2885    if (image_aspect & VK_IMAGE_ASPECT_STENCIL_BIT &&
2886        hw_render->stencil_init != VK_ATTACHMENT_LOAD_OP_CLEAR) {
2887       image_aspect &= ~VK_IMAGE_ASPECT_STENCIL_BIT;
2888    }
2889 
2890    if (image_aspect != VK_IMAGE_ASPECT_NONE) {
2891       VkClearAttachment clear_attachment = {
2892          .aspectMask = image_aspect,
2893          .colorAttachment = index,
2894          .clearValue = info->clear_values[view_idx],
2895       };
2896       VkClearRect rect = {
2897          .rect = info->render_area,
2898          .baseArrayLayer = 0,
2899          .layerCount = info->framebuffer->layers,
2900       };
2901 
2902       assert(view_idx < info->clear_value_count);
2903 
2904       pvr_clear_attachments_render_init(cmd_buffer, &clear_attachment, &rect);
2905 
2906       *index_list_clear_mask |= (1 << index);
2907    }
2908 }
2909 
2910 static void
pvr_perform_start_of_render_clears(struct pvr_cmd_buffer * cmd_buffer)2911 pvr_perform_start_of_render_clears(struct pvr_cmd_buffer *cmd_buffer)
2912 {
2913    struct pvr_render_pass_info *info = &cmd_buffer->state.render_pass_info;
2914    const struct pvr_framebuffer *framebuffer = info->framebuffer;
2915    const struct pvr_render_pass *pass = info->pass;
2916    const struct pvr_renderpass_hwsetup *hw_setup = pass->hw_setup;
2917    const struct pvr_renderpass_hwsetup_render *hw_render =
2918       &hw_setup->renders[hw_setup->subpass_map[info->subpass_idx].render];
2919 
2920    /* Mask of attachment clears using index lists instead of background object
2921     * to clear.
2922     */
2923    uint32_t index_list_clear_mask = 0;
2924 
2925    for (uint32_t i = 0; i < hw_render->color_init_count; i++) {
2926       pvr_perform_start_of_render_attachment_clear(cmd_buffer,
2927                                                    framebuffer,
2928                                                    i,
2929                                                    false,
2930                                                    &index_list_clear_mask);
2931    }
2932 
2933    info->enable_bg_tag = !!hw_render->color_init_count;
2934 
2935    /* If we're not using index list for all clears/loads then we need to run
2936     * the background object on empty tiles.
2937     */
2938    if (hw_render->color_init_count &&
2939        index_list_clear_mask != ((1u << hw_render->color_init_count) - 1u)) {
2940       info->process_empty_tiles = true;
2941    } else {
2942       info->process_empty_tiles = false;
2943    }
2944 
2945    if (hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED) {
2946       uint32_t ds_index_list = 0;
2947 
2948       pvr_perform_start_of_render_attachment_clear(cmd_buffer,
2949                                                    framebuffer,
2950                                                    0,
2951                                                    true,
2952                                                    &ds_index_list);
2953    }
2954 
2955    if (index_list_clear_mask)
2956       pvr_finishme("Add support for generating loadops shaders!");
2957 }
2958 
pvr_stash_depth_format(struct pvr_cmd_buffer_state * state,struct pvr_sub_cmd_gfx * const sub_cmd)2959 static void pvr_stash_depth_format(struct pvr_cmd_buffer_state *state,
2960                                    struct pvr_sub_cmd_gfx *const sub_cmd)
2961 {
2962    const struct pvr_render_pass *pass = state->render_pass_info.pass;
2963    const struct pvr_renderpass_hwsetup_render *hw_render =
2964       &pass->hw_setup->renders[sub_cmd->hw_render_idx];
2965 
2966    if (hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED) {
2967       struct pvr_image_view **iviews = state->render_pass_info.attachments;
2968 
2969       state->depth_format = iviews[hw_render->ds_attach_idx]->vk.format;
2970    }
2971 }
2972 
pvr_loadops_contain_clear(struct pvr_renderpass_hwsetup * hw_setup)2973 static bool pvr_loadops_contain_clear(struct pvr_renderpass_hwsetup *hw_setup)
2974 {
2975    for (uint32_t i = 0; i < hw_setup->render_count; i++) {
2976       struct pvr_renderpass_hwsetup_render *hw_render = &hw_setup->renders[i];
2977       uint32_t render_targets_count = hw_render->init_setup.num_render_targets;
2978 
2979       for (uint32_t j = 0;
2980            j < (hw_render->color_init_count * render_targets_count);
2981            j += render_targets_count) {
2982          for (uint32_t k = 0; k < hw_render->init_setup.num_render_targets;
2983               k++) {
2984             if (hw_render->color_init[j + k].op ==
2985                 VK_ATTACHMENT_LOAD_OP_CLEAR) {
2986                return true;
2987             }
2988          }
2989       }
2990       if (hw_render->depth_init == VK_ATTACHMENT_LOAD_OP_CLEAR ||
2991           hw_render->stencil_init == VK_ATTACHMENT_LOAD_OP_CLEAR) {
2992          return true;
2993       }
2994    }
2995 
2996    return false;
2997 }
2998 
2999 static VkResult
pvr_cmd_buffer_set_clear_values(struct pvr_cmd_buffer * cmd_buffer,const VkRenderPassBeginInfo * pRenderPassBegin)3000 pvr_cmd_buffer_set_clear_values(struct pvr_cmd_buffer *cmd_buffer,
3001                                 const VkRenderPassBeginInfo *pRenderPassBegin)
3002 {
3003    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
3004 
3005    /* Free any previously allocated clear values. */
3006    vk_free(&cmd_buffer->vk.pool->alloc, state->render_pass_info.clear_values);
3007 
3008    if (pRenderPassBegin->clearValueCount) {
3009       const size_t size = pRenderPassBegin->clearValueCount *
3010                           sizeof(*state->render_pass_info.clear_values);
3011 
3012       state->render_pass_info.clear_values =
3013          vk_zalloc(&cmd_buffer->vk.pool->alloc,
3014                    size,
3015                    8,
3016                    VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
3017       if (!state->render_pass_info.clear_values) {
3018          return vk_command_buffer_set_error(&cmd_buffer->vk,
3019                                             VK_ERROR_OUT_OF_HOST_MEMORY);
3020       }
3021 
3022       memcpy(state->render_pass_info.clear_values,
3023              pRenderPassBegin->pClearValues,
3024              size);
3025    } else {
3026       state->render_pass_info.clear_values = NULL;
3027    }
3028 
3029    state->render_pass_info.clear_value_count =
3030       pRenderPassBegin->clearValueCount;
3031 
3032    return VK_SUCCESS;
3033 }
3034 
3035 /**
3036  * \brief Indicates whether to use the large or normal clear state words.
3037  *
3038  * If the current render area can fit within a quarter of the max framebuffer
3039  * that the device is capable of, we can use the normal clear state words,
3040  * otherwise the large clear state words are needed.
3041  *
3042  * The requirement of a quarter of the max framebuffer comes from the index
3043  * count used in the normal clear state words and the vertices uploaded at
3044  * device creation.
3045  *
3046  * \param[in] cmd_buffer The command buffer for the clear.
3047  * \return true if large clear state words are required.
3048  */
3049 static bool
pvr_is_large_clear_required(const struct pvr_cmd_buffer * const cmd_buffer)3050 pvr_is_large_clear_required(const struct pvr_cmd_buffer *const cmd_buffer)
3051 {
3052    const struct pvr_device_info *const dev_info =
3053       &cmd_buffer->device->pdevice->dev_info;
3054    const VkRect2D render_area = cmd_buffer->state.render_pass_info.render_area;
3055    const uint32_t vf_max_x = rogue_get_param_vf_max_x(dev_info);
3056    const uint32_t vf_max_y = rogue_get_param_vf_max_x(dev_info);
3057 
3058    return (render_area.extent.width > (vf_max_x / 2) - 1) ||
3059           (render_area.extent.height > (vf_max_y / 2) - 1);
3060 }
3061 
pvr_emit_clear_words(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)3062 static void pvr_emit_clear_words(struct pvr_cmd_buffer *const cmd_buffer,
3063                                  struct pvr_sub_cmd_gfx *const sub_cmd)
3064 {
3065    struct pvr_device *device = cmd_buffer->device;
3066    struct pvr_csb *csb = &sub_cmd->control_stream;
3067    uint32_t vdm_state_size_in_dw;
3068    const uint32_t *vdm_state;
3069    uint32_t *stream;
3070 
3071    vdm_state_size_in_dw =
3072       pvr_clear_vdm_state_get_size_in_dw(&device->pdevice->dev_info, 1);
3073 
3074    pvr_csb_set_relocation_mark(csb);
3075 
3076    stream = pvr_csb_alloc_dwords(csb, vdm_state_size_in_dw);
3077    if (!stream) {
3078       pvr_cmd_buffer_set_error_unwarned(cmd_buffer, csb->status);
3079       return;
3080    }
3081 
3082    if (pvr_is_large_clear_required(cmd_buffer))
3083       vdm_state = device->static_clear_state.large_clear_vdm_words;
3084    else
3085       vdm_state = device->static_clear_state.vdm_words;
3086 
3087    memcpy(stream, vdm_state, PVR_DW_TO_BYTES(vdm_state_size_in_dw));
3088 
3089    pvr_csb_clear_relocation_mark(csb);
3090 }
3091 
pvr_cs_write_load_op(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_gfx * sub_cmd,struct pvr_load_op * load_op,uint32_t isp_userpass)3092 static VkResult pvr_cs_write_load_op(struct pvr_cmd_buffer *cmd_buffer,
3093                                      struct pvr_sub_cmd_gfx *sub_cmd,
3094                                      struct pvr_load_op *load_op,
3095                                      uint32_t isp_userpass)
3096 {
3097    const struct pvr_device *device = cmd_buffer->device;
3098    struct pvr_static_clear_ppp_template template =
3099       device->static_clear_state.ppp_templates[VK_IMAGE_ASPECT_COLOR_BIT];
3100    uint32_t pds_state[PVR_STATIC_CLEAR_PDS_STATE_COUNT];
3101    struct pvr_pds_upload shareds_update_program;
3102    struct pvr_suballoc_bo *pvr_bo;
3103    VkResult result;
3104 
3105    result = pvr_load_op_data_create_and_upload(cmd_buffer,
3106                                                load_op,
3107                                                &shareds_update_program);
3108    if (result != VK_SUCCESS)
3109       return result;
3110 
3111    template.config.ispctl.upass = isp_userpass;
3112 
3113    /* It might look odd that we aren't specifying the code segment's
3114     * address anywhere. This is because the hardware always assumes that the
3115     * data size is 2 128bit words and the code segments starts after that.
3116     */
3117    pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_SHADERBASE],
3118                  TA_STATE_PDS_SHADERBASE,
3119                  shaderbase) {
3120       shaderbase.addr = PVR_DEV_ADDR(load_op->pds_frag_prog.data_offset);
3121    }
3122 
3123    pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_TEXUNICODEBASE],
3124                  TA_STATE_PDS_TEXUNICODEBASE,
3125                  texunicodebase) {
3126       texunicodebase.addr =
3127          PVR_DEV_ADDR(load_op->pds_tex_state_prog.code_offset);
3128    }
3129 
3130    pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_SIZEINFO1],
3131                  TA_STATE_PDS_SIZEINFO1,
3132                  sizeinfo1) {
3133       /* Dummy coefficient loading program. */
3134       sizeinfo1.pds_varyingsize = 0;
3135 
3136       sizeinfo1.pds_texturestatesize = DIV_ROUND_UP(
3137          shareds_update_program.data_size,
3138          ROGUE_TA_STATE_PDS_SIZEINFO1_PDS_TEXTURESTATESIZE_UNIT_SIZE);
3139 
3140       sizeinfo1.pds_tempsize =
3141          DIV_ROUND_UP(load_op->temps_count,
3142                       ROGUE_TA_STATE_PDS_SIZEINFO1_PDS_TEMPSIZE_UNIT_SIZE);
3143    }
3144 
3145    pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_SIZEINFO2],
3146                  TA_STATE_PDS_SIZEINFO2,
3147                  sizeinfo2) {
3148       sizeinfo2.usc_sharedsize =
3149          DIV_ROUND_UP(load_op->const_shareds_count,
3150                       ROGUE_TA_STATE_PDS_SIZEINFO2_USC_SHAREDSIZE_UNIT_SIZE);
3151    }
3152 
3153    /* Dummy coefficient loading program. */
3154    pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_VARYINGBASE] = 0;
3155 
3156    pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_TEXTUREDATABASE],
3157                  TA_STATE_PDS_TEXTUREDATABASE,
3158                  texturedatabase) {
3159       texturedatabase.addr = PVR_DEV_ADDR(shareds_update_program.data_offset);
3160    }
3161 
3162    template.config.pds_state = &pds_state;
3163 
3164    pvr_emit_ppp_from_template(&sub_cmd->control_stream, &template, &pvr_bo);
3165    list_add(&pvr_bo->link, &cmd_buffer->bo_list);
3166 
3167    pvr_emit_clear_words(cmd_buffer, sub_cmd);
3168 
3169    pvr_reset_graphics_dirty_state(cmd_buffer, false);
3170 
3171    return VK_SUCCESS;
3172 }
3173 
pvr_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,const VkRenderPassBeginInfo * pRenderPassBeginInfo,const VkSubpassBeginInfo * pSubpassBeginInfo)3174 void pvr_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
3175                              const VkRenderPassBeginInfo *pRenderPassBeginInfo,
3176                              const VkSubpassBeginInfo *pSubpassBeginInfo)
3177 {
3178    PVR_FROM_HANDLE(pvr_framebuffer,
3179                    framebuffer,
3180                    pRenderPassBeginInfo->framebuffer);
3181    PVR_FROM_HANDLE(pvr_render_pass, pass, pRenderPassBeginInfo->renderPass);
3182    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
3183    const struct pvr_renderpass_hwsetup_subpass *hw_subpass;
3184    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
3185    VkResult result;
3186 
3187    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
3188 
3189    assert(!state->render_pass_info.pass);
3190    assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
3191 
3192    /* FIXME: Create a separate function for everything using pass->subpasses,
3193     * look at cmd_buffer_begin_subpass() for example. */
3194    state->render_pass_info.pass = pass;
3195    state->render_pass_info.framebuffer = framebuffer;
3196    state->render_pass_info.subpass_idx = 0;
3197    state->render_pass_info.render_area = pRenderPassBeginInfo->renderArea;
3198    state->render_pass_info.current_hw_subpass = 0;
3199    state->render_pass_info.pipeline_bind_point =
3200       pass->subpasses[0].pipeline_bind_point;
3201    state->render_pass_info.isp_userpass = pass->subpasses[0].isp_userpass;
3202    state->dirty.isp_userpass = true;
3203 
3204    result = pvr_cmd_buffer_setup_attachments(cmd_buffer, pass, framebuffer);
3205    if (result != VK_SUCCESS)
3206       return;
3207 
3208    result = pvr_init_render_targets(cmd_buffer->device, pass, framebuffer);
3209    if (result != VK_SUCCESS) {
3210       pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
3211       return;
3212    }
3213 
3214    result = pvr_cmd_buffer_set_clear_values(cmd_buffer, pRenderPassBeginInfo);
3215    if (result != VK_SUCCESS)
3216       return;
3217 
3218    assert(pass->subpasses[0].pipeline_bind_point ==
3219           VK_PIPELINE_BIND_POINT_GRAPHICS);
3220 
3221    result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
3222    if (result != VK_SUCCESS)
3223       return;
3224 
3225    /* Run subpass 0 "soft" background object after the actual background
3226     * object.
3227     */
3228    hw_subpass = pvr_get_hw_subpass(pass, 0);
3229    if (hw_subpass->load_op) {
3230       result = pvr_cs_write_load_op(cmd_buffer,
3231                                     &cmd_buffer->state.current_sub_cmd->gfx,
3232                                     hw_subpass->load_op,
3233                                     0);
3234       if (result != VK_SUCCESS)
3235          return;
3236    }
3237 
3238    pvr_perform_start_of_render_clears(cmd_buffer);
3239    pvr_stash_depth_format(&cmd_buffer->state,
3240                           &cmd_buffer->state.current_sub_cmd->gfx);
3241 }
3242 
pvr_BeginCommandBuffer(VkCommandBuffer commandBuffer,const VkCommandBufferBeginInfo * pBeginInfo)3243 VkResult pvr_BeginCommandBuffer(VkCommandBuffer commandBuffer,
3244                                 const VkCommandBufferBeginInfo *pBeginInfo)
3245 {
3246    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
3247    struct pvr_cmd_buffer_state *state;
3248    VkResult result;
3249 
3250    vk_command_buffer_begin(&cmd_buffer->vk, pBeginInfo);
3251 
3252    cmd_buffer->usage_flags = pBeginInfo->flags;
3253    state = &cmd_buffer->state;
3254 
3255    /* VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT must be ignored for
3256     * primary level command buffers.
3257     *
3258     * From the Vulkan 1.0 spec:
3259     *
3260     *    VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT specifies that a
3261     *    secondary command buffer is considered to be entirely inside a render
3262     *    pass. If this is a primary command buffer, then this bit is ignored.
3263     */
3264    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
3265       cmd_buffer->usage_flags &=
3266          ~VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
3267    }
3268 
3269    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
3270       if (cmd_buffer->usage_flags &
3271           VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
3272          const VkCommandBufferInheritanceInfo *inheritance_info =
3273             pBeginInfo->pInheritanceInfo;
3274          struct pvr_render_pass *pass;
3275 
3276          pass = pvr_render_pass_from_handle(inheritance_info->renderPass);
3277          state->render_pass_info.pass = pass;
3278          state->render_pass_info.framebuffer =
3279             pvr_framebuffer_from_handle(inheritance_info->framebuffer);
3280          state->render_pass_info.subpass_idx = inheritance_info->subpass;
3281          state->render_pass_info.isp_userpass =
3282             pass->subpasses[inheritance_info->subpass].isp_userpass;
3283 
3284          result =
3285             pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
3286          if (result != VK_SUCCESS)
3287             return result;
3288 
3289          state->vis_test_enabled = inheritance_info->occlusionQueryEnable;
3290       }
3291 
3292       state->dirty.isp_userpass = true;
3293    }
3294 
3295    util_dynarray_init(&state->query_indices, NULL);
3296 
3297    memset(state->barriers_needed,
3298           0xFF,
3299           sizeof(*state->barriers_needed) * ARRAY_SIZE(state->barriers_needed));
3300 
3301    return VK_SUCCESS;
3302 }
3303 
pvr_cmd_buffer_add_transfer_cmd(struct pvr_cmd_buffer * cmd_buffer,struct pvr_transfer_cmd * transfer_cmd)3304 VkResult pvr_cmd_buffer_add_transfer_cmd(struct pvr_cmd_buffer *cmd_buffer,
3305                                          struct pvr_transfer_cmd *transfer_cmd)
3306 {
3307    struct pvr_sub_cmd_transfer *sub_cmd;
3308    VkResult result;
3309 
3310    result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_TRANSFER);
3311    if (result != VK_SUCCESS)
3312       return result;
3313 
3314    sub_cmd = &cmd_buffer->state.current_sub_cmd->transfer;
3315 
3316    list_addtail(&transfer_cmd->link, sub_cmd->transfer_cmds);
3317 
3318    return VK_SUCCESS;
3319 }
3320 
3321 static VkResult
pvr_setup_vertex_buffers(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_graphics_pipeline * const gfx_pipeline)3322 pvr_setup_vertex_buffers(struct pvr_cmd_buffer *cmd_buffer,
3323                          const struct pvr_graphics_pipeline *const gfx_pipeline)
3324 {
3325    const struct pvr_vertex_shader_state *const vertex_state =
3326       &gfx_pipeline->shader_state.vertex;
3327    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
3328    const struct pvr_pds_info *const pds_info = state->pds_shader.info;
3329    struct pvr_suballoc_bo *pvr_bo;
3330    const uint8_t *entries;
3331    uint32_t *dword_buffer;
3332    uint64_t *qword_buffer;
3333    VkResult result;
3334 
3335    result =
3336       pvr_cmd_buffer_alloc_mem(cmd_buffer,
3337                                cmd_buffer->device->heaps.pds_heap,
3338                                PVR_DW_TO_BYTES(pds_info->data_size_in_dwords),
3339                                &pvr_bo);
3340    if (result != VK_SUCCESS)
3341       return result;
3342 
3343    dword_buffer = (uint32_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
3344    qword_buffer = (uint64_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
3345 
3346    entries = (uint8_t *)pds_info->entries;
3347 
3348    for (uint32_t i = 0; i < pds_info->entry_count; i++) {
3349       const struct pvr_const_map_entry *const entry_header =
3350          (struct pvr_const_map_entry *)entries;
3351 
3352       switch (entry_header->type) {
3353       case PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32: {
3354          const struct pvr_const_map_entry_literal32 *const literal =
3355             (struct pvr_const_map_entry_literal32 *)entries;
3356 
3357          PVR_WRITE(dword_buffer,
3358                    literal->literal_value,
3359                    literal->const_offset,
3360                    pds_info->data_size_in_dwords);
3361 
3362          entries += sizeof(*literal);
3363          break;
3364       }
3365 
3366       case PVR_PDS_CONST_MAP_ENTRY_TYPE_DOUTU_ADDRESS: {
3367          const struct pvr_const_map_entry_doutu_address *const doutu_addr =
3368             (struct pvr_const_map_entry_doutu_address *)entries;
3369 
3370          const pco_data *const vs_data = &state->gfx_pipeline->vs_data;
3371          const pvr_dev_addr_t exec_addr =
3372             PVR_DEV_ADDR_OFFSET(vertex_state->bo->dev_addr,
3373                                 vs_data->common.entry_offset);
3374          uint64_t addr = 0ULL;
3375 
3376          pvr_set_usc_execution_address64(&addr, exec_addr.addr);
3377 
3378          PVR_WRITE(qword_buffer,
3379                    addr | doutu_addr->doutu_control,
3380                    doutu_addr->const_offset,
3381                    pds_info->data_size_in_dwords);
3382 
3383          entries += sizeof(*doutu_addr);
3384          break;
3385       }
3386 
3387       case PVR_PDS_CONST_MAP_ENTRY_TYPE_BASE_INSTANCE: {
3388          const struct pvr_const_map_entry_base_instance *const base_instance =
3389             (struct pvr_const_map_entry_base_instance *)entries;
3390 
3391          PVR_WRITE(dword_buffer,
3392                    state->draw_state.base_instance,
3393                    base_instance->const_offset,
3394                    pds_info->data_size_in_dwords);
3395 
3396          entries += sizeof(*base_instance);
3397          break;
3398       }
3399 
3400       case PVR_PDS_CONST_MAP_ENTRY_TYPE_BASE_VERTEX: {
3401          const struct pvr_const_map_entry_base_instance *const base_instance =
3402             (struct pvr_const_map_entry_base_instance *)entries;
3403 
3404          PVR_WRITE(dword_buffer,
3405                    state->draw_state.base_vertex,
3406                    base_instance->const_offset,
3407                    pds_info->data_size_in_dwords);
3408 
3409          entries += sizeof(*base_instance);
3410          break;
3411       }
3412 
3413       case PVR_PDS_CONST_MAP_ENTRY_TYPE_VERTEX_ATTRIBUTE_ADDRESS: {
3414          const struct pvr_const_map_entry_vertex_attribute_address
3415             *const attribute =
3416                (struct pvr_const_map_entry_vertex_attribute_address *)entries;
3417          const struct pvr_vertex_binding *const binding =
3418             &state->vertex_bindings[attribute->binding_index];
3419          /* In relation to the Vulkan spec. 22.4. Vertex Input Address
3420           * Calculation:
3421           *    Adding binding->offset corresponds to calculating the
3422           *    `bufferBindingAddress`. Adding attribute->offset corresponds to
3423           *    adding the `attribDesc.offset`. The `effectiveVertexOffset` is
3424           *    taken care by the PDS program itself with a DDMAD which will
3425           *    multiply the vertex/instance idx with the binding's stride and
3426           *    add that to the address provided here.
3427           */
3428          const pvr_dev_addr_t addr =
3429             PVR_DEV_ADDR_OFFSET(binding->buffer->dev_addr,
3430                                 binding->offset + attribute->offset);
3431 
3432          PVR_WRITE(qword_buffer,
3433                    addr.addr,
3434                    attribute->const_offset,
3435                    pds_info->data_size_in_dwords);
3436 
3437          entries += sizeof(*attribute);
3438          break;
3439       }
3440 
3441       case PVR_PDS_CONST_MAP_ENTRY_TYPE_ROBUST_VERTEX_ATTRIBUTE_ADDRESS: {
3442          const struct pvr_const_map_entry_robust_vertex_attribute_address
3443             *const attribute =
3444                (struct pvr_const_map_entry_robust_vertex_attribute_address *)
3445                   entries;
3446          const struct pvr_vertex_binding *const binding =
3447             &state->vertex_bindings[attribute->binding_index];
3448          pvr_dev_addr_t addr;
3449 
3450          if (binding->buffer->vk.size <
3451              (attribute->offset + attribute->component_size_in_bytes)) {
3452             /* Replace with load from robustness buffer when no attribute is in
3453              * range
3454              */
3455             addr = PVR_DEV_ADDR_OFFSET(
3456                cmd_buffer->device->robustness_buffer->vma->dev_addr,
3457                attribute->robustness_buffer_offset);
3458          } else {
3459             addr = PVR_DEV_ADDR_OFFSET(binding->buffer->dev_addr,
3460                                        binding->offset + attribute->offset);
3461          }
3462 
3463          PVR_WRITE(qword_buffer,
3464                    addr.addr,
3465                    attribute->const_offset,
3466                    pds_info->data_size_in_dwords);
3467 
3468          entries += sizeof(*attribute);
3469          break;
3470       }
3471 
3472       case PVR_PDS_CONST_MAP_ENTRY_TYPE_VERTEX_ATTRIBUTE_MAX_INDEX: {
3473          const struct pvr_const_map_entry_vertex_attribute_max_index *attribute =
3474             (struct pvr_const_map_entry_vertex_attribute_max_index *)entries;
3475          const struct pvr_vertex_binding *const binding =
3476             &state->vertex_bindings[attribute->binding_index];
3477          const uint64_t bound_size = binding->buffer->vk.size - binding->offset;
3478          const uint32_t attribute_end =
3479             attribute->offset + attribute->component_size_in_bytes;
3480          uint32_t max_index;
3481 
3482          if (PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info,
3483                              pds_ddmadt)) {
3484             /* TODO: PVR_PDS_CONST_MAP_ENTRY_TYPE_VERTEX_ATTRIBUTE_MAX_INDEX
3485              * has the same define value as
3486              * PVR_PDS_CONST_MAP_ENTRY_TYPE_VERTEX_ATTR_DDMADT_OOB_BUFFER_SIZE
3487              * so maybe we want to remove one of the defines or change the
3488              * values.
3489              */
3490             pvr_finishme("Unimplemented robust buffer access with DDMADT");
3491             assert(false);
3492          }
3493 
3494          /* If the stride is 0 then all attributes use the same single element
3495           * from the binding so the index can only be up to 0.
3496           */
3497          if (bound_size < attribute_end || attribute->stride == 0) {
3498             max_index = 0;
3499          } else {
3500             max_index = (uint32_t)(bound_size / attribute->stride) - 1;
3501 
3502             /* There's one last attribute that can fit in. */
3503             if (bound_size % attribute->stride >= attribute_end)
3504                max_index++;
3505          }
3506 
3507          PVR_WRITE(dword_buffer,
3508                    max_index,
3509                    attribute->const_offset,
3510                    pds_info->data_size_in_dwords);
3511 
3512          entries += sizeof(*attribute);
3513          break;
3514       }
3515 
3516       default:
3517          unreachable("Unsupported data section map");
3518          break;
3519       }
3520    }
3521 
3522    state->pds_vertex_attrib_offset =
3523       pvr_bo->dev_addr.addr -
3524       cmd_buffer->device->heaps.pds_heap->base_addr.addr;
3525 
3526    return VK_SUCCESS;
3527 }
3528 
pvr_setup_descriptor_mappings_old(struct pvr_cmd_buffer * const cmd_buffer,enum pvr_stage_allocation stage,const struct pvr_stage_allocation_descriptor_state * descriptor_state,const pvr_dev_addr_t * const num_worgroups_buff_addr,uint32_t * const descriptor_data_offset_out)3529 static VkResult pvr_setup_descriptor_mappings_old(
3530    struct pvr_cmd_buffer *const cmd_buffer,
3531    enum pvr_stage_allocation stage,
3532    const struct pvr_stage_allocation_descriptor_state *descriptor_state,
3533    const pvr_dev_addr_t *const num_worgroups_buff_addr,
3534    uint32_t *const descriptor_data_offset_out)
3535 {
3536    const struct pvr_pds_info *const pds_info = &descriptor_state->pds_info;
3537    const struct pvr_descriptor_state *desc_state;
3538    struct pvr_suballoc_bo *pvr_bo;
3539    const uint8_t *entries;
3540    uint32_t *dword_buffer;
3541    uint64_t *qword_buffer;
3542    VkResult result;
3543 
3544    if (!pds_info->data_size_in_dwords)
3545       return VK_SUCCESS;
3546 
3547    result =
3548       pvr_cmd_buffer_alloc_mem(cmd_buffer,
3549                                cmd_buffer->device->heaps.pds_heap,
3550                                PVR_DW_TO_BYTES(pds_info->data_size_in_dwords),
3551                                &pvr_bo);
3552    if (result != VK_SUCCESS)
3553       return result;
3554 
3555    dword_buffer = (uint32_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
3556    qword_buffer = (uint64_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
3557 
3558    entries = (uint8_t *)pds_info->entries;
3559 
3560    switch (stage) {
3561    case PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY:
3562    case PVR_STAGE_ALLOCATION_FRAGMENT:
3563       desc_state = &cmd_buffer->state.gfx_desc_state;
3564       break;
3565 
3566    case PVR_STAGE_ALLOCATION_COMPUTE:
3567       desc_state = &cmd_buffer->state.compute_desc_state;
3568       break;
3569 
3570    default:
3571       unreachable("Unsupported stage.");
3572       break;
3573    }
3574 
3575    for (uint32_t i = 0; i < pds_info->entry_count; i++) {
3576       const struct pvr_const_map_entry *const entry_header =
3577          (struct pvr_const_map_entry *)entries;
3578 
3579       switch (entry_header->type) {
3580       case PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32: {
3581          const struct pvr_const_map_entry_literal32 *const literal =
3582             (struct pvr_const_map_entry_literal32 *)entries;
3583 
3584          PVR_WRITE(dword_buffer,
3585                    literal->literal_value,
3586                    literal->const_offset,
3587                    pds_info->data_size_in_dwords);
3588 
3589          entries += sizeof(*literal);
3590          break;
3591       }
3592 
3593       case PVR_PDS_CONST_MAP_ENTRY_TYPE_CONSTANT_BUFFER: {
3594          const struct pvr_const_map_entry_constant_buffer *const_buffer_entry =
3595             (struct pvr_const_map_entry_constant_buffer *)entries;
3596          const uint32_t desc_set = const_buffer_entry->desc_set;
3597          const uint32_t binding = const_buffer_entry->binding;
3598          const struct pvr_descriptor_set *descriptor_set;
3599          const struct pvr_descriptor *descriptor;
3600          pvr_dev_addr_t buffer_addr;
3601 
3602          assert(desc_set < PVR_MAX_DESCRIPTOR_SETS);
3603          descriptor_set = desc_state->descriptor_sets[desc_set];
3604 
3605          /* TODO: Handle dynamic buffers. */
3606          descriptor = &descriptor_set->descriptors[binding];
3607          assert(descriptor->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
3608 
3609          assert(descriptor->buffer_desc_range ==
3610                 PVR_DW_TO_BYTES(const_buffer_entry->size_in_dwords));
3611          assert(descriptor->buffer_whole_range ==
3612                 PVR_DW_TO_BYTES(const_buffer_entry->size_in_dwords));
3613 
3614          buffer_addr =
3615             PVR_DEV_ADDR_OFFSET(descriptor->buffer_dev_addr,
3616                                 const_buffer_entry->offset * sizeof(uint32_t));
3617 
3618          PVR_WRITE(qword_buffer,
3619                    buffer_addr.addr,
3620                    const_buffer_entry->const_offset,
3621                    pds_info->data_size_in_dwords);
3622 
3623          entries += sizeof(*const_buffer_entry);
3624          break;
3625       }
3626 
3627       case PVR_PDS_CONST_MAP_ENTRY_TYPE_DESCRIPTOR_SET: {
3628          const struct pvr_const_map_entry_descriptor_set *desc_set_entry =
3629             (struct pvr_const_map_entry_descriptor_set *)entries;
3630          const uint32_t desc_set_num = desc_set_entry->descriptor_set;
3631          const struct pvr_descriptor_set *descriptor_set;
3632          pvr_dev_addr_t desc_set_addr;
3633          uint64_t desc_portion_offset;
3634 
3635          assert(desc_set_num < PVR_MAX_DESCRIPTOR_SETS);
3636 
3637          /* TODO: Remove this when the compiler provides us with usage info?
3638           */
3639          /* We skip DMAing unbound descriptor sets. */
3640          if (!(desc_state->valid_mask & BITFIELD_BIT(desc_set_num))) {
3641             const struct pvr_const_map_entry_literal32 *literal;
3642             uint32_t zero_literal_value;
3643 
3644             /* The code segment contains a DOUT instructions so in the data
3645              * section we have to write a DOUTD_SRC0 and DOUTD_SRC1.
3646              * We'll write 0 for DOUTD_SRC0 since we don't have a buffer to DMA.
3647              * We're expecting a LITERAL32 entry containing the value for
3648              * DOUTD_SRC1 next so let's make sure we get it and write it
3649              * with BSIZE to 0 disabling the DMA operation.
3650              * We don't want the LITERAL32 to be processed as normal otherwise
3651              * we'd be DMAing from an address of 0.
3652              */
3653 
3654             entries += sizeof(*desc_set_entry);
3655             literal = (struct pvr_const_map_entry_literal32 *)entries;
3656 
3657             assert(literal->type == PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32);
3658 
3659             zero_literal_value =
3660                literal->literal_value &
3661                PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTD_SRC1_BSIZE_CLRMSK;
3662 
3663             PVR_WRITE(qword_buffer,
3664                       UINT64_C(0),
3665                       desc_set_entry->const_offset,
3666                       pds_info->data_size_in_dwords);
3667 
3668             PVR_WRITE(dword_buffer,
3669                       zero_literal_value,
3670                       desc_set_entry->const_offset,
3671                       pds_info->data_size_in_dwords);
3672 
3673             entries += sizeof(*literal);
3674             i++;
3675             continue;
3676          }
3677 
3678          descriptor_set = desc_state->descriptor_sets[desc_set_num];
3679 
3680          desc_set_addr = descriptor_set->pvr_bo->dev_addr;
3681 
3682          if (desc_set_entry->primary) {
3683             desc_portion_offset =
3684                descriptor_set->layout->memory_layout_in_dwords_per_stage[stage]
3685                   .primary_offset;
3686          } else {
3687             desc_portion_offset =
3688                descriptor_set->layout->memory_layout_in_dwords_per_stage[stage]
3689                   .secondary_offset;
3690          }
3691          desc_portion_offset = PVR_DW_TO_BYTES(desc_portion_offset);
3692 
3693          desc_set_addr =
3694             PVR_DEV_ADDR_OFFSET(desc_set_addr, desc_portion_offset);
3695 
3696          desc_set_addr = PVR_DEV_ADDR_OFFSET(
3697             desc_set_addr,
3698             PVR_DW_TO_BYTES((uint64_t)desc_set_entry->offset_in_dwords));
3699 
3700          PVR_WRITE(qword_buffer,
3701                    desc_set_addr.addr,
3702                    desc_set_entry->const_offset,
3703                    pds_info->data_size_in_dwords);
3704 
3705          entries += sizeof(*desc_set_entry);
3706          break;
3707       }
3708 
3709       case PVR_PDS_CONST_MAP_ENTRY_TYPE_SPECIAL_BUFFER: {
3710          const struct pvr_const_map_entry_special_buffer *special_buff_entry =
3711             (struct pvr_const_map_entry_special_buffer *)entries;
3712 
3713          switch (special_buff_entry->buffer_type) {
3714          case PVR_BUFFER_TYPE_COMPILE_TIME: {
3715             uint64_t addr = descriptor_state->static_consts->dev_addr.addr;
3716 
3717             PVR_WRITE(qword_buffer,
3718                       addr,
3719                       special_buff_entry->const_offset,
3720                       pds_info->data_size_in_dwords);
3721             break;
3722          }
3723 
3724          case PVR_BUFFER_TYPE_BLEND_CONSTS:
3725             /* TODO: See if instead of reusing the blend constant buffer type
3726              * entry, we can setup a new buffer type specifically for
3727              * num_workgroups or other built-in variables. The mappings are
3728              * setup at pipeline creation when creating the descriptor program.
3729              */
3730             if (stage == PVR_STAGE_ALLOCATION_COMPUTE) {
3731                assert(num_worgroups_buff_addr->addr);
3732 
3733                /* TODO: Check if we need to offset this (e.g. for just y and z),
3734                 * or cope with any reordering?
3735                 */
3736                PVR_WRITE(qword_buffer,
3737                          num_worgroups_buff_addr->addr,
3738                          special_buff_entry->const_offset,
3739                          pds_info->data_size_in_dwords);
3740             } else {
3741                pvr_finishme("Add blend constants support.");
3742             }
3743             break;
3744 
3745          default:
3746             unreachable("Unsupported special buffer type.");
3747          }
3748 
3749          entries += sizeof(*special_buff_entry);
3750          break;
3751       }
3752 
3753       default:
3754          unreachable("Unsupported map entry type.");
3755       }
3756    }
3757 
3758    *descriptor_data_offset_out =
3759       pvr_bo->dev_addr.addr -
3760       cmd_buffer->device->heaps.pds_heap->base_addr.addr;
3761 
3762    return VK_SUCCESS;
3763 }
3764 
3765 /* Note that the descriptor set doesn't have any space for dynamic buffer
3766  * descriptors so this works on the assumption that you have a buffer with space
3767  * for them at the end.
3768  */
pvr_get_dynamic_descriptor_primary_offset(const struct pvr_device * device,const struct pvr_descriptor_set_layout * layout,const struct pvr_descriptor_set_layout_binding * binding,const uint32_t stage,const uint32_t desc_idx)3769 static uint16_t pvr_get_dynamic_descriptor_primary_offset(
3770    const struct pvr_device *device,
3771    const struct pvr_descriptor_set_layout *layout,
3772    const struct pvr_descriptor_set_layout_binding *binding,
3773    const uint32_t stage,
3774    const uint32_t desc_idx)
3775 {
3776    struct pvr_descriptor_size_info size_info;
3777    uint32_t offset;
3778 
3779    assert(vk_descriptor_type_is_dynamic(binding->type));
3780    assert(desc_idx < binding->descriptor_count);
3781 
3782    pvr_descriptor_size_info_init(device, binding->type, &size_info);
3783 
3784    offset = layout->total_size_in_dwords;
3785    offset += binding->per_stage_offset_in_dwords[stage].primary;
3786    offset += (desc_idx * size_info.primary);
3787 
3788    /* Offset must be less than * 16bits. */
3789    assert(offset < UINT16_MAX);
3790 
3791    return (uint16_t)offset;
3792 }
3793 
3794 /* Note that the descriptor set doesn't have any space for dynamic buffer
3795  * descriptors so this works on the assumption that you have a buffer with space
3796  * for them at the end.
3797  */
pvr_get_dynamic_descriptor_secondary_offset(const struct pvr_device * device,const struct pvr_descriptor_set_layout * layout,const struct pvr_descriptor_set_layout_binding * binding,const uint32_t stage,const uint32_t desc_idx)3798 static uint16_t pvr_get_dynamic_descriptor_secondary_offset(
3799    const struct pvr_device *device,
3800    const struct pvr_descriptor_set_layout *layout,
3801    const struct pvr_descriptor_set_layout_binding *binding,
3802    const uint32_t stage,
3803    const uint32_t desc_idx)
3804 {
3805    struct pvr_descriptor_size_info size_info;
3806    uint32_t offset;
3807 
3808    assert(vk_descriptor_type_is_dynamic(binding->type));
3809    assert(desc_idx < binding->descriptor_count);
3810 
3811    pvr_descriptor_size_info_init(device, binding->type, &size_info);
3812 
3813    offset = layout->total_size_in_dwords;
3814    offset +=
3815       layout->memory_layout_in_dwords_per_stage[stage].primary_dynamic_size;
3816    offset += binding->per_stage_offset_in_dwords[stage].secondary;
3817    offset += (desc_idx * size_info.secondary);
3818 
3819    /* Offset must be less than * 16bits. */
3820    assert(offset < UINT16_MAX);
3821 
3822    return (uint16_t)offset;
3823 }
3824 
3825 /**
3826  * \brief Upload a copy of the descriptor set with dynamic buffer offsets
3827  * applied.
3828  */
3829 /* TODO: We should probably make the compiler aware of the dynamic descriptors.
3830  * We could use push constants like Anv seems to do. This would avoid having to
3831  * duplicate all sets containing dynamic descriptors each time the offsets are
3832  * updated.
3833  */
pvr_cmd_buffer_upload_patched_desc_set(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_descriptor_set * desc_set,const uint32_t * dynamic_offsets,struct pvr_suballoc_bo ** const bo_out)3834 static VkResult pvr_cmd_buffer_upload_patched_desc_set(
3835    struct pvr_cmd_buffer *cmd_buffer,
3836    const struct pvr_descriptor_set *desc_set,
3837    const uint32_t *dynamic_offsets,
3838    struct pvr_suballoc_bo **const bo_out)
3839 {
3840    const struct pvr_descriptor_set_layout *layout = desc_set->layout;
3841    const uint64_t normal_desc_set_size =
3842       PVR_DW_TO_BYTES(layout->total_size_in_dwords);
3843    const uint64_t dynamic_descs_size =
3844       PVR_DW_TO_BYTES(layout->total_dynamic_size_in_dwords);
3845    struct pvr_descriptor_size_info dynamic_uniform_buffer_size_info;
3846    struct pvr_descriptor_size_info dynamic_storage_buffer_size_info;
3847    struct pvr_device *device = cmd_buffer->device;
3848    struct pvr_suballoc_bo *patched_desc_set_bo;
3849    uint32_t *src_mem_ptr, *dst_mem_ptr;
3850    uint32_t desc_idx_offset = 0;
3851    VkResult result;
3852 
3853    assert(desc_set->layout->dynamic_buffer_count > 0);
3854 
3855    pvr_descriptor_size_info_init(device,
3856                                  VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC,
3857                                  &dynamic_uniform_buffer_size_info);
3858    pvr_descriptor_size_info_init(device,
3859                                  VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC,
3860                                  &dynamic_storage_buffer_size_info);
3861 
3862    /* TODO: In the descriptor set we don't account for dynamic buffer
3863     * descriptors and take care of them in the pipeline layout. The pipeline
3864     * layout allocates them at the beginning but let's put them at the end just
3865     * because it makes things a bit easier. Ideally we should be using the
3866     * pipeline layout and use the offsets from the pipeline layout to patch
3867     * descriptors.
3868     */
3869    result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
3870                                      cmd_buffer->device->heaps.general_heap,
3871                                      normal_desc_set_size + dynamic_descs_size,
3872                                      &patched_desc_set_bo);
3873    if (result != VK_SUCCESS)
3874       return result;
3875 
3876    src_mem_ptr = (uint32_t *)pvr_bo_suballoc_get_map_addr(desc_set->pvr_bo);
3877    dst_mem_ptr = (uint32_t *)pvr_bo_suballoc_get_map_addr(patched_desc_set_bo);
3878 
3879    memcpy(dst_mem_ptr, src_mem_ptr, normal_desc_set_size);
3880 
3881    for (uint32_t i = 0; i < desc_set->layout->binding_count; i++) {
3882       const struct pvr_descriptor_set_layout_binding *binding =
3883          &desc_set->layout->bindings[i];
3884       const struct pvr_descriptor *descriptors =
3885          &desc_set->descriptors[binding->descriptor_index];
3886       const struct pvr_descriptor_size_info *size_info;
3887 
3888       if (binding->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC)
3889          size_info = &dynamic_uniform_buffer_size_info;
3890       else if (binding->type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC)
3891          size_info = &dynamic_storage_buffer_size_info;
3892       else
3893          continue;
3894 
3895       for (uint32_t stage = 0; stage < PVR_STAGE_ALLOCATION_COUNT; stage++) {
3896          uint32_t primary_offset;
3897          uint32_t secondary_offset;
3898 
3899          if (!(binding->shader_stage_mask & BITFIELD_BIT(stage)))
3900             continue;
3901 
3902          /* Get the offsets for the first dynamic descriptor in the current
3903           * binding.
3904           */
3905          primary_offset =
3906             pvr_get_dynamic_descriptor_primary_offset(device,
3907                                                       desc_set->layout,
3908                                                       binding,
3909                                                       stage,
3910                                                       0);
3911          secondary_offset =
3912             pvr_get_dynamic_descriptor_secondary_offset(device,
3913                                                         desc_set->layout,
3914                                                         binding,
3915                                                         stage,
3916                                                         0);
3917 
3918          /* clang-format off */
3919          for (uint32_t desc_idx = 0;
3920               desc_idx < binding->descriptor_count;
3921               desc_idx++) {
3922             /* clang-format on */
3923             const pvr_dev_addr_t addr =
3924                PVR_DEV_ADDR_OFFSET(descriptors[desc_idx].buffer_dev_addr,
3925                                    dynamic_offsets[desc_idx + desc_idx_offset]);
3926             const VkDeviceSize range =
3927                MIN2(descriptors[desc_idx].buffer_desc_range,
3928                     descriptors[desc_idx].buffer_whole_range -
3929                        dynamic_offsets[desc_idx]);
3930 
3931 #if MESA_DEBUG
3932             uint32_t desc_primary_offset;
3933             uint32_t desc_secondary_offset;
3934 
3935             desc_primary_offset =
3936                pvr_get_dynamic_descriptor_primary_offset(device,
3937                                                          desc_set->layout,
3938                                                          binding,
3939                                                          stage,
3940                                                          desc_idx);
3941             desc_secondary_offset =
3942                pvr_get_dynamic_descriptor_secondary_offset(device,
3943                                                            desc_set->layout,
3944                                                            binding,
3945                                                            stage,
3946                                                            desc_idx);
3947 
3948             /* Check the assumption that the descriptors within a binding, for
3949              * a particular stage, are allocated consecutively.
3950              */
3951             assert(desc_primary_offset ==
3952                    primary_offset + size_info->primary * desc_idx);
3953             assert(desc_secondary_offset ==
3954                    secondary_offset + size_info->secondary * desc_idx);
3955 #endif
3956 
3957             assert(descriptors[desc_idx].type == binding->type);
3958 
3959             memcpy(dst_mem_ptr + primary_offset + size_info->primary * desc_idx,
3960                    &addr.addr,
3961                    PVR_DW_TO_BYTES(size_info->primary));
3962             memcpy(dst_mem_ptr + secondary_offset +
3963                       size_info->secondary * desc_idx,
3964                    &range,
3965                    PVR_DW_TO_BYTES(size_info->secondary));
3966          }
3967       }
3968 
3969       desc_idx_offset += binding->descriptor_count;
3970    }
3971 
3972    *bo_out = patched_desc_set_bo;
3973 
3974    return VK_SUCCESS;
3975 }
3976 
3977 #define PVR_SELECT(_geom, _frag, _compute)         \
3978    (stage == PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY) \
3979       ? (_geom)                                    \
3980       : (stage == PVR_STAGE_ALLOCATION_FRAGMENT) ? (_frag) : (_compute)
3981 
3982 static VkResult
pvr_cmd_buffer_upload_desc_set_table(struct pvr_cmd_buffer * const cmd_buffer,enum pvr_stage_allocation stage,pvr_dev_addr_t * addr_out)3983 pvr_cmd_buffer_upload_desc_set_table(struct pvr_cmd_buffer *const cmd_buffer,
3984                                      enum pvr_stage_allocation stage,
3985                                      pvr_dev_addr_t *addr_out)
3986 {
3987    uint64_t bound_desc_sets[PVR_MAX_DESCRIPTOR_SETS];
3988    const struct pvr_descriptor_state *desc_state;
3989    struct pvr_suballoc_bo *suballoc_bo;
3990    uint32_t dynamic_offset_idx = 0;
3991    VkResult result;
3992 
3993    switch (stage) {
3994    case PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY:
3995    case PVR_STAGE_ALLOCATION_FRAGMENT:
3996    case PVR_STAGE_ALLOCATION_COMPUTE:
3997       break;
3998 
3999    default:
4000       unreachable("Unsupported stage.");
4001       break;
4002    }
4003 
4004    desc_state = PVR_SELECT(&cmd_buffer->state.gfx_desc_state,
4005                            &cmd_buffer->state.gfx_desc_state,
4006                            &cmd_buffer->state.compute_desc_state);
4007 
4008    for (uint32_t set = 0; set < ARRAY_SIZE(bound_desc_sets); set++)
4009       bound_desc_sets[set] = ~0;
4010 
4011    assert(util_last_bit(desc_state->valid_mask) <= ARRAY_SIZE(bound_desc_sets));
4012    for (uint32_t set = 0; set < util_last_bit(desc_state->valid_mask); set++) {
4013       const struct pvr_descriptor_set *desc_set;
4014 
4015       if (!(desc_state->valid_mask & BITFIELD_BIT(set))) {
4016          const struct pvr_pipeline_layout *pipeline_layout =
4017             PVR_SELECT(cmd_buffer->state.gfx_pipeline->base.layout,
4018                        cmd_buffer->state.gfx_pipeline->base.layout,
4019                        cmd_buffer->state.compute_pipeline->base.layout);
4020          const struct pvr_descriptor_set_layout *set_layout;
4021 
4022          assert(set <= pipeline_layout->set_count);
4023 
4024          set_layout = pipeline_layout->set_layout[set];
4025          dynamic_offset_idx += set_layout->dynamic_buffer_count;
4026 
4027          continue;
4028       }
4029 
4030       desc_set = desc_state->descriptor_sets[set];
4031 
4032       /* TODO: Is it better if we don't set the valid_mask for empty sets? */
4033       if (desc_set->layout->descriptor_count == 0)
4034          continue;
4035 
4036       if (desc_set->layout->dynamic_buffer_count > 0) {
4037          struct pvr_suballoc_bo *new_desc_set_bo;
4038 
4039          assert(dynamic_offset_idx + desc_set->layout->dynamic_buffer_count <=
4040                 ARRAY_SIZE(desc_state->dynamic_offsets));
4041 
4042          result = pvr_cmd_buffer_upload_patched_desc_set(
4043             cmd_buffer,
4044             desc_set,
4045             &desc_state->dynamic_offsets[dynamic_offset_idx],
4046             &new_desc_set_bo);
4047          if (result != VK_SUCCESS)
4048             return result;
4049 
4050          dynamic_offset_idx += desc_set->layout->dynamic_buffer_count;
4051 
4052          bound_desc_sets[set] = new_desc_set_bo->dev_addr.addr;
4053       } else {
4054          bound_desc_sets[set] = desc_set->pvr_bo->dev_addr.addr;
4055       }
4056    }
4057 
4058    result = pvr_cmd_buffer_upload_general(cmd_buffer,
4059                                           bound_desc_sets,
4060                                           sizeof(bound_desc_sets),
4061                                           &suballoc_bo);
4062    if (result != VK_SUCCESS)
4063       return result;
4064 
4065    *addr_out = suballoc_bo->dev_addr;
4066    return VK_SUCCESS;
4067 }
4068 
4069 static VkResult
pvr_process_addr_literal(struct pvr_cmd_buffer * cmd_buffer,enum pvr_pds_addr_literal_type addr_literal_type,enum pvr_stage_allocation stage,pvr_dev_addr_t * addr_out)4070 pvr_process_addr_literal(struct pvr_cmd_buffer *cmd_buffer,
4071                          enum pvr_pds_addr_literal_type addr_literal_type,
4072                          enum pvr_stage_allocation stage,
4073                          pvr_dev_addr_t *addr_out)
4074 {
4075    VkResult result;
4076 
4077    switch (addr_literal_type) {
4078    case PVR_PDS_ADDR_LITERAL_DESC_SET_ADDRS_TABLE: {
4079       /* TODO: Maybe we want to free pvr_bo? And only when the data
4080        * section is written successfully we link all bos to the command
4081        * buffer.
4082        */
4083       result =
4084          pvr_cmd_buffer_upload_desc_set_table(cmd_buffer, stage, addr_out);
4085       if (result != VK_SUCCESS)
4086          return result;
4087 
4088       break;
4089    }
4090 
4091    case PVR_PDS_ADDR_LITERAL_PUSH_CONSTS: {
4092       const struct pvr_pipeline_layout *layout =
4093          PVR_SELECT(cmd_buffer->state.gfx_pipeline->base.layout,
4094                     cmd_buffer->state.gfx_pipeline->base.layout,
4095                     cmd_buffer->state.compute_pipeline->base.layout);
4096       const uint32_t push_constants_offset =
4097          PVR_SELECT(layout->vert_push_constants_offset,
4098                     layout->frag_push_constants_offset,
4099                     layout->compute_push_constants_offset);
4100 
4101       *addr_out = PVR_DEV_ADDR_OFFSET(cmd_buffer->state.push_constants.dev_addr,
4102                                       push_constants_offset);
4103       break;
4104    }
4105 
4106    case PVR_PDS_ADDR_LITERAL_BLEND_CONSTANTS: {
4107       float *blend_consts =
4108          cmd_buffer->vk.dynamic_graphics_state.cb.blend_constants;
4109       size_t size =
4110          sizeof(cmd_buffer->vk.dynamic_graphics_state.cb.blend_constants);
4111       struct pvr_suballoc_bo *blend_consts_bo;
4112 
4113       result = pvr_cmd_buffer_upload_general(cmd_buffer,
4114                                              blend_consts,
4115                                              size,
4116                                              &blend_consts_bo);
4117       if (result != VK_SUCCESS)
4118          return result;
4119 
4120       *addr_out = blend_consts_bo->dev_addr;
4121 
4122       break;
4123    }
4124 
4125    default:
4126       unreachable("Invalid add literal type.");
4127    }
4128 
4129    return VK_SUCCESS;
4130 }
4131 
4132 #undef PVR_SELECT
4133 
pvr_setup_descriptor_mappings_new(struct pvr_cmd_buffer * const cmd_buffer,enum pvr_stage_allocation stage,const struct pvr_stage_allocation_descriptor_state * descriptor_state,uint32_t * const descriptor_data_offset_out)4134 static VkResult pvr_setup_descriptor_mappings_new(
4135    struct pvr_cmd_buffer *const cmd_buffer,
4136    enum pvr_stage_allocation stage,
4137    const struct pvr_stage_allocation_descriptor_state *descriptor_state,
4138    uint32_t *const descriptor_data_offset_out)
4139 {
4140    const struct pvr_pds_info *const pds_info = &descriptor_state->pds_info;
4141    struct pvr_suballoc_bo *pvr_bo;
4142    const uint8_t *entries;
4143    uint32_t *dword_buffer;
4144    uint64_t *qword_buffer;
4145    VkResult result;
4146 
4147    if (!pds_info->data_size_in_dwords)
4148       return VK_SUCCESS;
4149 
4150    result =
4151       pvr_cmd_buffer_alloc_mem(cmd_buffer,
4152                                cmd_buffer->device->heaps.pds_heap,
4153                                PVR_DW_TO_BYTES(pds_info->data_size_in_dwords),
4154                                &pvr_bo);
4155    if (result != VK_SUCCESS)
4156       return result;
4157 
4158    dword_buffer = (uint32_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
4159    qword_buffer = (uint64_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
4160 
4161    entries = (uint8_t *)pds_info->entries;
4162 
4163    switch (stage) {
4164    case PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY:
4165    case PVR_STAGE_ALLOCATION_FRAGMENT:
4166    case PVR_STAGE_ALLOCATION_COMPUTE:
4167       break;
4168 
4169    default:
4170       unreachable("Unsupported stage.");
4171       break;
4172    }
4173 
4174    for (uint32_t i = 0; i < pds_info->entry_count; i++) {
4175       const struct pvr_const_map_entry *const entry_header =
4176          (struct pvr_const_map_entry *)entries;
4177 
4178       switch (entry_header->type) {
4179       case PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32: {
4180          const struct pvr_const_map_entry_literal32 *const literal =
4181             (struct pvr_const_map_entry_literal32 *)entries;
4182 
4183          PVR_WRITE(dword_buffer,
4184                    literal->literal_value,
4185                    literal->const_offset,
4186                    pds_info->data_size_in_dwords);
4187 
4188          entries += sizeof(*literal);
4189          break;
4190       }
4191 
4192       case PVR_PDS_CONST_MAP_ENTRY_TYPE_ADDR_LITERAL_BUFFER: {
4193          const struct pvr_pds_const_map_entry_addr_literal_buffer
4194             *const addr_literal_buffer_entry =
4195                (struct pvr_pds_const_map_entry_addr_literal_buffer *)entries;
4196          struct pvr_device *device = cmd_buffer->device;
4197          struct pvr_suballoc_bo *addr_literal_buffer_bo;
4198          uint32_t addr_literal_count = 0;
4199          uint64_t *addr_literal_buffer;
4200 
4201          result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
4202                                            device->heaps.general_heap,
4203                                            addr_literal_buffer_entry->size,
4204                                            &addr_literal_buffer_bo);
4205          if (result != VK_SUCCESS)
4206             return result;
4207 
4208          addr_literal_buffer =
4209             (uint64_t *)pvr_bo_suballoc_get_map_addr(addr_literal_buffer_bo);
4210 
4211          entries += sizeof(*addr_literal_buffer_entry);
4212 
4213          PVR_WRITE(qword_buffer,
4214                    addr_literal_buffer_bo->dev_addr.addr,
4215                    addr_literal_buffer_entry->const_offset,
4216                    pds_info->data_size_in_dwords);
4217 
4218          for (uint32_t j = i + 1; j < pds_info->entry_count; j++) {
4219             const struct pvr_const_map_entry *const entry_header =
4220                (struct pvr_const_map_entry *)entries;
4221             const struct pvr_pds_const_map_entry_addr_literal *addr_literal;
4222             pvr_dev_addr_t dev_addr;
4223 
4224             if (entry_header->type != PVR_PDS_CONST_MAP_ENTRY_TYPE_ADDR_LITERAL)
4225                break;
4226 
4227             addr_literal =
4228                (struct pvr_pds_const_map_entry_addr_literal *)entries;
4229 
4230             result = pvr_process_addr_literal(cmd_buffer,
4231                                               addr_literal->addr_type,
4232                                               stage,
4233                                               &dev_addr);
4234             if (result != VK_SUCCESS)
4235                return result;
4236 
4237             addr_literal_buffer[addr_literal_count++] = dev_addr.addr;
4238 
4239             entries += sizeof(*addr_literal);
4240          }
4241 
4242          assert(addr_literal_count * sizeof(uint64_t) ==
4243                 addr_literal_buffer_entry->size);
4244 
4245          i += addr_literal_count;
4246 
4247          break;
4248       }
4249 
4250       default:
4251          unreachable("Unsupported map entry type.");
4252       }
4253    }
4254 
4255    *descriptor_data_offset_out =
4256       pvr_bo->dev_addr.addr -
4257       cmd_buffer->device->heaps.pds_heap->base_addr.addr;
4258 
4259    return VK_SUCCESS;
4260 }
4261 
pvr_setup_descriptor_mappings(struct pvr_cmd_buffer * const cmd_buffer,enum pvr_stage_allocation stage,const struct pvr_stage_allocation_descriptor_state * descriptor_state,const pvr_dev_addr_t * const num_worgroups_buff_addr,uint32_t * const descriptor_data_offset_out)4262 static VkResult pvr_setup_descriptor_mappings(
4263    struct pvr_cmd_buffer *const cmd_buffer,
4264    enum pvr_stage_allocation stage,
4265    const struct pvr_stage_allocation_descriptor_state *descriptor_state,
4266    const pvr_dev_addr_t *const num_worgroups_buff_addr,
4267    uint32_t *const descriptor_data_offset_out)
4268 {
4269    const bool old_path =
4270       pvr_has_hard_coded_shaders(&cmd_buffer->device->pdevice->dev_info);
4271 
4272    if (old_path) {
4273       return pvr_setup_descriptor_mappings_old(cmd_buffer,
4274                                                stage,
4275                                                descriptor_state,
4276                                                num_worgroups_buff_addr,
4277                                                descriptor_data_offset_out);
4278    }
4279 
4280    return pvr_setup_descriptor_mappings_new(cmd_buffer,
4281                                             stage,
4282                                             descriptor_state,
4283                                             descriptor_data_offset_out);
4284 }
4285 
pvr_compute_update_shared(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd)4286 static void pvr_compute_update_shared(struct pvr_cmd_buffer *cmd_buffer,
4287                                       struct pvr_sub_cmd_compute *const sub_cmd)
4288 {
4289    const struct pvr_device *device = cmd_buffer->device;
4290    const struct pvr_physical_device *pdevice = device->pdevice;
4291    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4292    struct pvr_csb *csb = &sub_cmd->control_stream;
4293    const struct pvr_compute_pipeline *pipeline = state->compute_pipeline;
4294    const uint32_t const_shared_regs =
4295       pipeline->shader_state.const_shared_reg_count;
4296    struct pvr_compute_kernel_info info;
4297 
4298    /* No shared regs, no need to use an allocation kernel. */
4299    if (!const_shared_regs)
4300       return;
4301 
4302    /* Accumulate the MAX number of shared registers across the kernels in this
4303     * dispatch. This is used by the FW for context switching, so must be large
4304     * enough to contain all the shared registers that might be in use for this
4305     * compute job. Coefficients don't need to be included as the context switch
4306     * will not happen within the execution of a single workgroup, thus nothing
4307     * needs to be preserved.
4308     */
4309    state->max_shared_regs = MAX2(state->max_shared_regs, const_shared_regs);
4310 
4311    info = (struct pvr_compute_kernel_info){
4312       .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
4313       .sd_type = ROGUE_CDMCTRL_SD_TYPE_NONE,
4314 
4315       .usc_target = ROGUE_CDMCTRL_USC_TARGET_ALL,
4316       .usc_common_shared = true,
4317       .usc_common_size =
4318          DIV_ROUND_UP(const_shared_regs,
4319                       ROGUE_CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE),
4320 
4321       .global_size = { 1, 1, 1 },
4322       .local_size = { 1, 1, 1 },
4323    };
4324 
4325    /* Sometimes we don't have a secondary program if there were no constants to
4326     * write, but we still need to run a PDS program to accomplish the
4327     * allocation of the local/common store shared registers. Use the
4328     * pre-uploaded empty PDS program in this instance.
4329     */
4330    if (pipeline->descriptor_state.pds_info.code_size_in_dwords) {
4331       uint32_t pds_data_size_in_dwords =
4332          pipeline->descriptor_state.pds_info.data_size_in_dwords;
4333 
4334       info.pds_data_offset = state->pds_compute_descriptor_data_offset;
4335       info.pds_data_size =
4336          DIV_ROUND_UP(PVR_DW_TO_BYTES(pds_data_size_in_dwords),
4337                       ROGUE_CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE);
4338 
4339       /* Check that we have upload the code section. */
4340       assert(pipeline->descriptor_state.pds_code.code_size);
4341       info.pds_code_offset = pipeline->descriptor_state.pds_code.code_offset;
4342    } else {
4343       const struct pvr_pds_upload *program = &device->pds_compute_empty_program;
4344 
4345       info.pds_data_offset = program->data_offset;
4346       info.pds_data_size =
4347          DIV_ROUND_UP(PVR_DW_TO_BYTES(program->data_size),
4348                       ROGUE_CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE);
4349       info.pds_code_offset = program->code_offset;
4350    }
4351 
4352    /* We don't need to pad the workgroup size. */
4353 
4354    info.max_instances =
4355       pvr_compute_flat_slot_size(pdevice, const_shared_regs, false, 1U);
4356 
4357    pvr_compute_generate_control_stream(csb, sub_cmd, &info);
4358 }
4359 
pvr_compute_update_shared_private(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd,struct pvr_private_compute_pipeline * pipeline)4360 void pvr_compute_update_shared_private(
4361    struct pvr_cmd_buffer *cmd_buffer,
4362    struct pvr_sub_cmd_compute *const sub_cmd,
4363    struct pvr_private_compute_pipeline *pipeline)
4364 {
4365    const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
4366    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4367    const uint32_t const_shared_regs = pipeline->const_shared_regs_count;
4368    struct pvr_csb *csb = &sub_cmd->control_stream;
4369    struct pvr_compute_kernel_info info;
4370 
4371    /* No shared regs, no need to use an allocation kernel. */
4372    if (!const_shared_regs)
4373       return;
4374 
4375    /* See comment in pvr_compute_update_shared() for details on this. */
4376    state->max_shared_regs = MAX2(state->max_shared_regs, const_shared_regs);
4377 
4378    info = (struct pvr_compute_kernel_info){
4379       .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
4380       .usc_common_size =
4381          DIV_ROUND_UP(const_shared_regs,
4382                       ROGUE_CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE),
4383       .pds_data_size =
4384          DIV_ROUND_UP(PVR_DW_TO_BYTES(pipeline->pds_shared_update_data_size_dw),
4385                       ROGUE_CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE),
4386       .usc_target = ROGUE_CDMCTRL_USC_TARGET_ALL,
4387       .pds_data_offset = pipeline->pds_shared_update_data_offset,
4388       .pds_code_offset = pipeline->pds_shared_update_code_offset,
4389       .sd_type = ROGUE_CDMCTRL_SD_TYPE_NONE,
4390       .usc_common_shared = true,
4391       .global_size = { 1, 1, 1 },
4392       .local_size = { 1, 1, 1 },
4393    };
4394 
4395    /* We don't need to pad the workgroup size. */
4396 
4397    info.max_instances =
4398       pvr_compute_flat_slot_size(pdevice, const_shared_regs, false, 1U);
4399 
4400    pvr_compute_generate_control_stream(csb, sub_cmd, &info);
4401 }
4402 
4403 static uint32_t
pvr_compute_flat_pad_workgroup_size(const struct pvr_physical_device * pdevice,uint32_t workgroup_size,uint32_t coeff_regs_count)4404 pvr_compute_flat_pad_workgroup_size(const struct pvr_physical_device *pdevice,
4405                                     uint32_t workgroup_size,
4406                                     uint32_t coeff_regs_count)
4407 {
4408    const struct pvr_device_runtime_info *dev_runtime_info =
4409       &pdevice->dev_runtime_info;
4410    const struct pvr_device_info *dev_info = &pdevice->dev_info;
4411    uint32_t max_avail_coeff_regs =
4412       dev_runtime_info->cdm_max_local_mem_size_regs;
4413    uint32_t coeff_regs_count_aligned =
4414       ALIGN_POT(coeff_regs_count,
4415                 ROGUE_CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE >> 2U);
4416 
4417    /* If the work group size is > ROGUE_MAX_INSTANCES_PER_TASK. We now *always*
4418     * pad the work group size to the next multiple of
4419     * ROGUE_MAX_INSTANCES_PER_TASK.
4420     *
4421     * If we use more than 1/8th of the max coefficient registers then we round
4422     * work group size up to the next multiple of ROGUE_MAX_INSTANCES_PER_TASK
4423     */
4424    /* TODO: See if this can be optimized. */
4425    if (workgroup_size > ROGUE_MAX_INSTANCES_PER_TASK ||
4426        coeff_regs_count_aligned > (max_avail_coeff_regs / 8)) {
4427       assert(workgroup_size < rogue_get_compute_max_work_group_size(dev_info));
4428 
4429       return ALIGN_POT(workgroup_size, ROGUE_MAX_INSTANCES_PER_TASK);
4430    }
4431 
4432    return workgroup_size;
4433 }
4434 
pvr_compute_update_kernel_private(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd,struct pvr_private_compute_pipeline * pipeline,const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])4435 void pvr_compute_update_kernel_private(
4436    struct pvr_cmd_buffer *cmd_buffer,
4437    struct pvr_sub_cmd_compute *const sub_cmd,
4438    struct pvr_private_compute_pipeline *pipeline,
4439    const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])
4440 {
4441    const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
4442    const struct pvr_device_runtime_info *dev_runtime_info =
4443       &pdevice->dev_runtime_info;
4444    struct pvr_csb *csb = &sub_cmd->control_stream;
4445 
4446    struct pvr_compute_kernel_info info = {
4447       .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
4448       .usc_target = ROGUE_CDMCTRL_USC_TARGET_ANY,
4449       .pds_temp_size =
4450          DIV_ROUND_UP(pipeline->pds_temps_used << 2U,
4451                       ROGUE_CDMCTRL_KERNEL0_PDS_TEMP_SIZE_UNIT_SIZE),
4452 
4453       .pds_data_size =
4454          DIV_ROUND_UP(PVR_DW_TO_BYTES(pipeline->pds_data_size_dw),
4455                       ROGUE_CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE),
4456       .pds_data_offset = pipeline->pds_data_offset,
4457       .pds_code_offset = pipeline->pds_code_offset,
4458 
4459       .sd_type = ROGUE_CDMCTRL_SD_TYPE_NONE,
4460 
4461       .usc_unified_size =
4462          DIV_ROUND_UP(pipeline->unified_store_regs_count << 2U,
4463                       ROGUE_CDMCTRL_KERNEL0_USC_UNIFIED_SIZE_UNIT_SIZE),
4464 
4465       /* clang-format off */
4466       .global_size = {
4467          global_workgroup_size[0],
4468          global_workgroup_size[1],
4469          global_workgroup_size[2]
4470       },
4471       /* clang-format on */
4472    };
4473 
4474    uint32_t work_size = pipeline->workgroup_size.width *
4475                         pipeline->workgroup_size.height *
4476                         pipeline->workgroup_size.depth;
4477    uint32_t coeff_regs;
4478 
4479    if (work_size > ROGUE_MAX_INSTANCES_PER_TASK) {
4480       /* Enforce a single workgroup per cluster through allocation starvation.
4481        */
4482       coeff_regs = dev_runtime_info->cdm_max_local_mem_size_regs;
4483    } else {
4484       coeff_regs = pipeline->coeff_regs_count;
4485    }
4486 
4487    info.usc_common_size =
4488       DIV_ROUND_UP(PVR_DW_TO_BYTES(coeff_regs),
4489                    ROGUE_CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE);
4490 
4491    /* Use a whole slot per workgroup. */
4492    work_size = MAX2(work_size, ROGUE_MAX_INSTANCES_PER_TASK);
4493 
4494    coeff_regs += pipeline->const_shared_regs_count;
4495 
4496    if (pipeline->const_shared_regs_count > 0)
4497       info.sd_type = ROGUE_CDMCTRL_SD_TYPE_USC;
4498 
4499    work_size =
4500       pvr_compute_flat_pad_workgroup_size(pdevice, work_size, coeff_regs);
4501 
4502    info.local_size[0] = work_size;
4503    info.local_size[1] = 1U;
4504    info.local_size[2] = 1U;
4505 
4506    info.max_instances =
4507       pvr_compute_flat_slot_size(pdevice, coeff_regs, false, work_size);
4508 
4509    pvr_compute_generate_control_stream(csb, sub_cmd, &info);
4510 }
4511 
4512 /* TODO: Wire up the base_workgroup variant program when implementing
4513  * VK_KHR_device_group. The values will also need patching into the program.
4514  */
pvr_compute_update_kernel(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd,pvr_dev_addr_t indirect_addr,const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])4515 static void pvr_compute_update_kernel(
4516    struct pvr_cmd_buffer *cmd_buffer,
4517    struct pvr_sub_cmd_compute *const sub_cmd,
4518    pvr_dev_addr_t indirect_addr,
4519    const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])
4520 {
4521    const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
4522    const struct pvr_device_runtime_info *dev_runtime_info =
4523       &pdevice->dev_runtime_info;
4524    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4525    struct pvr_csb *csb = &sub_cmd->control_stream;
4526    const struct pvr_compute_pipeline *pipeline = state->compute_pipeline;
4527    const struct pvr_compute_shader_state *shader_state =
4528       &pipeline->shader_state;
4529    const struct pvr_pds_info *program_info = &pipeline->primary_program_info;
4530 
4531    struct pvr_compute_kernel_info info = {
4532       .indirect_buffer_addr = indirect_addr,
4533       .usc_target = ROGUE_CDMCTRL_USC_TARGET_ANY,
4534       .pds_temp_size =
4535          DIV_ROUND_UP(program_info->temps_required << 2U,
4536                       ROGUE_CDMCTRL_KERNEL0_PDS_TEMP_SIZE_UNIT_SIZE),
4537 
4538       .pds_data_size =
4539          DIV_ROUND_UP(PVR_DW_TO_BYTES(program_info->data_size_in_dwords),
4540                       ROGUE_CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE),
4541       .pds_data_offset = pipeline->primary_program.data_offset,
4542       .pds_code_offset = pipeline->primary_program.code_offset,
4543 
4544       .sd_type = ROGUE_CDMCTRL_SD_TYPE_NONE,
4545 
4546       .usc_unified_size =
4547          DIV_ROUND_UP(shader_state->input_register_count << 2U,
4548                       ROGUE_CDMCTRL_KERNEL0_USC_UNIFIED_SIZE_UNIT_SIZE),
4549 
4550       /* clang-format off */
4551       .global_size = {
4552          global_workgroup_size[0],
4553          global_workgroup_size[1],
4554          global_workgroup_size[2]
4555       },
4556       /* clang-format on */
4557    };
4558 
4559    uint32_t work_size = shader_state->work_size;
4560    uint32_t coeff_regs;
4561 
4562    if (work_size > ROGUE_MAX_INSTANCES_PER_TASK) {
4563       /* Enforce a single workgroup per cluster through allocation starvation.
4564        */
4565       coeff_regs = dev_runtime_info->cdm_max_local_mem_size_regs;
4566    } else {
4567       coeff_regs = shader_state->coefficient_register_count;
4568    }
4569 
4570    info.usc_common_size =
4571       DIV_ROUND_UP(PVR_DW_TO_BYTES(coeff_regs),
4572                    ROGUE_CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE);
4573 
4574    /* Use a whole slot per workgroup. */
4575    work_size = MAX2(work_size, ROGUE_MAX_INSTANCES_PER_TASK);
4576 
4577    coeff_regs += shader_state->const_shared_reg_count;
4578 
4579    if (shader_state->const_shared_reg_count > 0)
4580       info.sd_type = ROGUE_CDMCTRL_SD_TYPE_USC;
4581 
4582    work_size =
4583       pvr_compute_flat_pad_workgroup_size(pdevice, work_size, coeff_regs);
4584 
4585    info.local_size[0] = work_size;
4586    info.local_size[1] = 1U;
4587    info.local_size[2] = 1U;
4588 
4589    info.max_instances =
4590       pvr_compute_flat_slot_size(pdevice, coeff_regs, false, work_size);
4591 
4592    pvr_compute_generate_control_stream(csb, sub_cmd, &info);
4593 }
4594 
pvr_cmd_upload_push_consts(struct pvr_cmd_buffer * cmd_buffer)4595 static VkResult pvr_cmd_upload_push_consts(struct pvr_cmd_buffer *cmd_buffer)
4596 {
4597    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4598    struct pvr_suballoc_bo *suballoc_bo;
4599    VkResult result;
4600 
4601    /* TODO: Here are some possible optimizations/things to consider:
4602     *
4603     *    - Currently we upload maxPushConstantsSize. The application might only
4604     *      be using a portion of that so we might end up with unused memory.
4605     *      Should we be smarter about this. If we intend to upload the push
4606     *      consts into shareds, we definitely want to do avoid reserving unused
4607     *      regs.
4608     *
4609     *    - For now we have to upload to a new buffer each time since the shaders
4610     *      access the push constants from memory. If we were to reuse the same
4611     *      buffer we might update the contents out of sync with job submission
4612     *      and the shaders will see the updated contents while the command
4613     *      buffer was still being recorded and not yet submitted.
4614     *      If we were to upload the push constants directly to shared regs we
4615     *      could reuse the same buffer (avoiding extra allocation overhead)
4616     *      since the contents will be DMAed only on job submission when the
4617     *      control stream is processed and the PDS program is executed. This
4618     *      approach would also allow us to avoid regenerating the PDS data
4619     *      section in some cases since the buffer address will be constants.
4620     */
4621 
4622    if (cmd_buffer->state.push_constants.uploaded)
4623       return VK_SUCCESS;
4624 
4625    result = pvr_cmd_buffer_upload_general(cmd_buffer,
4626                                           state->push_constants.data,
4627                                           sizeof(state->push_constants.data),
4628                                           &suballoc_bo);
4629    if (result != VK_SUCCESS)
4630       return result;
4631 
4632    cmd_buffer->state.push_constants.dev_addr = suballoc_bo->dev_addr;
4633    cmd_buffer->state.push_constants.uploaded = true;
4634 
4635    return VK_SUCCESS;
4636 }
4637 
pvr_cmd_dispatch(struct pvr_cmd_buffer * const cmd_buffer,const pvr_dev_addr_t indirect_addr,const uint32_t workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])4638 static void pvr_cmd_dispatch(
4639    struct pvr_cmd_buffer *const cmd_buffer,
4640    const pvr_dev_addr_t indirect_addr,
4641    const uint32_t workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])
4642 {
4643    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4644    const struct pvr_compute_pipeline *compute_pipeline =
4645       state->compute_pipeline;
4646    struct pvr_sub_cmd_compute *sub_cmd;
4647    VkResult result;
4648 
4649    pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_COMPUTE);
4650 
4651    sub_cmd = &state->current_sub_cmd->compute;
4652    sub_cmd->uses_atomic_ops |= compute_pipeline->shader_state.uses_atomic_ops;
4653    sub_cmd->uses_barrier |= compute_pipeline->shader_state.uses_barrier;
4654 
4655    if (state->push_constants.dirty_stages & VK_SHADER_STAGE_COMPUTE_BIT) {
4656       result = pvr_cmd_upload_push_consts(cmd_buffer);
4657       if (result != VK_SUCCESS)
4658          return;
4659 
4660       /* Regenerate the PDS program to use the new push consts buffer. */
4661       state->dirty.compute_desc_dirty = true;
4662 
4663       state->push_constants.dirty_stages &= ~VK_SHADER_STAGE_COMPUTE_BIT;
4664    }
4665 
4666    if (compute_pipeline->shader_state.uses_num_workgroups) {
4667       pvr_dev_addr_t descriptor_data_offset_out;
4668 
4669       if (indirect_addr.addr) {
4670          descriptor_data_offset_out = indirect_addr;
4671       } else {
4672          struct pvr_suballoc_bo *num_workgroups_bo;
4673 
4674          result = pvr_cmd_buffer_upload_general(cmd_buffer,
4675                                                 workgroup_size,
4676                                                 sizeof(*workgroup_size) *
4677                                                    PVR_WORKGROUP_DIMENSIONS,
4678                                                 &num_workgroups_bo);
4679          if (result != VK_SUCCESS)
4680             return;
4681 
4682          descriptor_data_offset_out = num_workgroups_bo->dev_addr;
4683       }
4684 
4685       result = pvr_setup_descriptor_mappings(
4686          cmd_buffer,
4687          PVR_STAGE_ALLOCATION_COMPUTE,
4688          &compute_pipeline->descriptor_state,
4689          &descriptor_data_offset_out,
4690          &state->pds_compute_descriptor_data_offset);
4691       if (result != VK_SUCCESS)
4692          return;
4693    } else if ((compute_pipeline->base.layout
4694                   ->per_stage_descriptor_masks[PVR_STAGE_ALLOCATION_COMPUTE] &&
4695                state->dirty.compute_desc_dirty) ||
4696               state->dirty.compute_pipeline_binding) {
4697       result = pvr_setup_descriptor_mappings(
4698          cmd_buffer,
4699          PVR_STAGE_ALLOCATION_COMPUTE,
4700          &compute_pipeline->descriptor_state,
4701          NULL,
4702          &state->pds_compute_descriptor_data_offset);
4703       if (result != VK_SUCCESS)
4704          return;
4705    }
4706 
4707    pvr_compute_update_shared(cmd_buffer, sub_cmd);
4708    pvr_compute_update_kernel(cmd_buffer, sub_cmd, indirect_addr, workgroup_size);
4709 }
4710 
pvr_CmdDispatch(VkCommandBuffer commandBuffer,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)4711 void pvr_CmdDispatch(VkCommandBuffer commandBuffer,
4712                      uint32_t groupCountX,
4713                      uint32_t groupCountY,
4714                      uint32_t groupCountZ)
4715 {
4716    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
4717 
4718    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
4719 
4720    if (!groupCountX || !groupCountY || !groupCountZ)
4721       return;
4722 
4723    pvr_cmd_dispatch(cmd_buffer,
4724                     PVR_DEV_ADDR_INVALID,
4725                     (uint32_t[]){ groupCountX, groupCountY, groupCountZ });
4726 }
4727 
pvr_CmdDispatchIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset)4728 void pvr_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
4729                              VkBuffer _buffer,
4730                              VkDeviceSize offset)
4731 {
4732    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
4733    PVR_FROM_HANDLE(pvr_buffer, buffer, _buffer);
4734 
4735    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
4736 
4737    pvr_cmd_dispatch(cmd_buffer,
4738                     PVR_DEV_ADDR_OFFSET(buffer->dev_addr, offset),
4739                     (uint32_t[]){ 1, 1, 1 });
4740 }
4741 
4742 static void
pvr_update_draw_state(struct pvr_cmd_buffer_state * const state,const struct pvr_cmd_buffer_draw_state * const draw_state)4743 pvr_update_draw_state(struct pvr_cmd_buffer_state *const state,
4744                       const struct pvr_cmd_buffer_draw_state *const draw_state)
4745 {
4746    /* We don't have a state to tell us that base_instance is being used so it
4747     * gets used as a boolean - 0 means we'll use a pds program that skips the
4748     * base instance addition. If the base_instance gets used (and the last
4749     * draw's base_instance was 0) then we switch to the BASE_INSTANCE attrib
4750     * program.
4751     *
4752     * If base_instance changes then we only need to update the data section.
4753     *
4754     * The only draw call state that doesn't really matter is the start vertex
4755     * as that is handled properly in the VDM state in all cases.
4756     */
4757    if ((state->draw_state.draw_indexed != draw_state->draw_indexed) ||
4758        (state->draw_state.draw_indirect != draw_state->draw_indirect) ||
4759        (state->draw_state.base_instance == 0 &&
4760         draw_state->base_instance != 0)) {
4761       state->dirty.draw_variant = true;
4762    } else if (state->draw_state.base_instance != draw_state->base_instance) {
4763       state->dirty.draw_base_instance = true;
4764    }
4765 
4766    state->draw_state = *draw_state;
4767 }
4768 
pvr_calc_shared_regs_count(const struct pvr_graphics_pipeline * const gfx_pipeline)4769 static uint32_t pvr_calc_shared_regs_count(
4770    const struct pvr_graphics_pipeline *const gfx_pipeline)
4771 {
4772    uint32_t shared_regs = gfx_pipeline->vs_data.common.shareds;
4773 
4774    if (gfx_pipeline->shader_state.fragment.bo) {
4775       uint32_t fragment_regs = gfx_pipeline->fs_data.common.shareds;
4776       shared_regs = MAX2(shared_regs, fragment_regs);
4777    }
4778 
4779    return shared_regs;
4780 }
4781 
4782 static void
pvr_emit_dirty_pds_state(const struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd,const uint32_t pds_vertex_descriptor_data_offset)4783 pvr_emit_dirty_pds_state(const struct pvr_cmd_buffer *const cmd_buffer,
4784                          struct pvr_sub_cmd_gfx *const sub_cmd,
4785                          const uint32_t pds_vertex_descriptor_data_offset)
4786 {
4787    const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
4788    const struct pvr_stage_allocation_descriptor_state
4789       *const vertex_descriptor_state =
4790          &state->gfx_pipeline->shader_state.vertex.descriptor_state;
4791    const pco_data *const vs_data = &state->gfx_pipeline->vs_data;
4792    struct pvr_csb *const csb = &sub_cmd->control_stream;
4793 
4794    if (!vertex_descriptor_state->pds_info.code_size_in_dwords)
4795       return;
4796 
4797    pvr_csb_set_relocation_mark(csb);
4798 
4799    pvr_csb_emit (csb, VDMCTRL_PDS_STATE0, state0) {
4800       state0.usc_target = ROGUE_VDMCTRL_USC_TARGET_ALL;
4801 
4802       state0.usc_common_size =
4803          DIV_ROUND_UP(vs_data->common.shareds,
4804                       ROGUE_VDMCTRL_PDS_STATE0_USC_COMMON_SIZE_UNIT_SIZE);
4805 
4806       state0.pds_data_size = DIV_ROUND_UP(
4807          PVR_DW_TO_BYTES(vertex_descriptor_state->pds_info.data_size_in_dwords),
4808          ROGUE_VDMCTRL_PDS_STATE0_PDS_DATA_SIZE_UNIT_SIZE);
4809    }
4810 
4811    pvr_csb_emit (csb, VDMCTRL_PDS_STATE1, state1) {
4812       state1.pds_data_addr = PVR_DEV_ADDR(pds_vertex_descriptor_data_offset);
4813       state1.sd_type = ROGUE_VDMCTRL_SD_TYPE_NONE;
4814    }
4815 
4816    pvr_csb_emit (csb, VDMCTRL_PDS_STATE2, state2) {
4817       state2.pds_code_addr =
4818          PVR_DEV_ADDR(vertex_descriptor_state->pds_code.code_offset);
4819    }
4820 
4821    pvr_csb_clear_relocation_mark(csb);
4822 }
4823 
pvr_setup_output_select(struct pvr_cmd_buffer * const cmd_buffer)4824 static void pvr_setup_output_select(struct pvr_cmd_buffer *const cmd_buffer)
4825 {
4826    const struct pvr_graphics_pipeline *const gfx_pipeline =
4827       cmd_buffer->state.gfx_pipeline;
4828    struct vk_dynamic_graphics_state *const dynamic_state =
4829       &cmd_buffer->vk.dynamic_graphics_state;
4830    struct ROGUE_TA_STATE_HEADER *const header = &cmd_buffer->state.emit_header;
4831    struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state;
4832    const pco_data *const vs_data = &gfx_pipeline->vs_data;
4833    const pco_data *const fs_data = &gfx_pipeline->fs_data;
4834    uint32_t output_selects;
4835    uint32_t varying[2];
4836 
4837    const pco_range *varyings = vs_data->vs.varyings;
4838 
4839    const bool has_point_size = dynamic_state->ia.primitive_topology ==
4840                                   VK_PRIMITIVE_TOPOLOGY_POINT_LIST &&
4841                                varyings[VARYING_SLOT_PSIZ].count > 0;
4842 
4843    const bool has_viewport = varyings[VARYING_SLOT_VIEWPORT].count > 0;
4844 
4845    const bool has_layer = varyings[VARYING_SLOT_LAYER].count > 0;
4846 
4847    pvr_csb_pack (&output_selects, TA_OUTPUT_SEL, state) {
4848       state.rhw_pres = fs_data->fs.uses.w;
4849       state.tsp_unclamped_z_pres = fs_data->fs.uses.z;
4850 
4851       state.vtxsize = vs_data->vs.vtxouts;
4852       state.psprite_size_pres = has_point_size;
4853       state.vpt_tgt_pres = has_viewport;
4854       state.render_tgt_pres = has_layer;
4855    }
4856 
4857    if (ppp_state->output_selects != output_selects) {
4858       ppp_state->output_selects = output_selects;
4859       header->pres_outselects = true;
4860    }
4861 
4862    pvr_csb_pack (&varying[0], TA_STATE_VARYING0, varying0) {
4863       varying0.f32_linear = vs_data->vs.f32_smooth;
4864       varying0.f32_flat = vs_data->vs.f32_flat;
4865       varying0.f32_npc = vs_data->vs.f32_npc;
4866    }
4867 
4868    if (ppp_state->varying_word[0] != varying[0]) {
4869       ppp_state->varying_word[0] = varying[0];
4870       header->pres_varying_word0 = true;
4871    }
4872 
4873    pvr_csb_pack (&varying[1], TA_STATE_VARYING1, varying1) {
4874       varying1.f16_linear = vs_data->vs.f16_smooth;
4875       varying1.f16_flat = vs_data->vs.f16_flat;
4876       varying1.f16_npc = vs_data->vs.f16_npc;
4877    }
4878 
4879    if (ppp_state->varying_word[1] != varying[1]) {
4880       ppp_state->varying_word[1] = varying[1];
4881       header->pres_varying_word1 = true;
4882    }
4883 }
4884 
4885 static void
pvr_setup_isp_faces_and_control(struct pvr_cmd_buffer * const cmd_buffer,struct ROGUE_TA_STATE_ISPA * const ispa_out)4886 pvr_setup_isp_faces_and_control(struct pvr_cmd_buffer *const cmd_buffer,
4887                                 struct ROGUE_TA_STATE_ISPA *const ispa_out)
4888 {
4889    struct ROGUE_TA_STATE_HEADER *const header = &cmd_buffer->state.emit_header;
4890    const struct pvr_fragment_shader_state *const fragment_shader_state =
4891       &cmd_buffer->state.gfx_pipeline->shader_state.fragment;
4892    const struct pvr_render_pass_info *const pass_info =
4893       &cmd_buffer->state.render_pass_info;
4894    struct vk_dynamic_graphics_state *dynamic_state =
4895       &cmd_buffer->vk.dynamic_graphics_state;
4896    struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state;
4897 
4898    const bool rasterizer_discard = dynamic_state->rs.rasterizer_discard_enable;
4899    const uint32_t subpass_idx = pass_info->subpass_idx;
4900    const uint32_t depth_stencil_attachment_idx =
4901       pass_info->pass->subpasses[subpass_idx].depth_stencil_attachment;
4902    const struct pvr_render_pass_attachment *const attachment =
4903       depth_stencil_attachment_idx != VK_ATTACHMENT_UNUSED
4904          ? &pass_info->pass->attachments[depth_stencil_attachment_idx]
4905          : NULL;
4906 
4907    const enum ROGUE_TA_OBJTYPE obj_type =
4908       pvr_ta_objtype(dynamic_state->ia.primitive_topology);
4909 
4910    const VkImageAspectFlags ds_aspects =
4911       (!rasterizer_discard && attachment)
4912          ? vk_format_aspects(attachment->vk_format) &
4913               (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)
4914          : VK_IMAGE_ASPECT_NONE;
4915 
4916    /* This is deliberately a full copy rather than a pointer because
4917     * vk_optimize_depth_stencil_state() can only be run once against any given
4918     * instance of vk_depth_stencil_state.
4919     */
4920    struct vk_depth_stencil_state ds_state = dynamic_state->ds;
4921 
4922    uint32_t ispb_stencil_off;
4923    bool is_two_sided = false;
4924    uint32_t isp_control;
4925 
4926    uint32_t line_width;
4927    uint32_t common_a;
4928    uint32_t front_a;
4929    uint32_t front_b;
4930    uint32_t back_a;
4931    uint32_t back_b;
4932 
4933    vk_optimize_depth_stencil_state(&ds_state, ds_aspects, true);
4934 
4935    /* Convert to 4.4 fixed point format. */
4936    line_width = util_unsigned_fixed(dynamic_state->rs.line.width, 4);
4937 
4938    /* Subtract 1 to shift values from range [0=0,256=16] to [0=1/16,255=16].
4939     * If 0 it stays at 0, otherwise we subtract 1.
4940     */
4941    line_width = (!!line_width) * (line_width - 1);
4942 
4943    line_width = MIN2(line_width, ROGUE_TA_STATE_ISPA_POINTLINEWIDTH_SIZE_MAX);
4944 
4945    /* TODO: Part of the logic in this function is duplicated in another part
4946     * of the code. E.g. the dcmpmode, and sop1/2/3. Could we do this earlier?
4947     */
4948 
4949    pvr_csb_pack (&common_a, TA_STATE_ISPA, ispa) {
4950       ispa.pointlinewidth = line_width;
4951 
4952       ispa.dcmpmode = pvr_ta_cmpmode(ds_state.depth.compare_op);
4953       ispa.dwritedisable = !ds_state.depth.write_enable;
4954 
4955       ispa.passtype = fragment_shader_state->pass_type;
4956 
4957       ispa.objtype = obj_type;
4958 
4959       /* Return unpacked ispa structure. dcmpmode, dwritedisable, passtype and
4960        * objtype are needed by pvr_setup_triangle_merging_flag.
4961        */
4962       if (ispa_out)
4963          *ispa_out = ispa;
4964    }
4965 
4966    /* TODO: Does this actually represent the ispb control word on stencil off?
4967     * If not, rename the variable.
4968     */
4969    pvr_csb_pack (&ispb_stencil_off, TA_STATE_ISPB, ispb) {
4970       ispb.sop3 = ROGUE_TA_ISPB_STENCILOP_KEEP;
4971       ispb.sop2 = ROGUE_TA_ISPB_STENCILOP_KEEP;
4972       ispb.sop1 = ROGUE_TA_ISPB_STENCILOP_KEEP;
4973       ispb.scmpmode = ROGUE_TA_CMPMODE_ALWAYS;
4974    }
4975 
4976    /* FIXME: This logic should be redone and improved. Can we also get rid of
4977     * the front and back variants?
4978     */
4979 
4980    front_a = common_a;
4981    back_a = common_a;
4982 
4983    if (ds_state.stencil.test_enable) {
4984       uint32_t front_a_sref;
4985       uint32_t back_a_sref;
4986 
4987       pvr_csb_pack (&front_a_sref, TA_STATE_ISPA, ispa) {
4988          ispa.sref = ds_state.stencil.front.reference;
4989       }
4990       front_a |= front_a_sref;
4991 
4992       pvr_csb_pack (&back_a_sref, TA_STATE_ISPA, ispa) {
4993          ispa.sref = ds_state.stencil.back.reference;
4994       }
4995       back_a |= back_a_sref;
4996 
4997       pvr_csb_pack (&front_b, TA_STATE_ISPB, ispb) {
4998          const struct vk_stencil_test_face_state *const front =
4999             &ds_state.stencil.front;
5000 
5001          if (ds_state.stencil.write_enable)
5002             ispb.swmask = front->write_mask;
5003 
5004          ispb.scmpmask = front->compare_mask;
5005 
5006          ispb.sop3 = pvr_ta_stencilop(front->op.pass);
5007          ispb.sop2 = pvr_ta_stencilop(front->op.depth_fail);
5008          ispb.sop1 = pvr_ta_stencilop(front->op.fail);
5009          ispb.scmpmode = pvr_ta_cmpmode(front->op.compare);
5010       }
5011 
5012       pvr_csb_pack (&back_b, TA_STATE_ISPB, ispb) {
5013          const struct vk_stencil_test_face_state *const back =
5014             &ds_state.stencil.back;
5015 
5016          if (ds_state.stencil.write_enable)
5017             ispb.swmask = back->write_mask;
5018 
5019          ispb.scmpmask = back->compare_mask;
5020 
5021          ispb.sop3 = pvr_ta_stencilop(back->op.pass);
5022          ispb.sop2 = pvr_ta_stencilop(back->op.depth_fail);
5023          ispb.sop1 = pvr_ta_stencilop(back->op.fail);
5024          ispb.scmpmode = pvr_ta_cmpmode(back->op.compare);
5025       }
5026    } else {
5027       front_b = ispb_stencil_off;
5028       back_b = ispb_stencil_off;
5029    }
5030 
5031    if (front_a != back_a || front_b != back_b) {
5032       if (dynamic_state->rs.cull_mode & VK_CULL_MODE_BACK_BIT) {
5033          /* Single face, using front state. */
5034       } else if (dynamic_state->rs.cull_mode & VK_CULL_MODE_FRONT_BIT) {
5035          /* Single face, using back state. */
5036 
5037          front_a = back_a;
5038          front_b = back_b;
5039       } else {
5040          /* Both faces. */
5041 
5042          header->pres_ispctl_ba = is_two_sided = true;
5043 
5044          if (dynamic_state->rs.front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE) {
5045             uint32_t tmp = front_a;
5046 
5047             front_a = back_a;
5048             back_a = tmp;
5049 
5050             tmp = front_b;
5051             front_b = back_b;
5052             back_b = tmp;
5053          }
5054 
5055          /* HW defaults to stencil off. */
5056          if (back_b != ispb_stencil_off) {
5057             header->pres_ispctl_fb = true;
5058             header->pres_ispctl_bb = true;
5059          }
5060       }
5061    }
5062 
5063    if (ds_state.stencil.test_enable && front_b != ispb_stencil_off)
5064       header->pres_ispctl_fb = true;
5065 
5066    pvr_csb_pack (&isp_control, TA_STATE_ISPCTL, ispctl) {
5067       ispctl.upass = pass_info->isp_userpass;
5068 
5069       /* TODO: is bo ever NULL? Figure out what to do. */
5070       ispctl.tagwritedisable = rasterizer_discard || !fragment_shader_state->bo;
5071 
5072       ispctl.two_sided = is_two_sided;
5073       ispctl.bpres = header->pres_ispctl_fb || header->pres_ispctl_bb;
5074 
5075       ispctl.dbenable = !rasterizer_discard &&
5076                         dynamic_state->rs.depth_bias.enable &&
5077                         obj_type == ROGUE_TA_OBJTYPE_TRIANGLE;
5078       if (!rasterizer_discard && cmd_buffer->state.vis_test_enabled) {
5079          ispctl.vistest = true;
5080          ispctl.visreg = cmd_buffer->state.vis_reg;
5081       }
5082 
5083       ispctl.scenable = !rasterizer_discard;
5084 
5085       ppp_state->isp.control_struct = ispctl;
5086    }
5087 
5088    header->pres_ispctl = true;
5089 
5090    ppp_state->isp.control = isp_control;
5091    ppp_state->isp.front_a = front_a;
5092    ppp_state->isp.front_b = front_b;
5093    ppp_state->isp.back_a = back_a;
5094    ppp_state->isp.back_b = back_b;
5095 }
5096 
5097 static float
pvr_calculate_final_depth_bias_contant_factor(struct pvr_device_info * dev_info,VkFormat format,float depth_bias)5098 pvr_calculate_final_depth_bias_contant_factor(struct pvr_device_info *dev_info,
5099                                               VkFormat format,
5100                                               float depth_bias)
5101 {
5102    /* Information for future modifiers of these depth bias calculations.
5103     * ==================================================================
5104     * Specified depth bias equations scale the specified constant factor by a
5105     * value 'r' that is guaranteed to cause a resolvable difference in depth
5106     * across the entire range of depth values.
5107     * For floating point depth formats 'r' is calculated by taking the maximum
5108     * exponent across the triangle.
5109     * For UNORM formats 'r' is constant.
5110     * Here 'n' is the number of mantissa bits stored in the floating point
5111     * representation (23 for F32).
5112     *
5113     *    UNORM Format -> z += dbcf * r + slope
5114     *    FLOAT Format -> z += dbcf * 2^(e-n) + slope
5115     *
5116     * HW Variations.
5117     * ==============
5118     * The HW either always performs the F32 depth bias equation (exponent based
5119     * r), or in the case of HW that correctly supports the integer depth bias
5120     * equation for UNORM depth formats, we can select between both equations
5121     * using the ROGUE_CR_ISP_CTL.dbias_is_int flag - this is required to
5122     * correctly perform Vulkan UNORM depth bias (constant r).
5123     *
5124     *    if ern42307:
5125     *       if DBIAS_IS_INT_EN:
5126     *          z += dbcf + slope
5127     *       else:
5128     *          z += dbcf * 2^(e-n) + slope
5129     *    else:
5130     *       z += dbcf * 2^(e-n) + slope
5131     *
5132     */
5133 
5134    float nudge_factor;
5135 
5136    if (PVR_HAS_ERN(dev_info, 42307)) {
5137       switch (format) {
5138       case VK_FORMAT_D16_UNORM:
5139          return depth_bias / (1 << 15);
5140 
5141       case VK_FORMAT_D24_UNORM_S8_UINT:
5142       case VK_FORMAT_X8_D24_UNORM_PACK32:
5143          return depth_bias / (1 << 23);
5144 
5145       default:
5146          return depth_bias;
5147       }
5148    }
5149 
5150    /* The reasoning behind clamping/nudging the value here is because UNORM
5151     * depth formats can have higher precision over our underlying D32F
5152     * representation for some depth ranges.
5153     *
5154     * When the HW scales the depth bias value by 2^(e-n) [The 'r' term'] a depth
5155     * bias of 1 can result in a value smaller than one F32 ULP, which will get
5156     * quantized to 0 - resulting in no bias.
5157     *
5158     * Biasing small values away from zero will ensure that small depth biases of
5159     * 1 still yield a result and overcome Z-fighting.
5160     */
5161    switch (format) {
5162    case VK_FORMAT_D16_UNORM:
5163       depth_bias *= 512.0f;
5164       nudge_factor = 1.0f;
5165       break;
5166 
5167    case VK_FORMAT_D24_UNORM_S8_UINT:
5168    case VK_FORMAT_X8_D24_UNORM_PACK32:
5169       depth_bias *= 2.0f;
5170       nudge_factor = 2.0f;
5171       break;
5172 
5173    default:
5174       nudge_factor = 0.0f;
5175       break;
5176    }
5177 
5178    if (nudge_factor != 0.0f) {
5179       if (depth_bias < 0.0f && depth_bias > -nudge_factor)
5180          depth_bias -= nudge_factor;
5181       else if (depth_bias > 0.0f && depth_bias < nudge_factor)
5182          depth_bias += nudge_factor;
5183    }
5184 
5185    return depth_bias;
5186 }
5187 
pvr_get_viewport_scissor_overlap(const VkViewport * const viewport,const VkRect2D * const scissor,VkRect2D * const rect_out)5188 static void pvr_get_viewport_scissor_overlap(const VkViewport *const viewport,
5189                                              const VkRect2D *const scissor,
5190                                              VkRect2D *const rect_out)
5191 {
5192    /* TODO: See if we can remove this struct. */
5193    struct pvr_rect {
5194       int32_t x0, y0;
5195       int32_t x1, y1;
5196    };
5197 
5198    /* TODO: Worry about overflow? */
5199    const struct pvr_rect scissor_rect = {
5200       .x0 = scissor->offset.x,
5201       .y0 = scissor->offset.y,
5202       .x1 = scissor->offset.x + scissor->extent.width,
5203       .y1 = scissor->offset.y + scissor->extent.height
5204    };
5205    struct pvr_rect viewport_rect = { 0 };
5206 
5207    assert(viewport->width >= 0.0f);
5208    assert(scissor_rect.x0 >= 0);
5209    assert(scissor_rect.y0 >= 0);
5210 
5211    if (scissor->extent.width == 0 || scissor->extent.height == 0) {
5212       *rect_out = (VkRect2D){ 0 };
5213       return;
5214    }
5215 
5216    viewport_rect.x0 = (int32_t)viewport->x;
5217    viewport_rect.x1 = (int32_t)viewport->x + (int32_t)viewport->width;
5218 
5219    /* TODO: Is there a mathematical way of doing all this and then clamp at
5220     * the end?
5221     */
5222    /* We flip the y0 and y1 when height is negative. */
5223    viewport_rect.y0 = (int32_t)viewport->y + MIN2(0, (int32_t)viewport->height);
5224    viewport_rect.y1 = (int32_t)viewport->y + MAX2(0, (int32_t)viewport->height);
5225 
5226    if (scissor_rect.x1 <= viewport_rect.x0 ||
5227        scissor_rect.y1 <= viewport_rect.y0 ||
5228        scissor_rect.x0 >= viewport_rect.x1 ||
5229        scissor_rect.y0 >= viewport_rect.y1) {
5230       *rect_out = (VkRect2D){ 0 };
5231       return;
5232    }
5233 
5234    /* Determine the overlapping rectangle. */
5235    viewport_rect.x0 = MAX2(viewport_rect.x0, scissor_rect.x0);
5236    viewport_rect.y0 = MAX2(viewport_rect.y0, scissor_rect.y0);
5237    viewport_rect.x1 = MIN2(viewport_rect.x1, scissor_rect.x1);
5238    viewport_rect.y1 = MIN2(viewport_rect.y1, scissor_rect.y1);
5239 
5240    /* TODO: Is this conversion safe? Is this logic right? */
5241    rect_out->offset.x = (uint32_t)viewport_rect.x0;
5242    rect_out->offset.y = (uint32_t)viewport_rect.y0;
5243    rect_out->extent.height = (uint32_t)(viewport_rect.y1 - viewport_rect.y0);
5244    rect_out->extent.width = (uint32_t)(viewport_rect.x1 - viewport_rect.x0);
5245 }
5246 
5247 static inline uint32_t
pvr_get_geom_region_clip_align_size(struct pvr_device_info * const dev_info)5248 pvr_get_geom_region_clip_align_size(struct pvr_device_info *const dev_info)
5249 {
5250    /* TODO: This should come from rogue_ppp.xml. */
5251    return 16U + 16U * (!PVR_HAS_FEATURE(dev_info, tile_size_16x16));
5252 }
5253 
5254 static void
pvr_setup_isp_depth_bias_scissor_state(struct pvr_cmd_buffer * const cmd_buffer)5255 pvr_setup_isp_depth_bias_scissor_state(struct pvr_cmd_buffer *const cmd_buffer)
5256 {
5257    struct ROGUE_TA_STATE_HEADER *const header = &cmd_buffer->state.emit_header;
5258    struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state;
5259    struct vk_dynamic_graphics_state *const dynamic_state =
5260       &cmd_buffer->vk.dynamic_graphics_state;
5261    const struct ROGUE_TA_STATE_ISPCTL *const ispctl =
5262       &ppp_state->isp.control_struct;
5263    struct pvr_device_info *const dev_info =
5264       &cmd_buffer->device->pdevice->dev_info;
5265 
5266    if (ispctl->dbenable &&
5267        (BITSET_TEST(dynamic_state->dirty,
5268                     MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS) ||
5269         cmd_buffer->depth_bias_array.size == 0)) {
5270       struct pvr_depth_bias_state depth_bias = {
5271          .constant_factor = pvr_calculate_final_depth_bias_contant_factor(
5272             dev_info,
5273             cmd_buffer->state.depth_format,
5274             dynamic_state->rs.depth_bias.constant_factor),
5275          .slope_factor = dynamic_state->rs.depth_bias.slope_factor,
5276          .clamp = dynamic_state->rs.depth_bias.clamp,
5277       };
5278 
5279       ppp_state->depthbias_scissor_indices.depthbias_index =
5280          util_dynarray_num_elements(&cmd_buffer->depth_bias_array,
5281                                     __typeof__(depth_bias));
5282 
5283       util_dynarray_append(&cmd_buffer->depth_bias_array,
5284                            __typeof__(depth_bias),
5285                            depth_bias);
5286 
5287       header->pres_ispctl_dbsc = true;
5288    }
5289 
5290    if (ispctl->scenable) {
5291       const uint32_t region_clip_align_size =
5292          pvr_get_geom_region_clip_align_size(dev_info);
5293       const VkViewport *const viewport = &dynamic_state->vp.viewports[0];
5294       const VkRect2D *const scissor = &dynamic_state->vp.scissors[0];
5295       struct pvr_scissor_words scissor_words;
5296       VkRect2D overlap_rect;
5297       uint32_t height;
5298       uint32_t width;
5299       uint32_t x;
5300       uint32_t y;
5301 
5302       /* For region clip. */
5303       uint32_t bottom;
5304       uint32_t right;
5305       uint32_t left;
5306       uint32_t top;
5307 
5308       /* We don't support multiple viewport calculations. */
5309       assert(dynamic_state->vp.viewport_count == 1);
5310       /* We don't support multiple scissor calculations. */
5311       assert(dynamic_state->vp.scissor_count == 1);
5312 
5313       pvr_get_viewport_scissor_overlap(viewport, scissor, &overlap_rect);
5314 
5315       x = overlap_rect.offset.x;
5316       y = overlap_rect.offset.y;
5317       width = overlap_rect.extent.width;
5318       height = overlap_rect.extent.height;
5319 
5320       pvr_csb_pack (&scissor_words.w0, IPF_SCISSOR_WORD_0, word0) {
5321          word0.scw0_xmax = x + width;
5322          word0.scw0_xmin = x;
5323       }
5324 
5325       pvr_csb_pack (&scissor_words.w1, IPF_SCISSOR_WORD_1, word1) {
5326          word1.scw1_ymax = y + height;
5327          word1.scw1_ymin = y;
5328       }
5329 
5330       if (cmd_buffer->scissor_array.size &&
5331           cmd_buffer->scissor_words.w0 == scissor_words.w0 &&
5332           cmd_buffer->scissor_words.w1 == scissor_words.w1) {
5333          return;
5334       }
5335 
5336       cmd_buffer->scissor_words = scissor_words;
5337 
5338       /* Calculate region clip. */
5339 
5340       left = x / region_clip_align_size;
5341       top = y / region_clip_align_size;
5342 
5343       /* We prevent right=-1 with the multiplication. */
5344       /* TODO: Is there a better way of doing this? */
5345       if ((x + width) != 0U)
5346          right = DIV_ROUND_UP(x + width, region_clip_align_size) - 1;
5347       else
5348          right = 0;
5349 
5350       if ((y + height) != 0U)
5351          bottom = DIV_ROUND_UP(y + height, region_clip_align_size) - 1;
5352       else
5353          bottom = 0U;
5354 
5355       /* Setup region clip to clip everything outside what was calculated. */
5356 
5357       /* FIXME: Should we mask to prevent writing over other words? */
5358       pvr_csb_pack (&ppp_state->region_clipping.word0, TA_REGION_CLIP0, word0) {
5359          word0.right = right;
5360          word0.left = left;
5361          word0.mode = ROGUE_TA_REGION_CLIP_MODE_OUTSIDE;
5362       }
5363 
5364       pvr_csb_pack (&ppp_state->region_clipping.word1, TA_REGION_CLIP1, word1) {
5365          word1.bottom = bottom;
5366          word1.top = top;
5367       }
5368 
5369       ppp_state->depthbias_scissor_indices.scissor_index =
5370          util_dynarray_num_elements(&cmd_buffer->scissor_array,
5371                                     struct pvr_scissor_words);
5372 
5373       util_dynarray_append(&cmd_buffer->scissor_array,
5374                            struct pvr_scissor_words,
5375                            cmd_buffer->scissor_words);
5376 
5377       header->pres_ispctl_dbsc = true;
5378       header->pres_region_clip = true;
5379    }
5380 }
5381 
5382 static void
pvr_setup_triangle_merging_flag(struct pvr_cmd_buffer * const cmd_buffer,struct ROGUE_TA_STATE_ISPA * ispa)5383 pvr_setup_triangle_merging_flag(struct pvr_cmd_buffer *const cmd_buffer,
5384                                 struct ROGUE_TA_STATE_ISPA *ispa)
5385 {
5386    struct ROGUE_TA_STATE_HEADER *const header = &cmd_buffer->state.emit_header;
5387    struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state;
5388    uint32_t merge_word;
5389    uint32_t mask;
5390 
5391    pvr_csb_pack (&merge_word, TA_STATE_PDS_SIZEINFO2, size_info) {
5392       /* Disable for lines or punch-through or for DWD and depth compare
5393        * always.
5394        */
5395       if (ispa->objtype == ROGUE_TA_OBJTYPE_LINE ||
5396           ispa->passtype == ROGUE_TA_PASSTYPE_PUNCH_THROUGH ||
5397           (ispa->dwritedisable && ispa->dcmpmode == ROGUE_TA_CMPMODE_ALWAYS)) {
5398          size_info.pds_tri_merge_disable = true;
5399       }
5400    }
5401 
5402    pvr_csb_pack (&mask, TA_STATE_PDS_SIZEINFO2, size_info) {
5403       size_info.pds_tri_merge_disable = true;
5404    }
5405 
5406    merge_word |= ppp_state->pds.size_info2 & ~mask;
5407 
5408    if (merge_word != ppp_state->pds.size_info2) {
5409       ppp_state->pds.size_info2 = merge_word;
5410       header->pres_pds_state_ptr0 = true;
5411    }
5412 }
5413 
5414 static void
pvr_setup_fragment_state_pointers(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)5415 pvr_setup_fragment_state_pointers(struct pvr_cmd_buffer *const cmd_buffer,
5416                                   struct pvr_sub_cmd_gfx *const sub_cmd)
5417 {
5418    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
5419    const pco_data *const fs_data = &state->gfx_pipeline->fs_data;
5420 
5421    const struct pvr_fragment_shader_state *const fragment_shader_state =
5422       &state->gfx_pipeline->shader_state.fragment;
5423    const struct pvr_stage_allocation_descriptor_state *descriptor_shader_state =
5424       &fragment_shader_state->descriptor_state;
5425    const struct pvr_pipeline_stage_state *fragment_state =
5426       &fragment_shader_state->stage_state;
5427    const struct pvr_pds_upload *pds_coeff_program =
5428       &fragment_shader_state->pds_coeff_program;
5429 
5430    const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
5431    struct ROGUE_TA_STATE_HEADER *const header = &state->emit_header;
5432    struct pvr_ppp_state *const ppp_state = &state->ppp_state;
5433 
5434    const uint32_t pds_uniform_size =
5435       DIV_ROUND_UP(descriptor_shader_state->pds_info.data_size_in_dwords,
5436                    ROGUE_TA_STATE_PDS_SIZEINFO1_PDS_UNIFORMSIZE_UNIT_SIZE);
5437 
5438    const uint32_t pds_varying_state_size =
5439       DIV_ROUND_UP(pds_coeff_program->data_size,
5440                    ROGUE_TA_STATE_PDS_SIZEINFO1_PDS_VARYINGSIZE_UNIT_SIZE);
5441 
5442    const uint32_t usc_varying_size =
5443       DIV_ROUND_UP(fs_data->common.coeffs,
5444                    ROGUE_TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_UNIT_SIZE);
5445 
5446    const uint32_t pds_temp_size =
5447       DIV_ROUND_UP(fragment_state->pds_temps_count,
5448                    ROGUE_TA_STATE_PDS_SIZEINFO1_PDS_TEMPSIZE_UNIT_SIZE);
5449 
5450    const uint32_t usc_shared_size =
5451       DIV_ROUND_UP(fs_data->common.shareds,
5452                    ROGUE_TA_STATE_PDS_SIZEINFO2_USC_SHAREDSIZE_UNIT_SIZE);
5453 
5454    const uint32_t max_tiles_in_flight =
5455       pvr_calc_fscommon_size_and_tiles_in_flight(
5456          &pdevice->dev_info,
5457          &pdevice->dev_runtime_info,
5458          usc_shared_size *
5459             ROGUE_TA_STATE_PDS_SIZEINFO2_USC_SHAREDSIZE_UNIT_SIZE,
5460          1);
5461    uint32_t size_info_mask;
5462    uint32_t size_info2;
5463 
5464    if (max_tiles_in_flight < sub_cmd->max_tiles_in_flight)
5465       sub_cmd->max_tiles_in_flight = max_tiles_in_flight;
5466 
5467    pvr_csb_pack (&ppp_state->pds.pixel_shader_base,
5468                  TA_STATE_PDS_SHADERBASE,
5469                  shader_base) {
5470       const struct pvr_pds_upload *const pds_upload =
5471          &fragment_shader_state->pds_fragment_program;
5472 
5473       shader_base.addr = PVR_DEV_ADDR(pds_upload->data_offset);
5474    }
5475 
5476    if (descriptor_shader_state->pds_code.pvr_bo) {
5477       pvr_csb_pack (&ppp_state->pds.texture_uniform_code_base,
5478                     TA_STATE_PDS_TEXUNICODEBASE,
5479                     tex_base) {
5480          tex_base.addr =
5481             PVR_DEV_ADDR(descriptor_shader_state->pds_code.code_offset);
5482       }
5483    } else {
5484       ppp_state->pds.texture_uniform_code_base = 0U;
5485    }
5486 
5487    pvr_csb_pack (&ppp_state->pds.size_info1, TA_STATE_PDS_SIZEINFO1, info1) {
5488       info1.pds_uniformsize = pds_uniform_size;
5489       info1.pds_texturestatesize = 0U;
5490       info1.pds_varyingsize = pds_varying_state_size;
5491       info1.usc_varyingsize = usc_varying_size;
5492       info1.pds_tempsize = pds_temp_size;
5493    }
5494 
5495    pvr_csb_pack (&size_info_mask, TA_STATE_PDS_SIZEINFO2, mask) {
5496       mask.pds_tri_merge_disable = true;
5497    }
5498 
5499    ppp_state->pds.size_info2 &= size_info_mask;
5500 
5501    pvr_csb_pack (&size_info2, TA_STATE_PDS_SIZEINFO2, info2) {
5502       info2.usc_sharedsize = usc_shared_size;
5503    }
5504 
5505    ppp_state->pds.size_info2 |= size_info2;
5506 
5507    if (pds_coeff_program->pvr_bo) {
5508       header->pres_pds_state_ptr1 = true;
5509 
5510       pvr_csb_pack (&ppp_state->pds.varying_base,
5511                     TA_STATE_PDS_VARYINGBASE,
5512                     base) {
5513          base.addr = PVR_DEV_ADDR(pds_coeff_program->data_offset);
5514       }
5515    } else {
5516       ppp_state->pds.varying_base = 0U;
5517    }
5518 
5519    pvr_csb_pack (&ppp_state->pds.uniform_state_data_base,
5520                  TA_STATE_PDS_UNIFORMDATABASE,
5521                  base) {
5522       base.addr = PVR_DEV_ADDR(state->pds_fragment_descriptor_data_offset);
5523    }
5524 
5525    header->pres_pds_state_ptr0 = true;
5526    header->pres_pds_state_ptr3 = true;
5527 }
5528 
pvr_setup_viewport(struct pvr_cmd_buffer * const cmd_buffer)5529 static void pvr_setup_viewport(struct pvr_cmd_buffer *const cmd_buffer)
5530 {
5531    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
5532    struct ROGUE_TA_STATE_HEADER *const header = &state->emit_header;
5533    struct vk_dynamic_graphics_state *const dynamic_state =
5534       &cmd_buffer->vk.dynamic_graphics_state;
5535    struct pvr_ppp_state *const ppp_state = &state->ppp_state;
5536 
5537    if (ppp_state->viewport_count != dynamic_state->vp.viewport_count) {
5538       ppp_state->viewport_count = dynamic_state->vp.viewport_count;
5539       header->pres_viewport = true;
5540    }
5541 
5542    if (dynamic_state->rs.rasterizer_discard_enable) {
5543       /* We don't want to emit any viewport data as it'll just get thrown
5544        * away. It's after the previous condition because we still want to
5545        * stash the viewport_count as it's our trigger for when
5546        * rasterizer discard gets disabled.
5547        */
5548       header->pres_viewport = false;
5549       return;
5550    }
5551 
5552    for (uint32_t i = 0; i < ppp_state->viewport_count; i++) {
5553       VkViewport *viewport = &dynamic_state->vp.viewports[i];
5554       uint32_t x_scale = fui(viewport->width * 0.5f);
5555       uint32_t y_scale = fui(viewport->height * 0.5f);
5556       uint32_t z_scale = fui(viewport->maxDepth - viewport->minDepth);
5557       uint32_t x_center = fui(viewport->x + viewport->width * 0.5f);
5558       uint32_t y_center = fui(viewport->y + viewport->height * 0.5f);
5559       uint32_t z_center = fui(viewport->minDepth);
5560 
5561       if (ppp_state->viewports[i].a0 != x_center ||
5562           ppp_state->viewports[i].m0 != x_scale ||
5563           ppp_state->viewports[i].a1 != y_center ||
5564           ppp_state->viewports[i].m1 != y_scale ||
5565           ppp_state->viewports[i].a2 != z_center ||
5566           ppp_state->viewports[i].m2 != z_scale) {
5567          ppp_state->viewports[i].a0 = x_center;
5568          ppp_state->viewports[i].m0 = x_scale;
5569          ppp_state->viewports[i].a1 = y_center;
5570          ppp_state->viewports[i].m1 = y_scale;
5571          ppp_state->viewports[i].a2 = z_center;
5572          ppp_state->viewports[i].m2 = z_scale;
5573 
5574          header->pres_viewport = true;
5575       }
5576    }
5577 }
5578 
pvr_setup_ppp_control(struct pvr_cmd_buffer * const cmd_buffer)5579 static void pvr_setup_ppp_control(struct pvr_cmd_buffer *const cmd_buffer)
5580 {
5581    struct vk_dynamic_graphics_state *const dynamic_state =
5582       &cmd_buffer->vk.dynamic_graphics_state;
5583    const VkPrimitiveTopology topology = dynamic_state->ia.primitive_topology;
5584    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
5585    struct ROGUE_TA_STATE_HEADER *const header = &state->emit_header;
5586    struct pvr_ppp_state *const ppp_state = &state->ppp_state;
5587    uint32_t ppp_control;
5588 
5589    pvr_csb_pack (&ppp_control, TA_STATE_PPP_CTRL, control) {
5590       control.drawclippededges = true;
5591       control.wclampen = true;
5592 
5593       if (topology == VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN)
5594          control.flatshade_vtx = ROGUE_TA_FLATSHADE_VTX_VERTEX_1;
5595       else
5596          control.flatshade_vtx = ROGUE_TA_FLATSHADE_VTX_VERTEX_0;
5597 
5598       if (dynamic_state->rs.depth_clamp_enable)
5599          control.clip_mode = ROGUE_TA_CLIP_MODE_NO_FRONT_OR_REAR;
5600       else
5601          control.clip_mode = ROGUE_TA_CLIP_MODE_FRONT_REAR;
5602 
5603       /* +--- FrontIsCCW?
5604        * | +--- Cull Front?
5605        * v v
5606        * 0|0 CULLMODE_CULL_CCW,
5607        * 0|1 CULLMODE_CULL_CW,
5608        * 1|0 CULLMODE_CULL_CW,
5609        * 1|1 CULLMODE_CULL_CCW,
5610        */
5611       switch (dynamic_state->rs.cull_mode) {
5612       case VK_CULL_MODE_BACK_BIT:
5613       case VK_CULL_MODE_FRONT_BIT:
5614          if ((dynamic_state->rs.front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE) ^
5615              (dynamic_state->rs.cull_mode == VK_CULL_MODE_FRONT_BIT)) {
5616             control.cullmode = ROGUE_TA_CULLMODE_CULL_CW;
5617          } else {
5618             control.cullmode = ROGUE_TA_CULLMODE_CULL_CCW;
5619          }
5620 
5621          break;
5622 
5623       case VK_CULL_MODE_FRONT_AND_BACK:
5624       case VK_CULL_MODE_NONE:
5625          control.cullmode = ROGUE_TA_CULLMODE_NO_CULLING;
5626          break;
5627 
5628       default:
5629          unreachable("Unsupported cull mode!");
5630       }
5631    }
5632 
5633    if (ppp_control != ppp_state->ppp_control) {
5634       ppp_state->ppp_control = ppp_control;
5635       header->pres_ppp_ctrl = true;
5636    }
5637 }
5638 
5639 /* Largest valid PPP State update in words = 31
5640  * 1 - Header
5641  * 3 - Stream Out Config words 0, 1 and 2
5642  * 1 - PPP Control word
5643  * 3 - Varying Config words 0, 1 and 2
5644  * 1 - Output Select
5645  * 1 - WClamp
5646  * 6 - Viewport Transform words
5647  * 2 - Region Clip words
5648  * 3 - PDS State for fragment phase (PDSSTATEPTR 1-3)
5649  * 4 - PDS State for fragment phase (PDSSTATEPTR0)
5650  * 6 - ISP Control Words
5651  */
5652 #define PVR_MAX_PPP_STATE_DWORDS 31
5653 
pvr_emit_ppp_state(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)5654 static VkResult pvr_emit_ppp_state(struct pvr_cmd_buffer *const cmd_buffer,
5655                                    struct pvr_sub_cmd_gfx *const sub_cmd)
5656 {
5657    const bool deferred_secondary = pvr_cmd_uses_deferred_cs_cmds(cmd_buffer);
5658    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
5659    struct ROGUE_TA_STATE_HEADER *const header = &state->emit_header;
5660    struct pvr_csb *const control_stream = &sub_cmd->control_stream;
5661    struct pvr_ppp_state *const ppp_state = &state->ppp_state;
5662    uint32_t ppp_state_words[PVR_MAX_PPP_STATE_DWORDS];
5663    const bool emit_dbsc = header->pres_ispctl_dbsc;
5664    uint32_t *buffer_ptr = ppp_state_words;
5665    uint32_t dbsc_patching_offset = 0;
5666    uint32_t ppp_state_words_count;
5667    struct pvr_suballoc_bo *pvr_bo;
5668    VkResult result;
5669 
5670 #if !defined(NDEBUG)
5671    struct ROGUE_TA_STATE_HEADER emit_mask = *header;
5672    uint32_t packed_emit_mask;
5673 
5674    static_assert(pvr_cmd_length(TA_STATE_HEADER) == 1,
5675                  "EMIT_MASK_IS_CLEAR assumes 1 dword sized header.");
5676 
5677 #   define EMIT_MASK_GET(field) (emit_mask.field)
5678 #   define EMIT_MASK_SET(field, value) (emit_mask.field = (value))
5679 #   define EMIT_MASK_IS_CLEAR                                        \
5680       (pvr_cmd_pack(TA_STATE_HEADER)(&packed_emit_mask, &emit_mask), \
5681        packed_emit_mask == 0)
5682 #else
5683 #   define EMIT_MASK_GET(field)
5684 #   define EMIT_MASK_SET(field, value)
5685 #endif
5686 
5687    header->view_port_count =
5688       (ppp_state->viewport_count == 0) ? 0U : (ppp_state->viewport_count - 1);
5689    header->pres_ispctl_fa = header->pres_ispctl;
5690 
5691    /* If deferred_secondary is true then we do a separate state update
5692     * which gets patched in vkCmdExecuteCommands().
5693     */
5694    header->pres_ispctl_dbsc &= !deferred_secondary;
5695 
5696    pvr_csb_write_struct(buffer_ptr, TA_STATE_HEADER, header);
5697 
5698    static_assert(pvr_cmd_length(TA_STATE_HEADER) == 1,
5699                  "Following header check assumes 1 dword sized header.");
5700    /* If the header is empty we exit early and prevent a bo alloc of 0 size. */
5701    if (ppp_state_words[0] == 0)
5702       return VK_SUCCESS;
5703 
5704    if (header->pres_ispctl) {
5705       pvr_csb_write_value(buffer_ptr, TA_STATE_ISPCTL, ppp_state->isp.control);
5706 
5707       assert(header->pres_ispctl_fa);
5708       /* This is not a mistake. FA, BA have the ISPA format, and FB, BB have the
5709        * ISPB format.
5710        */
5711       pvr_csb_write_value(buffer_ptr, TA_STATE_ISPA, ppp_state->isp.front_a);
5712       EMIT_MASK_SET(pres_ispctl_fa, false);
5713 
5714       if (header->pres_ispctl_fb) {
5715          pvr_csb_write_value(buffer_ptr, TA_STATE_ISPB, ppp_state->isp.front_b);
5716          EMIT_MASK_SET(pres_ispctl_fb, false);
5717       }
5718 
5719       if (header->pres_ispctl_ba) {
5720          pvr_csb_write_value(buffer_ptr, TA_STATE_ISPA, ppp_state->isp.back_a);
5721          EMIT_MASK_SET(pres_ispctl_ba, false);
5722       }
5723 
5724       if (header->pres_ispctl_bb) {
5725          pvr_csb_write_value(buffer_ptr, TA_STATE_ISPB, ppp_state->isp.back_b);
5726          EMIT_MASK_SET(pres_ispctl_bb, false);
5727       }
5728 
5729       EMIT_MASK_SET(pres_ispctl, false);
5730    }
5731 
5732    if (header->pres_ispctl_dbsc) {
5733       assert(!deferred_secondary);
5734 
5735       dbsc_patching_offset = buffer_ptr - ppp_state_words;
5736 
5737       pvr_csb_pack (buffer_ptr, TA_STATE_ISPDBSC, ispdbsc) {
5738          ispdbsc.dbindex = ppp_state->depthbias_scissor_indices.depthbias_index;
5739          ispdbsc.scindex = ppp_state->depthbias_scissor_indices.scissor_index;
5740       }
5741       buffer_ptr += pvr_cmd_length(TA_STATE_ISPDBSC);
5742 
5743       EMIT_MASK_SET(pres_ispctl_dbsc, false);
5744    }
5745 
5746    if (header->pres_pds_state_ptr0) {
5747       pvr_csb_write_value(buffer_ptr,
5748                           TA_STATE_PDS_SHADERBASE,
5749                           ppp_state->pds.pixel_shader_base);
5750 
5751       pvr_csb_write_value(buffer_ptr,
5752                           TA_STATE_PDS_TEXUNICODEBASE,
5753                           ppp_state->pds.texture_uniform_code_base);
5754 
5755       pvr_csb_write_value(buffer_ptr,
5756                           TA_STATE_PDS_SIZEINFO1,
5757                           ppp_state->pds.size_info1);
5758       pvr_csb_write_value(buffer_ptr,
5759                           TA_STATE_PDS_SIZEINFO2,
5760                           ppp_state->pds.size_info2);
5761 
5762       EMIT_MASK_SET(pres_pds_state_ptr0, false);
5763    }
5764 
5765    if (header->pres_pds_state_ptr1) {
5766       pvr_csb_write_value(buffer_ptr,
5767                           TA_STATE_PDS_VARYINGBASE,
5768                           ppp_state->pds.varying_base);
5769       EMIT_MASK_SET(pres_pds_state_ptr1, false);
5770    }
5771 
5772    /* We don't use pds_state_ptr2 (texture state programs) control word, but
5773     * this doesn't mean we need to set it to 0. This is because the hardware
5774     * runs the texture state program only when
5775     * ROGUE_TA_STATE_PDS_SIZEINFO1.pds_texturestatesize is non-zero.
5776     */
5777    assert(pvr_csb_unpack(&ppp_state->pds.size_info1, TA_STATE_PDS_SIZEINFO1)
5778              .pds_texturestatesize == 0);
5779 
5780    if (header->pres_pds_state_ptr3) {
5781       pvr_csb_write_value(buffer_ptr,
5782                           TA_STATE_PDS_UNIFORMDATABASE,
5783                           ppp_state->pds.uniform_state_data_base);
5784       EMIT_MASK_SET(pres_pds_state_ptr3, false);
5785    }
5786 
5787    if (header->pres_region_clip) {
5788       pvr_csb_write_value(buffer_ptr,
5789                           TA_REGION_CLIP0,
5790                           ppp_state->region_clipping.word0);
5791       pvr_csb_write_value(buffer_ptr,
5792                           TA_REGION_CLIP1,
5793                           ppp_state->region_clipping.word1);
5794 
5795       EMIT_MASK_SET(pres_region_clip, false);
5796    }
5797 
5798    if (header->pres_viewport) {
5799       const uint32_t viewports = MAX2(1, ppp_state->viewport_count);
5800       EMIT_MASK_SET(view_port_count, viewports);
5801 
5802       for (uint32_t i = 0; i < viewports; i++) {
5803          /* These don't have any definitions in the csbgen xml files and none
5804           * will be added.
5805           */
5806          *buffer_ptr++ = ppp_state->viewports[i].a0;
5807          *buffer_ptr++ = ppp_state->viewports[i].m0;
5808          *buffer_ptr++ = ppp_state->viewports[i].a1;
5809          *buffer_ptr++ = ppp_state->viewports[i].m1;
5810          *buffer_ptr++ = ppp_state->viewports[i].a2;
5811          *buffer_ptr++ = ppp_state->viewports[i].m2;
5812 
5813          EMIT_MASK_SET(view_port_count, EMIT_MASK_GET(view_port_count) - 1);
5814       }
5815 
5816       EMIT_MASK_SET(pres_viewport, false);
5817    }
5818 
5819    if (header->pres_wclamp) {
5820       pvr_csb_pack (buffer_ptr, TA_WCLAMP, wclamp) {
5821          wclamp.val = fui(0.00001f);
5822       }
5823       buffer_ptr += pvr_cmd_length(TA_WCLAMP);
5824       EMIT_MASK_SET(pres_wclamp, false);
5825    }
5826 
5827    if (header->pres_outselects) {
5828       pvr_csb_write_value(buffer_ptr, TA_OUTPUT_SEL, ppp_state->output_selects);
5829       EMIT_MASK_SET(pres_outselects, false);
5830    }
5831 
5832    if (header->pres_varying_word0) {
5833       pvr_csb_write_value(buffer_ptr,
5834                           TA_STATE_VARYING0,
5835                           ppp_state->varying_word[0]);
5836       EMIT_MASK_SET(pres_varying_word0, false);
5837    }
5838 
5839    if (header->pres_varying_word1) {
5840       pvr_csb_write_value(buffer_ptr,
5841                           TA_STATE_VARYING1,
5842                           ppp_state->varying_word[1]);
5843       EMIT_MASK_SET(pres_varying_word1, false);
5844    }
5845 
5846    /* We only emit this on the first draw of a render job to prevent us from
5847     * inheriting a non-zero value set elsewhere.
5848     */
5849    if (header->pres_varying_word2) {
5850       pvr_csb_write_value(buffer_ptr, TA_STATE_VARYING2, 0);
5851       EMIT_MASK_SET(pres_varying_word2, false);
5852    }
5853 
5854    if (header->pres_ppp_ctrl) {
5855       pvr_csb_write_value(buffer_ptr,
5856                           TA_STATE_PPP_CTRL,
5857                           ppp_state->ppp_control);
5858       EMIT_MASK_SET(pres_ppp_ctrl, false);
5859    }
5860 
5861    /* We only emit this on the first draw of a render job to prevent us from
5862     * inheriting a non-zero value set elsewhere.
5863     */
5864    if (header->pres_stream_out_size) {
5865       pvr_csb_write_value(buffer_ptr, TA_STATE_STREAM_OUT0, 0);
5866       EMIT_MASK_SET(pres_stream_out_size, false);
5867    }
5868 
5869    assert(EMIT_MASK_IS_CLEAR);
5870 
5871 #undef EMIT_MASK_GET
5872 #undef EMIT_MASK_SET
5873 #if !defined(NDEBUG)
5874 #   undef EMIT_MASK_IS_CLEAR
5875 #endif
5876 
5877    ppp_state_words_count = buffer_ptr - ppp_state_words;
5878    assert(ppp_state_words_count <= PVR_MAX_PPP_STATE_DWORDS);
5879 
5880    result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
5881                                      cmd_buffer->device->heaps.general_heap,
5882                                      PVR_DW_TO_BYTES(ppp_state_words_count),
5883                                      &pvr_bo);
5884    if (result != VK_SUCCESS)
5885       return result;
5886 
5887    memcpy(pvr_bo_suballoc_get_map_addr(pvr_bo),
5888           ppp_state_words,
5889           PVR_DW_TO_BYTES(ppp_state_words_count));
5890 
5891    pvr_csb_set_relocation_mark(control_stream);
5892 
5893    /* Write the VDM state update into the VDM control stream. */
5894    pvr_csb_emit (control_stream, VDMCTRL_PPP_STATE0, state0) {
5895       state0.word_count = ppp_state_words_count;
5896       state0.addrmsb = pvr_bo->dev_addr;
5897    }
5898 
5899    pvr_csb_emit (control_stream, VDMCTRL_PPP_STATE1, state1) {
5900       state1.addrlsb = pvr_bo->dev_addr;
5901    }
5902 
5903    pvr_csb_clear_relocation_mark(control_stream);
5904 
5905    if (emit_dbsc && cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
5906       struct pvr_deferred_cs_command cmd;
5907 
5908       if (deferred_secondary) {
5909          const uint32_t num_dwords = pvr_cmd_length(VDMCTRL_PPP_STATE0) +
5910                                      pvr_cmd_length(VDMCTRL_PPP_STATE1);
5911          uint32_t *vdm_state;
5912 
5913          pvr_csb_set_relocation_mark(control_stream);
5914 
5915          vdm_state = pvr_csb_alloc_dwords(control_stream, num_dwords);
5916          if (!vdm_state) {
5917             result = pvr_csb_get_status(control_stream);
5918             return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
5919          }
5920 
5921          pvr_csb_clear_relocation_mark(control_stream);
5922 
5923          cmd = (struct pvr_deferred_cs_command){
5924             .type = PVR_DEFERRED_CS_COMMAND_TYPE_DBSC,
5925             .dbsc = {
5926                .state = ppp_state->depthbias_scissor_indices,
5927                .vdm_state = vdm_state,
5928             },
5929          };
5930       } else {
5931          cmd = (struct pvr_deferred_cs_command){
5932             .type = PVR_DEFERRED_CS_COMMAND_TYPE_DBSC2,
5933             .dbsc2 = {
5934                .state = ppp_state->depthbias_scissor_indices,
5935                .ppp_cs_bo = pvr_bo,
5936                .patch_offset = dbsc_patching_offset,
5937             },
5938          };
5939       }
5940 
5941       util_dynarray_append(&cmd_buffer->deferred_csb_commands,
5942                            struct pvr_deferred_cs_command,
5943                            cmd);
5944    }
5945 
5946    state->emit_header = (struct ROGUE_TA_STATE_HEADER){ 0 };
5947 
5948    return VK_SUCCESS;
5949 }
5950 
5951 static inline bool
pvr_ppp_state_update_required(const struct pvr_cmd_buffer * cmd_buffer)5952 pvr_ppp_state_update_required(const struct pvr_cmd_buffer *cmd_buffer)
5953 {
5954    const BITSET_WORD *const dynamic_dirty =
5955       cmd_buffer->vk.dynamic_graphics_state.dirty;
5956    const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
5957    const struct ROGUE_TA_STATE_HEADER *const header = &state->emit_header;
5958 
5959    /* For push constants we only need to worry if they are updated for the
5960     * fragment stage since we're only updating the pds programs used in the
5961     * fragment stage.
5962     */
5963 
5964    return header->pres_ppp_ctrl || header->pres_ispctl ||
5965           header->pres_ispctl_fb || header->pres_ispctl_ba ||
5966           header->pres_ispctl_bb || header->pres_ispctl_dbsc ||
5967           header->pres_pds_state_ptr0 || header->pres_pds_state_ptr1 ||
5968           header->pres_pds_state_ptr2 || header->pres_pds_state_ptr3 ||
5969           header->pres_region_clip || header->pres_viewport ||
5970           header->pres_wclamp || header->pres_outselects ||
5971           header->pres_varying_word0 || header->pres_varying_word1 ||
5972           header->pres_varying_word2 || header->pres_stream_out_program ||
5973           state->dirty.fragment_descriptors || state->dirty.vis_test ||
5974           state->dirty.gfx_pipeline_binding || state->dirty.isp_userpass ||
5975           state->push_constants.dirty_stages & VK_SHADER_STAGE_FRAGMENT_BIT ||
5976           BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) ||
5977           BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) ||
5978           BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE) ||
5979           BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE) ||
5980           BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS) ||
5981           BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH) ||
5982           BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_VP_SCISSORS) ||
5983           BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_VP_SCISSOR_COUNT) ||
5984           BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) ||
5985           BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT);
5986 }
5987 
5988 static VkResult
pvr_emit_dirty_ppp_state(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)5989 pvr_emit_dirty_ppp_state(struct pvr_cmd_buffer *const cmd_buffer,
5990                          struct pvr_sub_cmd_gfx *const sub_cmd)
5991 {
5992    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
5993    struct vk_dynamic_graphics_state *const dynamic_state =
5994       &cmd_buffer->vk.dynamic_graphics_state;
5995    VkResult result;
5996 
5997    /* TODO: The emit_header will be dirty only if
5998     * pvr_reset_graphics_dirty_state() was called before this (so when command
5999     * buffer begins recording or when it's reset). Otherwise it will have been
6000     * zeroed out by the previous pvr_emit_ppp_state(). We can probably set a
6001     * flag in there and check it here instead of checking the header.
6002     * Check if this is true and implement the flag.
6003     */
6004    if (!pvr_ppp_state_update_required(cmd_buffer))
6005       return VK_SUCCESS;
6006 
6007    if (state->dirty.gfx_pipeline_binding) {
6008       struct ROGUE_TA_STATE_ISPA ispa;
6009 
6010       pvr_setup_output_select(cmd_buffer);
6011       pvr_setup_isp_faces_and_control(cmd_buffer, &ispa);
6012       pvr_setup_triangle_merging_flag(cmd_buffer, &ispa);
6013    } else if (BITSET_TEST(dynamic_state->dirty,
6014                           MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) ||
6015               BITSET_TEST(dynamic_state->dirty,
6016                           MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE) ||
6017               BITSET_TEST(dynamic_state->dirty,
6018                           MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) ||
6019               BITSET_TEST(dynamic_state->dirty,
6020                           MESA_VK_DYNAMIC_RS_LINE_WIDTH) ||
6021               state->dirty.isp_userpass || state->dirty.vis_test) {
6022       pvr_setup_isp_faces_and_control(cmd_buffer, NULL);
6023    }
6024 
6025    if (!dynamic_state->rs.rasterizer_discard_enable &&
6026        state->dirty.fragment_descriptors &&
6027        state->gfx_pipeline->shader_state.fragment.bo &&
6028        !state->gfx_pipeline->fs_data.common.uses.empty) {
6029       pvr_setup_fragment_state_pointers(cmd_buffer, sub_cmd);
6030    }
6031 
6032    pvr_setup_isp_depth_bias_scissor_state(cmd_buffer);
6033 
6034    if (BITSET_TEST(dynamic_state->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) ||
6035        BITSET_TEST(dynamic_state->dirty, MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT))
6036       pvr_setup_viewport(cmd_buffer);
6037 
6038    pvr_setup_ppp_control(cmd_buffer);
6039 
6040    /* The hardware doesn't have an explicit mode for this so we use a
6041     * negative viewport to make sure all objects are culled out early.
6042     */
6043    if (dynamic_state->rs.cull_mode == VK_CULL_MODE_FRONT_AND_BACK) {
6044       /* Shift the viewport out of the guard-band culling everything. */
6045       const uint32_t negative_vp_val = fui(-2.0f);
6046 
6047       state->ppp_state.viewports[0].a0 = negative_vp_val;
6048       state->ppp_state.viewports[0].m0 = 0;
6049       state->ppp_state.viewports[0].a1 = negative_vp_val;
6050       state->ppp_state.viewports[0].m1 = 0;
6051       state->ppp_state.viewports[0].a2 = negative_vp_val;
6052       state->ppp_state.viewports[0].m2 = 0;
6053 
6054       state->ppp_state.viewport_count = 1;
6055 
6056       state->emit_header.pres_viewport = true;
6057    }
6058 
6059    result = pvr_emit_ppp_state(cmd_buffer, sub_cmd);
6060    if (result != VK_SUCCESS)
6061       return result;
6062 
6063    return VK_SUCCESS;
6064 }
6065 
pvr_calculate_vertex_cam_size(const struct pvr_device_info * dev_info,const uint32_t vs_output_size,const bool raster_enable,uint32_t * const cam_size_out,uint32_t * const vs_max_instances_out)6066 void pvr_calculate_vertex_cam_size(const struct pvr_device_info *dev_info,
6067                                    const uint32_t vs_output_size,
6068                                    const bool raster_enable,
6069                                    uint32_t *const cam_size_out,
6070                                    uint32_t *const vs_max_instances_out)
6071 {
6072    /* First work out the size of a vertex in the UVS and multiply by 4 for
6073     * column ordering.
6074     */
6075    const uint32_t uvs_vertex_vector_size_in_dwords =
6076       (vs_output_size + 1U + raster_enable * 4U) * 4U;
6077    const uint32_t vdm_cam_size =
6078       PVR_GET_FEATURE_VALUE(dev_info, vdm_cam_size, 32U);
6079 
6080    /* This is a proxy for 8XE. */
6081    if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format) &&
6082        vdm_cam_size < 96U) {
6083       /* Comparisons are based on size including scratch per vertex vector. */
6084       if (uvs_vertex_vector_size_in_dwords < (14U * 4U)) {
6085          *cam_size_out = MIN2(31U, vdm_cam_size - 1U);
6086          *vs_max_instances_out = 16U;
6087       } else if (uvs_vertex_vector_size_in_dwords < (20U * 4U)) {
6088          *cam_size_out = 15U;
6089          *vs_max_instances_out = 16U;
6090       } else if (uvs_vertex_vector_size_in_dwords < (28U * 4U)) {
6091          *cam_size_out = 11U;
6092          *vs_max_instances_out = 12U;
6093       } else if (uvs_vertex_vector_size_in_dwords < (44U * 4U)) {
6094          *cam_size_out = 7U;
6095          *vs_max_instances_out = 8U;
6096       } else if (PVR_HAS_FEATURE(dev_info,
6097                                  simple_internal_parameter_format_v2) ||
6098                  uvs_vertex_vector_size_in_dwords < (64U * 4U)) {
6099          *cam_size_out = 7U;
6100          *vs_max_instances_out = 4U;
6101       } else {
6102          *cam_size_out = 3U;
6103          *vs_max_instances_out = 2U;
6104       }
6105    } else {
6106       /* Comparisons are based on size including scratch per vertex vector. */
6107       if (uvs_vertex_vector_size_in_dwords <= (32U * 4U)) {
6108          /* output size <= 27 + 5 scratch. */
6109          *cam_size_out = MIN2(95U, vdm_cam_size - 1U);
6110          *vs_max_instances_out = 0U;
6111       } else if (uvs_vertex_vector_size_in_dwords <= 48U * 4U) {
6112          /* output size <= 43 + 5 scratch */
6113          *cam_size_out = 63U;
6114          if (PVR_GET_FEATURE_VALUE(dev_info, uvs_vtx_entries, 144U) < 288U)
6115             *vs_max_instances_out = 16U;
6116          else
6117             *vs_max_instances_out = 0U;
6118       } else if (uvs_vertex_vector_size_in_dwords <= 64U * 4U) {
6119          /* output size <= 59 + 5 scratch. */
6120          *cam_size_out = 31U;
6121          if (PVR_GET_FEATURE_VALUE(dev_info, uvs_vtx_entries, 144U) < 288U)
6122             *vs_max_instances_out = 16U;
6123          else
6124             *vs_max_instances_out = 0U;
6125       } else {
6126          *cam_size_out = 15U;
6127          *vs_max_instances_out = 16U;
6128       }
6129    }
6130 }
6131 
pvr_emit_dirty_vdm_state(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)6132 static void pvr_emit_dirty_vdm_state(struct pvr_cmd_buffer *const cmd_buffer,
6133                                      struct pvr_sub_cmd_gfx *const sub_cmd)
6134 {
6135    /* FIXME: Assume all state is dirty for the moment. */
6136    struct pvr_device_info *const dev_info =
6137       &cmd_buffer->device->pdevice->dev_info;
6138    ASSERTED const uint32_t max_user_vertex_output_components =
6139       pvr_get_max_user_vertex_output_components(dev_info);
6140    struct ROGUE_VDMCTRL_VDM_STATE0 header = { pvr_cmd_header(
6141       VDMCTRL_VDM_STATE0) };
6142    struct vk_dynamic_graphics_state *const dynamic_state =
6143       &cmd_buffer->vk.dynamic_graphics_state;
6144    const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
6145    const pco_data *const vs_data = &state->gfx_pipeline->vs_data;
6146    struct pvr_csb *const csb = &sub_cmd->control_stream;
6147    uint32_t max_instances;
6148    uint32_t cam_size;
6149 
6150    /* CAM Calculations and HW state take vertex size aligned to DWORDS. */
6151    assert(vs_data->vs.vtxouts <= max_user_vertex_output_components);
6152 
6153    pvr_calculate_vertex_cam_size(dev_info,
6154                                  vs_data->vs.vtxouts,
6155                                  true,
6156                                  &cam_size,
6157                                  &max_instances);
6158 
6159    pvr_csb_set_relocation_mark(csb);
6160 
6161    pvr_csb_emit (csb, VDMCTRL_VDM_STATE0, state0) {
6162       state0.cam_size = cam_size;
6163 
6164       if (dynamic_state->ia.primitive_restart_enable) {
6165          state0.cut_index_enable = true;
6166          state0.cut_index_present = true;
6167       }
6168 
6169       switch (dynamic_state->ia.primitive_topology) {
6170       case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
6171          state0.flatshade_control = ROGUE_VDMCTRL_FLATSHADE_CONTROL_VERTEX_1;
6172          break;
6173 
6174       default:
6175          state0.flatshade_control = ROGUE_VDMCTRL_FLATSHADE_CONTROL_VERTEX_0;
6176          break;
6177       }
6178 
6179       /* If we've bound a different vertex buffer, or this draw-call requires
6180        * a different PDS attrib data-section from the last draw call (changed
6181        * base_instance) then we need to specify a new data section. This is
6182        * also the case if we've switched pipeline or attrib program as the
6183        * data-section layout will be different.
6184        */
6185       state0.vs_data_addr_present =
6186          state->dirty.gfx_pipeline_binding || state->dirty.vertex_bindings ||
6187          state->dirty.draw_base_instance || state->dirty.draw_variant;
6188 
6189       /* Need to specify new PDS Attrib program if we've bound a different
6190        * pipeline or we needed a different PDS Attrib variant for this
6191        * draw-call.
6192        */
6193       state0.vs_other_present = state->dirty.gfx_pipeline_binding ||
6194                                 state->dirty.draw_variant;
6195 
6196       /* UVB_SCRATCH_SELECT_ONE with no rasterization is only valid when
6197        * stream output is enabled. We use UVB_SCRATCH_SELECT_FIVE because
6198        * Vulkan doesn't support stream output and the vertex position is
6199        * always emitted to the UVB.
6200        */
6201       state0.uvs_scratch_size_select =
6202          ROGUE_VDMCTRL_UVS_SCRATCH_SIZE_SELECT_FIVE;
6203 
6204       header = state0;
6205    }
6206 
6207    if (header.cut_index_present) {
6208       pvr_csb_emit (csb, VDMCTRL_VDM_STATE1, state1) {
6209          state1.cut_index =
6210             vk_index_to_restart(state->index_buffer_binding.type);
6211       }
6212    }
6213 
6214    if (header.vs_data_addr_present) {
6215       pvr_csb_emit (csb, VDMCTRL_VDM_STATE2, state2) {
6216          state2.vs_pds_data_base_addr =
6217             PVR_DEV_ADDR(state->pds_vertex_attrib_offset);
6218       }
6219    }
6220 
6221    if (header.vs_other_present) {
6222       const uint32_t usc_unified_store_size_in_bytes = vs_data->common.vtxins
6223                                                        << 2;
6224 
6225       pvr_csb_emit (csb, VDMCTRL_VDM_STATE3, state3) {
6226          state3.vs_pds_code_base_addr =
6227             PVR_DEV_ADDR(state->pds_shader.code_offset);
6228       }
6229 
6230       pvr_csb_emit (csb, VDMCTRL_VDM_STATE4, state4) {
6231          state4.vs_output_size = vs_data->vs.vtxouts;
6232       }
6233 
6234       pvr_csb_emit (csb, VDMCTRL_VDM_STATE5, state5) {
6235          state5.vs_max_instances = max_instances;
6236          state5.vs_usc_common_size = 0U;
6237          state5.vs_usc_unified_size = DIV_ROUND_UP(
6238             usc_unified_store_size_in_bytes,
6239             ROGUE_VDMCTRL_VDM_STATE5_VS_USC_UNIFIED_SIZE_UNIT_SIZE);
6240          state5.vs_pds_temp_size =
6241             DIV_ROUND_UP(state->pds_shader.info->temps_required << 2,
6242                          ROGUE_VDMCTRL_VDM_STATE5_VS_PDS_TEMP_SIZE_UNIT_SIZE);
6243          state5.vs_pds_data_size = DIV_ROUND_UP(
6244             PVR_DW_TO_BYTES(state->pds_shader.info->data_size_in_dwords),
6245             ROGUE_VDMCTRL_VDM_STATE5_VS_PDS_DATA_SIZE_UNIT_SIZE);
6246       }
6247    }
6248 
6249    pvr_csb_clear_relocation_mark(csb);
6250 }
6251 
pvr_validate_draw_state(struct pvr_cmd_buffer * cmd_buffer)6252 static VkResult pvr_validate_draw_state(struct pvr_cmd_buffer *cmd_buffer)
6253 {
6254    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
6255    struct vk_dynamic_graphics_state *const dynamic_state =
6256       &cmd_buffer->vk.dynamic_graphics_state;
6257    const struct pvr_graphics_pipeline *const gfx_pipeline = state->gfx_pipeline;
6258    const pco_data *const fs_data = &gfx_pipeline->fs_data;
6259    struct pvr_sub_cmd_gfx *sub_cmd;
6260    bool fstencil_writemask_zero;
6261    bool bstencil_writemask_zero;
6262    bool fstencil_keep;
6263    bool bstencil_keep;
6264    VkResult result;
6265 
6266    pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
6267 
6268    sub_cmd = &state->current_sub_cmd->gfx;
6269    sub_cmd->empty_cmd = false;
6270 
6271    /* Determine pipeline depth/stencil usage. If a pipeline uses depth or
6272     * stencil testing, those attachments are using their loaded values, and
6273     * the loadOps cannot be optimized out.
6274     */
6275    /* Pipeline uses depth testing. */
6276    if (sub_cmd->depth_usage == PVR_DEPTH_STENCIL_USAGE_UNDEFINED &&
6277        dynamic_state->ds.depth.compare_op != VK_COMPARE_OP_ALWAYS) {
6278       sub_cmd->depth_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED;
6279    }
6280 
6281    /* Pipeline uses stencil testing. */
6282    if (sub_cmd->stencil_usage == PVR_DEPTH_STENCIL_USAGE_UNDEFINED &&
6283        (dynamic_state->ds.stencil.front.op.compare != VK_COMPARE_OP_ALWAYS ||
6284         dynamic_state->ds.stencil.back.op.compare != VK_COMPARE_OP_ALWAYS)) {
6285       sub_cmd->stencil_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED;
6286    }
6287 
6288    if (PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info,
6289                        compute_overlap)) {
6290       uint32_t coefficient_size =
6291          DIV_ROUND_UP(fs_data->common.coeffs,
6292                       ROGUE_TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_UNIT_SIZE);
6293 
6294       if (coefficient_size >
6295           ROGUE_TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_MAX_SIZE)
6296          sub_cmd->disable_compute_overlap = true;
6297    }
6298 
6299    sub_cmd->frag_uses_atomic_ops |= fs_data->common.uses.atomics;
6300    sub_cmd->frag_has_side_effects |= fs_data->common.uses.side_effects;
6301    sub_cmd->frag_uses_texture_rw |= false;
6302    sub_cmd->vertex_uses_texture_rw |= false;
6303 
6304    sub_cmd->job.get_vis_results = state->vis_test_enabled;
6305 
6306    fstencil_keep =
6307       (dynamic_state->ds.stencil.front.op.fail == VK_STENCIL_OP_KEEP) &&
6308       (dynamic_state->ds.stencil.front.op.pass == VK_STENCIL_OP_KEEP);
6309    bstencil_keep =
6310       (dynamic_state->ds.stencil.back.op.fail == VK_STENCIL_OP_KEEP) &&
6311       (dynamic_state->ds.stencil.back.op.pass == VK_STENCIL_OP_KEEP);
6312    fstencil_writemask_zero = (dynamic_state->ds.stencil.front.write_mask == 0);
6313    bstencil_writemask_zero = (dynamic_state->ds.stencil.back.write_mask == 0);
6314 
6315    /* Set stencil modified flag if:
6316     * - Neither front nor back-facing stencil has a fail_op/pass_op of KEEP.
6317     * - Neither front nor back-facing stencil has a write_mask of zero.
6318     */
6319    if (!(fstencil_keep && bstencil_keep) &&
6320        !(fstencil_writemask_zero && bstencil_writemask_zero)) {
6321       sub_cmd->modifies_stencil = true;
6322    }
6323 
6324    /* Set depth modified flag if depth write is enabled. */
6325    if (dynamic_state->ds.depth.write_enable)
6326       sub_cmd->modifies_depth = true;
6327 
6328    /* If either the data or code changes for pds vertex attribs, regenerate the
6329     * data segment.
6330     */
6331    if (state->dirty.vertex_bindings || state->dirty.gfx_pipeline_binding ||
6332        state->dirty.draw_variant || state->dirty.draw_base_instance) {
6333       enum pvr_pds_vertex_attrib_program_type prog_type;
6334       const struct pvr_pds_attrib_program *program;
6335 
6336       if (state->draw_state.draw_indirect)
6337          prog_type = PVR_PDS_VERTEX_ATTRIB_PROGRAM_DRAW_INDIRECT;
6338       else if (state->draw_state.base_instance)
6339          prog_type = PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASE_INSTANCE;
6340       else
6341          prog_type = PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASIC;
6342 
6343       program =
6344          &gfx_pipeline->shader_state.vertex.pds_attrib_programs[prog_type];
6345       state->pds_shader.info = &program->info;
6346       state->pds_shader.code_offset = program->program.code_offset;
6347 
6348       state->max_shared_regs =
6349          MAX2(state->max_shared_regs, pvr_calc_shared_regs_count(gfx_pipeline));
6350 
6351       pvr_setup_vertex_buffers(cmd_buffer, gfx_pipeline);
6352    }
6353 
6354    if (state->push_constants.dirty_stages & VK_SHADER_STAGE_ALL_GRAPHICS) {
6355       result = pvr_cmd_upload_push_consts(cmd_buffer);
6356       if (result != VK_SUCCESS)
6357          return result;
6358    }
6359 
6360    state->dirty.vertex_descriptors = state->dirty.gfx_pipeline_binding;
6361    state->dirty.fragment_descriptors = state->dirty.vertex_descriptors;
6362 
6363    /* Account for dirty descriptor set. */
6364    /* TODO: It could be the case that there are no descriptors for a specific
6365     * stage, or that the update descriptors aren't active for a particular
6366     * stage. In such cases we could avoid regenerating the descriptor PDS
6367     * program.
6368     */
6369    state->dirty.vertex_descriptors |= state->dirty.gfx_desc_dirty;
6370    state->dirty.fragment_descriptors |= state->dirty.gfx_desc_dirty;
6371 
6372    if (BITSET_TEST(dynamic_state->dirty, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS))
6373       state->dirty.fragment_descriptors = true;
6374 
6375    state->dirty.vertex_descriptors |=
6376       state->push_constants.dirty_stages &
6377       (VK_SHADER_STAGE_ALL_GRAPHICS & ~VK_SHADER_STAGE_FRAGMENT_BIT);
6378    state->dirty.fragment_descriptors |= state->push_constants.dirty_stages &
6379                                         VK_SHADER_STAGE_FRAGMENT_BIT;
6380 
6381    if (state->dirty.fragment_descriptors) {
6382       result = pvr_setup_descriptor_mappings(
6383          cmd_buffer,
6384          PVR_STAGE_ALLOCATION_FRAGMENT,
6385          &state->gfx_pipeline->shader_state.fragment.descriptor_state,
6386          NULL,
6387          &state->pds_fragment_descriptor_data_offset);
6388       if (result != VK_SUCCESS) {
6389          mesa_loge("Could not setup fragment descriptor mappings.");
6390          return result;
6391       }
6392    }
6393 
6394    if (state->dirty.vertex_descriptors) {
6395       uint32_t pds_vertex_descriptor_data_offset;
6396 
6397       result = pvr_setup_descriptor_mappings(
6398          cmd_buffer,
6399          PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY,
6400          &state->gfx_pipeline->shader_state.vertex.descriptor_state,
6401          NULL,
6402          &pds_vertex_descriptor_data_offset);
6403       if (result != VK_SUCCESS) {
6404          mesa_loge("Could not setup vertex descriptor mappings.");
6405          return result;
6406       }
6407 
6408       pvr_emit_dirty_pds_state(cmd_buffer,
6409                                sub_cmd,
6410                                pds_vertex_descriptor_data_offset);
6411    }
6412 
6413    pvr_emit_dirty_ppp_state(cmd_buffer, sub_cmd);
6414    pvr_emit_dirty_vdm_state(cmd_buffer, sub_cmd);
6415 
6416    vk_dynamic_graphics_state_clear_dirty(dynamic_state);
6417    state->dirty.gfx_desc_dirty = false;
6418    state->dirty.draw_base_instance = false;
6419    state->dirty.draw_variant = false;
6420    state->dirty.fragment_descriptors = false;
6421    state->dirty.gfx_pipeline_binding = false;
6422    state->dirty.isp_userpass = false;
6423    state->dirty.vertex_bindings = false;
6424    state->dirty.vis_test = false;
6425 
6426    state->push_constants.dirty_stages &= ~VK_SHADER_STAGE_ALL_GRAPHICS;
6427 
6428    return VK_SUCCESS;
6429 }
6430 
pvr_get_hw_primitive_topology(VkPrimitiveTopology topology)6431 static uint32_t pvr_get_hw_primitive_topology(VkPrimitiveTopology topology)
6432 {
6433    switch (topology) {
6434    case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
6435       return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_POINT_LIST;
6436    case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
6437       return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_LIST;
6438    case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
6439       return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_STRIP;
6440    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
6441       return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_LIST;
6442    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
6443       return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_STRIP;
6444    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
6445       return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_FAN;
6446    case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
6447       return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_LIST_ADJ;
6448    case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
6449       return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_STRIP_ADJ;
6450    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
6451       return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_LIST_ADJ;
6452    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
6453       return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_STRIP_ADJ;
6454    case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
6455       return ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_PATCH_LIST;
6456    default:
6457       unreachable("Undefined primitive topology");
6458    }
6459 }
6460 
6461 /* TODO: Rewrite this in terms of ALIGN_POT() and pvr_cmd_length(). */
6462 /* Aligned to 128 bit for PDS loads / stores */
6463 #define DUMMY_VDM_CONTROL_STREAM_BLOCK_SIZE 8
6464 
6465 static VkResult
pvr_write_draw_indirect_vdm_stream(struct pvr_cmd_buffer * cmd_buffer,struct pvr_csb * const csb,pvr_dev_addr_t idx_buffer_addr,uint32_t idx_stride,struct ROGUE_VDMCTRL_INDEX_LIST0 * list_hdr,struct pvr_buffer * buffer,VkDeviceSize offset,uint32_t count,uint32_t stride)6466 pvr_write_draw_indirect_vdm_stream(struct pvr_cmd_buffer *cmd_buffer,
6467                                    struct pvr_csb *const csb,
6468                                    pvr_dev_addr_t idx_buffer_addr,
6469                                    uint32_t idx_stride,
6470                                    struct ROGUE_VDMCTRL_INDEX_LIST0 *list_hdr,
6471                                    struct pvr_buffer *buffer,
6472                                    VkDeviceSize offset,
6473                                    uint32_t count,
6474                                    uint32_t stride)
6475 {
6476    struct pvr_pds_drawindirect_program pds_prog = { 0 };
6477    uint32_t word0;
6478 
6479    /* Draw indirect always has index offset and instance count. */
6480    list_hdr->index_offset_present = true;
6481    list_hdr->index_instance_count_present = true;
6482 
6483    pvr_cmd_pack(VDMCTRL_INDEX_LIST0)(&word0, list_hdr);
6484 
6485    pds_prog.support_base_instance = true;
6486    pds_prog.arg_buffer = buffer->dev_addr.addr + offset;
6487    pds_prog.index_buffer = idx_buffer_addr.addr;
6488    pds_prog.index_block_header = word0;
6489    pds_prog.index_stride = idx_stride;
6490    pds_prog.num_views = 1U;
6491 
6492    /* TODO: See if we can pre-upload the code section of all the pds programs
6493     * and reuse them here.
6494     */
6495    /* Generate and upload the PDS programs (code + data). */
6496    for (uint32_t i = 0U; i < count; i++) {
6497       const struct pvr_device_info *dev_info =
6498          &cmd_buffer->device->pdevice->dev_info;
6499       struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6500       struct pvr_suballoc_bo *dummy_bo;
6501       struct pvr_suballoc_bo *pds_bo;
6502       uint32_t *dummy_stream;
6503       uint32_t *pds_base;
6504       uint32_t pds_size;
6505       VkResult result;
6506 
6507       /* TODO: Move this outside the loop and allocate all of them in one go? */
6508       result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
6509                                         cmd_buffer->device->heaps.general_heap,
6510                                         DUMMY_VDM_CONTROL_STREAM_BLOCK_SIZE,
6511                                         &dummy_bo);
6512       if (result != VK_SUCCESS)
6513          return result;
6514 
6515       pds_prog.increment_draw_id = (i != 0);
6516       pds_prog.index_list_addr_buffer = dummy_bo->dev_addr.addr;
6517 
6518       if (state->draw_state.draw_indexed) {
6519          pvr_pds_generate_draw_elements_indirect(&pds_prog,
6520                                                  0,
6521                                                  PDS_GENERATE_SIZES,
6522                                                  dev_info);
6523       } else {
6524          pvr_pds_generate_draw_arrays_indirect(&pds_prog,
6525                                                0,
6526                                                PDS_GENERATE_SIZES,
6527                                                dev_info);
6528       }
6529 
6530       pds_size = PVR_DW_TO_BYTES(pds_prog.program.data_size_aligned +
6531                                  pds_prog.program.code_size_aligned);
6532 
6533       result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
6534                                         cmd_buffer->device->heaps.pds_heap,
6535                                         pds_size,
6536                                         &pds_bo);
6537       if (result != VK_SUCCESS)
6538          return result;
6539 
6540       pds_base = pvr_bo_suballoc_get_map_addr(pds_bo);
6541       memcpy(pds_base,
6542              pds_prog.program.code,
6543              PVR_DW_TO_BYTES(pds_prog.program.code_size_aligned));
6544 
6545       if (state->draw_state.draw_indexed) {
6546          pvr_pds_generate_draw_elements_indirect(
6547             &pds_prog,
6548             pds_base + pds_prog.program.code_size_aligned,
6549             PDS_GENERATE_DATA_SEGMENT,
6550             dev_info);
6551       } else {
6552          pvr_pds_generate_draw_arrays_indirect(
6553             &pds_prog,
6554             pds_base + pds_prog.program.code_size_aligned,
6555             PDS_GENERATE_DATA_SEGMENT,
6556             dev_info);
6557       }
6558 
6559       pvr_csb_set_relocation_mark(csb);
6560 
6561       pvr_csb_emit (csb, VDMCTRL_PDS_STATE0, state0) {
6562          state0.usc_target = ROGUE_VDMCTRL_USC_TARGET_ANY;
6563 
6564          state0.pds_temp_size =
6565             DIV_ROUND_UP(PVR_DW_TO_BYTES(pds_prog.program.temp_size_aligned),
6566                          ROGUE_VDMCTRL_PDS_STATE0_PDS_TEMP_SIZE_UNIT_SIZE);
6567 
6568          state0.pds_data_size =
6569             DIV_ROUND_UP(PVR_DW_TO_BYTES(pds_prog.program.data_size_aligned),
6570                          ROGUE_VDMCTRL_PDS_STATE0_PDS_DATA_SIZE_UNIT_SIZE);
6571       }
6572 
6573       pvr_csb_emit (csb, VDMCTRL_PDS_STATE1, state1) {
6574          const uint32_t data_offset =
6575             pds_bo->dev_addr.addr +
6576             PVR_DW_TO_BYTES(pds_prog.program.code_size_aligned) -
6577             cmd_buffer->device->heaps.pds_heap->base_addr.addr;
6578 
6579          state1.pds_data_addr = PVR_DEV_ADDR(data_offset);
6580          state1.sd_type = ROGUE_VDMCTRL_SD_TYPE_PDS;
6581          state1.sd_next_type = ROGUE_VDMCTRL_SD_TYPE_NONE;
6582       }
6583 
6584       pvr_csb_emit (csb, VDMCTRL_PDS_STATE2, state2) {
6585          const uint32_t code_offset =
6586             pds_bo->dev_addr.addr -
6587             cmd_buffer->device->heaps.pds_heap->base_addr.addr;
6588 
6589          state2.pds_code_addr = PVR_DEV_ADDR(code_offset);
6590       }
6591 
6592       pvr_csb_clear_relocation_mark(csb);
6593 
6594       /* We don't really need to set the relocation mark since the following
6595        * state update is just one emit but let's be nice and use it.
6596        */
6597       pvr_csb_set_relocation_mark(csb);
6598 
6599       /* Sync task to ensure the VDM doesn't start reading the dummy blocks
6600        * before they are ready.
6601        */
6602       pvr_csb_emit (csb, VDMCTRL_INDEX_LIST0, list0) {
6603          list0.primitive_topology = ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_LIST;
6604       }
6605 
6606       pvr_csb_clear_relocation_mark(csb);
6607 
6608       dummy_stream = pvr_bo_suballoc_get_map_addr(dummy_bo);
6609 
6610       /* For indexed draw cmds fill in the dummy's header (as it won't change
6611        * based on the indirect args) and increment by the in-use size of each
6612        * dummy block.
6613        */
6614       if (!state->draw_state.draw_indexed) {
6615          dummy_stream[0] = word0;
6616          dummy_stream += 4;
6617       } else {
6618          dummy_stream += 5;
6619       }
6620 
6621       /* clang-format off */
6622       pvr_csb_pack (dummy_stream, VDMCTRL_STREAM_RETURN, word);
6623       /* clang-format on */
6624 
6625       pvr_csb_set_relocation_mark(csb);
6626 
6627       /* Stream link to the first dummy which forces the VDM to discard any
6628        * prefetched (dummy) control stream.
6629        */
6630       pvr_csb_emit (csb, VDMCTRL_STREAM_LINK0, link) {
6631          link.with_return = true;
6632          link.link_addrmsb = dummy_bo->dev_addr;
6633       }
6634 
6635       pvr_csb_emit (csb, VDMCTRL_STREAM_LINK1, link) {
6636          link.link_addrlsb = dummy_bo->dev_addr;
6637       }
6638 
6639       pvr_csb_clear_relocation_mark(csb);
6640 
6641       /* Point the pds program to the next argument buffer and the next VDM
6642        * dummy buffer.
6643        */
6644       pds_prog.arg_buffer += stride;
6645    }
6646 
6647    return VK_SUCCESS;
6648 }
6649 
6650 #undef DUMMY_VDM_CONTROL_STREAM_BLOCK_SIZE
6651 
pvr_emit_vdm_index_list(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd,VkPrimitiveTopology topology,uint32_t index_offset,uint32_t first_index,uint32_t index_count,uint32_t instance_count,struct pvr_buffer * buffer,VkDeviceSize offset,uint32_t count,uint32_t stride)6652 static void pvr_emit_vdm_index_list(struct pvr_cmd_buffer *cmd_buffer,
6653                                     struct pvr_sub_cmd_gfx *const sub_cmd,
6654                                     VkPrimitiveTopology topology,
6655                                     uint32_t index_offset,
6656                                     uint32_t first_index,
6657                                     uint32_t index_count,
6658                                     uint32_t instance_count,
6659                                     struct pvr_buffer *buffer,
6660                                     VkDeviceSize offset,
6661                                     uint32_t count,
6662                                     uint32_t stride)
6663 {
6664    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6665 
6666    const pco_data *const vs_data = &state->gfx_pipeline->vs_data;
6667    struct ROGUE_VDMCTRL_INDEX_LIST0 list_hdr = { pvr_cmd_header(
6668       VDMCTRL_INDEX_LIST0) };
6669    pvr_dev_addr_t index_buffer_addr = PVR_DEV_ADDR_INVALID;
6670    struct pvr_csb *const csb = &sub_cmd->control_stream;
6671    unsigned int index_stride = 0;
6672 
6673    list_hdr.primitive_topology = pvr_get_hw_primitive_topology(topology);
6674 
6675    /* firstInstance is not handled here in the VDM state, it's implemented as
6676     * an addition in the PDS vertex fetch using
6677     * PVR_PDS_CONST_MAP_ENTRY_TYPE_BASE_INSTANCE entry type.
6678     */
6679 
6680    list_hdr.index_count_present = true;
6681 
6682    if (instance_count > 1)
6683       list_hdr.index_instance_count_present = true;
6684 
6685    if (index_offset)
6686       list_hdr.index_offset_present = true;
6687 
6688    if (state->draw_state.draw_indexed) {
6689       list_hdr.index_size =
6690          pvr_vdmctrl_index_size_from_type(state->index_buffer_binding.type);
6691       index_stride = vk_index_type_to_bytes(state->index_buffer_binding.type);
6692 
6693       index_buffer_addr = PVR_DEV_ADDR_OFFSET(
6694          state->index_buffer_binding.buffer->dev_addr,
6695          state->index_buffer_binding.offset + first_index * index_stride);
6696 
6697       list_hdr.index_addr_present = true;
6698       list_hdr.index_base_addrmsb = index_buffer_addr;
6699    }
6700 
6701    list_hdr.degen_cull_enable =
6702       PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info,
6703                       vdm_degenerate_culling) &&
6704       !vs_data->common.uses.side_effects;
6705 
6706    if (state->draw_state.draw_indirect) {
6707       assert(buffer);
6708       pvr_write_draw_indirect_vdm_stream(cmd_buffer,
6709                                          csb,
6710                                          index_buffer_addr,
6711                                          index_stride,
6712                                          &list_hdr,
6713                                          buffer,
6714                                          offset,
6715                                          count,
6716                                          stride);
6717       return;
6718    }
6719 
6720    pvr_csb_set_relocation_mark(csb);
6721 
6722    pvr_csb_emit (csb, VDMCTRL_INDEX_LIST0, list0) {
6723       list0 = list_hdr;
6724    }
6725 
6726    if (list_hdr.index_addr_present) {
6727       pvr_csb_emit (csb, VDMCTRL_INDEX_LIST1, list1) {
6728          list1.index_base_addrlsb = index_buffer_addr;
6729       }
6730    }
6731 
6732    if (list_hdr.index_count_present) {
6733       pvr_csb_emit (csb, VDMCTRL_INDEX_LIST2, list2) {
6734          list2.index_count = index_count;
6735       }
6736    }
6737 
6738    if (list_hdr.index_instance_count_present) {
6739       pvr_csb_emit (csb, VDMCTRL_INDEX_LIST3, list3) {
6740          list3.instance_count = instance_count - 1;
6741       }
6742    }
6743 
6744    if (list_hdr.index_offset_present) {
6745       pvr_csb_emit (csb, VDMCTRL_INDEX_LIST4, list4) {
6746          list4.index_offset = index_offset;
6747       }
6748    }
6749 
6750    pvr_csb_clear_relocation_mark(csb);
6751 }
6752 
pvr_CmdDraw(VkCommandBuffer commandBuffer,uint32_t vertexCount,uint32_t instanceCount,uint32_t firstVertex,uint32_t firstInstance)6753 void pvr_CmdDraw(VkCommandBuffer commandBuffer,
6754                  uint32_t vertexCount,
6755                  uint32_t instanceCount,
6756                  uint32_t firstVertex,
6757                  uint32_t firstInstance)
6758 {
6759    const struct pvr_cmd_buffer_draw_state draw_state = {
6760       .base_vertex = firstVertex,
6761       .base_instance = firstInstance,
6762    };
6763 
6764    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
6765    struct vk_dynamic_graphics_state *const dynamic_state =
6766       &cmd_buffer->vk.dynamic_graphics_state;
6767    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6768    VkResult result;
6769 
6770    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
6771 
6772    pvr_update_draw_state(state, &draw_state);
6773 
6774    result = pvr_validate_draw_state(cmd_buffer);
6775    if (result != VK_SUCCESS)
6776       return;
6777 
6778    /* Write the VDM control stream for the primitive. */
6779    pvr_emit_vdm_index_list(cmd_buffer,
6780                            &state->current_sub_cmd->gfx,
6781                            dynamic_state->ia.primitive_topology,
6782                            firstVertex,
6783                            0U,
6784                            vertexCount,
6785                            instanceCount,
6786                            NULL,
6787                            0U,
6788                            0U,
6789                            0U);
6790 }
6791 
pvr_CmdDrawIndexed(VkCommandBuffer commandBuffer,uint32_t indexCount,uint32_t instanceCount,uint32_t firstIndex,int32_t vertexOffset,uint32_t firstInstance)6792 void pvr_CmdDrawIndexed(VkCommandBuffer commandBuffer,
6793                         uint32_t indexCount,
6794                         uint32_t instanceCount,
6795                         uint32_t firstIndex,
6796                         int32_t vertexOffset,
6797                         uint32_t firstInstance)
6798 {
6799    const struct pvr_cmd_buffer_draw_state draw_state = {
6800       .base_vertex = vertexOffset,
6801       .base_instance = firstInstance,
6802       .draw_indexed = true,
6803    };
6804 
6805    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
6806    struct vk_dynamic_graphics_state *const dynamic_state =
6807       &cmd_buffer->vk.dynamic_graphics_state;
6808    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6809    VkResult result;
6810 
6811    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
6812 
6813    pvr_update_draw_state(state, &draw_state);
6814 
6815    result = pvr_validate_draw_state(cmd_buffer);
6816    if (result != VK_SUCCESS)
6817       return;
6818 
6819    /* Write the VDM control stream for the primitive. */
6820    pvr_emit_vdm_index_list(cmd_buffer,
6821                            &state->current_sub_cmd->gfx,
6822                            dynamic_state->ia.primitive_topology,
6823                            vertexOffset,
6824                            firstIndex,
6825                            indexCount,
6826                            instanceCount,
6827                            NULL,
6828                            0U,
6829                            0U,
6830                            0U);
6831 }
6832 
pvr_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)6833 void pvr_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
6834                                 VkBuffer _buffer,
6835                                 VkDeviceSize offset,
6836                                 uint32_t drawCount,
6837                                 uint32_t stride)
6838 {
6839    const struct pvr_cmd_buffer_draw_state draw_state = {
6840       .draw_indirect = true,
6841       .draw_indexed = true,
6842    };
6843 
6844    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
6845    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6846    struct vk_dynamic_graphics_state *const dynamic_state =
6847       &cmd_buffer->vk.dynamic_graphics_state;
6848    PVR_FROM_HANDLE(pvr_buffer, buffer, _buffer);
6849    VkResult result;
6850 
6851    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
6852 
6853    pvr_update_draw_state(state, &draw_state);
6854 
6855    result = pvr_validate_draw_state(cmd_buffer);
6856    if (result != VK_SUCCESS)
6857       return;
6858 
6859    /* Write the VDM control stream for the primitive. */
6860    pvr_emit_vdm_index_list(cmd_buffer,
6861                            &state->current_sub_cmd->gfx,
6862                            dynamic_state->ia.primitive_topology,
6863                            0U,
6864                            0U,
6865                            0U,
6866                            0U,
6867                            buffer,
6868                            offset,
6869                            drawCount,
6870                            stride);
6871 }
6872 
pvr_CmdDrawIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)6873 void pvr_CmdDrawIndirect(VkCommandBuffer commandBuffer,
6874                          VkBuffer _buffer,
6875                          VkDeviceSize offset,
6876                          uint32_t drawCount,
6877                          uint32_t stride)
6878 {
6879    const struct pvr_cmd_buffer_draw_state draw_state = {
6880       .draw_indirect = true,
6881    };
6882 
6883    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
6884    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6885    PVR_FROM_HANDLE(pvr_buffer, buffer, _buffer);
6886    struct vk_dynamic_graphics_state *const dynamic_state =
6887       &cmd_buffer->vk.dynamic_graphics_state;
6888    VkResult result;
6889 
6890    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
6891 
6892    pvr_update_draw_state(state, &draw_state);
6893 
6894    result = pvr_validate_draw_state(cmd_buffer);
6895    if (result != VK_SUCCESS)
6896       return;
6897 
6898    /* Write the VDM control stream for the primitive. */
6899    pvr_emit_vdm_index_list(cmd_buffer,
6900                            &state->current_sub_cmd->gfx,
6901                            dynamic_state->ia.primitive_topology,
6902                            0U,
6903                            0U,
6904                            0U,
6905                            0U,
6906                            buffer,
6907                            offset,
6908                            drawCount,
6909                            stride);
6910 }
6911 
6912 static VkResult
pvr_resolve_unemitted_resolve_attachments(struct pvr_cmd_buffer * cmd_buffer,struct pvr_render_pass_info * info)6913 pvr_resolve_unemitted_resolve_attachments(struct pvr_cmd_buffer *cmd_buffer,
6914                                           struct pvr_render_pass_info *info)
6915 {
6916    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6917    const struct pvr_renderpass_hwsetup_render *hw_render =
6918       &state->render_pass_info.pass->hw_setup->renders[info->current_hw_subpass];
6919 
6920    for (uint32_t i = 0U; i < hw_render->eot_surface_count; i++) {
6921       const struct pvr_renderpass_hwsetup_eot_surface *surface =
6922          &hw_render->eot_surfaces[i];
6923       const uint32_t color_attach_idx = surface->src_attachment_idx;
6924       const uint32_t resolve_attach_idx = surface->attachment_idx;
6925       VkImageSubresourceLayers src_subresource;
6926       VkImageSubresourceLayers dst_subresource;
6927       struct pvr_image_view *dst_view;
6928       struct pvr_image_view *src_view;
6929       VkFormat src_format;
6930       VkFormat dst_format;
6931       VkImageCopy2 region;
6932       VkResult result;
6933 
6934       if (!surface->need_resolve ||
6935           surface->resolve_type != PVR_RESOLVE_TYPE_TRANSFER)
6936          continue;
6937 
6938       dst_view = info->attachments[resolve_attach_idx];
6939       src_view = info->attachments[color_attach_idx];
6940 
6941       src_subresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
6942       src_subresource.mipLevel = src_view->vk.base_mip_level;
6943       src_subresource.baseArrayLayer = src_view->vk.base_array_layer;
6944       src_subresource.layerCount = src_view->vk.layer_count;
6945 
6946       dst_subresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
6947       dst_subresource.mipLevel = dst_view->vk.base_mip_level;
6948       dst_subresource.baseArrayLayer = dst_view->vk.base_array_layer;
6949       dst_subresource.layerCount = dst_view->vk.layer_count;
6950 
6951       region.srcOffset = (VkOffset3D){ info->render_area.offset.x,
6952                                        info->render_area.offset.y,
6953                                        0 };
6954       region.dstOffset = (VkOffset3D){ info->render_area.offset.x,
6955                                        info->render_area.offset.y,
6956                                        0 };
6957       region.extent = (VkExtent3D){ info->render_area.extent.width,
6958                                     info->render_area.extent.height,
6959                                     1 };
6960 
6961       region.srcSubresource = src_subresource;
6962       region.dstSubresource = dst_subresource;
6963 
6964       /* TODO: if ERN_46863 is supported, Depth and stencil are sampled
6965        * separately from images with combined depth+stencil. Add logic here to
6966        * handle it using appropriate format from image view.
6967        */
6968       src_format = src_view->vk.image->format;
6969       dst_format = dst_view->vk.image->format;
6970       src_view->vk.image->format = src_view->vk.format;
6971       dst_view->vk.image->format = dst_view->vk.format;
6972 
6973       result = pvr_copy_or_resolve_color_image_region(
6974          cmd_buffer,
6975          vk_to_pvr_image(src_view->vk.image),
6976          vk_to_pvr_image(dst_view->vk.image),
6977          &region);
6978 
6979       src_view->vk.image->format = src_format;
6980       dst_view->vk.image->format = dst_format;
6981 
6982       state->current_sub_cmd->transfer.serialize_with_frag = true;
6983 
6984       if (result != VK_SUCCESS)
6985          return result;
6986    }
6987 
6988    return pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
6989 }
6990 
pvr_CmdEndRenderPass2(VkCommandBuffer commandBuffer,const VkSubpassEndInfo * pSubpassEndInfo)6991 void pvr_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
6992                            const VkSubpassEndInfo *pSubpassEndInfo)
6993 {
6994    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
6995    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6996    struct pvr_image_view **attachments;
6997    VkClearValue *clear_values;
6998    VkResult result;
6999 
7000    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
7001 
7002    assert(state->render_pass_info.pass);
7003    assert(state->render_pass_info.framebuffer);
7004 
7005    result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7006    if (result != VK_SUCCESS)
7007       return;
7008 
7009    result = pvr_resolve_unemitted_resolve_attachments(cmd_buffer,
7010                                                       &state->render_pass_info);
7011    if (result != VK_SUCCESS)
7012       return;
7013 
7014    /* Save the required fields before clearing render_pass_info struct. */
7015    attachments = state->render_pass_info.attachments;
7016    clear_values = state->render_pass_info.clear_values;
7017 
7018    memset(&state->render_pass_info, 0, sizeof(state->render_pass_info));
7019 
7020    state->render_pass_info.attachments = attachments;
7021    state->render_pass_info.clear_values = clear_values;
7022 }
7023 
7024 static VkResult
pvr_execute_deferred_cmd_buffer(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_cmd_buffer * sec_cmd_buffer)7025 pvr_execute_deferred_cmd_buffer(struct pvr_cmd_buffer *cmd_buffer,
7026                                 const struct pvr_cmd_buffer *sec_cmd_buffer)
7027 {
7028    struct vk_dynamic_graphics_state *const dynamic_state =
7029       &cmd_buffer->vk.dynamic_graphics_state;
7030    const uint32_t prim_db_elems =
7031       util_dynarray_num_elements(&cmd_buffer->depth_bias_array,
7032                                  struct pvr_depth_bias_state);
7033    const uint32_t prim_scissor_elems =
7034       util_dynarray_num_elements(&cmd_buffer->scissor_array,
7035                                  struct pvr_scissor_words);
7036 
7037    util_dynarray_foreach (&sec_cmd_buffer->deferred_csb_commands,
7038                           struct pvr_deferred_cs_command,
7039                           cmd) {
7040       switch (cmd->type) {
7041       case PVR_DEFERRED_CS_COMMAND_TYPE_DBSC: {
7042          const uint32_t scissor_idx =
7043             prim_scissor_elems + cmd->dbsc.state.scissor_index;
7044          const uint32_t db_idx =
7045             prim_db_elems + cmd->dbsc.state.depthbias_index;
7046          const uint32_t num_dwords =
7047             pvr_cmd_length(TA_STATE_HEADER) + pvr_cmd_length(TA_STATE_ISPDBSC);
7048          struct pvr_suballoc_bo *suballoc_bo;
7049          uint32_t ppp_state[num_dwords];
7050          VkResult result;
7051 
7052          pvr_csb_pack (&ppp_state[0], TA_STATE_HEADER, header) {
7053             header.pres_ispctl_dbsc = true;
7054          }
7055 
7056          pvr_csb_pack (&ppp_state[1], TA_STATE_ISPDBSC, ispdbsc) {
7057             ispdbsc.dbindex = db_idx;
7058             ispdbsc.scindex = scissor_idx;
7059          }
7060 
7061          result = pvr_cmd_buffer_upload_general(cmd_buffer,
7062                                                 &ppp_state[0],
7063                                                 sizeof(ppp_state),
7064                                                 &suballoc_bo);
7065          if (result != VK_SUCCESS)
7066             return result;
7067 
7068          pvr_csb_pack (&cmd->dbsc.vdm_state[0], VDMCTRL_PPP_STATE0, state) {
7069             state.word_count = num_dwords;
7070             state.addrmsb = suballoc_bo->dev_addr;
7071          }
7072 
7073          pvr_csb_pack (&cmd->dbsc.vdm_state[1], VDMCTRL_PPP_STATE1, state) {
7074             state.addrlsb = suballoc_bo->dev_addr;
7075          }
7076 
7077          break;
7078       }
7079 
7080       case PVR_DEFERRED_CS_COMMAND_TYPE_DBSC2: {
7081          const uint32_t scissor_idx =
7082             prim_scissor_elems + cmd->dbsc2.state.scissor_index;
7083          const uint32_t db_idx =
7084             prim_db_elems + cmd->dbsc2.state.depthbias_index;
7085 
7086          uint32_t *const addr =
7087             (uint32_t *)pvr_bo_suballoc_get_map_addr(cmd->dbsc2.ppp_cs_bo) +
7088             cmd->dbsc2.patch_offset;
7089 
7090          assert(pvr_bo_suballoc_get_map_addr(cmd->dbsc2.ppp_cs_bo));
7091 
7092          pvr_csb_pack (addr, TA_STATE_ISPDBSC, ispdbsc) {
7093             ispdbsc.dbindex = db_idx;
7094             ispdbsc.scindex = scissor_idx;
7095          }
7096 
7097          break;
7098       }
7099 
7100       default:
7101          unreachable("Invalid deferred control stream command type.");
7102          break;
7103       }
7104    }
7105 
7106    util_dynarray_append_dynarray(&cmd_buffer->depth_bias_array,
7107                                  &sec_cmd_buffer->depth_bias_array);
7108 
7109    util_dynarray_append_dynarray(&cmd_buffer->scissor_array,
7110                                  &sec_cmd_buffer->scissor_array);
7111 
7112    BITSET_SET(dynamic_state->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS);
7113    cmd_buffer->scissor_words = (struct pvr_scissor_words){ 0 };
7114 
7115    return VK_SUCCESS;
7116 }
7117 
7118 /* Caller needs to make sure that it ends the current sub_cmd. This function
7119  * only creates a copy of sec_sub_cmd and links it to the cmd_buffer's
7120  * sub_cmd list.
7121  */
pvr_execute_sub_cmd(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd * sec_sub_cmd)7122 static VkResult pvr_execute_sub_cmd(struct pvr_cmd_buffer *cmd_buffer,
7123                                     struct pvr_sub_cmd *sec_sub_cmd)
7124 {
7125    struct pvr_sub_cmd *primary_sub_cmd =
7126       vk_zalloc(&cmd_buffer->vk.pool->alloc,
7127                 sizeof(*primary_sub_cmd),
7128                 8,
7129                 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
7130    if (!primary_sub_cmd) {
7131       return vk_command_buffer_set_error(&cmd_buffer->vk,
7132                                          VK_ERROR_OUT_OF_HOST_MEMORY);
7133    }
7134 
7135    primary_sub_cmd->type = sec_sub_cmd->type;
7136    primary_sub_cmd->owned = false;
7137 
7138    list_addtail(&primary_sub_cmd->link, &cmd_buffer->sub_cmds);
7139 
7140    switch (sec_sub_cmd->type) {
7141    case PVR_SUB_CMD_TYPE_GRAPHICS:
7142       primary_sub_cmd->gfx = sec_sub_cmd->gfx;
7143       break;
7144 
7145    case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY:
7146    case PVR_SUB_CMD_TYPE_COMPUTE:
7147       primary_sub_cmd->compute = sec_sub_cmd->compute;
7148       break;
7149 
7150    case PVR_SUB_CMD_TYPE_TRANSFER:
7151       primary_sub_cmd->transfer = sec_sub_cmd->transfer;
7152       break;
7153 
7154    case PVR_SUB_CMD_TYPE_EVENT:
7155       primary_sub_cmd->event = sec_sub_cmd->event;
7156       break;
7157 
7158    default:
7159       unreachable("Unsupported sub-command type");
7160    }
7161 
7162    return VK_SUCCESS;
7163 }
7164 
7165 static VkResult
pvr_execute_graphics_cmd_buffer(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_cmd_buffer * sec_cmd_buffer)7166 pvr_execute_graphics_cmd_buffer(struct pvr_cmd_buffer *cmd_buffer,
7167                                 const struct pvr_cmd_buffer *sec_cmd_buffer)
7168 {
7169    const struct pvr_device_info *dev_info =
7170       &cmd_buffer->device->pdevice->dev_info;
7171    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
7172    struct pvr_sub_cmd *primary_sub_cmd = state->current_sub_cmd;
7173    struct pvr_sub_cmd *first_sec_cmd;
7174    VkResult result;
7175 
7176    /* Inherited queries are not supported. */
7177    assert(!state->vis_test_enabled);
7178 
7179    if (list_is_empty(&sec_cmd_buffer->sub_cmds))
7180       return VK_SUCCESS;
7181 
7182    first_sec_cmd =
7183       list_first_entry(&sec_cmd_buffer->sub_cmds, struct pvr_sub_cmd, link);
7184 
7185    /* Kick a render if we have a new base address. */
7186    if (primary_sub_cmd->gfx.query_pool && first_sec_cmd->gfx.query_pool &&
7187        primary_sub_cmd->gfx.query_pool != first_sec_cmd->gfx.query_pool) {
7188       state->current_sub_cmd->gfx.barrier_store = true;
7189 
7190       result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7191       if (result != VK_SUCCESS)
7192          return result;
7193 
7194       result =
7195          pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
7196       if (result != VK_SUCCESS)
7197          return result;
7198 
7199       primary_sub_cmd = state->current_sub_cmd;
7200 
7201       /* Use existing render setup, but load color attachments from HW
7202        * Background object.
7203        */
7204       primary_sub_cmd->gfx.barrier_load = true;
7205       primary_sub_cmd->gfx.barrier_store = false;
7206    }
7207 
7208    list_for_each_entry (struct pvr_sub_cmd,
7209                         sec_sub_cmd,
7210                         &sec_cmd_buffer->sub_cmds,
7211                         link) {
7212       /* Only graphics secondary execution supported within a renderpass. */
7213       assert(sec_sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS);
7214 
7215       if (!sec_sub_cmd->gfx.empty_cmd)
7216          primary_sub_cmd->gfx.empty_cmd = false;
7217 
7218       if (sec_sub_cmd->gfx.query_pool) {
7219          primary_sub_cmd->gfx.query_pool = sec_sub_cmd->gfx.query_pool;
7220 
7221          util_dynarray_append_dynarray(&state->query_indices,
7222                                        &sec_sub_cmd->gfx.sec_query_indices);
7223       }
7224 
7225       if (pvr_cmd_uses_deferred_cs_cmds(sec_cmd_buffer)) {
7226          /* TODO: In case if secondary buffer is created with
7227           * VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT, then we patch the
7228           * stream and copy it to primary stream using pvr_csb_copy below.
7229           * This will need locking if the same secondary command buffer is
7230           * executed in multiple primary buffers at the same time.
7231           */
7232          result = pvr_execute_deferred_cmd_buffer(cmd_buffer, sec_cmd_buffer);
7233          if (result != VK_SUCCESS)
7234             return result;
7235 
7236          result = pvr_csb_copy(&primary_sub_cmd->gfx.control_stream,
7237                                &sec_sub_cmd->gfx.control_stream);
7238          if (result != VK_SUCCESS)
7239             return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
7240       } else {
7241          result = pvr_execute_deferred_cmd_buffer(cmd_buffer, sec_cmd_buffer);
7242          if (result != VK_SUCCESS)
7243             return result;
7244 
7245          pvr_csb_emit_link(
7246             &primary_sub_cmd->gfx.control_stream,
7247             pvr_csb_get_start_address(&sec_sub_cmd->gfx.control_stream),
7248             true);
7249       }
7250 
7251       if (PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info,
7252                           compute_overlap)) {
7253          primary_sub_cmd->gfx.job.disable_compute_overlap |=
7254             sec_sub_cmd->gfx.job.disable_compute_overlap;
7255       }
7256 
7257       primary_sub_cmd->gfx.max_tiles_in_flight =
7258          MIN2(primary_sub_cmd->gfx.max_tiles_in_flight,
7259               sec_sub_cmd->gfx.max_tiles_in_flight);
7260 
7261       /* Pass loaded depth/stencil usage from secondary command buffer. */
7262       if (sec_sub_cmd->gfx.depth_usage == PVR_DEPTH_STENCIL_USAGE_NEEDED)
7263          primary_sub_cmd->gfx.depth_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED;
7264 
7265       if (sec_sub_cmd->gfx.stencil_usage == PVR_DEPTH_STENCIL_USAGE_NEEDED)
7266          primary_sub_cmd->gfx.stencil_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED;
7267 
7268       /* Pass depth/stencil modification state from secondary command buffer. */
7269       if (sec_sub_cmd->gfx.modifies_depth)
7270          primary_sub_cmd->gfx.modifies_depth = true;
7271 
7272       if (sec_sub_cmd->gfx.modifies_stencil)
7273          primary_sub_cmd->gfx.modifies_stencil = true;
7274 
7275       if (sec_sub_cmd->gfx.barrier_store) {
7276          struct pvr_sub_cmd *sec_next =
7277             list_entry(sec_sub_cmd->link.next, struct pvr_sub_cmd, link);
7278 
7279          /* This shouldn't be the last sub cmd. There should be a barrier load
7280           * subsequent to the barrier store.
7281           */
7282          assert(list_last_entry(&sec_cmd_buffer->sub_cmds,
7283                                 struct pvr_sub_cmd,
7284                                 link) != sec_sub_cmd);
7285 
7286          /* Kick render to store stencil. */
7287          state->current_sub_cmd->gfx.barrier_store = true;
7288          state->current_sub_cmd->gfx.empty_cmd = false;
7289 
7290          result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7291          if (result != VK_SUCCESS)
7292             return result;
7293 
7294          result =
7295             pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
7296          if (result != VK_SUCCESS)
7297             return result;
7298 
7299          primary_sub_cmd = state->current_sub_cmd;
7300 
7301          /* Use existing render setup, but load color attachments from HW
7302           * Background object.
7303           */
7304          primary_sub_cmd->gfx.barrier_load = sec_next->gfx.barrier_load;
7305          primary_sub_cmd->gfx.barrier_store = sec_next->gfx.barrier_store;
7306          primary_sub_cmd->gfx.empty_cmd = false;
7307       }
7308 
7309       if (!PVR_HAS_FEATURE(dev_info, gs_rta_support)) {
7310          util_dynarray_append_dynarray(&cmd_buffer->deferred_clears,
7311                                        &sec_cmd_buffer->deferred_clears);
7312       }
7313    }
7314 
7315    return VK_SUCCESS;
7316 }
7317 
pvr_CmdExecuteCommands(VkCommandBuffer commandBuffer,uint32_t commandBufferCount,const VkCommandBuffer * pCommandBuffers)7318 void pvr_CmdExecuteCommands(VkCommandBuffer commandBuffer,
7319                             uint32_t commandBufferCount,
7320                             const VkCommandBuffer *pCommandBuffers)
7321 {
7322    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7323    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
7324    struct pvr_cmd_buffer *last_cmd_buffer;
7325    VkResult result;
7326 
7327    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
7328 
7329    assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
7330 
7331    /* Reset the CPU copy of the most recent PPP state of the primary command
7332     * buffer.
7333     *
7334     * The next draw call in the primary after CmdExecuteCommands may send
7335     * redundant state, if it all goes in the same geom job.
7336     *
7337     * Can't just copy state from the secondary because the recording state of
7338     * the secondary command buffers would have been deleted at this point.
7339     */
7340    pvr_reset_graphics_dirty_state(cmd_buffer, false);
7341 
7342    if (state->current_sub_cmd &&
7343        state->current_sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS) {
7344       for (uint32_t i = 0; i < commandBufferCount; i++) {
7345          PVR_FROM_HANDLE(pvr_cmd_buffer, sec_cmd_buffer, pCommandBuffers[i]);
7346 
7347          assert(sec_cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
7348 
7349          result = pvr_execute_graphics_cmd_buffer(cmd_buffer, sec_cmd_buffer);
7350          if (result != VK_SUCCESS)
7351             return;
7352       }
7353 
7354       last_cmd_buffer =
7355          pvr_cmd_buffer_from_handle(pCommandBuffers[commandBufferCount - 1]);
7356 
7357       /* Set barriers from final command secondary command buffer. */
7358       for (uint32_t i = 0; i != PVR_NUM_SYNC_PIPELINE_STAGES; i++) {
7359          state->barriers_needed[i] |=
7360             last_cmd_buffer->state.barriers_needed[i] &
7361             PVR_PIPELINE_STAGE_ALL_GRAPHICS_BITS;
7362       }
7363    } else {
7364       for (uint32_t i = 0; i < commandBufferCount; i++) {
7365          PVR_FROM_HANDLE(pvr_cmd_buffer, sec_cmd_buffer, pCommandBuffers[i]);
7366 
7367          assert(sec_cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
7368 
7369          result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7370          if (result != VK_SUCCESS)
7371             return;
7372 
7373          list_for_each_entry_safe (struct pvr_sub_cmd,
7374                                    sec_sub_cmd,
7375                                    &sec_cmd_buffer->sub_cmds,
7376                                    link) {
7377             result = pvr_execute_sub_cmd(cmd_buffer, sec_sub_cmd);
7378             if (result != VK_SUCCESS)
7379                return;
7380          }
7381       }
7382 
7383       last_cmd_buffer =
7384          pvr_cmd_buffer_from_handle(pCommandBuffers[commandBufferCount - 1]);
7385 
7386       memcpy(state->barriers_needed,
7387              last_cmd_buffer->state.barriers_needed,
7388              sizeof(state->barriers_needed));
7389    }
7390 }
7391 
pvr_insert_transparent_obj(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)7392 static void pvr_insert_transparent_obj(struct pvr_cmd_buffer *const cmd_buffer,
7393                                        struct pvr_sub_cmd_gfx *const sub_cmd)
7394 {
7395    struct pvr_device *const device = cmd_buffer->device;
7396    /* Yes we want a copy. The user could be recording multiple command buffers
7397     * in parallel so writing the template in place could cause problems.
7398     */
7399    struct pvr_static_clear_ppp_template clear =
7400       device->static_clear_state.ppp_templates[VK_IMAGE_ASPECT_COLOR_BIT];
7401    uint32_t pds_state[PVR_STATIC_CLEAR_PDS_STATE_COUNT] = { 0 };
7402    struct pvr_csb *csb = &sub_cmd->control_stream;
7403    struct pvr_suballoc_bo *ppp_bo;
7404 
7405    assert(clear.requires_pds_state);
7406 
7407    /* Patch the template. */
7408 
7409    pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_SHADERBASE],
7410                  TA_STATE_PDS_SHADERBASE,
7411                  shaderbase) {
7412       shaderbase.addr = PVR_DEV_ADDR(device->nop_program.pds.data_offset);
7413    }
7414 
7415    clear.config.pds_state = &pds_state;
7416 
7417    clear.config.ispctl.upass = cmd_buffer->state.render_pass_info.isp_userpass;
7418 
7419    /* Emit PPP state from template. */
7420 
7421    pvr_emit_ppp_from_template(csb, &clear, &ppp_bo);
7422    list_add(&ppp_bo->link, &cmd_buffer->bo_list);
7423 
7424    /* Emit VDM state. */
7425 
7426    pvr_emit_clear_words(cmd_buffer, sub_cmd);
7427 
7428    /* Reset graphics state. */
7429    pvr_reset_graphics_dirty_state(cmd_buffer, false);
7430 }
7431 
7432 static inline struct pvr_render_subpass *
pvr_get_current_subpass(const struct pvr_cmd_buffer_state * const state)7433 pvr_get_current_subpass(const struct pvr_cmd_buffer_state *const state)
7434 {
7435    const uint32_t subpass_idx = state->render_pass_info.subpass_idx;
7436 
7437    return &state->render_pass_info.pass->subpasses[subpass_idx];
7438 }
7439 
pvr_CmdNextSubpass2(VkCommandBuffer commandBuffer,const VkSubpassBeginInfo * pSubpassBeginInfo,const VkSubpassEndInfo * pSubpassEndInfo)7440 void pvr_CmdNextSubpass2(VkCommandBuffer commandBuffer,
7441                          const VkSubpassBeginInfo *pSubpassBeginInfo,
7442                          const VkSubpassEndInfo *pSubpassEndInfo)
7443 {
7444    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7445    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
7446    struct pvr_render_pass_info *rp_info = &state->render_pass_info;
7447    const struct pvr_renderpass_hwsetup_subpass *hw_subpass;
7448    struct pvr_renderpass_hwsetup_render *next_hw_render;
7449    const struct pvr_render_pass *pass = rp_info->pass;
7450    const struct pvr_renderpass_hw_map *current_map;
7451    const struct pvr_renderpass_hw_map *next_map;
7452    struct pvr_load_op *hw_subpass_load_op;
7453    VkResult result;
7454 
7455    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
7456 
7457    current_map = &pass->hw_setup->subpass_map[rp_info->subpass_idx];
7458    next_map = &pass->hw_setup->subpass_map[rp_info->subpass_idx + 1];
7459    next_hw_render = &pass->hw_setup->renders[next_map->render];
7460 
7461    if (current_map->render != next_map->render) {
7462       result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7463       if (result != VK_SUCCESS)
7464          return;
7465 
7466       result = pvr_resolve_unemitted_resolve_attachments(cmd_buffer, rp_info);
7467       if (result != VK_SUCCESS)
7468          return;
7469 
7470       rp_info->current_hw_subpass = next_map->render;
7471 
7472       result =
7473          pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
7474       if (result != VK_SUCCESS)
7475          return;
7476 
7477       rp_info->enable_bg_tag = false;
7478       rp_info->process_empty_tiles = false;
7479 
7480       /* If this subpass contains any load ops the HW Background Object must be
7481        * run to do the clears/loads.
7482        */
7483       if (next_hw_render->color_init_count > 0) {
7484          rp_info->enable_bg_tag = true;
7485 
7486          for (uint32_t i = 0; i < next_hw_render->color_init_count; i++) {
7487             /* Empty tiles need to be cleared too. */
7488             if (next_hw_render->color_init[i].op ==
7489                 VK_ATTACHMENT_LOAD_OP_CLEAR) {
7490                rp_info->process_empty_tiles = true;
7491                break;
7492             }
7493          }
7494       }
7495 
7496       /* Set isp_userpass to zero for new hw_render. This will be used to set
7497        * ROGUE_CR_ISP_CTL::upass_start.
7498        */
7499       rp_info->isp_userpass = 0;
7500    }
7501 
7502    hw_subpass = &next_hw_render->subpasses[next_map->subpass];
7503    hw_subpass_load_op = hw_subpass->load_op;
7504 
7505    if (hw_subpass_load_op) {
7506       result = pvr_cs_write_load_op(cmd_buffer,
7507                                     &state->current_sub_cmd->gfx,
7508                                     hw_subpass_load_op,
7509                                     rp_info->isp_userpass);
7510    }
7511 
7512    /* Pipelines are created for a particular subpass so unbind but leave the
7513     * vertex and descriptor bindings intact as they are orthogonal to the
7514     * subpass.
7515     */
7516    state->gfx_pipeline = NULL;
7517 
7518    /* User-pass spawn is 4 bits so if the driver has to wrap it, it will emit a
7519     * full screen transparent object to flush all tags up until now, then the
7520     * user-pass spawn value will implicitly be reset to 0 because
7521     * pvr_render_subpass::isp_userpass values are stored ANDed with
7522     * ROGUE_CR_ISP_CTL_UPASS_START_SIZE_MAX.
7523     */
7524    /* If hw_subpass_load_op is valid then pvr_write_load_op_control_stream
7525     * has already done a full-screen transparent object.
7526     */
7527    if (rp_info->isp_userpass == ROGUE_CR_ISP_CTL_UPASS_START_SIZE_MAX &&
7528        !hw_subpass_load_op) {
7529       pvr_insert_transparent_obj(cmd_buffer, &state->current_sub_cmd->gfx);
7530    }
7531 
7532    rp_info->subpass_idx++;
7533 
7534    rp_info->isp_userpass = pass->subpasses[rp_info->subpass_idx].isp_userpass;
7535    state->dirty.isp_userpass = true;
7536 
7537    rp_info->pipeline_bind_point =
7538       pass->subpasses[rp_info->subpass_idx].pipeline_bind_point;
7539 
7540    pvr_stash_depth_format(state, &state->current_sub_cmd->gfx);
7541 }
7542 
7543 static bool
pvr_stencil_has_self_dependency(const struct pvr_cmd_buffer_state * const state)7544 pvr_stencil_has_self_dependency(const struct pvr_cmd_buffer_state *const state)
7545 {
7546    const struct pvr_render_subpass *const current_subpass =
7547       pvr_get_current_subpass(state);
7548    const uint32_t *const input_attachments = current_subpass->input_attachments;
7549 
7550    if (current_subpass->depth_stencil_attachment == VK_ATTACHMENT_UNUSED)
7551       return false;
7552 
7553    /* We only need to check the current software subpass as we don't support
7554     * merging to/from a subpass with self-dep stencil.
7555     */
7556 
7557    for (uint32_t i = 0; i < current_subpass->input_count; i++) {
7558       if (input_attachments[i] == current_subpass->depth_stencil_attachment)
7559          return true;
7560    }
7561 
7562    return false;
7563 }
7564 
pvr_is_stencil_store_load_needed(const struct pvr_cmd_buffer * const cmd_buffer,VkPipelineStageFlags2 vk_src_stage_mask,VkPipelineStageFlags2 vk_dst_stage_mask,uint32_t memory_barrier_count,const VkMemoryBarrier2 * const memory_barriers,uint32_t image_barrier_count,const VkImageMemoryBarrier2 * const image_barriers)7565 static bool pvr_is_stencil_store_load_needed(
7566    const struct pvr_cmd_buffer *const cmd_buffer,
7567    VkPipelineStageFlags2 vk_src_stage_mask,
7568    VkPipelineStageFlags2 vk_dst_stage_mask,
7569    uint32_t memory_barrier_count,
7570    const VkMemoryBarrier2 *const memory_barriers,
7571    uint32_t image_barrier_count,
7572    const VkImageMemoryBarrier2 *const image_barriers)
7573 {
7574    const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
7575    const uint32_t fragment_test_stages =
7576       VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
7577       VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT;
7578    const struct pvr_render_pass *const pass = state->render_pass_info.pass;
7579    const struct pvr_renderpass_hwsetup_render *hw_render;
7580    struct pvr_image_view **const attachments =
7581       state->render_pass_info.attachments;
7582    const struct pvr_image_view *attachment;
7583    uint32_t hw_render_idx;
7584 
7585    if (!pass)
7586       return false;
7587 
7588    hw_render_idx = state->current_sub_cmd->gfx.hw_render_idx;
7589    hw_render = &pass->hw_setup->renders[hw_render_idx];
7590 
7591    if (hw_render->ds_attach_idx == VK_ATTACHMENT_UNUSED)
7592       return false;
7593 
7594    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
7595       attachment = attachments[hw_render->ds_attach_idx];
7596    } else {
7597       assert(!attachments);
7598       attachment = NULL;
7599    }
7600 
7601    if (!(vk_src_stage_mask & fragment_test_stages) &&
7602        vk_dst_stage_mask & VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT)
7603       return false;
7604 
7605    for (uint32_t i = 0; i < memory_barrier_count; i++) {
7606       const uint32_t stencil_write_bit =
7607          VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
7608       const uint32_t input_attachment_read_bit =
7609          VK_ACCESS_INPUT_ATTACHMENT_READ_BIT;
7610 
7611       if (!(memory_barriers[i].srcAccessMask & stencil_write_bit))
7612          continue;
7613 
7614       if (!(memory_barriers[i].dstAccessMask & input_attachment_read_bit))
7615          continue;
7616 
7617       return pvr_stencil_has_self_dependency(state);
7618    }
7619 
7620    for (uint32_t i = 0; i < image_barrier_count; i++) {
7621       PVR_FROM_HANDLE(pvr_image, image, image_barriers[i].image);
7622       const uint32_t stencil_bit = VK_IMAGE_ASPECT_STENCIL_BIT;
7623 
7624       if (!(image_barriers[i].subresourceRange.aspectMask & stencil_bit))
7625          continue;
7626 
7627       if (attachment && image != vk_to_pvr_image(attachment->vk.image))
7628          continue;
7629 
7630       if (!vk_format_has_stencil(image->vk.format))
7631          continue;
7632 
7633       return pvr_stencil_has_self_dependency(state);
7634    }
7635 
7636    return false;
7637 }
7638 
7639 static VkResult
pvr_cmd_buffer_insert_mid_frag_barrier_event(struct pvr_cmd_buffer * cmd_buffer,uint32_t src_stage_mask,uint32_t dst_stage_mask)7640 pvr_cmd_buffer_insert_mid_frag_barrier_event(struct pvr_cmd_buffer *cmd_buffer,
7641                                              uint32_t src_stage_mask,
7642                                              uint32_t dst_stage_mask)
7643 {
7644    VkResult result;
7645 
7646    assert(cmd_buffer->state.current_sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS);
7647 
7648    cmd_buffer->state.current_sub_cmd->gfx.empty_cmd = false;
7649 
7650    /* Submit graphics job to store stencil. */
7651    cmd_buffer->state.current_sub_cmd->gfx.barrier_store = true;
7652 
7653    pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7654    result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
7655    if (result != VK_SUCCESS)
7656       return result;
7657 
7658    cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
7659       .type = PVR_EVENT_TYPE_BARRIER,
7660       .barrier = {
7661          .wait_for_stage_mask = src_stage_mask,
7662          .wait_at_stage_mask = dst_stage_mask,
7663       },
7664    };
7665 
7666    pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7667    pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
7668 
7669    /* Use existing render setup, but load color attachments from HW BGOBJ */
7670    cmd_buffer->state.current_sub_cmd->gfx.barrier_load = true;
7671    cmd_buffer->state.current_sub_cmd->gfx.barrier_store = false;
7672 
7673    return VK_SUCCESS;
7674 }
7675 
7676 static VkResult
pvr_cmd_buffer_insert_barrier_event(struct pvr_cmd_buffer * cmd_buffer,uint32_t src_stage_mask,uint32_t dst_stage_mask)7677 pvr_cmd_buffer_insert_barrier_event(struct pvr_cmd_buffer *cmd_buffer,
7678                                     uint32_t src_stage_mask,
7679                                     uint32_t dst_stage_mask)
7680 {
7681    VkResult result;
7682 
7683    result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
7684    if (result != VK_SUCCESS)
7685       return result;
7686 
7687    cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
7688       .type = PVR_EVENT_TYPE_BARRIER,
7689       .barrier = {
7690          .wait_for_stage_mask = src_stage_mask,
7691          .wait_at_stage_mask = dst_stage_mask,
7692       },
7693    };
7694 
7695    return pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7696 }
7697 
7698 /* This is just enough to handle vkCmdPipelineBarrier().
7699  * TODO: Complete?
7700  */
pvr_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,const VkDependencyInfo * pDependencyInfo)7701 void pvr_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,
7702                              const VkDependencyInfo *pDependencyInfo)
7703 {
7704    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7705    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
7706    const struct pvr_render_pass *const render_pass =
7707       state->render_pass_info.pass;
7708    VkPipelineStageFlags vk_src_stage_mask = 0U;
7709    VkPipelineStageFlags vk_dst_stage_mask = 0U;
7710    bool is_stencil_store_load_needed;
7711    uint32_t required_stage_mask = 0U;
7712    uint32_t src_stage_mask;
7713    uint32_t dst_stage_mask;
7714    bool is_barrier_needed;
7715    VkResult result;
7716 
7717    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
7718 
7719    for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++) {
7720       vk_src_stage_mask |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
7721       vk_dst_stage_mask |= pDependencyInfo->pMemoryBarriers[i].dstStageMask;
7722    }
7723 
7724    for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++) {
7725       vk_src_stage_mask |=
7726          pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
7727       vk_dst_stage_mask |=
7728          pDependencyInfo->pBufferMemoryBarriers[i].dstStageMask;
7729    }
7730 
7731    for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++) {
7732       vk_src_stage_mask |=
7733          pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
7734       vk_dst_stage_mask |=
7735          pDependencyInfo->pImageMemoryBarriers[i].dstStageMask;
7736    }
7737 
7738    src_stage_mask = pvr_stage_mask_src(vk_src_stage_mask);
7739    dst_stage_mask = pvr_stage_mask_dst(vk_dst_stage_mask);
7740 
7741    for (uint32_t stage = 0U; stage != PVR_NUM_SYNC_PIPELINE_STAGES; stage++) {
7742       if (!(dst_stage_mask & BITFIELD_BIT(stage)))
7743          continue;
7744 
7745       required_stage_mask |= state->barriers_needed[stage];
7746    }
7747 
7748    src_stage_mask &= required_stage_mask;
7749    for (uint32_t stage = 0U; stage != PVR_NUM_SYNC_PIPELINE_STAGES; stage++) {
7750       if (!(dst_stage_mask & BITFIELD_BIT(stage)))
7751          continue;
7752 
7753       state->barriers_needed[stage] &= ~src_stage_mask;
7754    }
7755 
7756    if (src_stage_mask == 0 || dst_stage_mask == 0) {
7757       is_barrier_needed = false;
7758    } else if (src_stage_mask == PVR_PIPELINE_STAGE_GEOM_BIT &&
7759               dst_stage_mask == PVR_PIPELINE_STAGE_FRAG_BIT) {
7760       /* This is implicit so no need to barrier. */
7761       is_barrier_needed = false;
7762    } else if (src_stage_mask == dst_stage_mask &&
7763               util_bitcount(src_stage_mask) == 1) {
7764       struct pvr_sub_cmd *const current_sub_cmd = state->current_sub_cmd;
7765 
7766       switch (src_stage_mask) {
7767       case PVR_PIPELINE_STAGE_FRAG_BIT:
7768          is_barrier_needed = false;
7769 
7770          if (!render_pass)
7771             break;
7772 
7773          assert(current_sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS);
7774 
7775          /* Flush all fragment work up to this point. */
7776          pvr_insert_transparent_obj(cmd_buffer, &current_sub_cmd->gfx);
7777          break;
7778 
7779       case PVR_PIPELINE_STAGE_COMPUTE_BIT:
7780          is_barrier_needed = false;
7781 
7782          if (!current_sub_cmd ||
7783              current_sub_cmd->type != PVR_SUB_CMD_TYPE_COMPUTE) {
7784             break;
7785          }
7786 
7787          /* Multiple dispatches can be merged into a single job. When back to
7788           * back dispatches have a sequential dependency (Compute -> compute
7789           * pipeline barrier) we need to do the following.
7790           *   - Dispatch a kernel which fences all previous memory writes and
7791           *     flushes the MADD cache.
7792           *   - Issue a compute fence which ensures all previous tasks emitted
7793           *     by the compute data master are completed before starting
7794           *     anything new.
7795           */
7796 
7797          /* Issue Data Fence, Wait for Data Fence (IDFWDF) makes the PDS wait
7798           * for data.
7799           */
7800          pvr_compute_generate_idfwdf(cmd_buffer, &current_sub_cmd->compute);
7801 
7802          pvr_compute_generate_fence(cmd_buffer,
7803                                     &current_sub_cmd->compute,
7804                                     false);
7805          break;
7806 
7807       default:
7808          is_barrier_needed = false;
7809          break;
7810       };
7811    } else {
7812       is_barrier_needed = true;
7813    }
7814 
7815    is_stencil_store_load_needed =
7816       pvr_is_stencil_store_load_needed(cmd_buffer,
7817                                        vk_src_stage_mask,
7818                                        vk_dst_stage_mask,
7819                                        pDependencyInfo->memoryBarrierCount,
7820                                        pDependencyInfo->pMemoryBarriers,
7821                                        pDependencyInfo->imageMemoryBarrierCount,
7822                                        pDependencyInfo->pImageMemoryBarriers);
7823 
7824    if (is_stencil_store_load_needed) {
7825       assert(render_pass);
7826       result = pvr_cmd_buffer_insert_mid_frag_barrier_event(cmd_buffer,
7827                                                             src_stage_mask,
7828                                                             dst_stage_mask);
7829       if (result != VK_SUCCESS)
7830          mesa_loge("Failed to insert mid frag barrier event.");
7831    } else if (is_barrier_needed) {
7832       result = pvr_cmd_buffer_insert_barrier_event(cmd_buffer,
7833                                                    src_stage_mask,
7834                                                    dst_stage_mask);
7835       if (result != VK_SUCCESS)
7836          mesa_loge("Failed to insert pipeline barrier event.");
7837    }
7838 }
7839 
pvr_CmdResetEvent2(VkCommandBuffer commandBuffer,VkEvent _event,VkPipelineStageFlags2 stageMask)7840 void pvr_CmdResetEvent2(VkCommandBuffer commandBuffer,
7841                         VkEvent _event,
7842                         VkPipelineStageFlags2 stageMask)
7843 {
7844    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7845    PVR_FROM_HANDLE(pvr_event, event, _event);
7846    VkResult result;
7847 
7848    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
7849 
7850    result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
7851    if (result != VK_SUCCESS)
7852       return;
7853 
7854    cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
7855       .type = PVR_EVENT_TYPE_RESET,
7856       .set_reset = {
7857          .event = event,
7858          .wait_for_stage_mask = pvr_stage_mask_src(stageMask),
7859       },
7860    };
7861 
7862    pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7863 }
7864 
pvr_CmdSetEvent2(VkCommandBuffer commandBuffer,VkEvent _event,const VkDependencyInfo * pDependencyInfo)7865 void pvr_CmdSetEvent2(VkCommandBuffer commandBuffer,
7866                       VkEvent _event,
7867                       const VkDependencyInfo *pDependencyInfo)
7868 {
7869    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7870    PVR_FROM_HANDLE(pvr_event, event, _event);
7871    VkPipelineStageFlags2 stage_mask = 0;
7872    VkResult result;
7873 
7874    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
7875 
7876    result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
7877    if (result != VK_SUCCESS)
7878       return;
7879 
7880    for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++)
7881       stage_mask |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
7882 
7883    for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++)
7884       stage_mask |= pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
7885 
7886    for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++)
7887       stage_mask |= pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
7888 
7889    cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
7890       .type = PVR_EVENT_TYPE_SET,
7891       .set_reset = {
7892          .event = event,
7893          .wait_for_stage_mask = pvr_stage_mask_dst(stage_mask),
7894       },
7895    };
7896 
7897    pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7898 }
7899 
pvr_CmdWaitEvents2(VkCommandBuffer commandBuffer,uint32_t eventCount,const VkEvent * pEvents,const VkDependencyInfo * pDependencyInfos)7900 void pvr_CmdWaitEvents2(VkCommandBuffer commandBuffer,
7901                         uint32_t eventCount,
7902                         const VkEvent *pEvents,
7903                         const VkDependencyInfo *pDependencyInfos)
7904 {
7905    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7906    struct pvr_event **events_array;
7907    uint32_t *stage_masks;
7908    VkResult result;
7909 
7910    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
7911 
7912    VK_MULTIALLOC(ma);
7913    vk_multialloc_add(&ma, &events_array, __typeof__(*events_array), eventCount);
7914    vk_multialloc_add(&ma, &stage_masks, __typeof__(*stage_masks), eventCount);
7915 
7916    if (!vk_multialloc_alloc(&ma,
7917                             &cmd_buffer->vk.pool->alloc,
7918                             VK_SYSTEM_ALLOCATION_SCOPE_OBJECT)) {
7919       vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
7920       return;
7921    }
7922 
7923    result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
7924    if (result != VK_SUCCESS) {
7925       vk_free(&cmd_buffer->vk.pool->alloc, events_array);
7926       return;
7927    }
7928 
7929    memcpy(events_array, pEvents, sizeof(*events_array) * eventCount);
7930 
7931    for (uint32_t i = 0; i < eventCount; i++) {
7932       const VkDependencyInfo *info = &pDependencyInfos[i];
7933       VkPipelineStageFlags2 mask = 0;
7934 
7935       for (uint32_t j = 0; j < info->memoryBarrierCount; j++)
7936          mask |= info->pMemoryBarriers[j].dstStageMask;
7937 
7938       for (uint32_t j = 0; j < info->bufferMemoryBarrierCount; j++)
7939          mask |= info->pBufferMemoryBarriers[j].dstStageMask;
7940 
7941       for (uint32_t j = 0; j < info->imageMemoryBarrierCount; j++)
7942          mask |= info->pImageMemoryBarriers[j].dstStageMask;
7943 
7944       stage_masks[i] = pvr_stage_mask_dst(mask);
7945    }
7946 
7947    cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
7948       .type = PVR_EVENT_TYPE_WAIT,
7949       .wait = {
7950          .count = eventCount,
7951          .events = events_array,
7952          .wait_at_stage_masks = stage_masks,
7953       },
7954    };
7955 
7956    pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7957 }
7958 
pvr_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,VkPipelineStageFlags2 stage,VkQueryPool queryPool,uint32_t query)7959 void pvr_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,
7960                             VkPipelineStageFlags2 stage,
7961                             VkQueryPool queryPool,
7962                             uint32_t query)
7963 {
7964    unreachable("Timestamp queries are not supported.");
7965 }
7966 
pvr_EndCommandBuffer(VkCommandBuffer commandBuffer)7967 VkResult pvr_EndCommandBuffer(VkCommandBuffer commandBuffer)
7968 {
7969    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7970    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
7971    VkResult result;
7972 
7973    if (vk_command_buffer_has_error(&cmd_buffer->vk))
7974       return vk_command_buffer_end(&cmd_buffer->vk);
7975 
7976    /* TODO: We should be freeing all the resources, allocated for recording,
7977     * here.
7978     */
7979    util_dynarray_fini(&state->query_indices);
7980 
7981    result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7982    if (result != VK_SUCCESS)
7983       pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
7984 
7985    return vk_command_buffer_end(&cmd_buffer->vk);
7986 }
7987