• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2022 Imagination Technologies Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a copy
5  * of this software and associated documentation files (the "Software"), to deal
6  * in the Software without restriction, including without limitation the rights
7  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8  * copies of the Software, and to permit persons to whom the Software is
9  * furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include <assert.h>
25 #include <limits.h>
26 #include <stdbool.h>
27 #include <stddef.h>
28 #include <stdint.h>
29 #include <string.h>
30 #include <vulkan/vulkan.h>
31 
32 #include "hwdef/rogue_hw_defs.h"
33 #include "hwdef/rogue_hw_utils.h"
34 #include "pvr_bo.h"
35 #include "pvr_csb.h"
36 #include "pvr_csb_enum_helpers.h"
37 #include "pvr_device_info.h"
38 #include "pvr_end_of_tile.h"
39 #include "pvr_formats.h"
40 #include "pvr_hw_pass.h"
41 #include "pvr_job_common.h"
42 #include "pvr_job_render.h"
43 #include "pvr_limits.h"
44 #include "pvr_pds.h"
45 #include "pvr_private.h"
46 #include "pvr_types.h"
47 #include "pvr_winsys.h"
48 #include "util/bitscan.h"
49 #include "util/compiler.h"
50 #include "util/list.h"
51 #include "util/macros.h"
52 #include "util/u_dynarray.h"
53 #include "util/u_pack_color.h"
54 #include "vk_alloc.h"
55 #include "vk_command_buffer.h"
56 #include "vk_command_pool.h"
57 #include "vk_format.h"
58 #include "vk_log.h"
59 #include "vk_object.h"
60 #include "vk_util.h"
61 
62 /* Structure used to pass data into pvr_compute_generate_control_stream()
63  * function.
64  */
65 struct pvr_compute_kernel_info {
66    pvr_dev_addr_t indirect_buffer_addr;
67    bool global_offsets_present;
68    uint32_t usc_common_size;
69    uint32_t usc_unified_size;
70    uint32_t pds_temp_size;
71    uint32_t pds_data_size;
72    enum PVRX(CDMCTRL_USC_TARGET) usc_target;
73    bool is_fence;
74    uint32_t pds_data_offset;
75    uint32_t pds_code_offset;
76    enum PVRX(CDMCTRL_SD_TYPE) sd_type;
77    bool usc_common_shared;
78    uint32_t local_size[PVR_WORKGROUP_DIMENSIONS];
79    uint32_t global_size[PVR_WORKGROUP_DIMENSIONS];
80    uint32_t max_instances;
81 };
82 
pvr_cmd_buffer_free_sub_cmd(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd * sub_cmd)83 static void pvr_cmd_buffer_free_sub_cmd(struct pvr_cmd_buffer *cmd_buffer,
84                                         struct pvr_sub_cmd *sub_cmd)
85 {
86    switch (sub_cmd->type) {
87    case PVR_SUB_CMD_TYPE_GRAPHICS:
88       pvr_csb_finish(&sub_cmd->gfx.control_stream);
89       pvr_bo_free(cmd_buffer->device, sub_cmd->gfx.depth_bias_bo);
90       pvr_bo_free(cmd_buffer->device, sub_cmd->gfx.scissor_bo);
91       break;
92 
93    case PVR_SUB_CMD_TYPE_COMPUTE:
94       pvr_csb_finish(&sub_cmd->compute.control_stream);
95       break;
96 
97    case PVR_SUB_CMD_TYPE_TRANSFER:
98       list_for_each_entry_safe (struct pvr_transfer_cmd,
99                                 transfer_cmd,
100                                 &sub_cmd->transfer.transfer_cmds,
101                                 link) {
102          list_del(&transfer_cmd->link);
103          vk_free(&cmd_buffer->vk.pool->alloc, transfer_cmd);
104       }
105       break;
106 
107    default:
108       pvr_finishme("Unsupported sub-command type %d", sub_cmd->type);
109       break;
110    }
111 
112    list_del(&sub_cmd->link);
113    vk_free(&cmd_buffer->vk.pool->alloc, sub_cmd);
114 }
115 
pvr_cmd_buffer_free_sub_cmds(struct pvr_cmd_buffer * cmd_buffer)116 static void pvr_cmd_buffer_free_sub_cmds(struct pvr_cmd_buffer *cmd_buffer)
117 {
118    list_for_each_entry_safe (struct pvr_sub_cmd,
119                              sub_cmd,
120                              &cmd_buffer->sub_cmds,
121                              link) {
122       pvr_cmd_buffer_free_sub_cmd(cmd_buffer, sub_cmd);
123    }
124 }
125 
pvr_cmd_buffer_destroy(struct vk_command_buffer * vk_cmd_buffer)126 static void pvr_cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer)
127 {
128    struct pvr_cmd_buffer *cmd_buffer =
129       container_of(vk_cmd_buffer, struct pvr_cmd_buffer, vk);
130 
131    vk_free(&cmd_buffer->vk.pool->alloc,
132            cmd_buffer->state.render_pass_info.attachments);
133    vk_free(&cmd_buffer->vk.pool->alloc,
134            cmd_buffer->state.render_pass_info.clear_values);
135 
136    pvr_cmd_buffer_free_sub_cmds(cmd_buffer);
137 
138    list_for_each_entry_safe (struct pvr_bo, bo, &cmd_buffer->bo_list, link) {
139       list_del(&bo->link);
140       pvr_bo_free(cmd_buffer->device, bo);
141    }
142 
143    util_dynarray_fini(&cmd_buffer->scissor_array);
144    util_dynarray_fini(&cmd_buffer->depth_bias_array);
145 
146    vk_command_buffer_finish(&cmd_buffer->vk);
147    vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer);
148 }
149 
pvr_cmd_buffer_create(struct pvr_device * device,struct vk_command_pool * pool,VkCommandBufferLevel level,VkCommandBuffer * pCommandBuffer)150 static VkResult pvr_cmd_buffer_create(struct pvr_device *device,
151                                       struct vk_command_pool *pool,
152                                       VkCommandBufferLevel level,
153                                       VkCommandBuffer *pCommandBuffer)
154 {
155    struct pvr_cmd_buffer *cmd_buffer;
156    VkResult result;
157 
158    cmd_buffer = vk_zalloc(&pool->alloc,
159                           sizeof(*cmd_buffer),
160                           8U,
161                           VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
162    if (!cmd_buffer)
163       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
164 
165    result = vk_command_buffer_init(&cmd_buffer->vk, pool, level);
166    if (result != VK_SUCCESS) {
167       vk_free(&pool->alloc, cmd_buffer);
168       return result;
169    }
170 
171    cmd_buffer->vk.destroy = pvr_cmd_buffer_destroy;
172    cmd_buffer->device = device;
173 
174    util_dynarray_init(&cmd_buffer->depth_bias_array, NULL);
175    util_dynarray_init(&cmd_buffer->scissor_array, NULL);
176 
177    cmd_buffer->state.status = VK_SUCCESS;
178    cmd_buffer->status = PVR_CMD_BUFFER_STATUS_INITIAL;
179 
180    list_inithead(&cmd_buffer->sub_cmds);
181    list_inithead(&cmd_buffer->bo_list);
182 
183    *pCommandBuffer = pvr_cmd_buffer_to_handle(cmd_buffer);
184 
185    return VK_SUCCESS;
186 }
187 
188 VkResult
pvr_AllocateCommandBuffers(VkDevice _device,const VkCommandBufferAllocateInfo * pAllocateInfo,VkCommandBuffer * pCommandBuffers)189 pvr_AllocateCommandBuffers(VkDevice _device,
190                            const VkCommandBufferAllocateInfo *pAllocateInfo,
191                            VkCommandBuffer *pCommandBuffers)
192 {
193    VK_FROM_HANDLE(vk_command_pool, pool, pAllocateInfo->commandPool);
194    PVR_FROM_HANDLE(pvr_device, device, _device);
195    VkResult result = VK_SUCCESS;
196    uint32_t i;
197 
198    for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
199       result = pvr_cmd_buffer_create(device,
200                                      pool,
201                                      pAllocateInfo->level,
202                                      &pCommandBuffers[i]);
203       if (result != VK_SUCCESS)
204          break;
205    }
206 
207    if (result != VK_SUCCESS) {
208       while (i--) {
209          VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, pCommandBuffers[i]);
210          pvr_cmd_buffer_destroy(cmd_buffer);
211       }
212 
213       for (i = 0; i < pAllocateInfo->commandBufferCount; i++)
214          pCommandBuffers[i] = VK_NULL_HANDLE;
215    }
216 
217    return result;
218 }
219 
pvr_cmd_buffer_update_barriers(struct pvr_cmd_buffer * cmd_buffer,enum pvr_sub_cmd_type type)220 static void pvr_cmd_buffer_update_barriers(struct pvr_cmd_buffer *cmd_buffer,
221                                            enum pvr_sub_cmd_type type)
222 {
223    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
224    uint32_t barriers;
225 
226    switch (type) {
227    case PVR_SUB_CMD_TYPE_GRAPHICS:
228       barriers = PVR_PIPELINE_STAGE_GEOM_BIT | PVR_PIPELINE_STAGE_FRAG_BIT;
229       break;
230 
231    case PVR_SUB_CMD_TYPE_COMPUTE:
232       barriers = PVR_PIPELINE_STAGE_COMPUTE_BIT;
233       break;
234 
235    case PVR_SUB_CMD_TYPE_TRANSFER:
236       barriers = PVR_PIPELINE_STAGE_TRANSFER_BIT;
237       break;
238 
239    default:
240       barriers = 0;
241       pvr_finishme("Unsupported sub-command type %d", type);
242       break;
243    }
244 
245    for (uint32_t i = 0; i < ARRAY_SIZE(state->barriers_needed); i++)
246       state->barriers_needed[i] |= barriers;
247 }
248 
249 static VkResult
pvr_cmd_buffer_upload_tables(struct pvr_device * device,struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)250 pvr_cmd_buffer_upload_tables(struct pvr_device *device,
251                              struct pvr_cmd_buffer *cmd_buffer,
252                              struct pvr_sub_cmd_gfx *const sub_cmd)
253 {
254    const uint32_t cache_line_size =
255       rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
256    VkResult result;
257 
258    assert(!sub_cmd->depth_bias_bo && !sub_cmd->scissor_bo);
259 
260    if (cmd_buffer->depth_bias_array.size > 0) {
261       result =
262          pvr_gpu_upload(device,
263                         device->heaps.general_heap,
264                         util_dynarray_begin(&cmd_buffer->depth_bias_array),
265                         cmd_buffer->depth_bias_array.size,
266                         cache_line_size,
267                         &sub_cmd->depth_bias_bo);
268       if (result != VK_SUCCESS)
269          return result;
270    }
271 
272    if (cmd_buffer->scissor_array.size > 0) {
273       result = pvr_gpu_upload(device,
274                               device->heaps.general_heap,
275                               util_dynarray_begin(&cmd_buffer->scissor_array),
276                               cmd_buffer->scissor_array.size,
277                               cache_line_size,
278                               &sub_cmd->scissor_bo);
279       if (result != VK_SUCCESS)
280          goto err_free_depth_bias_bo;
281    }
282 
283    util_dynarray_clear(&cmd_buffer->depth_bias_array);
284    util_dynarray_clear(&cmd_buffer->scissor_array);
285 
286    return VK_SUCCESS;
287 
288 err_free_depth_bias_bo:
289    pvr_bo_free(device, sub_cmd->depth_bias_bo);
290    sub_cmd->depth_bias_bo = NULL;
291 
292    return result;
293 }
294 
295 static VkResult
pvr_cmd_buffer_emit_ppp_state(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)296 pvr_cmd_buffer_emit_ppp_state(struct pvr_cmd_buffer *cmd_buffer,
297                               struct pvr_sub_cmd_gfx *const sub_cmd)
298 {
299    struct pvr_framebuffer *framebuffer =
300       cmd_buffer->state.render_pass_info.framebuffer;
301 
302    pvr_csb_emit (&sub_cmd->control_stream, VDMCTRL_PPP_STATE0, state0) {
303       state0.addrmsb = framebuffer->ppp_state_bo->vma->dev_addr;
304       state0.word_count = framebuffer->ppp_state_size;
305    }
306 
307    pvr_csb_emit (&sub_cmd->control_stream, VDMCTRL_PPP_STATE1, state1) {
308       state1.addrlsb = framebuffer->ppp_state_bo->vma->dev_addr;
309    }
310 
311    return VK_SUCCESS;
312 }
313 
314 static VkResult
pvr_cmd_buffer_upload_general(struct pvr_cmd_buffer * const cmd_buffer,const void * const data,const size_t size,struct pvr_bo ** const pvr_bo_out)315 pvr_cmd_buffer_upload_general(struct pvr_cmd_buffer *const cmd_buffer,
316                               const void *const data,
317                               const size_t size,
318                               struct pvr_bo **const pvr_bo_out)
319 {
320    struct pvr_device *const device = cmd_buffer->device;
321    const uint32_t cache_line_size =
322       rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
323    struct pvr_bo *pvr_bo;
324    VkResult result;
325 
326    result = pvr_gpu_upload(device,
327                            device->heaps.general_heap,
328                            data,
329                            size,
330                            cache_line_size,
331                            &pvr_bo);
332    if (result != VK_SUCCESS)
333       return result;
334 
335    list_add(&pvr_bo->link, &cmd_buffer->bo_list);
336 
337    *pvr_bo_out = pvr_bo;
338 
339    return VK_SUCCESS;
340 }
341 
342 static VkResult
pvr_cmd_buffer_upload_usc(struct pvr_cmd_buffer * const cmd_buffer,const void * const code,const size_t code_size,uint64_t code_alignment,struct pvr_bo ** const pvr_bo_out)343 pvr_cmd_buffer_upload_usc(struct pvr_cmd_buffer *const cmd_buffer,
344                           const void *const code,
345                           const size_t code_size,
346                           uint64_t code_alignment,
347                           struct pvr_bo **const pvr_bo_out)
348 {
349    struct pvr_device *const device = cmd_buffer->device;
350    const uint32_t cache_line_size =
351       rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
352    struct pvr_bo *pvr_bo;
353    VkResult result;
354 
355    code_alignment = MAX2(code_alignment, cache_line_size);
356 
357    result =
358       pvr_gpu_upload_usc(device, code, code_size, code_alignment, &pvr_bo);
359    if (result != VK_SUCCESS)
360       return result;
361 
362    list_add(&pvr_bo->link, &cmd_buffer->bo_list);
363 
364    *pvr_bo_out = pvr_bo;
365 
366    return VK_SUCCESS;
367 }
368 
369 static VkResult
pvr_cmd_buffer_upload_pds(struct pvr_cmd_buffer * const cmd_buffer,const uint32_t * data,uint32_t data_size_dwords,uint32_t data_alignment,const uint32_t * code,uint32_t code_size_dwords,uint32_t code_alignment,uint64_t min_alignment,struct pvr_pds_upload * const pds_upload_out)370 pvr_cmd_buffer_upload_pds(struct pvr_cmd_buffer *const cmd_buffer,
371                           const uint32_t *data,
372                           uint32_t data_size_dwords,
373                           uint32_t data_alignment,
374                           const uint32_t *code,
375                           uint32_t code_size_dwords,
376                           uint32_t code_alignment,
377                           uint64_t min_alignment,
378                           struct pvr_pds_upload *const pds_upload_out)
379 {
380    struct pvr_device *const device = cmd_buffer->device;
381    VkResult result;
382 
383    result = pvr_gpu_upload_pds(device,
384                                data,
385                                data_size_dwords,
386                                data_alignment,
387                                code,
388                                code_size_dwords,
389                                code_alignment,
390                                min_alignment,
391                                pds_upload_out);
392    if (result != VK_SUCCESS)
393       return result;
394 
395    list_add(&pds_upload_out->pvr_bo->link, &cmd_buffer->bo_list);
396 
397    return VK_SUCCESS;
398 }
399 
400 static inline VkResult
pvr_cmd_buffer_upload_pds_data(struct pvr_cmd_buffer * const cmd_buffer,const uint32_t * data,uint32_t data_size_dwords,uint32_t data_alignment,struct pvr_pds_upload * const pds_upload_out)401 pvr_cmd_buffer_upload_pds_data(struct pvr_cmd_buffer *const cmd_buffer,
402                                const uint32_t *data,
403                                uint32_t data_size_dwords,
404                                uint32_t data_alignment,
405                                struct pvr_pds_upload *const pds_upload_out)
406 {
407    return pvr_cmd_buffer_upload_pds(cmd_buffer,
408                                     data,
409                                     data_size_dwords,
410                                     data_alignment,
411                                     NULL,
412                                     0,
413                                     0,
414                                     data_alignment,
415                                     pds_upload_out);
416 }
417 
pvr_sub_cmd_gfx_per_job_fragment_programs_create_and_upload(struct pvr_cmd_buffer * const cmd_buffer,const uint32_t pbe_cs_words[static const ROGUE_NUM_PBESTATE_STATE_WORDS],struct pvr_pds_upload * const pds_upload_out)418 static VkResult pvr_sub_cmd_gfx_per_job_fragment_programs_create_and_upload(
419    struct pvr_cmd_buffer *const cmd_buffer,
420    const uint32_t pbe_cs_words[static const ROGUE_NUM_PBESTATE_STATE_WORDS],
421    struct pvr_pds_upload *const pds_upload_out)
422 {
423    struct pvr_pds_event_program pixel_event_program = {
424       /* No data to DMA, just a DOUTU needed. */
425       .num_emit_word_pairs = 0,
426    };
427    const uint32_t staging_buffer_size =
428       cmd_buffer->device->pixel_event_data_size_in_dwords * sizeof(uint32_t);
429    const VkAllocationCallbacks *const allocator = &cmd_buffer->vk.pool->alloc;
430    struct pvr_device *const device = cmd_buffer->device;
431    /* FIXME: This should come from the compiler for the USC pixel program. */
432    const uint32_t usc_temp_count = 0;
433    struct pvr_bo *usc_eot_program;
434    uint8_t *usc_eot_program_ptr;
435    uint32_t *staging_buffer;
436    VkResult result;
437 
438    result = pvr_cmd_buffer_upload_usc(cmd_buffer,
439                                       pvr_end_of_tile_program,
440                                       sizeof(pvr_end_of_tile_program),
441                                       4,
442                                       &usc_eot_program);
443    if (result != VK_SUCCESS)
444       return result;
445 
446    assert((pbe_cs_words[1] & 0x3F) == 0x20);
447 
448    /* FIXME: Stop patching the framebuffer address (this will require the
449     * end-of-tile program to be generated at run-time).
450     */
451    pvr_bo_cpu_map(device, usc_eot_program);
452    usc_eot_program_ptr = usc_eot_program->bo->map;
453    usc_eot_program_ptr[6] = (pbe_cs_words[0] >> 0) & 0xFF;
454    usc_eot_program_ptr[7] = (pbe_cs_words[0] >> 8) & 0xFF;
455    usc_eot_program_ptr[8] = (pbe_cs_words[0] >> 16) & 0xFF;
456    usc_eot_program_ptr[9] = (pbe_cs_words[0] >> 24) & 0xFF;
457    pvr_bo_cpu_unmap(device, usc_eot_program);
458 
459    pvr_pds_setup_doutu(&pixel_event_program.task_control,
460                        usc_eot_program->vma->dev_addr.addr,
461                        usc_temp_count,
462                        PVRX(PDSINST_DOUTU_SAMPLE_RATE_INSTANCE),
463                        false);
464 
465    /* TODO: We could skip allocating this and generate directly into the device
466     * buffer thus removing one allocation and memcpy() per job. Would this
467     * speed up things in a noticeable way?
468     */
469    staging_buffer = vk_alloc(allocator,
470                              staging_buffer_size,
471                              8,
472                              VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
473    if (!staging_buffer) {
474       result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
475       goto err_free_usc_pixel_program;
476    }
477 
478    /* Generate the data segment. The code segment was uploaded earlier when
479     * setting up the PDS static heap data.
480     */
481    pvr_pds_generate_pixel_event_data_segment(&pixel_event_program,
482                                              staging_buffer,
483                                              &device->pdevice->dev_info);
484 
485    result = pvr_cmd_buffer_upload_pds_data(
486       cmd_buffer,
487       staging_buffer,
488       cmd_buffer->device->pixel_event_data_size_in_dwords,
489       4,
490       pds_upload_out);
491    if (result != VK_SUCCESS)
492       goto err_free_pixel_event_staging_buffer;
493 
494    vk_free(allocator, staging_buffer);
495 
496    return VK_SUCCESS;
497 
498 err_free_pixel_event_staging_buffer:
499    vk_free(allocator, staging_buffer);
500 
501 err_free_usc_pixel_program:
502    list_del(&usc_eot_program->link);
503    pvr_bo_free(device, usc_eot_program);
504 
505    return result;
506 }
507 
pvr_get_hw_clear_color(VkFormat vk_format,const VkClearValue * clear_value)508 static uint32_t pvr_get_hw_clear_color(VkFormat vk_format,
509                                        const VkClearValue *clear_value)
510 {
511    union util_color uc = { .ui = 0 };
512 
513    switch (vk_format) {
514    case VK_FORMAT_B8G8R8A8_UNORM:
515       util_pack_color(clear_value->color.float32,
516                       PIPE_FORMAT_R8G8B8A8_UNORM,
517                       &uc);
518       break;
519 
520    default:
521       assert(!"Unsupported format");
522       uc.ui[0] = 0;
523       break;
524    }
525 
526    return uc.ui[0];
527 }
528 
529 static VkResult
pvr_load_op_constants_create_and_upload(struct pvr_cmd_buffer * cmd_buffer,uint32_t idx,pvr_dev_addr_t * const addr_out)530 pvr_load_op_constants_create_and_upload(struct pvr_cmd_buffer *cmd_buffer,
531                                         uint32_t idx,
532                                         pvr_dev_addr_t *const addr_out)
533 {
534    const struct pvr_render_pass_info *render_pass_info =
535       &cmd_buffer->state.render_pass_info;
536    const struct pvr_render_pass *pass = render_pass_info->pass;
537    const struct pvr_renderpass_hwsetup_render *hw_render =
538       &pass->hw_setup->renders[idx];
539    ASSERTED const struct pvr_load_op *load_op = hw_render->client_data;
540    const struct pvr_renderpass_colorinit *color_init =
541       &hw_render->color_init[0];
542    const struct pvr_render_pass_attachment *attachment =
543       &pass->attachments[color_init->driver_id];
544    const VkClearValue *clear_value =
545       &render_pass_info->clear_values[color_init->driver_id];
546    uint32_t hw_clear_value;
547    struct pvr_bo *clear_bo;
548    VkResult result;
549 
550    pvr_finishme("Add missing load op data support");
551 
552    assert(load_op->is_hw_object);
553    assert(hw_render->color_init_count == 1);
554 
555    /* FIXME: add support for RENDERPASS_SURFACE_INITOP_LOAD. */
556    assert(color_init->op == RENDERPASS_SURFACE_INITOP_CLEAR);
557 
558    /* FIXME: do this at the point we store the clear values? */
559    hw_clear_value = pvr_get_hw_clear_color(attachment->vk_format, clear_value);
560 
561    result = pvr_cmd_buffer_upload_general(cmd_buffer,
562                                           &hw_clear_value,
563                                           sizeof(hw_clear_value),
564                                           &clear_bo);
565    if (result != VK_SUCCESS)
566       return result;
567 
568    *addr_out = clear_bo->vma->dev_addr;
569 
570    return VK_SUCCESS;
571 }
572 
pvr_load_op_pds_data_create_and_upload(struct pvr_cmd_buffer * cmd_buffer,uint32_t idx,pvr_dev_addr_t constants_addr,struct pvr_pds_upload * const pds_upload_out)573 static VkResult pvr_load_op_pds_data_create_and_upload(
574    struct pvr_cmd_buffer *cmd_buffer,
575    uint32_t idx,
576    pvr_dev_addr_t constants_addr,
577    struct pvr_pds_upload *const pds_upload_out)
578 {
579    const struct pvr_render_pass_info *render_pass_info =
580       &cmd_buffer->state.render_pass_info;
581    const struct pvr_load_op *load_op =
582       render_pass_info->pass->hw_setup->renders[idx].client_data;
583    struct pvr_device *device = cmd_buffer->device;
584    const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
585    struct pvr_pds_pixel_shader_sa_program program = { 0 };
586    uint32_t staging_buffer_size;
587    uint32_t *staging_buffer;
588    VkResult result;
589 
590    program.num_texture_dma_kicks = 1;
591 
592    pvr_csb_pack (&program.texture_dma_address[0],
593                  PDSINST_DOUT_FIELDS_DOUTD_SRC0,
594                  value) {
595       value.sbase = constants_addr;
596    }
597 
598    pvr_csb_pack (&program.texture_dma_control[0],
599                  PDSINST_DOUT_FIELDS_DOUTD_SRC1,
600                  value) {
601       value.dest = PVRX(PDSINST_DOUTD_DEST_COMMON_STORE);
602       value.a0 = load_op->shareds_dest_offset;
603       value.bsize = load_op->shareds_count;
604    }
605 
606    pvr_pds_set_sizes_pixel_shader_sa_texture_data(&program, dev_info);
607 
608    staging_buffer_size = program.data_size * sizeof(*staging_buffer);
609 
610    staging_buffer = vk_alloc(&cmd_buffer->vk.pool->alloc,
611                              staging_buffer_size,
612                              8,
613                              VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
614    if (!staging_buffer)
615       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
616 
617    pvr_pds_generate_pixel_shader_sa_texture_state_data(&program,
618                                                        staging_buffer,
619                                                        dev_info);
620 
621    result = pvr_cmd_buffer_upload_pds_data(cmd_buffer,
622                                            staging_buffer,
623                                            program.data_size,
624                                            1,
625                                            pds_upload_out);
626    if (result != VK_SUCCESS) {
627       vk_free(&cmd_buffer->vk.pool->alloc, staging_buffer);
628       return result;
629    }
630 
631    vk_free(&cmd_buffer->vk.pool->alloc, staging_buffer);
632 
633    return VK_SUCCESS;
634 }
635 
636 /* FIXME: Should this function be specific to the HW background object, in
637  * which case its name should be changed, or should it have the load op
638  * structure passed in?
639  */
640 static VkResult
pvr_load_op_data_create_and_upload(struct pvr_cmd_buffer * cmd_buffer,uint32_t idx,struct pvr_pds_upload * const pds_upload_out)641 pvr_load_op_data_create_and_upload(struct pvr_cmd_buffer *cmd_buffer,
642                                    uint32_t idx,
643                                    struct pvr_pds_upload *const pds_upload_out)
644 {
645    pvr_dev_addr_t constants_addr;
646    VkResult result;
647 
648    result =
649       pvr_load_op_constants_create_and_upload(cmd_buffer, idx, &constants_addr);
650    if (result != VK_SUCCESS)
651       return result;
652 
653    return pvr_load_op_pds_data_create_and_upload(cmd_buffer,
654                                                  idx,
655                                                  constants_addr,
656                                                  pds_upload_out);
657 }
658 
pvr_pds_bgnd_pack_state(const struct pvr_load_op * load_op,const struct pvr_pds_upload * load_op_program,uint64_t pds_reg_values[static const ROGUE_NUM_CR_PDS_BGRND_WORDS])659 static void pvr_pds_bgnd_pack_state(
660    const struct pvr_load_op *load_op,
661    const struct pvr_pds_upload *load_op_program,
662    uint64_t pds_reg_values[static const ROGUE_NUM_CR_PDS_BGRND_WORDS])
663 {
664    pvr_csb_pack (&pds_reg_values[0], CR_PDS_BGRND0_BASE, value) {
665       value.shader_addr = PVR_DEV_ADDR(load_op->pds_frag_prog.data_offset);
666       value.texunicode_addr =
667          PVR_DEV_ADDR(load_op->pds_tex_state_prog.code_offset);
668    }
669 
670    pvr_csb_pack (&pds_reg_values[1], CR_PDS_BGRND1_BASE, value) {
671       value.texturedata_addr = PVR_DEV_ADDR(load_op_program->data_offset);
672    }
673 
674    pvr_csb_pack (&pds_reg_values[2], CR_PDS_BGRND3_SIZEINFO, value) {
675       value.usc_sharedsize =
676          DIV_ROUND_UP(load_op->const_shareds_count,
677                       PVRX(CR_PDS_BGRND3_SIZEINFO_USC_SHAREDSIZE_UNIT_SIZE));
678       value.pds_texturestatesize = DIV_ROUND_UP(
679          load_op_program->data_size,
680          PVRX(CR_PDS_BGRND3_SIZEINFO_PDS_TEXTURESTATESIZE_UNIT_SIZE));
681       value.pds_tempsize =
682          DIV_ROUND_UP(load_op->temps_count,
683                       PVRX(CR_PDS_BGRND3_SIZEINFO_PDS_TEMPSIZE_UNIT_SIZE));
684    }
685 }
686 
687 /**
688  * \brief Calculates the stride in pixels based on the pitch in bytes and pixel
689  * format.
690  *
691  * \param[in] pitch     Width pitch in bytes.
692  * \param[in] vk_format Vulkan image format.
693  * \return Stride in pixels.
694  */
pvr_stride_from_pitch(uint32_t pitch,VkFormat vk_format)695 static inline uint32_t pvr_stride_from_pitch(uint32_t pitch, VkFormat vk_format)
696 {
697    const unsigned int cpp = vk_format_get_blocksize(vk_format);
698 
699    assert(pitch % cpp == 0);
700 
701    return pitch / cpp;
702 }
703 
pvr_setup_pbe_state(const struct pvr_device_info * dev_info,struct pvr_framebuffer * framebuffer,uint32_t mrt_index,const struct usc_mrt_resource * mrt_resource,const struct pvr_image_view * const iview,const VkRect2D * render_area,const bool down_scale,const uint32_t samples,uint32_t pbe_cs_words[static const ROGUE_NUM_PBESTATE_STATE_WORDS],uint64_t pbe_reg_words[static const ROGUE_NUM_PBESTATE_REG_WORDS])704 static void pvr_setup_pbe_state(
705    const struct pvr_device_info *dev_info,
706    struct pvr_framebuffer *framebuffer,
707    uint32_t mrt_index,
708    const struct usc_mrt_resource *mrt_resource,
709    const struct pvr_image_view *const iview,
710    const VkRect2D *render_area,
711    const bool down_scale,
712    const uint32_t samples,
713    uint32_t pbe_cs_words[static const ROGUE_NUM_PBESTATE_STATE_WORDS],
714    uint64_t pbe_reg_words[static const ROGUE_NUM_PBESTATE_REG_WORDS])
715 {
716    const struct pvr_image *image = iview->image;
717    uint32_t level_pitch = image->mip_levels[iview->vk.base_mip_level].pitch;
718 
719    struct pvr_pbe_surf_params surface_params;
720    struct pvr_pbe_render_params render_params;
721    bool with_packed_usc_channel;
722    const uint8_t *swizzle;
723    uint32_t position;
724 
725    /* down_scale should be true when performing a resolve, in which case there
726     * should be more than one sample.
727     */
728    assert((down_scale && samples > 1U) || (!down_scale && samples == 1U));
729 
730    /* Setup surface parameters. */
731 
732    if (PVR_HAS_FEATURE(dev_info, usc_f16sop_u8)) {
733       switch (iview->vk.format) {
734       case VK_FORMAT_B8G8R8A8_UNORM:
735          with_packed_usc_channel = true;
736          break;
737       case VK_FORMAT_D32_SFLOAT:
738          with_packed_usc_channel = false;
739          break;
740       default:
741          unreachable("Unsupported Vulkan image format");
742       }
743    } else {
744       with_packed_usc_channel = false;
745    }
746 
747    swizzle = pvr_get_format_swizzle(iview->vk.format);
748    memcpy(surface_params.swizzle, swizzle, sizeof(surface_params.swizzle));
749 
750    pvr_pbe_get_src_format_and_gamma(iview->vk.format,
751                                     PVR_PBE_GAMMA_NONE,
752                                     with_packed_usc_channel,
753                                     &surface_params.source_format,
754                                     &surface_params.gamma);
755 
756    surface_params.is_normalized = vk_format_is_normalized(iview->vk.format);
757    surface_params.pbe_packmode = pvr_get_pbe_packmode(iview->vk.format);
758    surface_params.nr_components = vk_format_get_nr_components(iview->vk.format);
759 
760    /* FIXME: Should we have an inline function to return the address of a mip
761     * level?
762     */
763    surface_params.addr =
764       PVR_DEV_ADDR_OFFSET(image->vma->dev_addr,
765                           image->mip_levels[iview->vk.base_mip_level].offset);
766 
767    surface_params.mem_layout = image->memlayout;
768    surface_params.stride = pvr_stride_from_pitch(level_pitch, iview->vk.format);
769    surface_params.depth = iview->vk.extent.depth;
770    surface_params.width = iview->vk.extent.width;
771    surface_params.height = iview->vk.extent.height;
772    surface_params.z_only_render = false;
773    surface_params.down_scale = down_scale;
774    surface_params.msaa_mode = samples;
775 
776    /* Setup render parameters. */
777 
778    if (mrt_resource->type == USC_MRT_RESOURCE_TYPE_MEMORY) {
779       position = mrt_resource->u.mem.offset_in_dwords;
780    } else {
781       assert(mrt_resource->type == USC_MRT_RESOURCE_TYPE_OUTPUT_REGISTER);
782       assert(mrt_resource->u.reg.offset == 0);
783 
784       position = mrt_resource->u.reg.out_reg;
785    }
786 
787    assert(position <= 3 || PVR_HAS_FEATURE(dev_info, eight_output_registers));
788 
789    switch (position) {
790    case 0:
791    case 4:
792       render_params.source_start = PVR_PBE_STARTPOS_BIT0;
793       break;
794    case 1:
795    case 5:
796       render_params.source_start = PVR_PBE_STARTPOS_BIT32;
797       break;
798    case 2:
799    case 6:
800       render_params.source_start = PVR_PBE_STARTPOS_BIT64;
801       break;
802    case 3:
803    case 7:
804       render_params.source_start = PVR_PBE_STARTPOS_BIT96;
805       break;
806    default:
807       assert(!"Invalid output register");
808       break;
809    }
810 
811    render_params.min_x_clip = MAX2(0, render_area->offset.x);
812    render_params.min_y_clip = MAX2(0, render_area->offset.y);
813    render_params.max_x_clip =
814       MIN2(framebuffer->width,
815            render_area->offset.x + render_area->extent.width) -
816       1;
817    render_params.max_y_clip =
818       MIN2(framebuffer->height,
819            render_area->offset.y + render_area->extent.height) -
820       1;
821 
822    render_params.slice = 0;
823    render_params.mrt_index = mrt_index;
824 
825    pvr_pbe_pack_state(dev_info,
826                       &surface_params,
827                       &render_params,
828                       pbe_cs_words,
829                       pbe_reg_words);
830 }
831 
832 static struct pvr_render_target *
pvr_get_render_target(const struct pvr_render_pass * pass,const struct pvr_framebuffer * framebuffer,uint32_t idx)833 pvr_get_render_target(const struct pvr_render_pass *pass,
834                       const struct pvr_framebuffer *framebuffer,
835                       uint32_t idx)
836 {
837    const struct pvr_renderpass_hwsetup_render *hw_render =
838       &pass->hw_setup->renders[idx];
839    uint32_t rt_idx = 0;
840 
841    switch (hw_render->sample_count) {
842    case 1:
843    case 2:
844    case 4:
845    case 8:
846       rt_idx = util_logbase2(hw_render->sample_count);
847       break;
848 
849    default:
850       unreachable("Unsupported sample count");
851       break;
852    }
853 
854    return &framebuffer->render_targets[rt_idx];
855 }
856 
857 static uint32_t
pvr_pass_get_pixel_output_width(const struct pvr_render_pass * pass,uint32_t idx,const struct pvr_device_info * dev_info)858 pvr_pass_get_pixel_output_width(const struct pvr_render_pass *pass,
859                                 uint32_t idx,
860                                 const struct pvr_device_info *dev_info)
861 {
862    const struct pvr_renderpass_hwsetup_render *hw_render =
863       &pass->hw_setup->renders[idx];
864    /* Default value based on the maximum value found in all existing cores. The
865     * maximum is used as this is being treated as a lower bound, making it a
866     * "safer" choice than the minimum value found in all existing cores.
867     */
868    const uint32_t min_output_regs =
869       PVR_GET_FEATURE_VALUE(dev_info, usc_min_output_registers_per_pix, 2U);
870    const uint32_t width = MAX2(hw_render->output_regs_count, min_output_regs);
871 
872    return util_next_power_of_two(width);
873 }
874 
pvr_sub_cmd_gfx_job_init(const struct pvr_device_info * dev_info,struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_gfx * sub_cmd)875 static VkResult pvr_sub_cmd_gfx_job_init(const struct pvr_device_info *dev_info,
876                                          struct pvr_cmd_buffer *cmd_buffer,
877                                          struct pvr_sub_cmd_gfx *sub_cmd)
878 {
879    struct pvr_render_pass_info *render_pass_info =
880       &cmd_buffer->state.render_pass_info;
881    const struct pvr_renderpass_hwsetup_render *hw_render =
882       &render_pass_info->pass->hw_setup->renders[sub_cmd->hw_render_idx];
883    struct pvr_render_job *job = &sub_cmd->job;
884    struct pvr_pds_upload pds_pixel_event_program;
885 
886    uint32_t pbe_cs_words[PVR_MAX_COLOR_ATTACHMENTS]
887                         [ROGUE_NUM_PBESTATE_STATE_WORDS];
888    struct pvr_render_target *render_target;
889    VkResult result;
890 
891    assert(hw_render->eot_surface_count < ARRAY_SIZE(pbe_cs_words));
892 
893    for (uint32_t i = 0; i < hw_render->eot_surface_count; i++) {
894       const struct pvr_renderpass_hwsetup_eot_surface *surface =
895          &hw_render->eot_surfaces[i];
896       const struct pvr_image_view *iview =
897          render_pass_info->attachments[surface->attachment_index];
898       const struct usc_mrt_resource *mrt_resource =
899          &hw_render->eot_setup.mrt_resources[surface->mrt_index];
900       uint32_t samples = 1;
901 
902       if (surface->need_resolve)
903          pvr_finishme("Set up job resolve information.");
904 
905       pvr_setup_pbe_state(dev_info,
906                           render_pass_info->framebuffer,
907                           surface->mrt_index,
908                           mrt_resource,
909                           iview,
910                           &render_pass_info->render_area,
911                           surface->need_resolve,
912                           samples,
913                           pbe_cs_words[i],
914                           job->pbe_reg_words[i]);
915    }
916 
917    /* FIXME: The fragment program only supports a single surface at present. */
918    assert(hw_render->eot_surface_count == 1);
919    result = pvr_sub_cmd_gfx_per_job_fragment_programs_create_and_upload(
920       cmd_buffer,
921       pbe_cs_words[0],
922       &pds_pixel_event_program);
923    if (result != VK_SUCCESS)
924       return result;
925 
926    job->pds_pixel_event_data_offset = pds_pixel_event_program.data_offset;
927 
928    /* FIXME: Don't do this if there is a barrier load. */
929    if (render_pass_info->enable_bg_tag) {
930       const struct pvr_load_op *load_op = hw_render->client_data;
931       struct pvr_pds_upload load_op_program;
932 
933       /* FIXME: Should we free the PDS pixel event data or let it be freed
934        * when the pool gets emptied?
935        */
936       result = pvr_load_op_data_create_and_upload(cmd_buffer,
937                                                   sub_cmd->hw_render_idx,
938                                                   &load_op_program);
939       if (result != VK_SUCCESS)
940          return result;
941 
942       pvr_pds_bgnd_pack_state(load_op,
943                               &load_op_program,
944                               job->pds_bgnd_reg_values);
945    }
946 
947    job->enable_bg_tag = render_pass_info->enable_bg_tag;
948    job->process_empty_tiles = render_pass_info->process_empty_tiles;
949 
950    render_target = pvr_get_render_target(render_pass_info->pass,
951                                          render_pass_info->framebuffer,
952                                          sub_cmd->hw_render_idx);
953    job->rt_dataset = render_target->rt_dataset;
954 
955    job->ctrl_stream_addr = pvr_csb_get_start_address(&sub_cmd->control_stream);
956 
957    /* FIXME: Need to set up the border color table at device creation
958     * time. Set to invalid for the time being.
959     */
960    job->border_colour_table_addr = PVR_DEV_ADDR_INVALID;
961 
962    if (sub_cmd->depth_bias_bo)
963       job->depth_bias_table_addr = sub_cmd->depth_bias_bo->vma->dev_addr;
964    else
965       job->depth_bias_table_addr = PVR_DEV_ADDR_INVALID;
966 
967    if (sub_cmd->scissor_bo)
968       job->scissor_table_addr = sub_cmd->scissor_bo->vma->dev_addr;
969    else
970       job->scissor_table_addr = PVR_DEV_ADDR_INVALID;
971 
972    job->pixel_output_width =
973       pvr_pass_get_pixel_output_width(render_pass_info->pass,
974                                       sub_cmd->hw_render_idx,
975                                       dev_info);
976 
977    if (hw_render->ds_surface_id != -1) {
978       struct pvr_image_view *iview =
979          render_pass_info->attachments[hw_render->ds_surface_id];
980       const struct pvr_image *image = iview->image;
981 
982       if (vk_format_has_depth(image->vk.format)) {
983          uint32_t level_pitch =
984             image->mip_levels[iview->vk.base_mip_level].pitch;
985 
986          /* FIXME: Is this sufficient for depth buffers? */
987          job->depth_addr = image->dev_addr;
988 
989          job->depth_stride =
990             pvr_stride_from_pitch(level_pitch, iview->vk.format);
991          job->depth_height = iview->vk.extent.height;
992          job->depth_physical_width =
993             u_minify(image->physical_extent.width, iview->vk.base_mip_level);
994          job->depth_physical_height =
995             u_minify(image->physical_extent.height, iview->vk.base_mip_level);
996          job->depth_layer_size = image->layer_size;
997 
998          if (hw_render->ds_surface_id < render_pass_info->clear_value_count) {
999             VkClearValue *clear_values =
1000                &render_pass_info->clear_values[hw_render->ds_surface_id];
1001 
1002             job->depth_clear_value = clear_values->depthStencil.depth;
1003          } else {
1004             job->depth_clear_value = 1.0f;
1005          }
1006 
1007          job->depth_vk_format = iview->vk.format;
1008 
1009          job->depth_memlayout = image->memlayout;
1010       } else {
1011          job->depth_addr = PVR_DEV_ADDR_INVALID;
1012          job->depth_stride = 0;
1013          job->depth_height = 0;
1014          job->depth_physical_width = 0;
1015          job->depth_physical_height = 0;
1016          job->depth_layer_size = 0;
1017          job->depth_clear_value = 1.0f;
1018          job->depth_vk_format = VK_FORMAT_UNDEFINED;
1019          job->depth_memlayout = PVR_MEMLAYOUT_LINEAR;
1020       }
1021 
1022       if (vk_format_has_stencil(image->vk.format)) {
1023          /* FIXME: Is this sufficient for stencil buffers? */
1024          job->stencil_addr = image->dev_addr;
1025       } else {
1026          job->stencil_addr = PVR_DEV_ADDR_INVALID;
1027       }
1028 
1029       job->samples = image->vk.samples;
1030    } else {
1031       pvr_finishme("Set up correct number of samples for render job");
1032 
1033       job->depth_addr = PVR_DEV_ADDR_INVALID;
1034       job->depth_stride = 0;
1035       job->depth_height = 0;
1036       job->depth_physical_width = 0;
1037       job->depth_physical_height = 0;
1038       job->depth_layer_size = 0;
1039       job->depth_clear_value = 1.0f;
1040       job->depth_vk_format = VK_FORMAT_UNDEFINED;
1041       job->depth_memlayout = PVR_MEMLAYOUT_LINEAR;
1042 
1043       job->stencil_addr = PVR_DEV_ADDR_INVALID;
1044 
1045       job->samples = 1;
1046    }
1047 
1048    if (sub_cmd->max_tiles_in_flight ==
1049        PVR_GET_FEATURE_VALUE(dev_info, isp_max_tiles_in_flight, 1U)) {
1050       /* Use the default limit based on the partition store. */
1051       job->max_tiles_in_flight = 0U;
1052    } else {
1053       job->max_tiles_in_flight = sub_cmd->max_tiles_in_flight;
1054    }
1055 
1056    job->frag_uses_atomic_ops = sub_cmd->frag_uses_atomic_ops;
1057    job->disable_compute_overlap = false;
1058    job->max_shared_registers = cmd_buffer->state.max_shared_regs;
1059    job->run_frag = true;
1060    job->geometry_terminate = true;
1061 
1062    return VK_SUCCESS;
1063 }
1064 
1065 /* Number of shareds used in the Issue Data Fence(IDF)/Wait Data Fence(WDF)
1066  * kernel.
1067  */
1068 #define PVR_IDF_WDF_IN_REGISTER_CONST_COUNT 12U
1069 
1070 static void
pvr_sub_cmd_compute_job_init(const struct pvr_physical_device * pdevice,struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * sub_cmd)1071 pvr_sub_cmd_compute_job_init(const struct pvr_physical_device *pdevice,
1072                              struct pvr_cmd_buffer *cmd_buffer,
1073                              struct pvr_sub_cmd_compute *sub_cmd)
1074 {
1075    const struct pvr_device_runtime_info *dev_runtime_info =
1076       &pdevice->dev_runtime_info;
1077    const struct pvr_device_info *dev_info = &pdevice->dev_info;
1078 
1079    if (sub_cmd->uses_barrier)
1080       sub_cmd->submit_info.flags |= PVR_WINSYS_COMPUTE_FLAG_PREVENT_ALL_OVERLAP;
1081 
1082    pvr_csb_pack (&sub_cmd->submit_info.regs.cdm_ctrl_stream_base,
1083                  CR_CDM_CTRL_STREAM_BASE,
1084                  value) {
1085       value.addr = pvr_csb_get_start_address(&sub_cmd->control_stream);
1086    }
1087 
1088    /* FIXME: Need to set up the border color table at device creation
1089     * time. Set to invalid for the time being.
1090     */
1091    pvr_csb_pack (&sub_cmd->submit_info.regs.tpu_border_colour_table,
1092                  CR_TPU_BORDER_COLOUR_TABLE_CDM,
1093                  value) {
1094       value.border_colour_table_address = PVR_DEV_ADDR_INVALID;
1095    }
1096 
1097    sub_cmd->num_shared_regs = MAX2(cmd_buffer->device->idfwdf_state.usc_shareds,
1098                                    cmd_buffer->state.max_shared_regs);
1099 
1100    cmd_buffer->state.max_shared_regs = 0U;
1101 
1102    if (PVR_HAS_FEATURE(dev_info, compute_morton_capable))
1103       sub_cmd->submit_info.regs.cdm_item = 0;
1104 
1105    pvr_csb_pack (&sub_cmd->submit_info.regs.tpu, CR_TPU, value) {
1106       value.tag_cem_4k_face_packing = true;
1107    }
1108 
1109    if (PVR_HAS_FEATURE(dev_info, cluster_grouping) &&
1110        PVR_HAS_FEATURE(dev_info, slc_mcu_cache_controls) &&
1111        dev_runtime_info->num_phantoms > 1 && sub_cmd->uses_atomic_ops) {
1112       /* Each phantom has its own MCU, so atomicity can only be guaranteed
1113        * when all work items are processed on the same phantom. This means we
1114        * need to disable all USCs other than those of the first phantom, which
1115        * has 4 clusters.
1116        */
1117       pvr_csb_pack (&sub_cmd->submit_info.regs.compute_cluster,
1118                     CR_COMPUTE_CLUSTER,
1119                     value) {
1120          value.mask = 0xFU;
1121       }
1122    } else {
1123       pvr_csb_pack (&sub_cmd->submit_info.regs.compute_cluster,
1124                     CR_COMPUTE_CLUSTER,
1125                     value) {
1126          value.mask = 0U;
1127       }
1128    }
1129 
1130    if (PVR_HAS_FEATURE(dev_info, gpu_multicore_support) &&
1131        sub_cmd->uses_atomic_ops) {
1132       sub_cmd->submit_info.flags |= PVR_WINSYS_COMPUTE_FLAG_SINGLE_CORE;
1133    }
1134 }
1135 
1136 #define PIXEL_ALLOCATION_SIZE_MAX_IN_BLOCKS \
1137    (1024 / PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE))
1138 
1139 static uint32_t
pvr_compute_flat_slot_size(const struct pvr_physical_device * pdevice,uint32_t coeff_regs_count,bool use_barrier,uint32_t total_workitems)1140 pvr_compute_flat_slot_size(const struct pvr_physical_device *pdevice,
1141                            uint32_t coeff_regs_count,
1142                            bool use_barrier,
1143                            uint32_t total_workitems)
1144 {
1145    const struct pvr_device_runtime_info *dev_runtime_info =
1146       &pdevice->dev_runtime_info;
1147    const struct pvr_device_info *dev_info = &pdevice->dev_info;
1148    uint32_t max_workgroups_per_task = ROGUE_CDM_MAX_PACKED_WORKGROUPS_PER_TASK;
1149    uint32_t max_avail_coeff_regs =
1150       dev_runtime_info->cdm_max_local_mem_size_regs;
1151    uint32_t localstore_chunks_count =
1152       DIV_ROUND_UP(coeff_regs_count << 2,
1153                    PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE));
1154 
1155    /* Ensure that we cannot have more workgroups in a slot than the available
1156     * number of coefficients allow us to have.
1157     */
1158    if (coeff_regs_count > 0U) {
1159       /* If TA or 3D can overlap with CDM, or if the TA is running a geometry
1160        * shader then we need to consider this in calculating max allowed
1161        * work-groups.
1162        */
1163       if (PVR_HAS_QUIRK(dev_info, 52354) &&
1164           (PVR_HAS_FEATURE(dev_info, compute_overlap) ||
1165            PVR_HAS_FEATURE(dev_info, gs_rta_support))) {
1166          /* Solve for n (number of work-groups per task). All values are in
1167           * size of common store alloc blocks:
1168           *
1169           * n + (2n + 7) * (local_memory_size_max - 1) =
1170           * 	(coefficient_memory_pool_size) - (7 * pixel_allocation_size_max)
1171           * ==>
1172           * n + 2n * (local_memory_size_max - 1) =
1173           * 	(coefficient_memory_pool_size) - (7 * pixel_allocation_size_max)
1174           * 	- (7 * (local_memory_size_max - 1))
1175           * ==>
1176           * n * (1 + 2 * (local_memory_size_max - 1)) =
1177           * 	(coefficient_memory_pool_size) - (7 * pixel_allocation_size_max)
1178           * 	- (7 * (local_memory_size_max - 1))
1179           * ==>
1180           * n = ((coefficient_memory_pool_size) -
1181           * 	(7 * pixel_allocation_size_max) -
1182           * 	(7 * (local_memory_size_max - 1)) / (1 +
1183           * 2 * (local_memory_size_max - 1)))
1184           */
1185          uint32_t max_common_store_blocks =
1186             DIV_ROUND_UP(max_avail_coeff_regs * 4U,
1187                          PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE));
1188 
1189          /* (coefficient_memory_pool_size) - (7 * pixel_allocation_size_max)
1190           */
1191          max_common_store_blocks -= ROGUE_MAX_OVERLAPPED_PIXEL_TASK_INSTANCES *
1192                                     PIXEL_ALLOCATION_SIZE_MAX_IN_BLOCKS;
1193 
1194          /* - (7 * (local_memory_size_max - 1)) */
1195          max_common_store_blocks -= (ROGUE_MAX_OVERLAPPED_PIXEL_TASK_INSTANCES *
1196                                      (localstore_chunks_count - 1U));
1197 
1198          /* Divide by (1 + 2 * (local_memory_size_max - 1)) */
1199          max_workgroups_per_task = max_common_store_blocks /
1200                                    (1U + 2U * (localstore_chunks_count - 1U));
1201 
1202          max_workgroups_per_task =
1203             MIN2(max_workgroups_per_task,
1204                  ROGUE_CDM_MAX_PACKED_WORKGROUPS_PER_TASK);
1205 
1206       } else {
1207          max_workgroups_per_task =
1208             MIN2((max_avail_coeff_regs / coeff_regs_count),
1209                  max_workgroups_per_task);
1210       }
1211    }
1212 
1213    /* max_workgroups_per_task should at least be one. */
1214    assert(max_workgroups_per_task >= 1U);
1215 
1216    if (total_workitems >= ROGUE_MAX_INSTANCES_PER_TASK) {
1217       /* In this case, the work group size will have been padded up to the
1218        * next ROGUE_MAX_INSTANCES_PER_TASK so we just set max instances to be
1219        * ROGUE_MAX_INSTANCES_PER_TASK.
1220        */
1221       return ROGUE_MAX_INSTANCES_PER_TASK;
1222    }
1223 
1224    /* In this case, the number of instances in the slot must be clamped to
1225     * accommodate whole work-groups only.
1226     */
1227    if (PVR_HAS_QUIRK(dev_info, 49032) || use_barrier) {
1228       max_workgroups_per_task =
1229          MIN2(max_workgroups_per_task,
1230               ROGUE_MAX_INSTANCES_PER_TASK / total_workitems);
1231       return total_workitems * max_workgroups_per_task;
1232    }
1233 
1234    return MIN2(total_workitems * max_workgroups_per_task,
1235                ROGUE_MAX_INSTANCES_PER_TASK);
1236 }
1237 
1238 static void
pvr_compute_generate_control_stream(struct pvr_csb * csb,struct pvr_sub_cmd_compute * sub_cmd,const struct pvr_compute_kernel_info * info)1239 pvr_compute_generate_control_stream(struct pvr_csb *csb,
1240                                     struct pvr_sub_cmd_compute *sub_cmd,
1241                                     const struct pvr_compute_kernel_info *info)
1242 {
1243    /* Compute kernel 0. */
1244    pvr_csb_emit (csb, CDMCTRL_KERNEL0, kernel0) {
1245       kernel0.indirect_present = !!info->indirect_buffer_addr.addr;
1246       kernel0.global_offsets_present = info->global_offsets_present;
1247       kernel0.usc_common_size = info->usc_common_size;
1248       kernel0.usc_unified_size = info->usc_unified_size;
1249       kernel0.pds_temp_size = info->pds_temp_size;
1250       kernel0.pds_data_size = info->pds_data_size;
1251       kernel0.usc_target = info->usc_target;
1252       kernel0.fence = info->is_fence;
1253    }
1254 
1255    /* Compute kernel 1. */
1256    pvr_csb_emit (csb, CDMCTRL_KERNEL1, kernel1) {
1257       kernel1.data_addr = PVR_DEV_ADDR(info->pds_data_offset);
1258       kernel1.sd_type = info->sd_type;
1259       kernel1.usc_common_shared = info->usc_common_shared;
1260    }
1261 
1262    /* Compute kernel 2. */
1263    pvr_csb_emit (csb, CDMCTRL_KERNEL2, kernel2) {
1264       kernel2.code_addr = PVR_DEV_ADDR(info->pds_code_offset);
1265    }
1266 
1267    if (info->indirect_buffer_addr.addr) {
1268       /* Compute kernel 6. */
1269       pvr_csb_emit (csb, CDMCTRL_KERNEL6, kernel6) {
1270          kernel6.indirect_addrmsb = info->indirect_buffer_addr;
1271       }
1272 
1273       /* Compute kernel 7. */
1274       pvr_csb_emit (csb, CDMCTRL_KERNEL7, kernel7) {
1275          kernel7.indirect_addrlsb = info->indirect_buffer_addr;
1276       }
1277    } else {
1278       /* Compute kernel 3. */
1279       pvr_csb_emit (csb, CDMCTRL_KERNEL3, kernel3) {
1280          assert(info->global_size[0U] > 0U);
1281          kernel3.workgroup_x = info->global_size[0U] - 1U;
1282       }
1283 
1284       /* Compute kernel 4. */
1285       pvr_csb_emit (csb, CDMCTRL_KERNEL4, kernel4) {
1286          assert(info->global_size[1U] > 0U);
1287          kernel4.workgroup_y = info->global_size[1U] - 1U;
1288       }
1289 
1290       /* Compute kernel 5. */
1291       pvr_csb_emit (csb, CDMCTRL_KERNEL5, kernel5) {
1292          assert(info->global_size[2U] > 0U);
1293          kernel5.workgroup_z = info->global_size[2U] - 1U;
1294       }
1295    }
1296 
1297    /* Compute kernel 8. */
1298    pvr_csb_emit (csb, CDMCTRL_KERNEL8, kernel8) {
1299       if (info->max_instances == ROGUE_MAX_INSTANCES_PER_TASK)
1300          kernel8.max_instances = 0U;
1301       else
1302          kernel8.max_instances = info->max_instances;
1303 
1304       assert(info->local_size[0U] > 0U);
1305       kernel8.workgroup_size_x = info->local_size[0U] - 1U;
1306       assert(info->local_size[1U] > 0U);
1307       kernel8.workgroup_size_y = info->local_size[1U] - 1U;
1308       assert(info->local_size[2U] > 0U);
1309       kernel8.workgroup_size_z = info->local_size[2U] - 1U;
1310    }
1311 
1312    /* Track the highest amount of shared registers usage in this dispatch.
1313     * This is used by the FW for context switching, so must be large enough
1314     * to contain all the shared registers that might be in use for this compute
1315     * job. Coefficients don't need to be included as the context switch will not
1316     * happen within the execution of a single workgroup, thus nothing needs to
1317     * be preserved.
1318     */
1319    if (info->usc_common_shared) {
1320       sub_cmd->num_shared_regs =
1321          MAX2(sub_cmd->num_shared_regs, info->usc_common_size);
1322    }
1323 }
1324 
1325 /* TODO: This can be pre-packed and uploaded directly. Would that provide any
1326  * speed up?
1327  */
1328 static void
pvr_compute_generate_idfwdf(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd)1329 pvr_compute_generate_idfwdf(struct pvr_cmd_buffer *cmd_buffer,
1330                             struct pvr_sub_cmd_compute *const sub_cmd)
1331 {
1332    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
1333    bool *const is_sw_barier_required =
1334       &state->current_sub_cmd->compute.pds_sw_barrier_requires_clearing;
1335    const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
1336    struct pvr_csb *csb = &sub_cmd->control_stream;
1337    const struct pvr_pds_upload *program;
1338 
1339    if (PVR_NEED_SW_COMPUTE_PDS_BARRIER(&pdevice->dev_info) &&
1340        *is_sw_barier_required) {
1341       *is_sw_barier_required = false;
1342       program = &cmd_buffer->device->idfwdf_state.sw_compute_barrier_pds;
1343    } else {
1344       program = &cmd_buffer->device->idfwdf_state.pds;
1345    }
1346 
1347    struct pvr_compute_kernel_info info = {
1348       .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
1349       .global_offsets_present = false,
1350       .usc_common_size =
1351          DIV_ROUND_UP(cmd_buffer->device->idfwdf_state.usc_shareds << 2,
1352                       PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE)),
1353       .usc_unified_size = 0U,
1354       .pds_temp_size = 0U,
1355       .pds_data_size =
1356          DIV_ROUND_UP(program->data_size << 2,
1357                       PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE)),
1358       .usc_target = PVRX(CDMCTRL_USC_TARGET_ALL),
1359       .is_fence = false,
1360       .pds_data_offset = program->data_offset,
1361       .sd_type = PVRX(CDMCTRL_SD_TYPE_USC),
1362       .usc_common_shared = true,
1363       .pds_code_offset = program->code_offset,
1364       .global_size = { 1U, 1U, 1U },
1365       .local_size = { 1U, 1U, 1U },
1366    };
1367 
1368    /* We don't need to pad work-group size for this case. */
1369 
1370    info.max_instances =
1371       pvr_compute_flat_slot_size(pdevice,
1372                                  cmd_buffer->device->idfwdf_state.usc_shareds,
1373                                  false,
1374                                  1U);
1375 
1376    pvr_compute_generate_control_stream(csb, sub_cmd, &info);
1377 }
1378 
1379 static void
pvr_compute_generate_fence(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd,bool deallocate_shareds)1380 pvr_compute_generate_fence(struct pvr_cmd_buffer *cmd_buffer,
1381                            struct pvr_sub_cmd_compute *const sub_cmd,
1382                            bool deallocate_shareds)
1383 {
1384    const struct pvr_pds_upload *program =
1385       &cmd_buffer->device->pds_compute_fence_program;
1386    const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
1387    struct pvr_csb *csb = &sub_cmd->control_stream;
1388 
1389    struct pvr_compute_kernel_info info = {
1390       .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
1391       .global_offsets_present = false,
1392       .usc_common_size = 0U,
1393       .usc_unified_size = 0U,
1394       .pds_temp_size = 0U,
1395       .pds_data_size =
1396          DIV_ROUND_UP(program->data_size << 2,
1397                       PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE)),
1398       .usc_target = PVRX(CDMCTRL_USC_TARGET_ANY),
1399       .is_fence = true,
1400       .pds_data_offset = program->data_offset,
1401       .sd_type = PVRX(CDMCTRL_SD_TYPE_PDS),
1402       .usc_common_shared = deallocate_shareds,
1403       .pds_code_offset = program->code_offset,
1404       .global_size = { 1U, 1U, 1U },
1405       .local_size = { 1U, 1U, 1U },
1406    };
1407 
1408    /* We don't need to pad work-group size for this case. */
1409    /* Here we calculate the slot size. This can depend on the use of barriers,
1410     * local memory, BRN's or other factors.
1411     */
1412    info.max_instances = pvr_compute_flat_slot_size(pdevice, 0U, false, 1U);
1413 
1414    pvr_compute_generate_control_stream(csb, sub_cmd, &info);
1415 }
1416 
pvr_cmd_buffer_end_sub_cmd(struct pvr_cmd_buffer * cmd_buffer)1417 static VkResult pvr_cmd_buffer_end_sub_cmd(struct pvr_cmd_buffer *cmd_buffer)
1418 {
1419    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
1420    struct pvr_sub_cmd *sub_cmd = state->current_sub_cmd;
1421    struct pvr_device *device = cmd_buffer->device;
1422    VkResult result;
1423 
1424    /* FIXME: Is this NULL check required because this function is called from
1425     * pvr_resolve_unemitted_resolve_attachments()? See comment about this
1426     * function being called twice in a row in pvr_CmdEndRenderPass().
1427     */
1428    if (!sub_cmd)
1429       return VK_SUCCESS;
1430 
1431    switch (sub_cmd->type) {
1432    case PVR_SUB_CMD_TYPE_GRAPHICS: {
1433       struct pvr_sub_cmd_gfx *const gfx_sub_cmd = &sub_cmd->gfx;
1434 
1435       if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
1436          result = pvr_csb_emit_return(&gfx_sub_cmd->control_stream);
1437          if (result != VK_SUCCESS) {
1438             state->status = result;
1439             return result;
1440          }
1441 
1442          break;
1443       }
1444 
1445       /* TODO: Check if the sub_cmd can be skipped based on
1446        * sub_cmd->gfx.empty_cmd flag.
1447        */
1448 
1449       result = pvr_cmd_buffer_upload_tables(device, cmd_buffer, gfx_sub_cmd);
1450       if (result != VK_SUCCESS) {
1451          state->status = result;
1452          return result;
1453       }
1454 
1455       result = pvr_cmd_buffer_emit_ppp_state(cmd_buffer, gfx_sub_cmd);
1456       if (result != VK_SUCCESS) {
1457          state->status = result;
1458          return result;
1459       }
1460 
1461       result = pvr_csb_emit_terminate(&gfx_sub_cmd->control_stream);
1462       if (result != VK_SUCCESS) {
1463          state->status = result;
1464          return result;
1465       }
1466 
1467       result = pvr_sub_cmd_gfx_job_init(&device->pdevice->dev_info,
1468                                         cmd_buffer,
1469                                         gfx_sub_cmd);
1470       if (result != VK_SUCCESS) {
1471          state->status = result;
1472          return result;
1473       }
1474 
1475       break;
1476    }
1477 
1478    case PVR_SUB_CMD_TYPE_COMPUTE: {
1479       struct pvr_sub_cmd_compute *const compute_sub_cmd = &sub_cmd->compute;
1480 
1481       pvr_compute_generate_fence(cmd_buffer, compute_sub_cmd, true);
1482 
1483       result = pvr_csb_emit_terminate(&compute_sub_cmd->control_stream);
1484       if (result != VK_SUCCESS) {
1485          state->status = result;
1486          return result;
1487       }
1488 
1489       pvr_sub_cmd_compute_job_init(device->pdevice,
1490                                    cmd_buffer,
1491                                    compute_sub_cmd);
1492       break;
1493    }
1494 
1495    case PVR_SUB_CMD_TYPE_TRANSFER:
1496       break;
1497 
1498    default:
1499       pvr_finishme("Unsupported sub-command type %d", sub_cmd->type);
1500       break;
1501    }
1502 
1503    state->current_sub_cmd = NULL;
1504 
1505    return VK_SUCCESS;
1506 }
1507 
pvr_reset_graphics_dirty_state(struct pvr_cmd_buffer_state * state,bool start_geom)1508 static void pvr_reset_graphics_dirty_state(struct pvr_cmd_buffer_state *state,
1509                                            bool start_geom)
1510 {
1511    if (start_geom) {
1512       /*
1513        * Initial geometry phase State.
1514        * It's the driver's responsibility to ensure that the state of the
1515        * hardware is correctly initialized at the start of every geometry
1516        * phase. This is required to prevent stale state from a previous
1517        * geometry phase erroneously affecting the next geometry phase. The
1518        * following fields in PPP State Header, and their corresponding state
1519        * words, must be supplied in the first PPP State Update of a geometry
1520        * phase that contains any geometry (draw calls). Any field not listed
1521        * below is safe to ignore.
1522        *
1523        *	TA_PRES_STREAM_OUT_SIZE
1524        *	TA_PRES_PPPCTRL
1525        *	TA_PRES_VARYING_WORD2
1526        *	TA_PRES_VARYING_WORD1
1527        *	TA_PRES_VARYING_WORD0
1528        *	TA_PRES_OUTSELECTS
1529        *	TA_PRES_WCLAMP
1530        *	TA_VIEWPORT_COUNT
1531        *	TA_PRES_VIEWPORT
1532        *	TA_PRES_REGION_CLIP
1533        *	TA_PRES_PDSSTATEPTR0
1534        *	TA_PRES_ISPCTLFB
1535        *	TA_PRES_ISPCTLFA
1536        *	TA_PRES_ISPCTL
1537        *
1538        * If a geometry phase does not contain any geometry, this restriction
1539        * can be ignored. If the first draw call in a geometry phase will only
1540        * update the depth or stencil buffers i.e. ISP_TAGWRITEDISABLE is set
1541        * in the ISP State Control Word, the PDS State Pointers
1542        * (TA_PRES_PDSSTATEPTR*) in the first PPP State Update do not need to
1543        * be supplied, since they will never reach the PDS in the fragment
1544        * phase.
1545        */
1546 
1547       state->emit_state_bits = 0;
1548 
1549       state->emit_state.stream_out = true;
1550       state->emit_state.ppp_control = true;
1551       state->emit_state.varying_word2 = true;
1552       state->emit_state.varying_word1 = true;
1553       state->emit_state.varying_word0 = true;
1554       state->emit_state.output_selects = true;
1555       state->emit_state.wclamp = true;
1556       state->emit_state.viewport = true;
1557       state->emit_state.region_clip = true;
1558       state->emit_state.pds_fragment_stateptr0 = true;
1559       state->emit_state.isp_fb = true;
1560       state->emit_state.isp = true;
1561    } else {
1562       state->emit_state.ppp_control = true;
1563       state->emit_state.varying_word1 = true;
1564       state->emit_state.varying_word0 = true;
1565       state->emit_state.output_selects = true;
1566       state->emit_state.viewport = true;
1567       state->emit_state.region_clip = true;
1568       state->emit_state.pds_fragment_stateptr0 = true;
1569       state->emit_state.isp_fb = true;
1570       state->emit_state.isp = true;
1571    }
1572 
1573    memset(&state->ppp_state, 0U, sizeof(state->ppp_state));
1574 
1575    state->dirty.vertex_bindings = true;
1576    state->dirty.gfx_pipeline_binding = true;
1577    state->dirty.viewport = true;
1578 }
1579 
pvr_cmd_buffer_start_sub_cmd(struct pvr_cmd_buffer * cmd_buffer,enum pvr_sub_cmd_type type)1580 static VkResult pvr_cmd_buffer_start_sub_cmd(struct pvr_cmd_buffer *cmd_buffer,
1581                                              enum pvr_sub_cmd_type type)
1582 {
1583    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
1584    struct pvr_device *device = cmd_buffer->device;
1585    struct pvr_sub_cmd *sub_cmd;
1586    VkResult result;
1587 
1588    /* Check the current status of the buffer. */
1589    if (state->status != VK_SUCCESS)
1590       return state->status;
1591 
1592    pvr_cmd_buffer_update_barriers(cmd_buffer, type);
1593 
1594    if (state->current_sub_cmd) {
1595       if (state->current_sub_cmd->type == type) {
1596          /* Continue adding to the current sub command. */
1597          return VK_SUCCESS;
1598       }
1599 
1600       /* End the current sub command. */
1601       result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
1602       if (result != VK_SUCCESS)
1603          return result;
1604    }
1605 
1606    sub_cmd = vk_zalloc(&cmd_buffer->vk.pool->alloc,
1607                        sizeof(*sub_cmd),
1608                        8,
1609                        VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
1610    if (!sub_cmd) {
1611       state->status = vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
1612       return state->status;
1613    }
1614 
1615    sub_cmd->type = type;
1616 
1617    switch (type) {
1618    case PVR_SUB_CMD_TYPE_GRAPHICS:
1619 
1620       sub_cmd->gfx.depth_usage = PVR_DEPTH_STENCIL_USAGE_UNDEFINED;
1621       sub_cmd->gfx.stencil_usage = PVR_DEPTH_STENCIL_USAGE_UNDEFINED;
1622       sub_cmd->gfx.modifies_depth = false;
1623       sub_cmd->gfx.modifies_stencil = false;
1624       sub_cmd->gfx.max_tiles_in_flight =
1625          PVR_GET_FEATURE_VALUE(&device->pdevice->dev_info,
1626                                isp_max_tiles_in_flight,
1627                                1);
1628       sub_cmd->gfx.hw_render_idx = state->render_pass_info.current_hw_subpass;
1629       sub_cmd->gfx.framebuffer = state->render_pass_info.framebuffer;
1630       sub_cmd->gfx.empty_cmd = true;
1631 
1632       pvr_reset_graphics_dirty_state(state, true);
1633       pvr_csb_init(device,
1634                    PVR_CMD_STREAM_TYPE_GRAPHICS,
1635                    &sub_cmd->gfx.control_stream);
1636       break;
1637 
1638    case PVR_SUB_CMD_TYPE_COMPUTE:
1639       pvr_csb_init(device,
1640                    PVR_CMD_STREAM_TYPE_COMPUTE,
1641                    &sub_cmd->compute.control_stream);
1642       break;
1643 
1644    case PVR_SUB_CMD_TYPE_TRANSFER:
1645       list_inithead(&sub_cmd->transfer.transfer_cmds);
1646       break;
1647 
1648    default:
1649       pvr_finishme("Unsupported sub-command type %d", type);
1650       break;
1651    }
1652 
1653    list_addtail(&sub_cmd->link, &cmd_buffer->sub_cmds);
1654    state->current_sub_cmd = sub_cmd;
1655 
1656    return VK_SUCCESS;
1657 }
1658 
pvr_cmd_buffer_alloc_mem(struct pvr_cmd_buffer * cmd_buffer,struct pvr_winsys_heap * heap,uint64_t size,uint32_t flags,struct pvr_bo ** const pvr_bo_out)1659 VkResult pvr_cmd_buffer_alloc_mem(struct pvr_cmd_buffer *cmd_buffer,
1660                                   struct pvr_winsys_heap *heap,
1661                                   uint64_t size,
1662                                   uint32_t flags,
1663                                   struct pvr_bo **const pvr_bo_out)
1664 {
1665    const uint32_t cache_line_size =
1666       rogue_get_slc_cache_line_size(&cmd_buffer->device->pdevice->dev_info);
1667    struct pvr_bo *pvr_bo;
1668    VkResult result;
1669 
1670    result = pvr_bo_alloc(cmd_buffer->device,
1671                          heap,
1672                          size,
1673                          cache_line_size,
1674                          flags,
1675                          &pvr_bo);
1676    if (result != VK_SUCCESS) {
1677       cmd_buffer->state.status = result;
1678       return result;
1679    }
1680 
1681    list_add(&pvr_bo->link, &cmd_buffer->bo_list);
1682 
1683    *pvr_bo_out = pvr_bo;
1684 
1685    return VK_SUCCESS;
1686 }
1687 
pvr_ResetCommandBuffer(VkCommandBuffer commandBuffer,VkCommandBufferResetFlags flags)1688 VkResult pvr_ResetCommandBuffer(VkCommandBuffer commandBuffer,
1689                                 VkCommandBufferResetFlags flags)
1690 {
1691    assert(!"Unimplemented");
1692    return VK_SUCCESS;
1693 }
1694 
pvr_cmd_bind_compute_pipeline(const struct pvr_compute_pipeline * const compute_pipeline,struct pvr_cmd_buffer * const cmd_buffer)1695 static void pvr_cmd_bind_compute_pipeline(
1696    const struct pvr_compute_pipeline *const compute_pipeline,
1697    struct pvr_cmd_buffer *const cmd_buffer)
1698 {
1699    cmd_buffer->state.compute_pipeline = compute_pipeline;
1700    cmd_buffer->state.dirty.compute_pipeline_binding = true;
1701 }
1702 
pvr_cmd_bind_graphics_pipeline(const struct pvr_graphics_pipeline * const gfx_pipeline,struct pvr_cmd_buffer * const cmd_buffer)1703 static void pvr_cmd_bind_graphics_pipeline(
1704    const struct pvr_graphics_pipeline *const gfx_pipeline,
1705    struct pvr_cmd_buffer *const cmd_buffer)
1706 {
1707    struct pvr_dynamic_state *const dest_state =
1708       &cmd_buffer->state.dynamic.common;
1709    const struct pvr_dynamic_state *const src_state =
1710       &gfx_pipeline->dynamic_state;
1711    struct pvr_cmd_buffer_state *const cmd_buffer_state = &cmd_buffer->state;
1712    const uint32_t state_mask = src_state->mask;
1713 
1714    cmd_buffer_state->gfx_pipeline = gfx_pipeline;
1715    cmd_buffer_state->dirty.gfx_pipeline_binding = true;
1716 
1717    /* FIXME: Handle PVR_DYNAMIC_STATE_BIT_VIEWPORT. */
1718    if (!(state_mask & PVR_DYNAMIC_STATE_BIT_VIEWPORT)) {
1719       assert(!"Unimplemented");
1720    }
1721 
1722    /* FIXME: Handle PVR_DYNAMIC_STATE_BIT_SCISSOR. */
1723    if (!(state_mask & PVR_DYNAMIC_STATE_BIT_SCISSOR)) {
1724       assert(!"Unimplemented");
1725    }
1726 
1727    if (!(state_mask & PVR_DYNAMIC_STATE_BIT_LINE_WIDTH)) {
1728       dest_state->line_width = src_state->line_width;
1729 
1730       cmd_buffer_state->dirty.line_width = true;
1731    }
1732 
1733    if (!(state_mask & PVR_DYNAMIC_STATE_BIT_DEPTH_BIAS)) {
1734       memcpy(&dest_state->depth_bias,
1735              &src_state->depth_bias,
1736              sizeof(src_state->depth_bias));
1737 
1738       cmd_buffer_state->dirty.depth_bias = true;
1739    }
1740 
1741    if (!(state_mask & PVR_DYNAMIC_STATE_BIT_BLEND_CONSTANTS)) {
1742       STATIC_ASSERT(
1743          __same_type(dest_state->blend_constants, src_state->blend_constants));
1744 
1745       typed_memcpy(dest_state->blend_constants,
1746                    src_state->blend_constants,
1747                    ARRAY_SIZE(dest_state->blend_constants));
1748 
1749       cmd_buffer_state->dirty.blend_constants = true;
1750    }
1751 
1752    if (!(state_mask & PVR_DYNAMIC_STATE_BIT_STENCIL_COMPARE_MASK)) {
1753       dest_state->compare_mask.front = src_state->compare_mask.front;
1754       dest_state->compare_mask.back = src_state->compare_mask.back;
1755 
1756       cmd_buffer_state->dirty.compare_mask = true;
1757    }
1758 
1759    if (!(state_mask & PVR_DYNAMIC_STATE_BIT_STENCIL_WRITE_MASK)) {
1760       dest_state->write_mask.front = src_state->write_mask.front;
1761       dest_state->write_mask.back = src_state->write_mask.back;
1762 
1763       cmd_buffer_state->dirty.write_mask = true;
1764    }
1765 
1766    if (!(state_mask & PVR_DYNAMIC_STATE_BIT_STENCIL_REFERENCE)) {
1767       dest_state->reference.front = src_state->reference.front;
1768       dest_state->reference.back = src_state->reference.back;
1769 
1770       cmd_buffer_state->dirty.reference = true;
1771    }
1772 }
1773 
pvr_CmdBindPipeline(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipeline _pipeline)1774 void pvr_CmdBindPipeline(VkCommandBuffer commandBuffer,
1775                          VkPipelineBindPoint pipelineBindPoint,
1776                          VkPipeline _pipeline)
1777 {
1778    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
1779    PVR_FROM_HANDLE(pvr_pipeline, pipeline, _pipeline);
1780 
1781    switch (pipelineBindPoint) {
1782    case VK_PIPELINE_BIND_POINT_COMPUTE:
1783       pvr_cmd_bind_compute_pipeline(to_pvr_compute_pipeline(pipeline),
1784                                     cmd_buffer);
1785       break;
1786 
1787    case VK_PIPELINE_BIND_POINT_GRAPHICS:
1788       pvr_cmd_bind_graphics_pipeline(to_pvr_graphics_pipeline(pipeline),
1789                                      cmd_buffer);
1790       break;
1791 
1792    default:
1793       unreachable("Invalid bind point.");
1794       break;
1795    }
1796 }
1797 
1798 #if defined(DEBUG)
check_viewport_quirk_70165(const struct pvr_device * device,const VkViewport * pViewport)1799 static void check_viewport_quirk_70165(const struct pvr_device *device,
1800                                        const VkViewport *pViewport)
1801 {
1802    const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
1803    float min_vertex_x, max_vertex_x, min_vertex_y, max_vertex_y;
1804    float min_screen_space_value, max_screen_space_value;
1805    float sign_to_unsigned_offset, fixed_point_max;
1806    float guardband_width, guardband_height;
1807 
1808    if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
1809       /* Max representable value in 13.4 fixed point format.
1810        * Round-down to avoid precision issues.
1811        * Calculated as (2 ** 13) - 2*(2 ** -4)
1812        */
1813       fixed_point_max = 8192.0f - 2.0f / 16.0f;
1814 
1815       if (PVR_HAS_FEATURE(dev_info, screen_size8K)) {
1816          if (pViewport->width <= 4096 && pViewport->height <= 4096) {
1817             guardband_width = pViewport->width / 4.0f;
1818             guardband_height = pViewport->height / 4.0f;
1819 
1820             /* 2k of the range is negative */
1821             sign_to_unsigned_offset = 2048.0f;
1822          } else {
1823             guardband_width = 0.0f;
1824             guardband_height = 0.0f;
1825 
1826             /* For > 4k renders, the entire range is positive */
1827             sign_to_unsigned_offset = 0.0f;
1828          }
1829       } else {
1830          guardband_width = pViewport->width / 4.0f;
1831          guardband_height = pViewport->height / 4.0f;
1832 
1833          /* 2k of the range is negative */
1834          sign_to_unsigned_offset = 2048.0f;
1835       }
1836    } else {
1837       /* Max representable value in 16.8 fixed point format
1838        * Calculated as (2 ** 16) - (2 ** -8)
1839        */
1840       fixed_point_max = 65535.99609375f;
1841       guardband_width = pViewport->width / 4.0f;
1842       guardband_height = pViewport->height / 4.0f;
1843 
1844       /* 4k/20k of the range is negative */
1845       sign_to_unsigned_offset = (float)PVR_MAX_NEG_OFFSCREEN_OFFSET;
1846    }
1847 
1848    min_screen_space_value = -sign_to_unsigned_offset;
1849    max_screen_space_value = fixed_point_max - sign_to_unsigned_offset;
1850 
1851    min_vertex_x = pViewport->x - guardband_width;
1852    max_vertex_x = pViewport->x + pViewport->width + guardband_width;
1853    min_vertex_y = pViewport->y - guardband_height;
1854    max_vertex_y = pViewport->y + pViewport->height + guardband_height;
1855    if (min_vertex_x < min_screen_space_value ||
1856        max_vertex_x > max_screen_space_value ||
1857        min_vertex_y < min_screen_space_value ||
1858        max_vertex_y > max_screen_space_value) {
1859       mesa_logw("Viewport is affected by BRN70165, geometry outside "
1860                 "the viewport could be corrupted");
1861    }
1862 }
1863 #endif
1864 
pvr_CmdSetViewport(VkCommandBuffer commandBuffer,uint32_t firstViewport,uint32_t viewportCount,const VkViewport * pViewports)1865 void pvr_CmdSetViewport(VkCommandBuffer commandBuffer,
1866                         uint32_t firstViewport,
1867                         uint32_t viewportCount,
1868                         const VkViewport *pViewports)
1869 {
1870    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
1871    const uint32_t total_count = firstViewport + viewportCount;
1872    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
1873 
1874    assert(firstViewport < PVR_MAX_VIEWPORTS && viewportCount > 0);
1875    assert(total_count >= 1 && total_count <= PVR_MAX_VIEWPORTS);
1876 
1877    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
1878 
1879 #if defined(DEBUG)
1880    if (PVR_HAS_QUIRK(&cmd_buffer->device->pdevice->dev_info, 70165)) {
1881       for (uint32_t viewport = 0; viewport < viewportCount; viewport++) {
1882          check_viewport_quirk_70165(cmd_buffer->device, &pViewports[viewport]);
1883       }
1884    }
1885 #endif
1886 
1887    if (state->dynamic.common.viewport.count < total_count)
1888       state->dynamic.common.viewport.count = total_count;
1889 
1890    memcpy(&state->dynamic.common.viewport.viewports[firstViewport],
1891           pViewports,
1892           viewportCount * sizeof(*pViewports));
1893 
1894    state->dirty.viewport = true;
1895 }
1896 
pvr_CmdSetScissor(VkCommandBuffer commandBuffer,uint32_t firstScissor,uint32_t scissorCount,const VkRect2D * pScissors)1897 void pvr_CmdSetScissor(VkCommandBuffer commandBuffer,
1898                        uint32_t firstScissor,
1899                        uint32_t scissorCount,
1900                        const VkRect2D *pScissors)
1901 {
1902    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
1903    const uint32_t total_count = firstScissor + scissorCount;
1904    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
1905 
1906    assert(firstScissor < PVR_MAX_VIEWPORTS && scissorCount > 0);
1907    assert(total_count >= 1 && total_count <= PVR_MAX_VIEWPORTS);
1908 
1909    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
1910 
1911    if (state->dynamic.common.scissor.count < total_count)
1912       state->dynamic.common.scissor.count = total_count;
1913 
1914    memcpy(&state->dynamic.common.scissor.scissors[firstScissor],
1915           pScissors,
1916           scissorCount * sizeof(*pScissors));
1917 
1918    state->dirty.scissor = true;
1919 }
1920 
pvr_CmdSetLineWidth(VkCommandBuffer commandBuffer,float lineWidth)1921 void pvr_CmdSetLineWidth(VkCommandBuffer commandBuffer, float lineWidth)
1922 {
1923    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
1924    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
1925 
1926    state->dynamic.common.line_width = lineWidth;
1927    state->dirty.line_width = true;
1928 }
1929 
pvr_CmdSetDepthBias(VkCommandBuffer commandBuffer,float depthBiasConstantFactor,float depthBiasClamp,float depthBiasSlopeFactor)1930 void pvr_CmdSetDepthBias(VkCommandBuffer commandBuffer,
1931                          float depthBiasConstantFactor,
1932                          float depthBiasClamp,
1933                          float depthBiasSlopeFactor)
1934 {
1935    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
1936    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
1937 
1938    state->dynamic.common.depth_bias.constant_factor = depthBiasConstantFactor;
1939    state->dynamic.common.depth_bias.clamp = depthBiasClamp;
1940    state->dynamic.common.depth_bias.slope_factor = depthBiasSlopeFactor;
1941    state->dirty.depth_bias = true;
1942 }
1943 
pvr_CmdSetBlendConstants(VkCommandBuffer commandBuffer,const float blendConstants[4])1944 void pvr_CmdSetBlendConstants(VkCommandBuffer commandBuffer,
1945                               const float blendConstants[4])
1946 {
1947    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
1948    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
1949 
1950    STATIC_ASSERT(ARRAY_SIZE(state->dynamic.common.blend_constants) == 4);
1951    memcpy(state->dynamic.common.blend_constants,
1952           blendConstants,
1953           sizeof(state->dynamic.common.blend_constants));
1954 
1955    state->dirty.blend_constants = true;
1956 }
1957 
pvr_CmdSetDepthBounds(VkCommandBuffer commandBuffer,float minDepthBounds,float maxDepthBounds)1958 void pvr_CmdSetDepthBounds(VkCommandBuffer commandBuffer,
1959                            float minDepthBounds,
1960                            float maxDepthBounds)
1961 {
1962    mesa_logd("No support for depth bounds testing.");
1963 }
1964 
pvr_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t compareMask)1965 void pvr_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,
1966                                   VkStencilFaceFlags faceMask,
1967                                   uint32_t compareMask)
1968 {
1969    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
1970    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
1971 
1972    if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
1973       state->dynamic.common.compare_mask.front = compareMask;
1974 
1975    if (faceMask & VK_STENCIL_FACE_BACK_BIT)
1976       state->dynamic.common.compare_mask.back = compareMask;
1977 
1978    state->dirty.compare_mask = true;
1979 }
1980 
pvr_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t writeMask)1981 void pvr_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,
1982                                 VkStencilFaceFlags faceMask,
1983                                 uint32_t writeMask)
1984 {
1985    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
1986    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
1987 
1988    if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
1989       state->dynamic.common.write_mask.front = writeMask;
1990 
1991    if (faceMask & VK_STENCIL_FACE_BACK_BIT)
1992       state->dynamic.common.write_mask.back = writeMask;
1993 
1994    state->dirty.write_mask = true;
1995 }
1996 
pvr_CmdSetStencilReference(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t reference)1997 void pvr_CmdSetStencilReference(VkCommandBuffer commandBuffer,
1998                                 VkStencilFaceFlags faceMask,
1999                                 uint32_t reference)
2000 {
2001    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2002    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
2003 
2004    if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
2005       state->dynamic.common.reference.front = reference;
2006 
2007    if (faceMask & VK_STENCIL_FACE_BACK_BIT)
2008       state->dynamic.common.reference.back = reference;
2009 
2010    state->dirty.reference = true;
2011 }
2012 
pvr_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipelineLayout _layout,uint32_t firstSet,uint32_t descriptorSetCount,const VkDescriptorSet * pDescriptorSets,uint32_t dynamicOffsetCount,const uint32_t * pDynamicOffsets)2013 void pvr_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
2014                                VkPipelineBindPoint pipelineBindPoint,
2015                                VkPipelineLayout _layout,
2016                                uint32_t firstSet,
2017                                uint32_t descriptorSetCount,
2018                                const VkDescriptorSet *pDescriptorSets,
2019                                uint32_t dynamicOffsetCount,
2020                                const uint32_t *pDynamicOffsets)
2021 {
2022    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2023    struct pvr_descriptor_state *descriptor_state;
2024 
2025    assert(firstSet + descriptorSetCount <= PVR_MAX_DESCRIPTOR_SETS);
2026 
2027    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2028 
2029    switch (pipelineBindPoint) {
2030    case VK_PIPELINE_BIND_POINT_GRAPHICS:
2031    case VK_PIPELINE_BIND_POINT_COMPUTE:
2032       break;
2033 
2034    default:
2035       unreachable("Unsupported bind point.");
2036       break;
2037    }
2038 
2039    if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) {
2040       descriptor_state = &cmd_buffer->state.gfx_desc_state;
2041       cmd_buffer->state.dirty.gfx_desc_dirty = true;
2042    } else {
2043       descriptor_state = &cmd_buffer->state.compute_desc_state;
2044       cmd_buffer->state.dirty.compute_desc_dirty = true;
2045    }
2046 
2047    for (uint32_t i = 0; i < descriptorSetCount; i++) {
2048       PVR_FROM_HANDLE(pvr_descriptor_set, set, pDescriptorSets[i]);
2049       uint32_t index = firstSet + i;
2050 
2051       if (descriptor_state->descriptor_sets[index] != set) {
2052          descriptor_state->descriptor_sets[index] = set;
2053          descriptor_state->valid_mask |= (1u << index);
2054       }
2055    }
2056 }
2057 
pvr_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets)2058 void pvr_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,
2059                               uint32_t firstBinding,
2060                               uint32_t bindingCount,
2061                               const VkBuffer *pBuffers,
2062                               const VkDeviceSize *pOffsets)
2063 {
2064    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2065    struct pvr_vertex_binding *const vb = cmd_buffer->state.vertex_bindings;
2066 
2067    /* We have to defer setting up vertex buffer since we need the buffer
2068     * stride from the pipeline.
2069     */
2070 
2071    assert(firstBinding < PVR_MAX_VERTEX_INPUT_BINDINGS &&
2072           bindingCount <= PVR_MAX_VERTEX_INPUT_BINDINGS);
2073 
2074    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2075 
2076    for (uint32_t i = 0; i < bindingCount; i++) {
2077       vb[firstBinding + i].buffer = pvr_buffer_from_handle(pBuffers[i]);
2078       vb[firstBinding + i].offset = pOffsets[i];
2079    }
2080 
2081    cmd_buffer->state.dirty.vertex_bindings = true;
2082 }
2083 
pvr_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,VkBuffer buffer,VkDeviceSize offset,VkIndexType indexType)2084 void pvr_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,
2085                             VkBuffer buffer,
2086                             VkDeviceSize offset,
2087                             VkIndexType indexType)
2088 {
2089    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2090    PVR_FROM_HANDLE(pvr_buffer, index_buffer, buffer);
2091    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
2092 
2093    assert(offset < index_buffer->vk.size);
2094    assert(indexType == VK_INDEX_TYPE_UINT32 ||
2095           indexType == VK_INDEX_TYPE_UINT16);
2096 
2097    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2098 
2099    state->index_buffer_binding.buffer = index_buffer;
2100    state->index_buffer_binding.offset = offset;
2101    state->index_buffer_binding.type = indexType;
2102    state->dirty.index_buffer_binding = true;
2103 }
2104 
pvr_CmdPushConstants(VkCommandBuffer commandBuffer,VkPipelineLayout layout,VkShaderStageFlags stageFlags,uint32_t offset,uint32_t size,const void * pValues)2105 void pvr_CmdPushConstants(VkCommandBuffer commandBuffer,
2106                           VkPipelineLayout layout,
2107                           VkShaderStageFlags stageFlags,
2108                           uint32_t offset,
2109                           uint32_t size,
2110                           const void *pValues)
2111 {
2112 #if defined(DEBUG)
2113    const uint64_t ending = (uint64_t)offset + (uint64_t)size;
2114 #endif
2115 
2116    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2117    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
2118 
2119    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2120 
2121    pvr_assert(ending <= PVR_MAX_PUSH_CONSTANTS_SIZE);
2122 
2123    memcpy(&state->push_constants.data[offset], pValues, size);
2124 
2125    state->push_constants.dirty_stages |= stageFlags;
2126 }
2127 
2128 static VkResult
pvr_cmd_buffer_setup_attachments(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_render_pass * pass,const struct pvr_framebuffer * framebuffer)2129 pvr_cmd_buffer_setup_attachments(struct pvr_cmd_buffer *cmd_buffer,
2130                                  const struct pvr_render_pass *pass,
2131                                  const struct pvr_framebuffer *framebuffer)
2132 {
2133    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
2134    struct pvr_render_pass_info *info = &state->render_pass_info;
2135 
2136    assert(pass->attachment_count == framebuffer->attachment_count);
2137 
2138    /* Free any previously allocated attachments. */
2139    vk_free(&cmd_buffer->vk.pool->alloc, state->render_pass_info.attachments);
2140 
2141    if (pass->attachment_count == 0) {
2142       info->attachments = NULL;
2143       return VK_SUCCESS;
2144    }
2145 
2146    info->attachments =
2147       vk_zalloc(&cmd_buffer->vk.pool->alloc,
2148                 pass->attachment_count * sizeof(*info->attachments),
2149                 8,
2150                 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
2151    if (!info->attachments) {
2152       /* Propagate VK_ERROR_OUT_OF_HOST_MEMORY to vkEndCommandBuffer */
2153       state->status = vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
2154       return state->status;
2155    }
2156 
2157    if (framebuffer) {
2158       for (uint32_t i = 0; i < pass->attachment_count; i++)
2159          info->attachments[i] = framebuffer->attachments[i];
2160    }
2161 
2162    return VK_SUCCESS;
2163 }
2164 
pvr_init_render_targets(struct pvr_device * device,struct pvr_render_pass * pass,struct pvr_framebuffer * framebuffer)2165 static VkResult pvr_init_render_targets(struct pvr_device *device,
2166                                         struct pvr_render_pass *pass,
2167                                         struct pvr_framebuffer *framebuffer)
2168 {
2169    for (uint32_t i = 0; i < pass->hw_setup->render_count; i++) {
2170       struct pvr_render_target *render_target =
2171          pvr_get_render_target(pass, framebuffer, i);
2172 
2173       pthread_mutex_lock(&render_target->mutex);
2174 
2175       if (!render_target->valid) {
2176          const struct pvr_renderpass_hwsetup_render *hw_render =
2177             &pass->hw_setup->renders[i];
2178          VkResult result;
2179 
2180          result = pvr_render_target_dataset_create(device,
2181                                                    framebuffer->width,
2182                                                    framebuffer->height,
2183                                                    hw_render->sample_count,
2184                                                    framebuffer->layers,
2185                                                    &render_target->rt_dataset);
2186          if (result != VK_SUCCESS) {
2187             pthread_mutex_unlock(&render_target->mutex);
2188             return result;
2189          }
2190 
2191          render_target->valid = true;
2192       }
2193 
2194       pthread_mutex_unlock(&render_target->mutex);
2195    }
2196 
2197    return VK_SUCCESS;
2198 }
2199 
2200 static const struct pvr_renderpass_hwsetup_subpass *
pvr_get_hw_subpass(const struct pvr_render_pass * pass,const uint32_t subpass)2201 pvr_get_hw_subpass(const struct pvr_render_pass *pass, const uint32_t subpass)
2202 {
2203    const struct pvr_renderpass_hw_map *map =
2204       &pass->hw_setup->subpass_map[subpass];
2205 
2206    return &pass->hw_setup->renders[map->render].subpasses[map->subpass];
2207 }
2208 
pvr_perform_start_of_render_attachment_clear(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_framebuffer * framebuffer,uint32_t index,bool is_depth_stencil,uint32_t * index_list_clear_mask)2209 static void pvr_perform_start_of_render_attachment_clear(
2210    struct pvr_cmd_buffer *cmd_buffer,
2211    const struct pvr_framebuffer *framebuffer,
2212    uint32_t index,
2213    bool is_depth_stencil,
2214    uint32_t *index_list_clear_mask)
2215 {
2216    struct pvr_render_pass_info *info = &cmd_buffer->state.render_pass_info;
2217    const struct pvr_render_pass *pass = info->pass;
2218    const struct pvr_renderpass_hwsetup_render *hw_render;
2219    const struct pvr_renderpass_hwsetup *hw_setup;
2220    struct pvr_image_view *iview;
2221    uint32_t view_idx;
2222    uint32_t height;
2223    uint32_t width;
2224 
2225    hw_setup = pass->hw_setup;
2226    hw_render =
2227       &hw_setup->renders[hw_setup->subpass_map[info->subpass_idx].render];
2228 
2229    if (is_depth_stencil) {
2230       bool stencil_clear;
2231       bool depth_clear;
2232       bool is_stencil;
2233       bool is_depth;
2234 
2235       assert(hw_render->ds_surface_id != -1);
2236       assert(index == 0);
2237 
2238       view_idx = hw_render->ds_surface_id;
2239 
2240       is_depth = vk_format_has_depth(pass->attachments[view_idx].vk_format);
2241       is_stencil = vk_format_has_stencil(pass->attachments[view_idx].vk_format);
2242       depth_clear = hw_render->depth_init == RENDERPASS_SURFACE_INITOP_CLEAR;
2243       stencil_clear = hw_render->stencil_init ==
2244                       RENDERPASS_SURFACE_INITOP_CLEAR;
2245 
2246       /* Attempt to clear the ds attachment. Do not erroneously discard an
2247        * attachment that has no depth clear but has a stencil attachment.
2248        */
2249       /* if not (a ∧ c) ∨ (b ∧ d) */
2250       if (!((is_depth && depth_clear) || (is_stencil && stencil_clear)))
2251          return;
2252    } else if (hw_render->color_init[index].op !=
2253               RENDERPASS_SURFACE_INITOP_CLEAR) {
2254       return;
2255    } else {
2256       view_idx = hw_render->color_init[index].driver_id;
2257    }
2258 
2259    iview = info->attachments[view_idx];
2260    width = iview->vk.extent.width;
2261    height = iview->vk.extent.height;
2262 
2263    /* FIXME: It would be nice if this function and pvr_sub_cmd_gfx_job_init()
2264     * were doing the same check (even if it's just an assert) to determine if a
2265     * clear is needed.
2266     */
2267    /* If this is single-layer fullscreen, we already do the clears in
2268     * pvr_sub_cmd_gfx_job_init().
2269     */
2270    if (info->render_area.offset.x == 0 && info->render_area.offset.y == 0 &&
2271        info->render_area.extent.width == width &&
2272        info->render_area.extent.height == height && framebuffer->layers == 1) {
2273       return;
2274    }
2275 
2276    pvr_finishme("Unimplemented path!");
2277 }
2278 
2279 static void
pvr_perform_start_of_render_clears(struct pvr_cmd_buffer * cmd_buffer)2280 pvr_perform_start_of_render_clears(struct pvr_cmd_buffer *cmd_buffer)
2281 {
2282    struct pvr_render_pass_info *info = &cmd_buffer->state.render_pass_info;
2283    const struct pvr_framebuffer *framebuffer = info->framebuffer;
2284    const struct pvr_render_pass *pass = info->pass;
2285    const struct pvr_renderpass_hwsetup *hw_setup = pass->hw_setup;
2286    const struct pvr_renderpass_hwsetup_render *hw_render;
2287 
2288    /* Mask of attachment clears using index lists instead of background object
2289     * to clear.
2290     */
2291    uint32_t index_list_clear_mask = 0;
2292 
2293    hw_render =
2294       &hw_setup->renders[hw_setup->subpass_map[info->subpass_idx].render];
2295    if (!hw_render) {
2296       info->process_empty_tiles = false;
2297       info->enable_bg_tag = false;
2298       return;
2299    }
2300 
2301    for (uint32_t i = 0; i < hw_render->color_init_count; i++) {
2302       pvr_perform_start_of_render_attachment_clear(cmd_buffer,
2303                                                    framebuffer,
2304                                                    i,
2305                                                    false,
2306                                                    &index_list_clear_mask);
2307    }
2308 
2309    info->enable_bg_tag = !!hw_render->color_init_count;
2310 
2311    /* If we're not using index list for all clears/loads then we need to run
2312     * the background object on empty tiles.
2313     */
2314    if (hw_render->color_init_count &&
2315        index_list_clear_mask != ((1u << hw_render->color_init_count) - 1u)) {
2316       info->process_empty_tiles = true;
2317    } else {
2318       info->process_empty_tiles = false;
2319    }
2320 
2321    if (hw_render->ds_surface_id != -1) {
2322       uint32_t ds_index_list = 0;
2323 
2324       pvr_perform_start_of_render_attachment_clear(cmd_buffer,
2325                                                    framebuffer,
2326                                                    0,
2327                                                    true,
2328                                                    &ds_index_list);
2329    }
2330 
2331    if (index_list_clear_mask)
2332       pvr_finishme("Add support for generating loadops shaders!");
2333 }
2334 
pvr_stash_depth_format(struct pvr_cmd_buffer_state * state,struct pvr_sub_cmd_gfx * const sub_cmd)2335 static void pvr_stash_depth_format(struct pvr_cmd_buffer_state *state,
2336                                    struct pvr_sub_cmd_gfx *const sub_cmd)
2337 {
2338    const struct pvr_render_pass *pass = state->render_pass_info.pass;
2339    const struct pvr_renderpass_hwsetup_render *hw_render =
2340       &pass->hw_setup->renders[sub_cmd->hw_render_idx];
2341 
2342    if (hw_render->ds_surface_id != -1) {
2343       struct pvr_image_view **iviews = state->render_pass_info.attachments;
2344 
2345       state->depth_format = iviews[hw_render->ds_surface_id]->vk.format;
2346    }
2347 }
2348 
pvr_loadops_contain_clear(struct pvr_renderpass_hwsetup * hw_setup)2349 static bool pvr_loadops_contain_clear(struct pvr_renderpass_hwsetup *hw_setup)
2350 {
2351    for (uint32_t i = 0; i < hw_setup->render_count; i++) {
2352       struct pvr_renderpass_hwsetup_render *hw_render = &hw_setup->renders[i];
2353       uint32_t render_targets_count =
2354          hw_render->init_setup.render_targets_count;
2355 
2356       for (uint32_t j = 0;
2357            j < (hw_render->color_init_count * render_targets_count);
2358            j += render_targets_count) {
2359          for (uint32_t k = 0; k < hw_render->init_setup.render_targets_count;
2360               k++) {
2361             if (hw_render->color_init[j + k].op ==
2362                 RENDERPASS_SURFACE_INITOP_CLEAR) {
2363                return true;
2364             }
2365          }
2366       }
2367       if (hw_render->depth_init == RENDERPASS_SURFACE_INITOP_CLEAR ||
2368           hw_render->stencil_init == RENDERPASS_SURFACE_INITOP_CLEAR) {
2369          return true;
2370       }
2371    }
2372 
2373    return false;
2374 }
2375 
2376 static VkResult
pvr_cmd_buffer_set_clear_values(struct pvr_cmd_buffer * cmd_buffer,const VkRenderPassBeginInfo * pRenderPassBegin)2377 pvr_cmd_buffer_set_clear_values(struct pvr_cmd_buffer *cmd_buffer,
2378                                 const VkRenderPassBeginInfo *pRenderPassBegin)
2379 {
2380    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
2381 
2382    /* Free any previously allocated clear values. */
2383    vk_free(&cmd_buffer->vk.pool->alloc, state->render_pass_info.clear_values);
2384 
2385    if (pRenderPassBegin->clearValueCount) {
2386       const size_t size = pRenderPassBegin->clearValueCount *
2387                           sizeof(*state->render_pass_info.clear_values);
2388 
2389       state->render_pass_info.clear_values =
2390          vk_zalloc(&cmd_buffer->vk.pool->alloc,
2391                    size,
2392                    8,
2393                    VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
2394       if (!state->render_pass_info.clear_values) {
2395          state->status = vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
2396          return state->status;
2397       }
2398 
2399       memcpy(state->render_pass_info.clear_values,
2400              pRenderPassBegin->pClearValues,
2401              size);
2402    } else {
2403       state->render_pass_info.clear_values = NULL;
2404    }
2405 
2406    state->render_pass_info.clear_value_count =
2407       pRenderPassBegin->clearValueCount;
2408 
2409    return VK_SUCCESS;
2410 }
2411 
pvr_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,const VkRenderPassBeginInfo * pRenderPassBeginInfo,const VkSubpassBeginInfo * pSubpassBeginInfo)2412 void pvr_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
2413                              const VkRenderPassBeginInfo *pRenderPassBeginInfo,
2414                              const VkSubpassBeginInfo *pSubpassBeginInfo)
2415 {
2416    PVR_FROM_HANDLE(pvr_framebuffer,
2417                    framebuffer,
2418                    pRenderPassBeginInfo->framebuffer);
2419    PVR_FROM_HANDLE(pvr_render_pass, pass, pRenderPassBeginInfo->renderPass);
2420    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2421    const struct pvr_renderpass_hwsetup_subpass *hw_subpass;
2422    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
2423    VkResult result;
2424 
2425    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2426 
2427    assert(!state->render_pass_info.pass);
2428    assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
2429 
2430    /* FIXME: Create a separate function for everything using pass->subpasses,
2431     * look at cmd_buffer_begin_subpass() for example. */
2432    state->render_pass_info.pass = pass;
2433    state->render_pass_info.framebuffer = framebuffer;
2434    state->render_pass_info.subpass_idx = 0;
2435    state->render_pass_info.render_area = pRenderPassBeginInfo->renderArea;
2436    state->render_pass_info.current_hw_subpass = 0;
2437    state->render_pass_info.pipeline_bind_point =
2438       pass->subpasses[0].pipeline_bind_point;
2439    state->render_pass_info.userpass_spawn = pass->subpasses[0].userpass_spawn;
2440    state->dirty.userpass_spawn = true;
2441 
2442    result = pvr_cmd_buffer_setup_attachments(cmd_buffer, pass, framebuffer);
2443    if (result != VK_SUCCESS)
2444       return;
2445 
2446    state->status =
2447       pvr_init_render_targets(cmd_buffer->device, pass, framebuffer);
2448    if (state->status != VK_SUCCESS)
2449       return;
2450 
2451    result = pvr_cmd_buffer_set_clear_values(cmd_buffer, pRenderPassBeginInfo);
2452    if (result != VK_SUCCESS)
2453       return;
2454 
2455    assert(pass->subpasses[0].pipeline_bind_point ==
2456           VK_PIPELINE_BIND_POINT_GRAPHICS);
2457 
2458    result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
2459    if (result != VK_SUCCESS)
2460       return;
2461 
2462    /* Run subpass 0 "soft" background object after the actual background
2463     * object.
2464     */
2465    hw_subpass = pvr_get_hw_subpass(pass, 0);
2466    if (hw_subpass->client_data)
2467       pvr_finishme("Unimplemented path!");
2468 
2469    pvr_perform_start_of_render_clears(cmd_buffer);
2470    pvr_stash_depth_format(&cmd_buffer->state,
2471                           &cmd_buffer->state.current_sub_cmd->gfx);
2472 
2473    if (!pvr_loadops_contain_clear(pass->hw_setup)) {
2474       state->dynamic.scissor_accum_state = PVR_SCISSOR_ACCUM_CHECK_FOR_CLEAR;
2475       state->dynamic.scissor_accum_bounds.offset.x = 0;
2476       state->dynamic.scissor_accum_bounds.offset.y = 0;
2477       state->dynamic.scissor_accum_bounds.extent.width = 0;
2478       state->dynamic.scissor_accum_bounds.extent.height = 0;
2479    } else {
2480       state->dynamic.scissor_accum_state = PVR_SCISSOR_ACCUM_DISABLED;
2481    }
2482 }
2483 
pvr_cmd_buffer_reset(struct pvr_cmd_buffer * cmd_buffer)2484 static void pvr_cmd_buffer_reset(struct pvr_cmd_buffer *cmd_buffer)
2485 {
2486    if (cmd_buffer->status != PVR_CMD_BUFFER_STATUS_INITIAL) {
2487       /* FIXME: For now we always free all resources as if
2488        * VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT was set.
2489        */
2490       pvr_cmd_buffer_free_sub_cmds(cmd_buffer);
2491 
2492       list_for_each_entry_safe (struct pvr_bo, bo, &cmd_buffer->bo_list, link) {
2493          list_del(&bo->link);
2494          pvr_bo_free(cmd_buffer->device, bo);
2495       }
2496 
2497       util_dynarray_clear(&cmd_buffer->scissor_array);
2498       util_dynarray_clear(&cmd_buffer->depth_bias_array);
2499 
2500       cmd_buffer->state.status = VK_SUCCESS;
2501       cmd_buffer->status = PVR_CMD_BUFFER_STATUS_INITIAL;
2502    }
2503 }
2504 
pvr_BeginCommandBuffer(VkCommandBuffer commandBuffer,const VkCommandBufferBeginInfo * pBeginInfo)2505 VkResult pvr_BeginCommandBuffer(VkCommandBuffer commandBuffer,
2506                                 const VkCommandBufferBeginInfo *pBeginInfo)
2507 {
2508    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2509    struct pvr_cmd_buffer_state *state;
2510    VkResult result;
2511 
2512    pvr_cmd_buffer_reset(cmd_buffer);
2513 
2514    cmd_buffer->usage_flags = pBeginInfo->flags;
2515    state = &cmd_buffer->state;
2516 
2517    /* VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT must be ignored for
2518     * primary level command buffers.
2519     *
2520     * From the Vulkan 1.0 spec:
2521     *
2522     *    VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT specifies that a
2523     *    secondary command buffer is considered to be entirely inside a render
2524     *    pass. If this is a primary command buffer, then this bit is ignored.
2525     */
2526    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
2527       cmd_buffer->usage_flags &=
2528          ~VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
2529    }
2530 
2531    if (cmd_buffer->usage_flags &
2532        VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
2533       const VkCommandBufferInheritanceInfo *inheritance_info =
2534          pBeginInfo->pInheritanceInfo;
2535       struct pvr_render_pass *pass;
2536 
2537       pass = pvr_render_pass_from_handle(inheritance_info->renderPass);
2538       state->render_pass_info.pass = pass;
2539       state->render_pass_info.framebuffer =
2540          pvr_framebuffer_from_handle(inheritance_info->framebuffer);
2541       state->render_pass_info.subpass_idx = inheritance_info->subpass;
2542       state->render_pass_info.userpass_spawn =
2543          pass->subpasses[inheritance_info->subpass].userpass_spawn;
2544 
2545       result =
2546          pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
2547       if (result != VK_SUCCESS)
2548          return result;
2549    }
2550 
2551    memset(state->barriers_needed,
2552           0xFF,
2553           sizeof(*state->barriers_needed) * ARRAY_SIZE(state->barriers_needed));
2554 
2555    cmd_buffer->status = PVR_CMD_BUFFER_STATUS_RECORDING;
2556 
2557    return VK_SUCCESS;
2558 }
2559 
pvr_cmd_buffer_add_transfer_cmd(struct pvr_cmd_buffer * cmd_buffer,struct pvr_transfer_cmd * transfer_cmd)2560 VkResult pvr_cmd_buffer_add_transfer_cmd(struct pvr_cmd_buffer *cmd_buffer,
2561                                          struct pvr_transfer_cmd *transfer_cmd)
2562 {
2563    struct pvr_sub_cmd_transfer *sub_cmd;
2564    VkResult result;
2565 
2566    result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_TRANSFER);
2567    if (result != VK_SUCCESS)
2568       return result;
2569 
2570    sub_cmd = &cmd_buffer->state.current_sub_cmd->transfer;
2571 
2572    list_addtail(&transfer_cmd->link, &sub_cmd->transfer_cmds);
2573 
2574    return VK_SUCCESS;
2575 }
2576 
2577 static void
pvr_validate_push_descriptors(struct pvr_cmd_buffer * cmd_buffer,bool * const push_descriptors_dirty_out)2578 pvr_validate_push_descriptors(struct pvr_cmd_buffer *cmd_buffer,
2579                               bool *const push_descriptors_dirty_out)
2580 {
2581    /* TODO: Implement this function, based on ValidatePushDescriptors. */
2582    pvr_finishme("Add support for push descriptors!");
2583    *push_descriptors_dirty_out = false;
2584 }
2585 
2586 #define PVR_WRITE(_buffer, _value, _offset, _max)                \
2587    do {                                                          \
2588       __typeof__(_value) __value = _value;                       \
2589       uint64_t __offset = _offset;                               \
2590       uint32_t __nr_dwords = sizeof(__value) / sizeof(uint32_t); \
2591       static_assert(__same_type(*_buffer, __value),              \
2592                     "Buffer and value type mismatch");           \
2593       assert((__offset + __nr_dwords) <= (_max));                \
2594       assert((__offset % __nr_dwords) == 0U);                    \
2595       _buffer[__offset / __nr_dwords] = __value;                 \
2596    } while (0)
2597 
2598 static VkResult
pvr_setup_vertex_buffers(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_graphics_pipeline * const gfx_pipeline)2599 pvr_setup_vertex_buffers(struct pvr_cmd_buffer *cmd_buffer,
2600                          const struct pvr_graphics_pipeline *const gfx_pipeline)
2601 {
2602    const struct pvr_vertex_shader_state *const vertex_state =
2603       &gfx_pipeline->vertex_shader_state;
2604    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
2605    const struct pvr_pds_info *const pds_info = state->pds_shader.info;
2606    const uint8_t *entries;
2607    uint32_t *dword_buffer;
2608    uint64_t *qword_buffer;
2609    struct pvr_bo *pvr_bo;
2610    VkResult result;
2611 
2612    result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
2613                                      cmd_buffer->device->heaps.pds_heap,
2614                                      pds_info->data_size_in_dwords,
2615                                      PVR_BO_ALLOC_FLAG_CPU_MAPPED,
2616                                      &pvr_bo);
2617    if (result != VK_SUCCESS)
2618       return result;
2619 
2620    dword_buffer = (uint32_t *)pvr_bo->bo->map;
2621    qword_buffer = (uint64_t *)pvr_bo->bo->map;
2622 
2623    entries = (uint8_t *)pds_info->entries;
2624 
2625    for (uint32_t i = 0; i < pds_info->entry_count; i++) {
2626       const struct pvr_const_map_entry *const entry_header =
2627          (struct pvr_const_map_entry *)entries;
2628 
2629       switch (entry_header->type) {
2630       case PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32: {
2631          const struct pvr_const_map_entry_literal32 *const literal =
2632             (struct pvr_const_map_entry_literal32 *)entries;
2633 
2634          PVR_WRITE(dword_buffer,
2635                    literal->literal_value,
2636                    literal->const_offset,
2637                    pds_info->data_size_in_dwords);
2638 
2639          entries += sizeof(*literal);
2640          break;
2641       }
2642 
2643       case PVR_PDS_CONST_MAP_ENTRY_TYPE_DOUTU_ADDRESS: {
2644          const struct pvr_const_map_entry_doutu_address *const doutu_addr =
2645             (struct pvr_const_map_entry_doutu_address *)entries;
2646          const pvr_dev_addr_t exec_addr =
2647             PVR_DEV_ADDR_OFFSET(vertex_state->bo->vma->dev_addr,
2648                                 vertex_state->entry_offset);
2649          uint64_t addr = 0ULL;
2650 
2651          pvr_set_usc_execution_address64(&addr, exec_addr.addr);
2652 
2653          PVR_WRITE(qword_buffer,
2654                    addr | doutu_addr->doutu_control,
2655                    doutu_addr->const_offset,
2656                    pds_info->data_size_in_dwords);
2657 
2658          entries += sizeof(*doutu_addr);
2659          break;
2660       }
2661 
2662       case PVR_PDS_CONST_MAP_ENTRY_TYPE_BASE_INSTANCE: {
2663          const struct pvr_const_map_entry_base_instance *const base_instance =
2664             (struct pvr_const_map_entry_base_instance *)entries;
2665 
2666          PVR_WRITE(dword_buffer,
2667                    state->draw_state.base_instance,
2668                    base_instance->const_offset,
2669                    pds_info->data_size_in_dwords);
2670 
2671          entries += sizeof(*base_instance);
2672          break;
2673       }
2674 
2675       case PVR_PDS_CONST_MAP_ENTRY_TYPE_VERTEX_ATTRIBUTE_ADDRESS: {
2676          const struct pvr_const_map_entry_vertex_attribute_address
2677             *const attribute =
2678                (struct pvr_const_map_entry_vertex_attribute_address *)entries;
2679          const struct pvr_vertex_binding *const binding =
2680             &state->vertex_bindings[attribute->binding_index];
2681          const pvr_dev_addr_t addr =
2682             PVR_DEV_ADDR_OFFSET(binding->buffer->dev_addr,
2683                                 binding->offset + attribute->offset);
2684 
2685          PVR_WRITE(qword_buffer,
2686                    addr.addr,
2687                    attribute->const_offset,
2688                    pds_info->data_size_in_dwords);
2689 
2690          entries += sizeof(*attribute);
2691          break;
2692       }
2693 
2694       default:
2695          unreachable("Unsupported data section map");
2696          break;
2697       }
2698    }
2699 
2700    state->pds_vertex_attrib_offset =
2701       pvr_bo->vma->dev_addr.addr -
2702       cmd_buffer->device->heaps.pds_heap->base_addr.addr;
2703 
2704    pvr_bo_cpu_unmap(cmd_buffer->device, pvr_bo);
2705 
2706    return VK_SUCCESS;
2707 }
2708 
pvr_setup_descriptor_mappings(struct pvr_cmd_buffer * const cmd_buffer,enum pvr_stage_allocation stage,const struct pvr_stage_allocation_descriptor_state * descriptor_state,UNUSED const pvr_dev_addr_t * const num_worgroups_buff_addr,uint32_t * const descriptor_data_offset_out)2709 static VkResult pvr_setup_descriptor_mappings(
2710    struct pvr_cmd_buffer *const cmd_buffer,
2711    enum pvr_stage_allocation stage,
2712    const struct pvr_stage_allocation_descriptor_state *descriptor_state,
2713    UNUSED const pvr_dev_addr_t *const num_worgroups_buff_addr,
2714    uint32_t *const descriptor_data_offset_out)
2715 {
2716    const struct pvr_pds_info *const pds_info = &descriptor_state->pds_info;
2717    const struct pvr_descriptor_state *desc_state;
2718    const uint8_t *entries;
2719    uint32_t *dword_buffer;
2720    uint64_t *qword_buffer;
2721    struct pvr_bo *pvr_bo;
2722    VkResult result;
2723 
2724    pvr_finishme("Handle num_worgroups_buff_addr");
2725 
2726    if (!pds_info->data_size_in_dwords)
2727       return VK_SUCCESS;
2728 
2729    result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
2730                                      cmd_buffer->device->heaps.pds_heap,
2731                                      pds_info->data_size_in_dwords,
2732                                      PVR_BO_ALLOC_FLAG_CPU_MAPPED,
2733                                      &pvr_bo);
2734    if (result != VK_SUCCESS)
2735       return result;
2736 
2737    dword_buffer = (uint32_t *)pvr_bo->bo->map;
2738    qword_buffer = (uint64_t *)pvr_bo->bo->map;
2739 
2740    entries = (uint8_t *)pds_info->entries;
2741 
2742    switch (stage) {
2743    case PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY:
2744    case PVR_STAGE_ALLOCATION_FRAGMENT:
2745       desc_state = &cmd_buffer->state.gfx_desc_state;
2746       break;
2747 
2748    case PVR_STAGE_ALLOCATION_COMPUTE:
2749       desc_state = &cmd_buffer->state.compute_desc_state;
2750       break;
2751 
2752    default:
2753       unreachable("Unsupported stage.");
2754       break;
2755    }
2756 
2757    for (uint32_t i = 0; i < pds_info->entry_count; i++) {
2758       const struct pvr_const_map_entry *const entry_header =
2759          (struct pvr_const_map_entry *)entries;
2760 
2761       /* TODO: See if instead of reusing the blend constant buffer type entry,
2762        * we can setup a new buffer type specifically for num_workgroups or other
2763        * built-in variables. The mappings are setup at pipeline creation when
2764        * creating the descriptor program.
2765        */
2766       pvr_finishme("Handle blend constant reuse for compute.");
2767 
2768       switch (entry_header->type) {
2769       case PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32: {
2770          const struct pvr_const_map_entry_literal32 *const literal =
2771             (struct pvr_const_map_entry_literal32 *)entries;
2772 
2773          PVR_WRITE(dword_buffer,
2774                    literal->literal_value,
2775                    literal->const_offset,
2776                    pds_info->data_size_in_dwords);
2777 
2778          entries += sizeof(*literal);
2779          break;
2780       }
2781 
2782       case PVR_PDS_CONST_MAP_ENTRY_TYPE_CONSTANT_BUFFER: {
2783          const struct pvr_const_map_entry_constant_buffer *const_buffer_entry =
2784             (struct pvr_const_map_entry_constant_buffer *)entries;
2785          const uint32_t desc_set = const_buffer_entry->desc_set;
2786          const uint32_t binding = const_buffer_entry->binding;
2787          const struct pvr_descriptor_set *descriptor_set;
2788          const struct pvr_descriptor *descriptor;
2789          pvr_dev_addr_t buffer_addr;
2790 
2791          /* TODO: Handle push descriptors. */
2792 
2793          assert(desc_set < PVR_MAX_DESCRIPTOR_SETS);
2794          descriptor_set = desc_state->descriptor_sets[desc_set];
2795 
2796          /* TODO: Handle dynamic buffers. */
2797          descriptor = &descriptor_set->descriptors[binding];
2798          assert(descriptor->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
2799 
2800          assert(descriptor->buffer_desc_range ==
2801                 const_buffer_entry->size_in_dwords * sizeof(uint32_t));
2802          assert(descriptor->buffer_create_info_size ==
2803                 const_buffer_entry->size_in_dwords * sizeof(uint32_t));
2804 
2805          buffer_addr =
2806             PVR_DEV_ADDR_OFFSET(descriptor->buffer_dev_addr,
2807                                 const_buffer_entry->offset * sizeof(uint32_t));
2808 
2809          PVR_WRITE(qword_buffer,
2810                    buffer_addr.addr,
2811                    const_buffer_entry->const_offset,
2812                    pds_info->data_size_in_dwords);
2813 
2814          entries += sizeof(*const_buffer_entry);
2815          break;
2816       }
2817 
2818       case PVR_PDS_CONST_MAP_ENTRY_TYPE_DESCRIPTOR_SET: {
2819          const struct pvr_const_map_entry_descriptor_set *desc_set_entry =
2820             (struct pvr_const_map_entry_descriptor_set *)entries;
2821          const uint32_t desc_set_num = desc_set_entry->descriptor_set;
2822          const struct pvr_descriptor_set *descriptor_set;
2823          pvr_dev_addr_t desc_set_addr;
2824 
2825          assert(desc_set_num < PVR_MAX_DESCRIPTOR_SETS);
2826 
2827          /* TODO: Remove this when the compiler provides us with usage info?
2828           */
2829          /* We skip DMAing unbound descriptor sets. */
2830          if (!(desc_state->valid_mask & BITFIELD_BIT(desc_set_num))) {
2831             const struct pvr_const_map_entry_literal32 *literal;
2832             uint32_t zero_literal_value;
2833 
2834             entries += sizeof(*desc_set_entry);
2835             literal = (struct pvr_const_map_entry_literal32 *)entries;
2836 
2837             /* TODO: Is there any guarantee that a literal will follow the
2838              * descriptor set entry?
2839              */
2840             assert(literal->type == PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32);
2841 
2842             /* We zero out the DMA size so the DMA isn't performed. */
2843             zero_literal_value =
2844                literal->literal_value &
2845                PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTD_SRC1_BSIZE_CLRMSK;
2846 
2847             PVR_WRITE(qword_buffer,
2848                       UINT64_C(0),
2849                       desc_set_entry->const_offset,
2850                       pds_info->data_size_in_dwords);
2851 
2852             PVR_WRITE(dword_buffer,
2853                       zero_literal_value,
2854                       desc_set_entry->const_offset,
2855                       pds_info->data_size_in_dwords);
2856 
2857             entries += sizeof(*literal);
2858             i++;
2859             continue;
2860          }
2861 
2862          descriptor_set = desc_state->descriptor_sets[desc_set_num];
2863 
2864          pvr_finishme("Handle push descriptor entry.");
2865 
2866          desc_set_addr = descriptor_set->pvr_bo->vma->dev_addr;
2867 
2868          if (desc_set_entry->primary) {
2869             desc_set_addr = PVR_DEV_ADDR_OFFSET(
2870                desc_set_addr,
2871                descriptor_set->layout->memory_layout_in_dwords_per_stage[stage]
2872                      .primary_offset
2873                   << 2U);
2874          } else {
2875             desc_set_addr = PVR_DEV_ADDR_OFFSET(
2876                desc_set_addr,
2877                descriptor_set->layout->memory_layout_in_dwords_per_stage[stage]
2878                      .secondary_offset
2879                   << 2U);
2880          }
2881 
2882          desc_set_addr = PVR_DEV_ADDR_OFFSET(
2883             desc_set_addr,
2884             (uint64_t)desc_set_entry->offset_in_dwords << 2U);
2885 
2886          PVR_WRITE(qword_buffer,
2887                    desc_set_addr.addr,
2888                    desc_set_entry->const_offset,
2889                    pds_info->data_size_in_dwords);
2890 
2891          entries += sizeof(*desc_set_entry);
2892          break;
2893       }
2894 
2895       case PVR_PDS_CONST_MAP_ENTRY_TYPE_SPECIAL_BUFFER: {
2896          const struct pvr_const_map_entry_special_buffer *special_buff_entry =
2897             (struct pvr_const_map_entry_special_buffer *)entries;
2898 
2899          switch (special_buff_entry->buffer_type) {
2900          case PVR_BUFFER_TYPES_COMPILE_TIME: {
2901             uint64_t addr = descriptor_state->static_consts->vma->dev_addr.addr;
2902 
2903             PVR_WRITE(qword_buffer,
2904                       addr,
2905                       special_buff_entry->const_offset,
2906                       pds_info->data_size_in_dwords);
2907             break;
2908          }
2909 
2910          default:
2911             unreachable("Unsupported special buffer type.");
2912          }
2913 
2914          entries += sizeof(*special_buff_entry);
2915          break;
2916       }
2917 
2918       default:
2919          unreachable("Unsupported map entry type.");
2920       }
2921    }
2922 
2923    pvr_bo_cpu_unmap(cmd_buffer->device, pvr_bo);
2924 
2925    *descriptor_data_offset_out =
2926       pvr_bo->vma->dev_addr.addr -
2927       cmd_buffer->device->heaps.pds_heap->base_addr.addr;
2928 
2929    return VK_SUCCESS;
2930 }
2931 
2932 #undef PVR_WRITE
2933 
pvr_compute_update_shared(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd)2934 static void pvr_compute_update_shared(struct pvr_cmd_buffer *cmd_buffer,
2935                                       struct pvr_sub_cmd_compute *const sub_cmd)
2936 {
2937    const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
2938    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
2939    struct pvr_csb *csb = &sub_cmd->control_stream;
2940    const struct pvr_compute_pipeline *pipeline = state->compute_pipeline;
2941    const uint32_t const_shared_reg_count =
2942       pipeline->state.shader.const_shared_reg_count;
2943    struct pvr_compute_kernel_info info;
2944 
2945    /* No shared regs, no need to use an allocation kernel. */
2946    if (!const_shared_reg_count)
2947       return;
2948 
2949    info = (struct pvr_compute_kernel_info){
2950       .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
2951       .sd_type = PVRX(CDMCTRL_SD_TYPE_NONE),
2952 
2953       .usc_target = PVRX(CDMCTRL_USC_TARGET_ALL),
2954       .usc_common_shared = true,
2955       .usc_common_size =
2956          DIV_ROUND_UP(const_shared_reg_count,
2957                       PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE)),
2958 
2959       .local_size = { 1, 1, 1 },
2960       .global_size = { 1, 1, 1 },
2961    };
2962 
2963    /* Sometimes we don't have a secondary program if there were no constants to
2964     * write, but we still need to run a PDS program to accomplish the
2965     * allocation of the local/common store shared registers so we repurpose the
2966     * deallocation PDS program.
2967     */
2968    if (pipeline->state.descriptor.pds_info.code_size_in_dwords) {
2969       uint32_t pds_data_size_in_dwords =
2970          pipeline->state.descriptor.pds_info.data_size_in_dwords;
2971 
2972       info.pds_data_offset = state->pds_compute_descriptor_data_offset;
2973       info.pds_data_size =
2974          DIV_ROUND_UP(pds_data_size_in_dwords << 2U,
2975                       PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE));
2976 
2977       /* Check that we have upload the code section. */
2978       assert(pipeline->state.descriptor.pds_code.code_size);
2979       info.pds_code_offset = pipeline->state.descriptor.pds_code.code_offset;
2980    } else {
2981       /* FIXME: There should be a deallocation pds program already uploaded
2982        * that we use at this point.
2983        */
2984       assert(!"Unimplemented");
2985    }
2986 
2987    /* We don't need to pad the workgroup size. */
2988 
2989    info.max_instances =
2990       pvr_compute_flat_slot_size(pdevice, const_shared_reg_count, false, 1U);
2991 
2992    pvr_compute_generate_control_stream(csb, sub_cmd, &info);
2993 }
2994 
2995 static uint32_t
pvr_compute_flat_pad_workgroup_size(const struct pvr_physical_device * pdevice,uint32_t workgroup_size,uint32_t coeff_regs_count)2996 pvr_compute_flat_pad_workgroup_size(const struct pvr_physical_device *pdevice,
2997                                     uint32_t workgroup_size,
2998                                     uint32_t coeff_regs_count)
2999 {
3000    const struct pvr_device_runtime_info *dev_runtime_info =
3001       &pdevice->dev_runtime_info;
3002    const struct pvr_device_info *dev_info = &pdevice->dev_info;
3003    uint32_t max_avail_coeff_regs =
3004       dev_runtime_info->cdm_max_local_mem_size_regs;
3005    uint32_t coeff_regs_count_aligned =
3006       ALIGN_POT(coeff_regs_count,
3007                 PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE) >> 2U);
3008 
3009    /* If the work group size is > ROGUE_MAX_INSTANCES_PER_TASK. We now *always*
3010     * pad the work group size to the next multiple of
3011     * ROGUE_MAX_INSTANCES_PER_TASK.
3012     *
3013     * If we use more than 1/8th of the max coefficient registers then we round
3014     * work group size up to the next multiple of ROGUE_MAX_INSTANCES_PER_TASK
3015     */
3016    /* TODO: See if this can be optimized. */
3017    if (workgroup_size > ROGUE_MAX_INSTANCES_PER_TASK ||
3018        coeff_regs_count_aligned > (max_avail_coeff_regs / 8)) {
3019       assert(workgroup_size < rogue_get_compute_max_work_group_size(dev_info));
3020 
3021       return ALIGN_POT(workgroup_size, ROGUE_MAX_INSTANCES_PER_TASK);
3022    }
3023 
3024    return workgroup_size;
3025 }
3026 
3027 /* TODO: Wire up the base_workgroup variant program when implementing
3028  * VK_KHR_device_group. The values will also need patching into the program.
3029  */
pvr_compute_update_kernel(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd,const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])3030 static void pvr_compute_update_kernel(
3031    struct pvr_cmd_buffer *cmd_buffer,
3032    struct pvr_sub_cmd_compute *const sub_cmd,
3033    const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])
3034 {
3035    const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
3036    const struct pvr_device_runtime_info *dev_runtime_info =
3037       &pdevice->dev_runtime_info;
3038    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
3039    struct pvr_csb *csb = &sub_cmd->control_stream;
3040    const struct pvr_compute_pipeline *pipeline = state->compute_pipeline;
3041    const struct pvr_pds_info *program_info =
3042       &pipeline->state.primary_program_info;
3043 
3044    struct pvr_compute_kernel_info info = {
3045       .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
3046       .usc_target = PVRX(CDMCTRL_USC_TARGET_ANY),
3047       .pds_temp_size =
3048          DIV_ROUND_UP(program_info->temps_required << 2U,
3049                       PVRX(CDMCTRL_KERNEL0_PDS_TEMP_SIZE_UNIT_SIZE)),
3050 
3051       .pds_data_size =
3052          DIV_ROUND_UP(program_info->data_size_in_dwords << 2U,
3053                       PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE)),
3054       .pds_data_offset = pipeline->state.primary_program.data_offset,
3055       .pds_code_offset = pipeline->state.primary_program.code_offset,
3056 
3057       .sd_type = PVRX(CDMCTRL_SD_TYPE_USC),
3058 
3059       .usc_unified_size =
3060          DIV_ROUND_UP(pipeline->state.shader.input_register_count << 2U,
3061                       PVRX(CDMCTRL_KERNEL0_USC_UNIFIED_SIZE_UNIT_SIZE)),
3062 
3063       /* clang-format off */
3064       .global_size = {
3065          global_workgroup_size[0],
3066          global_workgroup_size[1],
3067          global_workgroup_size[2]
3068       },
3069       /* clang-format on */
3070    };
3071 
3072    uint32_t work_size = pipeline->state.shader.work_size;
3073    uint32_t coeff_regs;
3074 
3075    if (work_size > ROGUE_MAX_INSTANCES_PER_TASK) {
3076       /* Enforce a single workgroup per cluster through allocation starvation.
3077        */
3078       coeff_regs = dev_runtime_info->cdm_max_local_mem_size_regs;
3079    } else {
3080       coeff_regs = pipeline->state.shader.coefficient_register_count;
3081    }
3082 
3083    info.usc_common_size =
3084       DIV_ROUND_UP(coeff_regs << 2U,
3085                    PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE));
3086 
3087    /* Use a whole slot per workgroup. */
3088    work_size = MAX2(work_size, ROGUE_MAX_INSTANCES_PER_TASK);
3089 
3090    coeff_regs += pipeline->state.shader.const_shared_reg_count;
3091 
3092    work_size =
3093       pvr_compute_flat_pad_workgroup_size(pdevice, work_size, coeff_regs);
3094 
3095    info.local_size[0] = work_size;
3096    info.local_size[1] = 1U;
3097    info.local_size[2] = 1U;
3098 
3099    info.max_instances =
3100       pvr_compute_flat_slot_size(pdevice, coeff_regs, false, work_size);
3101 
3102    pvr_compute_generate_control_stream(csb, sub_cmd, &info);
3103 }
3104 
pvr_CmdDispatch(VkCommandBuffer commandBuffer,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)3105 void pvr_CmdDispatch(VkCommandBuffer commandBuffer,
3106                      uint32_t groupCountX,
3107                      uint32_t groupCountY,
3108                      uint32_t groupCountZ)
3109 {
3110    const uint32_t workgroup_size[] = { groupCountX, groupCountY, groupCountZ };
3111    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
3112    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
3113    const struct pvr_compute_pipeline *compute_pipeline =
3114       state->compute_pipeline;
3115    const VkShaderStageFlags push_consts_stage_mask =
3116       compute_pipeline->base.layout->push_constants_shader_stages;
3117    bool push_descriptors_dirty;
3118    struct pvr_sub_cmd_compute *sub_cmd;
3119    VkResult result;
3120 
3121    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
3122    assert(compute_pipeline);
3123 
3124    if (!groupCountX || !groupCountY || !groupCountZ)
3125       return;
3126 
3127    pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_COMPUTE);
3128 
3129    sub_cmd = &state->current_sub_cmd->compute;
3130 
3131    sub_cmd->uses_atomic_ops |= compute_pipeline->state.shader.uses_atomic_ops;
3132    sub_cmd->uses_barrier |= compute_pipeline->state.shader.uses_barrier;
3133 
3134    if (push_consts_stage_mask & VK_SHADER_STAGE_COMPUTE_BIT) {
3135       /* TODO: Add a dirty push constants mask in the cmd_buffer state and
3136        * check for dirty compute stage.
3137        */
3138       pvr_finishme("Add support for push constants.");
3139    }
3140 
3141    pvr_validate_push_descriptors(cmd_buffer, &push_descriptors_dirty);
3142 
3143    if (compute_pipeline->state.shader.uses_num_workgroups) {
3144       struct pvr_bo *num_workgroups_bo;
3145 
3146       result = pvr_cmd_buffer_upload_general(cmd_buffer,
3147                                              workgroup_size,
3148                                              sizeof(workgroup_size),
3149                                              &num_workgroups_bo);
3150       if (result != VK_SUCCESS)
3151          return;
3152 
3153       result = pvr_setup_descriptor_mappings(
3154          cmd_buffer,
3155          PVR_STAGE_ALLOCATION_COMPUTE,
3156          &compute_pipeline->state.descriptor,
3157          &num_workgroups_bo->vma->dev_addr,
3158          &state->pds_compute_descriptor_data_offset);
3159       if (result != VK_SUCCESS)
3160          return;
3161    } else if ((compute_pipeline->base.layout
3162                   ->per_stage_descriptor_masks[PVR_STAGE_ALLOCATION_COMPUTE] &&
3163                state->dirty.compute_desc_dirty) ||
3164               state->dirty.compute_pipeline_binding || push_descriptors_dirty) {
3165       result = pvr_setup_descriptor_mappings(
3166          cmd_buffer,
3167          PVR_STAGE_ALLOCATION_COMPUTE,
3168          &compute_pipeline->state.descriptor,
3169          NULL,
3170          &state->pds_compute_descriptor_data_offset);
3171       if (result != VK_SUCCESS)
3172          return;
3173    }
3174 
3175    pvr_compute_update_shared(cmd_buffer, sub_cmd);
3176 
3177    pvr_compute_update_kernel(cmd_buffer, sub_cmd, workgroup_size);
3178 }
3179 
pvr_CmdDispatchIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset)3180 void pvr_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
3181                              VkBuffer _buffer,
3182                              VkDeviceSize offset)
3183 {
3184    assert(!"Unimplemented");
3185 }
3186 
3187 static void
pvr_update_draw_state(struct pvr_cmd_buffer_state * const state,const struct pvr_cmd_buffer_draw_state * const draw_state)3188 pvr_update_draw_state(struct pvr_cmd_buffer_state *const state,
3189                       const struct pvr_cmd_buffer_draw_state *const draw_state)
3190 {
3191    /* We don't have a state to tell us that base_instance is being used so it
3192     * gets used as a boolean - 0 means we'll use a pds program that skips the
3193     * base instance addition. If the base_instance gets used (and the last
3194     * draw's base_instance was 0) then we switch to the BASE_INSTANCE attrib
3195     * program.
3196     *
3197     * If base_instance changes then we only need to update the data section.
3198     *
3199     * The only draw call state that doesn't really matter is the start vertex
3200     * as that is handled properly in the VDM state in all cases.
3201     */
3202    if ((state->draw_state.draw_indexed != draw_state->draw_indexed) ||
3203        (state->draw_state.draw_indirect != draw_state->draw_indirect) ||
3204        (state->draw_state.base_instance == 0 &&
3205         draw_state->base_instance != 0)) {
3206       state->dirty.draw_variant = true;
3207    } else if (state->draw_state.base_instance != draw_state->base_instance) {
3208       state->dirty.draw_base_instance = true;
3209    }
3210 
3211    state->draw_state = *draw_state;
3212 }
3213 
pvr_calc_shared_regs_count(const struct pvr_graphics_pipeline * const gfx_pipeline)3214 static uint32_t pvr_calc_shared_regs_count(
3215    const struct pvr_graphics_pipeline *const gfx_pipeline)
3216 {
3217    const struct pvr_pipeline_stage_state *const vertex_state =
3218       &gfx_pipeline->vertex_shader_state.stage_state;
3219    uint32_t shared_regs = vertex_state->const_shared_reg_count +
3220                           vertex_state->const_shared_reg_offset;
3221 
3222    if (gfx_pipeline->fragment_shader_state.bo) {
3223       const struct pvr_pipeline_stage_state *const fragment_state =
3224          &gfx_pipeline->fragment_shader_state.stage_state;
3225       uint32_t fragment_regs = fragment_state->const_shared_reg_count +
3226                                fragment_state->const_shared_reg_offset;
3227 
3228       shared_regs = MAX2(shared_regs, fragment_regs);
3229    }
3230 
3231    return shared_regs;
3232 }
3233 
3234 static void
pvr_emit_dirty_pds_state(const struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd,const uint32_t pds_vertex_descriptor_data_offset)3235 pvr_emit_dirty_pds_state(const struct pvr_cmd_buffer *const cmd_buffer,
3236                          struct pvr_sub_cmd_gfx *const sub_cmd,
3237                          const uint32_t pds_vertex_descriptor_data_offset)
3238 {
3239    const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
3240    const struct pvr_stage_allocation_descriptor_state
3241       *const vertex_descriptor_state =
3242          &state->gfx_pipeline->vertex_shader_state.descriptor_state;
3243    const struct pvr_pipeline_stage_state *const vertex_stage_state =
3244       &state->gfx_pipeline->vertex_shader_state.stage_state;
3245    struct pvr_csb *const csb = &sub_cmd->control_stream;
3246 
3247    if (!vertex_descriptor_state->pds_info.code_size_in_dwords)
3248       return;
3249 
3250    pvr_csb_emit (csb, VDMCTRL_PDS_STATE0, state0) {
3251       state0.usc_target = PVRX(VDMCTRL_USC_TARGET_ALL);
3252 
3253       state0.usc_common_size =
3254          DIV_ROUND_UP(vertex_stage_state->const_shared_reg_count << 2,
3255                       PVRX(VDMCTRL_PDS_STATE0_USC_COMMON_SIZE_UNIT_SIZE));
3256 
3257       state0.pds_data_size = DIV_ROUND_UP(
3258          vertex_descriptor_state->pds_info.data_size_in_dwords << 2,
3259          PVRX(VDMCTRL_PDS_STATE0_PDS_DATA_SIZE_UNIT_SIZE));
3260    }
3261 
3262    pvr_csb_emit (csb, VDMCTRL_PDS_STATE1, state1) {
3263       state1.pds_data_addr = PVR_DEV_ADDR(pds_vertex_descriptor_data_offset);
3264       state1.sd_type = PVRX(VDMCTRL_SD_TYPE_NONE);
3265    }
3266 
3267    pvr_csb_emit (csb, VDMCTRL_PDS_STATE2, state2) {
3268       state2.pds_code_addr =
3269          PVR_DEV_ADDR(vertex_descriptor_state->pds_code.code_offset);
3270    }
3271 }
3272 
pvr_setup_output_select(struct pvr_cmd_buffer * const cmd_buffer)3273 static void pvr_setup_output_select(struct pvr_cmd_buffer *const cmd_buffer)
3274 {
3275    struct pvr_emit_state *const emit_state = &cmd_buffer->state.emit_state;
3276    const struct pvr_graphics_pipeline *const gfx_pipeline =
3277       cmd_buffer->state.gfx_pipeline;
3278    struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state;
3279    const struct pvr_vertex_shader_state *const vertex_state =
3280       &gfx_pipeline->vertex_shader_state;
3281    uint32_t output_selects;
3282 
3283    /* TODO: Handle vertex and fragment shader state flags. */
3284 
3285    pvr_csb_pack (&output_selects, TA_OUTPUT_SEL, state) {
3286       const VkPrimitiveTopology topology =
3287          gfx_pipeline->input_asm_state.topology;
3288 
3289       state.rhw_pres = true;
3290       state.vtxsize = DIV_ROUND_UP(vertex_state->vertex_output_size, 4U);
3291       state.psprite_size_pres = (topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST);
3292    }
3293 
3294    if (ppp_state->output_selects != output_selects) {
3295       ppp_state->output_selects = output_selects;
3296       emit_state->output_selects = true;
3297    }
3298 
3299    if (ppp_state->varying_word[0] != vertex_state->varying[0]) {
3300       ppp_state->varying_word[0] = vertex_state->varying[0];
3301       emit_state->varying_word0 = true;
3302    }
3303 
3304    if (ppp_state->varying_word[1] != vertex_state->varying[1]) {
3305       ppp_state->varying_word[1] = vertex_state->varying[1];
3306       emit_state->varying_word1 = true;
3307    }
3308 }
3309 
3310 static void
pvr_setup_isp_faces_and_control(struct pvr_cmd_buffer * const cmd_buffer,struct PVRX (TA_STATE_ISPA)* const ispa_out)3311 pvr_setup_isp_faces_and_control(struct pvr_cmd_buffer *const cmd_buffer,
3312                                 struct PVRX(TA_STATE_ISPA) *const ispa_out)
3313 {
3314    struct pvr_emit_state *const emit_state = &cmd_buffer->state.emit_state;
3315    const struct pvr_graphics_pipeline *const gfx_pipeline =
3316       cmd_buffer->state.gfx_pipeline;
3317    struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state;
3318    const struct pvr_dynamic_state *const dynamic_state =
3319       &cmd_buffer->state.dynamic.common;
3320    const struct pvr_render_pass_info *const pass_info =
3321       &cmd_buffer->state.render_pass_info;
3322    const uint32_t subpass_idx = pass_info->subpass_idx;
3323    const uint32_t *depth_stencil_attachment_idx =
3324       pass_info->pass->subpasses[subpass_idx].depth_stencil_attachment;
3325    const struct pvr_image_view *const attachment =
3326       (!depth_stencil_attachment_idx)
3327          ? NULL
3328          : pass_info->attachments[*depth_stencil_attachment_idx];
3329 
3330    const VkCullModeFlags cull_mode = gfx_pipeline->raster_state.cull_mode;
3331    const bool raster_discard_enabled =
3332       gfx_pipeline->raster_state.discard_enable;
3333    const bool disable_all = raster_discard_enabled || !attachment;
3334 
3335    const VkPrimitiveTopology topology = gfx_pipeline->input_asm_state.topology;
3336    const enum PVRX(TA_OBJTYPE) obj_type = pvr_ta_objtype(topology);
3337 
3338    const bool disable_stencil_write = disable_all;
3339    const bool disable_stencil_test =
3340       disable_all || !vk_format_has_stencil(attachment->vk.format);
3341 
3342    const bool disable_depth_write = disable_all;
3343    const bool disable_depth_test = disable_all ||
3344                                    !vk_format_has_depth(attachment->vk.format);
3345 
3346    uint32_t ispb_stencil_off;
3347    bool is_two_sided = false;
3348    uint32_t isp_control;
3349 
3350    uint32_t line_width;
3351    uint32_t common_a;
3352    uint32_t front_a;
3353    uint32_t front_b;
3354    uint32_t back_a;
3355    uint32_t back_b;
3356 
3357    /* Convert to 4.4 fixed point format. */
3358    line_width = util_unsigned_fixed(dynamic_state->line_width, 4);
3359 
3360    /* Subtract 1 to shift values from range [0=0,256=16] to [0=1/16,255=16].
3361     * If 0 it stays at 0, otherwise we subtract 1.
3362     */
3363    line_width = (!!line_width) * (line_width - 1);
3364 
3365    line_width = MIN2(line_width, PVRX(TA_STATE_ISPA_POINTLINEWIDTH_SIZE_MAX));
3366 
3367    /* TODO: Part of the logic in this function is duplicated in another part
3368     * of the code. E.g. the dcmpmode, and sop1/2/3. Could we do this earlier?
3369     */
3370 
3371    pvr_csb_pack (&common_a, TA_STATE_ISPA, ispa) {
3372       ispa.pointlinewidth = line_width;
3373 
3374       if (disable_depth_test)
3375          ispa.dcmpmode = PVRX(TA_CMPMODE_ALWAYS);
3376       else
3377          ispa.dcmpmode = pvr_ta_cmpmode(gfx_pipeline->depth_compare_op);
3378 
3379       /* FIXME: Can we just have this and remove the assignment above?
3380        * The user provides a depthTestEnable at vkCreateGraphicsPipelines()
3381        * should we be using that?
3382        */
3383       ispa.dcmpmode |= gfx_pipeline->depth_compare_op;
3384 
3385       ispa.dwritedisable = disable_depth_test || disable_depth_write;
3386       /* FIXME: Can we just have this and remove the assignment above? */
3387       ispa.dwritedisable = ispa.dwritedisable ||
3388                            gfx_pipeline->depth_write_disable;
3389 
3390       ispa.passtype = gfx_pipeline->fragment_shader_state.pass_type;
3391 
3392       ispa.objtype = obj_type;
3393 
3394       /* Return unpacked ispa structure. dcmpmode, dwritedisable, passtype and
3395        * objtype are needed by pvr_setup_triangle_merging_flag.
3396        */
3397       if (ispa_out)
3398          *ispa_out = ispa;
3399    }
3400 
3401    /* FIXME: This logic should be redone and improved. Can we also get rid of
3402     * the front and back variants?
3403     */
3404 
3405    pvr_csb_pack (&front_a, TA_STATE_ISPA, ispa) {
3406       ispa.sref = (!disable_stencil_test) * dynamic_state->reference.front;
3407    }
3408    front_a |= common_a;
3409 
3410    pvr_csb_pack (&back_a, TA_STATE_ISPA, ispa) {
3411       ispa.sref = (!disable_stencil_test) * dynamic_state->compare_mask.back;
3412    }
3413    back_a |= common_a;
3414 
3415    /* TODO: Does this actually represent the ispb control word on stencil off?
3416     * If not, rename the variable.
3417     */
3418    pvr_csb_pack (&ispb_stencil_off, TA_STATE_ISPB, ispb) {
3419       ispb.sop3 = PVRX(TA_ISPB_STENCILOP_KEEP);
3420       ispb.sop2 = PVRX(TA_ISPB_STENCILOP_KEEP);
3421       ispb.sop1 = PVRX(TA_ISPB_STENCILOP_KEEP);
3422       ispb.scmpmode = PVRX(TA_CMPMODE_ALWAYS);
3423    }
3424 
3425    if (disable_stencil_test) {
3426       back_b = front_b = ispb_stencil_off;
3427    } else {
3428       pvr_csb_pack (&front_b, TA_STATE_ISPB, ispb) {
3429          ispb.swmask =
3430             (!disable_stencil_write) * dynamic_state->write_mask.front;
3431          ispb.scmpmask = dynamic_state->compare_mask.front;
3432 
3433          ispb.sop3 = pvr_ta_stencilop(gfx_pipeline->stencil_front.pass_op);
3434          ispb.sop2 =
3435             pvr_ta_stencilop(gfx_pipeline->stencil_front.depth_fail_op);
3436          ispb.sop1 = pvr_ta_stencilop(gfx_pipeline->stencil_front.fail_op);
3437 
3438          ispb.scmpmode = pvr_ta_cmpmode(gfx_pipeline->stencil_front.compare_op);
3439       }
3440 
3441       pvr_csb_pack (&back_b, TA_STATE_ISPB, ispb) {
3442          ispb.swmask =
3443             (!disable_stencil_write) * dynamic_state->write_mask.back;
3444          ispb.scmpmask = dynamic_state->compare_mask.back;
3445 
3446          ispb.sop3 = pvr_ta_stencilop(gfx_pipeline->stencil_back.pass_op);
3447          ispb.sop2 = pvr_ta_stencilop(gfx_pipeline->stencil_back.depth_fail_op);
3448          ispb.sop1 = pvr_ta_stencilop(gfx_pipeline->stencil_back.fail_op);
3449 
3450          ispb.scmpmode = pvr_ta_cmpmode(gfx_pipeline->stencil_back.compare_op);
3451       }
3452    }
3453 
3454    if (front_a != back_a || front_b != back_b) {
3455       if (cull_mode & VK_CULL_MODE_BACK_BIT) {
3456          /* Single face, using front state. */
3457       } else if (cull_mode & VK_CULL_MODE_FRONT_BIT) {
3458          /* Single face, using back state. */
3459 
3460          front_a = back_a;
3461          front_b = back_b;
3462       } else {
3463          /* Both faces. */
3464 
3465          emit_state->isp_ba = is_two_sided = true;
3466 
3467          if (gfx_pipeline->raster_state.front_face ==
3468              VK_FRONT_FACE_COUNTER_CLOCKWISE) {
3469             uint32_t tmp = front_a;
3470 
3471             front_a = back_a;
3472             back_a = tmp;
3473 
3474             tmp = front_b;
3475             front_b = back_b;
3476             back_b = tmp;
3477          }
3478 
3479          /* HW defaults to stencil off. */
3480          if (back_b != ispb_stencil_off)
3481             emit_state->isp_fb = emit_state->isp_bb = true;
3482       }
3483    }
3484 
3485    if (!disable_stencil_test && front_b != ispb_stencil_off)
3486       emit_state->isp_fb = true;
3487 
3488    pvr_csb_pack (&isp_control, TA_STATE_ISPCTL, ispctl) {
3489       ispctl.upass = pass_info->userpass_spawn;
3490 
3491       /* TODO: is bo ever NULL? Figure out what to do. */
3492       ispctl.tagwritedisable = raster_discard_enabled ||
3493                                !gfx_pipeline->fragment_shader_state.bo;
3494 
3495       ispctl.two_sided = is_two_sided;
3496       ispctl.bpres = emit_state->isp_fb || emit_state->isp_bb;
3497 
3498       ispctl.dbenable = !raster_discard_enabled &&
3499                         gfx_pipeline->raster_state.depth_bias_enable &&
3500                         obj_type == PVRX(TA_OBJTYPE_TRIANGLE);
3501       ispctl.scenable = !raster_discard_enabled;
3502 
3503       ppp_state->isp.control_struct = ispctl;
3504    }
3505 
3506    emit_state->isp = true;
3507 
3508    ppp_state->isp.control = isp_control;
3509    ppp_state->isp.front_a = front_a;
3510    ppp_state->isp.front_b = front_b;
3511    ppp_state->isp.back_a = back_a;
3512    ppp_state->isp.back_b = back_b;
3513 }
3514 
pvr_get_viewport_scissor_overlap(const VkViewport * const viewport,const VkRect2D * const scissor,VkRect2D * const rect_out)3515 static void pvr_get_viewport_scissor_overlap(const VkViewport *const viewport,
3516                                              const VkRect2D *const scissor,
3517                                              VkRect2D *const rect_out)
3518 {
3519    /* TODO: See if we can remove this struct. */
3520    struct pvr_rect {
3521       int32_t x0, y0;
3522       int32_t x1, y1;
3523    };
3524 
3525    /* TODO: Worry about overflow? */
3526    const struct pvr_rect scissor_rect = {
3527       .x0 = scissor->offset.x,
3528       .y0 = scissor->offset.y,
3529       .x1 = scissor->offset.x + scissor->extent.width,
3530       .y1 = scissor->offset.y + scissor->extent.height
3531    };
3532    struct pvr_rect viewport_rect = { 0 };
3533 
3534    assert(viewport->width >= 0.0f);
3535    assert(scissor_rect.x0 >= 0);
3536    assert(scissor_rect.y0 >= 0);
3537 
3538    if (scissor->extent.width == 0 || scissor->extent.height == 0) {
3539       *rect_out = (VkRect2D){ 0 };
3540       return;
3541    }
3542 
3543    viewport_rect.x0 = (int32_t)viewport->x;
3544    viewport_rect.x1 = (int32_t)viewport->x + (int32_t)viewport->width;
3545 
3546    /* TODO: Is there a mathematical way of doing all this and then clamp at
3547     * the end?
3548     */
3549    /* We flip the y0 and y1 when height is negative. */
3550    viewport_rect.y0 = (int32_t)viewport->y + MIN2(0, (int32_t)viewport->height);
3551    viewport_rect.y1 = (int32_t)viewport->y + MAX2(0, (int32_t)viewport->height);
3552 
3553    if (scissor_rect.x1 <= viewport_rect.x0 ||
3554        scissor_rect.y1 <= viewport_rect.y0 ||
3555        scissor_rect.x0 >= viewport_rect.x1 ||
3556        scissor_rect.y0 >= viewport_rect.y1) {
3557       *rect_out = (VkRect2D){ 0 };
3558       return;
3559    }
3560 
3561    /* Determine the overlapping rectangle. */
3562    viewport_rect.x0 = MAX2(viewport_rect.x0, scissor_rect.x0);
3563    viewport_rect.y0 = MAX2(viewport_rect.y0, scissor_rect.y0);
3564    viewport_rect.x1 = MIN2(viewport_rect.x1, scissor_rect.x1);
3565    viewport_rect.y1 = MIN2(viewport_rect.y1, scissor_rect.y1);
3566 
3567    /* TODO: Is this conversion safe? Is this logic right? */
3568    rect_out->offset.x = (uint32_t)viewport_rect.x0;
3569    rect_out->offset.y = (uint32_t)viewport_rect.y0;
3570    rect_out->extent.height = (uint32_t)(viewport_rect.y1 - viewport_rect.y0);
3571    rect_out->extent.width = (uint32_t)(viewport_rect.x1 - viewport_rect.x0);
3572 }
3573 
3574 static inline uint32_t
pvr_get_geom_region_clip_align_size(struct pvr_device_info * const dev_info)3575 pvr_get_geom_region_clip_align_size(struct pvr_device_info *const dev_info)
3576 {
3577    /* TODO: This should come from rogue_ppp.xml. */
3578    return 16U + 16U * (!PVR_HAS_FEATURE(dev_info, tile_size_16x16));
3579 }
3580 
3581 /* FIXME: Remove device param when PVR_HAS_FEATURE() accepts const dev_info */
3582 static void
pvr_setup_isp_depth_bias_scissor_state(struct pvr_cmd_buffer * const cmd_buffer)3583 pvr_setup_isp_depth_bias_scissor_state(struct pvr_cmd_buffer *const cmd_buffer)
3584 {
3585    struct pvr_emit_state *const emit_state = &cmd_buffer->state.emit_state;
3586    struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state;
3587    const struct pvr_dynamic_state *const dynamic_state =
3588       &cmd_buffer->state.dynamic.common;
3589    const struct PVRX(TA_STATE_ISPCTL) *const ispctl =
3590       &ppp_state->isp.control_struct;
3591    struct pvr_device_info *const dev_info =
3592       &cmd_buffer->device->pdevice->dev_info;
3593 
3594    if (ispctl->dbenable)
3595       assert(!"Unimplemented");
3596 
3597    if (ispctl->scenable) {
3598       const uint32_t region_clip_align_size =
3599          pvr_get_geom_region_clip_align_size(dev_info);
3600       const VkViewport *const viewport = &dynamic_state->viewport.viewports[0];
3601       const VkRect2D *const scissor = &dynamic_state->scissor.scissors[0];
3602       VkRect2D overlap_rect;
3603       uint32_t scissor_words[2];
3604       uint32_t height;
3605       uint32_t width;
3606       uint32_t x;
3607       uint32_t y;
3608 
3609       /* For region clip. */
3610       uint32_t bottom;
3611       uint32_t right;
3612       uint32_t left;
3613       uint32_t top;
3614 
3615       /* We don't support multiple viewport calculations. */
3616       assert(dynamic_state->viewport.count == 1);
3617       /* We don't support multiple scissor calculations. */
3618       assert(dynamic_state->scissor.count == 1);
3619 
3620       pvr_get_viewport_scissor_overlap(viewport, scissor, &overlap_rect);
3621 
3622       x = overlap_rect.offset.x;
3623       y = overlap_rect.offset.y;
3624       width = overlap_rect.extent.width;
3625       height = overlap_rect.extent.height;
3626 
3627       pvr_csb_pack (&scissor_words[0], IPF_SCISSOR_WORD_0, word0) {
3628          word0.scw0_xmax = x + width;
3629          word0.scw0_xmin = x;
3630       }
3631 
3632       pvr_csb_pack (&scissor_words[1], IPF_SCISSOR_WORD_1, word1) {
3633          word1.scw1_ymax = y + height;
3634          word1.scw1_ymin = y;
3635       }
3636 
3637       if (cmd_buffer->scissor_array.size &&
3638           cmd_buffer->scissor_words[0] == scissor_words[0] &&
3639           cmd_buffer->scissor_words[1] == scissor_words[1]) {
3640          return;
3641       }
3642 
3643       cmd_buffer->scissor_words[0] = scissor_words[0];
3644       cmd_buffer->scissor_words[1] = scissor_words[1];
3645 
3646       /* Calculate region clip. */
3647 
3648       left = x / region_clip_align_size;
3649       top = y / region_clip_align_size;
3650 
3651       /* We prevent right=-1 with the multiplication. */
3652       /* TODO: Is there a better way of doing this? */
3653       if ((x + width) != 0U)
3654          right = DIV_ROUND_UP(x + width, region_clip_align_size) - 1;
3655       else
3656          right = 0;
3657 
3658       if ((y + height) != 0U)
3659          bottom = DIV_ROUND_UP(y + height, region_clip_align_size) - 1;
3660       else
3661          bottom = 0U;
3662 
3663       /* Setup region clip to clip everything outside what was calculated. */
3664 
3665       /* FIXME: Should we mask to prevent writing over other words? */
3666       pvr_csb_pack (&ppp_state->region_clipping.word0, TA_REGION_CLIP0, word0) {
3667          word0.right = right;
3668          word0.left = left;
3669          word0.mode = PVRX(TA_REGION_CLIP_MODE_OUTSIDE);
3670       }
3671 
3672       pvr_csb_pack (&ppp_state->region_clipping.word1, TA_REGION_CLIP1, word1) {
3673          word1.bottom = bottom;
3674          word1.top = top;
3675       }
3676 
3677       ppp_state->depthbias_scissor_indices.scissor_index =
3678          util_dynarray_num_elements(&cmd_buffer->scissor_array,
3679                                     __typeof__(cmd_buffer->scissor_words));
3680 
3681       memcpy(util_dynarray_grow_bytes(&cmd_buffer->scissor_array,
3682                                       1,
3683                                       sizeof(cmd_buffer->scissor_words)),
3684              cmd_buffer->scissor_words,
3685              sizeof(cmd_buffer->scissor_words));
3686 
3687       emit_state->isp_dbsc = true;
3688       emit_state->region_clip = true;
3689    }
3690 }
3691 
3692 static void
pvr_setup_triangle_merging_flag(struct pvr_cmd_buffer * const cmd_buffer,struct PVRX (TA_STATE_ISPA)* ispa)3693 pvr_setup_triangle_merging_flag(struct pvr_cmd_buffer *const cmd_buffer,
3694                                 struct PVRX(TA_STATE_ISPA) * ispa)
3695 {
3696    struct pvr_emit_state *const emit_state = &cmd_buffer->state.emit_state;
3697    struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state;
3698    uint32_t merge_word;
3699    uint32_t mask;
3700 
3701    pvr_csb_pack (&merge_word, TA_STATE_PDS_SIZEINFO2, size_info) {
3702       /* Disable for lines or punch-through or for DWD and depth compare
3703        * always.
3704        */
3705       if (ispa->objtype == PVRX(TA_OBJTYPE_LINE) ||
3706           ispa->passtype == PVRX(TA_PASSTYPE_PUNCH_THROUGH) ||
3707           (ispa->dwritedisable && ispa->dcmpmode == PVRX(TA_CMPMODE_ALWAYS))) {
3708          size_info.pds_tri_merge_disable = true;
3709       }
3710    }
3711 
3712    pvr_csb_pack (&mask, TA_STATE_PDS_SIZEINFO2, size_info) {
3713       size_info.pds_tri_merge_disable = true;
3714    }
3715 
3716    merge_word |= ppp_state->pds.size_info2 & ~mask;
3717 
3718    if (merge_word != ppp_state->pds.size_info2) {
3719       ppp_state->pds.size_info2 = merge_word;
3720       emit_state->pds_fragment_stateptr0 = true;
3721    }
3722 }
3723 
3724 static void
pvr_setup_fragment_state_pointers(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)3725 pvr_setup_fragment_state_pointers(struct pvr_cmd_buffer *const cmd_buffer,
3726                                   struct pvr_sub_cmd_gfx *const sub_cmd)
3727 {
3728    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
3729    const struct pvr_stage_allocation_descriptor_state *descriptor_shader_state =
3730       &state->gfx_pipeline->fragment_shader_state.descriptor_state;
3731    const struct pvr_pds_upload *pds_coeff_program =
3732       &state->gfx_pipeline->fragment_shader_state.pds_coeff_program;
3733    const struct pvr_pipeline_stage_state *fragment_state =
3734       &state->gfx_pipeline->fragment_shader_state.stage_state;
3735    const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
3736    struct pvr_emit_state *const emit_state = &state->emit_state;
3737    struct pvr_ppp_state *const ppp_state = &state->ppp_state;
3738 
3739    const uint32_t pds_uniform_size =
3740       DIV_ROUND_UP(descriptor_shader_state->pds_info.data_size_in_dwords,
3741                    PVRX(TA_STATE_PDS_SIZEINFO1_PDS_UNIFORMSIZE_UNIT_SIZE));
3742 
3743    const uint32_t pds_varying_state_size =
3744       DIV_ROUND_UP(pds_coeff_program->data_size,
3745                    PVRX(TA_STATE_PDS_SIZEINFO1_PDS_VARYINGSIZE_UNIT_SIZE));
3746 
3747    const uint32_t usc_varying_size =
3748       DIV_ROUND_UP(fragment_state->coefficient_size,
3749                    PVRX(TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_UNIT_SIZE));
3750 
3751    const uint32_t pds_temp_size =
3752       DIV_ROUND_UP(fragment_state->temps_count,
3753                    PVRX(TA_STATE_PDS_SIZEINFO1_PDS_TEMPSIZE_UNIT_SIZE));
3754 
3755    const uint32_t usc_shared_size =
3756       DIV_ROUND_UP(fragment_state->const_shared_reg_count,
3757                    PVRX(TA_STATE_PDS_SIZEINFO2_USC_SHAREDSIZE_UNIT_SIZE));
3758 
3759    const uint32_t max_tiles_in_flight =
3760       pvr_calc_fscommon_size_and_tiles_in_flight(
3761          pdevice,
3762          usc_shared_size *
3763             PVRX(TA_STATE_PDS_SIZEINFO2_USC_SHAREDSIZE_UNIT_SIZE),
3764          1);
3765    uint32_t size_info_mask;
3766    uint32_t size_info2;
3767 
3768    if (max_tiles_in_flight < sub_cmd->max_tiles_in_flight)
3769       sub_cmd->max_tiles_in_flight = max_tiles_in_flight;
3770 
3771    pvr_csb_pack (&ppp_state->pds.pixel_shader_base,
3772                  TA_STATE_PDS_SHADERBASE,
3773                  shader_base) {
3774       const struct pvr_pds_upload *const pds_upload =
3775          &state->gfx_pipeline->fragment_shader_state.pds_fragment_program;
3776 
3777       shader_base.addr = PVR_DEV_ADDR(pds_upload->data_offset);
3778    }
3779 
3780    if (descriptor_shader_state->pds_code.pvr_bo) {
3781       pvr_csb_pack (&ppp_state->pds.texture_uniform_code_base,
3782                     TA_STATE_PDS_TEXUNICODEBASE,
3783                     tex_base) {
3784          tex_base.addr =
3785             PVR_DEV_ADDR(descriptor_shader_state->pds_code.code_offset);
3786       }
3787    } else {
3788       ppp_state->pds.texture_uniform_code_base = 0U;
3789    }
3790 
3791    pvr_csb_pack (&ppp_state->pds.size_info1, TA_STATE_PDS_SIZEINFO1, info1) {
3792       info1.pds_uniformsize = pds_uniform_size;
3793       info1.pds_texturestatesize = 0U;
3794       info1.pds_varyingsize = pds_varying_state_size;
3795       info1.usc_varyingsize = usc_varying_size;
3796       info1.pds_tempsize = pds_temp_size;
3797    }
3798 
3799    pvr_csb_pack (&size_info_mask, TA_STATE_PDS_SIZEINFO2, mask) {
3800       mask.pds_tri_merge_disable = true;
3801    }
3802 
3803    ppp_state->pds.size_info2 &= size_info_mask;
3804 
3805    pvr_csb_pack (&size_info2, TA_STATE_PDS_SIZEINFO2, info2) {
3806       info2.usc_sharedsize = usc_shared_size;
3807    }
3808 
3809    ppp_state->pds.size_info2 |= size_info2;
3810 
3811    if (pds_coeff_program->pvr_bo) {
3812       state->emit_state.pds_fragment_stateptr1 = true;
3813 
3814       pvr_csb_pack (&ppp_state->pds.varying_base,
3815                     TA_STATE_PDS_VARYINGBASE,
3816                     base) {
3817          base.addr = PVR_DEV_ADDR(pds_coeff_program->data_offset);
3818       }
3819    } else {
3820       ppp_state->pds.varying_base = 0U;
3821    }
3822 
3823    pvr_csb_pack (&ppp_state->pds.uniform_state_data_base,
3824                  TA_STATE_PDS_UNIFORMDATABASE,
3825                  base) {
3826       base.addr = PVR_DEV_ADDR(state->pds_fragment_descriptor_data_offset);
3827    }
3828 
3829    emit_state->pds_fragment_stateptr0 = true;
3830    emit_state->pds_fragment_stateptr3 = true;
3831 }
3832 
pvr_setup_viewport(struct pvr_cmd_buffer * const cmd_buffer)3833 static void pvr_setup_viewport(struct pvr_cmd_buffer *const cmd_buffer)
3834 {
3835    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
3836    struct pvr_emit_state *const emit_state = &state->emit_state;
3837    struct pvr_ppp_state *const ppp_state = &state->ppp_state;
3838 
3839    if (ppp_state->viewport_count != state->dynamic.common.viewport.count) {
3840       ppp_state->viewport_count = state->dynamic.common.viewport.count;
3841       emit_state->viewport = true;
3842    }
3843 
3844    if (state->gfx_pipeline->raster_state.discard_enable) {
3845       /* We don't want to emit any viewport data as it'll just get thrown
3846        * away. It's after the previous condition because we still want to
3847        * stash the viewport_count as it's our trigger for when
3848        * rasterizer discard gets disabled.
3849        */
3850       emit_state->viewport = false;
3851       return;
3852    }
3853 
3854    for (uint32_t i = 0; i < ppp_state->viewport_count; i++) {
3855       VkViewport *viewport = &state->dynamic.common.viewport.viewports[i];
3856       uint32_t x_scale = fui(viewport->width * 0.5f);
3857       uint32_t y_scale = fui(viewport->height * 0.5f);
3858       uint32_t z_scale = fui(viewport->maxDepth - viewport->minDepth);
3859       uint32_t x_center = fui(viewport->x + viewport->width * 0.5f);
3860       uint32_t y_center = fui(viewport->y + viewport->height * 0.5f);
3861       uint32_t z_center = fui(viewport->minDepth);
3862 
3863       if (ppp_state->viewports[i].a0 != x_center ||
3864           ppp_state->viewports[i].m0 != x_scale ||
3865           ppp_state->viewports[i].a1 != y_center ||
3866           ppp_state->viewports[i].m1 != y_scale ||
3867           ppp_state->viewports[i].a2 != z_center ||
3868           ppp_state->viewports[i].m2 != z_scale) {
3869          ppp_state->viewports[i].a0 = x_center;
3870          ppp_state->viewports[i].m0 = x_scale;
3871          ppp_state->viewports[i].a1 = y_center;
3872          ppp_state->viewports[i].m1 = y_scale;
3873          ppp_state->viewports[i].a2 = z_center;
3874          ppp_state->viewports[i].m2 = z_scale;
3875 
3876          emit_state->viewport = true;
3877       }
3878    }
3879 }
3880 
pvr_setup_ppp_control(struct pvr_cmd_buffer * const cmd_buffer)3881 static void pvr_setup_ppp_control(struct pvr_cmd_buffer *const cmd_buffer)
3882 {
3883    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
3884    const struct pvr_graphics_pipeline *const gfx_pipeline = state->gfx_pipeline;
3885    struct pvr_emit_state *const emit_state = &state->emit_state;
3886    struct pvr_ppp_state *const ppp_state = &state->ppp_state;
3887    uint32_t ppp_control;
3888 
3889    pvr_csb_pack (&ppp_control, TA_STATE_PPP_CTRL, control) {
3890       const struct pvr_raster_state *raster_state = &gfx_pipeline->raster_state;
3891       VkPrimitiveTopology topology = gfx_pipeline->input_asm_state.topology;
3892       control.drawclippededges = true;
3893       control.wclampen = true;
3894 
3895       if (topology == VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN)
3896          control.flatshade_vtx = PVRX(TA_FLATSHADE_VTX_VERTEX_1);
3897       else
3898          control.flatshade_vtx = PVRX(TA_FLATSHADE_VTX_VERTEX_0);
3899 
3900       if (raster_state->depth_clamp_enable)
3901          control.clip_mode = PVRX(TA_CLIP_MODE_NO_FRONT_OR_REAR);
3902       else
3903          control.clip_mode = PVRX(TA_CLIP_MODE_FRONT_REAR);
3904 
3905       /* +--- FrontIsCCW?
3906        * | +--- Cull Front?
3907        * v v
3908        * 0|0 CULLMODE_CULL_CCW,
3909        * 0|1 CULLMODE_CULL_CW,
3910        * 1|0 CULLMODE_CULL_CW,
3911        * 1|1 CULLMODE_CULL_CCW,
3912        */
3913       switch (raster_state->cull_mode) {
3914       case VK_CULL_MODE_BACK_BIT:
3915       case VK_CULL_MODE_FRONT_BIT:
3916          if ((raster_state->front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE) ^
3917              (raster_state->cull_mode == VK_CULL_MODE_FRONT_BIT)) {
3918             control.cullmode = PVRX(TA_CULLMODE_CULL_CW);
3919          } else {
3920             control.cullmode = PVRX(TA_CULLMODE_CULL_CCW);
3921          }
3922 
3923          break;
3924 
3925       case VK_CULL_MODE_NONE:
3926          control.cullmode = PVRX(TA_CULLMODE_NO_CULLING);
3927          break;
3928 
3929       default:
3930          unreachable("Unsupported cull mode!");
3931       }
3932    }
3933 
3934    if (ppp_control != ppp_state->ppp_control) {
3935       ppp_state->ppp_control = ppp_control;
3936       emit_state->ppp_control = true;
3937    }
3938 }
3939 
3940 /* Largest valid PPP State update in words = 31
3941  * 1 - Header
3942  * 3 - Stream Out Config words 0, 1 and 2
3943  * 1 - PPP Control word
3944  * 3 - Varying Config words 0, 1 and 2
3945  * 1 - Output Select
3946  * 1 - WClamp
3947  * 6 - Viewport Transform words
3948  * 2 - Region Clip words
3949  * 3 - PDS State for fragment phase (PDSSTATEPTR 1-3)
3950  * 4 - PDS State for fragment phase (PDSSTATEPTR0)
3951  * 6 - ISP Control Words
3952  */
3953 #define PVR_MAX_PPP_STATE_DWORDS 31
3954 
pvr_emit_ppp_state(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)3955 static VkResult pvr_emit_ppp_state(struct pvr_cmd_buffer *const cmd_buffer,
3956                                    struct pvr_sub_cmd_gfx *const sub_cmd)
3957 {
3958    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
3959    struct pvr_emit_state *const emit_state = &state->emit_state;
3960    struct pvr_ppp_state *const ppp_state = &state->ppp_state;
3961    struct pvr_csb *const control_stream = &sub_cmd->control_stream;
3962    uint32_t ppp_state_words[PVR_MAX_PPP_STATE_DWORDS];
3963    uint32_t ppp_state_words_count;
3964    uint32_t ppp_state_header;
3965    bool deferred_secondary;
3966    struct pvr_bo *pvr_bo;
3967    uint32_t *buffer_ptr;
3968    VkResult result;
3969 
3970    buffer_ptr = ppp_state_words;
3971 
3972    pvr_csb_pack (&ppp_state_header, TA_STATE_HEADER, header) {
3973       header.view_port_count = (ppp_state->viewport_count == 0)
3974                                   ? 0U
3975                                   : (ppp_state->viewport_count - 1);
3976 
3977       /* Skip over header. */
3978       buffer_ptr++;
3979 
3980       /* Set ISP state. */
3981       if (emit_state->isp) {
3982          header.pres_ispctl = true;
3983          *buffer_ptr++ = ppp_state->isp.control;
3984          header.pres_ispctl_fa = true;
3985          *buffer_ptr++ = ppp_state->isp.front_a;
3986 
3987          if (emit_state->isp_fb) {
3988             header.pres_ispctl_fb = true;
3989             *buffer_ptr++ = ppp_state->isp.front_b;
3990          }
3991 
3992          if (emit_state->isp_ba) {
3993             header.pres_ispctl_ba = true;
3994             *buffer_ptr++ = ppp_state->isp.back_a;
3995          }
3996 
3997          if (emit_state->isp_bb) {
3998             header.pres_ispctl_bb = true;
3999             *buffer_ptr++ = ppp_state->isp.back_b;
4000          }
4001       }
4002 
4003       /* Depth bias / scissor
4004        * If deferred_secondary is true then we do a separate state update
4005        * which gets patched in ExecuteDeferredCommandBuffer.
4006        */
4007       /* TODO: Update above comment when we port ExecuteDeferredCommandBuffer.
4008        */
4009       deferred_secondary =
4010          cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
4011          cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
4012 
4013       if (emit_state->isp_dbsc && !deferred_secondary) {
4014          header.pres_ispctl_dbsc = true;
4015 
4016          pvr_csb_pack (buffer_ptr++, TA_STATE_ISPDBSC, ispdbsc) {
4017             ispdbsc.dbindex =
4018                ppp_state->depthbias_scissor_indices.depthbias_index;
4019             ispdbsc.scindex =
4020                ppp_state->depthbias_scissor_indices.scissor_index;
4021          }
4022       }
4023 
4024       /* PDS state. */
4025       if (emit_state->pds_fragment_stateptr0) {
4026          header.pres_pds_state_ptr0 = true;
4027 
4028          *buffer_ptr++ = ppp_state->pds.pixel_shader_base;
4029          *buffer_ptr++ = ppp_state->pds.texture_uniform_code_base;
4030          *buffer_ptr++ = ppp_state->pds.size_info1;
4031          *buffer_ptr++ = ppp_state->pds.size_info2;
4032       }
4033 
4034       if (emit_state->pds_fragment_stateptr1) {
4035          header.pres_pds_state_ptr1 = true;
4036          *buffer_ptr++ = ppp_state->pds.varying_base;
4037       }
4038 
4039       /* We don't use the pds_fragment_stateptr2 (texture state programs)
4040        * control word, but this doesn't mean we need to set it to 0. This is
4041        * because the hardware runs the texture state program only when the
4042        * pds_texture state field of PDS_SIZEINFO1 is non-zero.
4043        */
4044 
4045       if (emit_state->pds_fragment_stateptr3) {
4046          header.pres_pds_state_ptr3 = true;
4047          *buffer_ptr++ = ppp_state->pds.uniform_state_data_base;
4048       }
4049 
4050       /* Region clip. */
4051       if (emit_state->region_clip) {
4052          header.pres_region_clip = true;
4053          *buffer_ptr++ = ppp_state->region_clipping.word0;
4054          *buffer_ptr++ = ppp_state->region_clipping.word1;
4055       }
4056 
4057       /* Viewport. */
4058       if (emit_state->viewport) {
4059          const uint32_t viewports = MAX2(1, ppp_state->viewport_count);
4060 
4061          header.pres_viewport = true;
4062          for (uint32_t i = 0; i < viewports; i++) {
4063             *buffer_ptr++ = ppp_state->viewports[i].a0;
4064             *buffer_ptr++ = ppp_state->viewports[i].m0;
4065             *buffer_ptr++ = ppp_state->viewports[i].a1;
4066             *buffer_ptr++ = ppp_state->viewports[i].m1;
4067             *buffer_ptr++ = ppp_state->viewports[i].a2;
4068             *buffer_ptr++ = ppp_state->viewports[i].m2;
4069          }
4070       }
4071 
4072       /* W clamp. */
4073       if (emit_state->wclamp) {
4074          const float wclamp = 0.00001f;
4075 
4076          header.pres_wclamp = true;
4077          *buffer_ptr++ = fui(wclamp);
4078       }
4079 
4080       /* Output selects. */
4081       if (emit_state->output_selects) {
4082          header.pres_outselects = true;
4083          *buffer_ptr++ = ppp_state->output_selects;
4084       }
4085 
4086       /* Varying words. */
4087       if (emit_state->varying_word0) {
4088          header.pres_varying_word0 = true;
4089          *buffer_ptr++ = ppp_state->varying_word[0];
4090       }
4091 
4092       if (emit_state->varying_word1) {
4093          header.pres_varying_word1 = true;
4094          *buffer_ptr++ = ppp_state->varying_word[1];
4095       }
4096 
4097       if (emit_state->varying_word2) {
4098          /* We only emit this on the first draw of a render job to prevent us
4099           * from inheriting a non-zero value set elsewhere.
4100           */
4101          header.pres_varying_word2 = true;
4102          *buffer_ptr++ = 0;
4103       }
4104 
4105       /* PPP control. */
4106       if (emit_state->ppp_control) {
4107          header.pres_ppp_ctrl = true;
4108          *buffer_ptr++ = ppp_state->ppp_control;
4109       }
4110 
4111       if (emit_state->stream_out) {
4112          /* We only emit this on the first draw of a render job to prevent us
4113           * from inheriting a non-zero value set elsewhere.
4114           */
4115          header.pres_stream_out_size = true;
4116          *buffer_ptr++ = 0;
4117       }
4118    }
4119 
4120    if (!ppp_state_header)
4121       return VK_SUCCESS;
4122 
4123    ppp_state_words_count = buffer_ptr - ppp_state_words;
4124    ppp_state_words[0] = ppp_state_header;
4125 
4126    result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
4127                                      cmd_buffer->device->heaps.general_heap,
4128                                      ppp_state_words_count * sizeof(uint32_t),
4129                                      PVR_BO_ALLOC_FLAG_CPU_MAPPED,
4130                                      &pvr_bo);
4131    if (result != VK_SUCCESS)
4132       return result;
4133 
4134    memcpy(pvr_bo->bo->map,
4135           ppp_state_words,
4136           ppp_state_words_count * sizeof(uint32_t));
4137 
4138    /* Write the VDM state update into the VDM control stream. */
4139    pvr_csb_emit (control_stream, VDMCTRL_PPP_STATE0, state0) {
4140       state0.word_count = ppp_state_words_count;
4141       state0.addrmsb = pvr_bo->vma->dev_addr;
4142    }
4143 
4144    pvr_csb_emit (control_stream, VDMCTRL_PPP_STATE1, state1) {
4145       state1.addrlsb = pvr_bo->vma->dev_addr;
4146    }
4147 
4148    if (emit_state->isp_dbsc &&
4149        cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
4150       pvr_finishme("Unimplemented path!!");
4151    }
4152 
4153    state->emit_state_bits = 0;
4154 
4155    return VK_SUCCESS;
4156 }
4157 
4158 static VkResult
pvr_emit_dirty_ppp_state(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)4159 pvr_emit_dirty_ppp_state(struct pvr_cmd_buffer *const cmd_buffer,
4160                          struct pvr_sub_cmd_gfx *const sub_cmd)
4161 {
4162    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
4163    const struct pvr_graphics_pipeline *const gfx_pipeline = state->gfx_pipeline;
4164    const bool dirty_stencil = state->dirty.compare_mask ||
4165                               state->dirty.write_mask || state->dirty.reference;
4166    VkResult result;
4167 
4168    if (!(dirty_stencil || state->dirty.depth_bias ||
4169          state->dirty.fragment_descriptors || state->dirty.line_width ||
4170          state->dirty.gfx_pipeline_binding || state->dirty.scissor ||
4171          state->dirty.userpass_spawn || state->dirty.viewport ||
4172          state->emit_state_bits)) {
4173       return VK_SUCCESS;
4174    }
4175 
4176    if (state->dirty.gfx_pipeline_binding) {
4177       struct PVRX(TA_STATE_ISPA) ispa;
4178 
4179       pvr_setup_output_select(cmd_buffer);
4180       pvr_setup_isp_faces_and_control(cmd_buffer, &ispa);
4181       pvr_setup_triangle_merging_flag(cmd_buffer, &ispa);
4182    } else if (dirty_stencil || state->dirty.line_width ||
4183               state->dirty.userpass_spawn) {
4184       pvr_setup_isp_faces_and_control(cmd_buffer, NULL);
4185    }
4186 
4187    if (!gfx_pipeline->raster_state.discard_enable &&
4188        state->dirty.fragment_descriptors &&
4189        gfx_pipeline->fragment_shader_state.bo) {
4190       pvr_setup_fragment_state_pointers(cmd_buffer, sub_cmd);
4191    }
4192 
4193    pvr_setup_isp_depth_bias_scissor_state(cmd_buffer);
4194 
4195    if (state->dirty.viewport)
4196       pvr_setup_viewport(cmd_buffer);
4197 
4198    pvr_setup_ppp_control(cmd_buffer);
4199 
4200    if (gfx_pipeline->raster_state.cull_mode == VK_CULL_MODE_FRONT_AND_BACK) {
4201       /* FIXME: Port SetNegativeViewport(). */
4202    }
4203 
4204    result = pvr_emit_ppp_state(cmd_buffer, sub_cmd);
4205    if (result != VK_SUCCESS)
4206       return result;
4207 
4208    return VK_SUCCESS;
4209 }
4210 
4211 static void
pvr_calculate_vertex_cam_size(const struct pvr_device_info * dev_info,const uint32_t vs_output_size,const bool raster_enable,uint32_t * const cam_size_out,uint32_t * const vs_max_instances_out)4212 pvr_calculate_vertex_cam_size(const struct pvr_device_info *dev_info,
4213                               const uint32_t vs_output_size,
4214                               const bool raster_enable,
4215                               uint32_t *const cam_size_out,
4216                               uint32_t *const vs_max_instances_out)
4217 {
4218    /* First work out the size of a vertex in the UVS and multiply by 4 for
4219     * column ordering.
4220     */
4221    const uint32_t uvs_vertex_vector_size_in_dwords =
4222       (vs_output_size + 1U + raster_enable * 4U) * 4U;
4223    const uint32_t vdm_cam_size =
4224       PVR_GET_FEATURE_VALUE(dev_info, vdm_cam_size, 32U);
4225 
4226    /* This is a proxy for 8XE. */
4227    if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format) &&
4228        vdm_cam_size < 96U) {
4229       /* Comparisons are based on size including scratch per vertex vector. */
4230       if (uvs_vertex_vector_size_in_dwords < (14U * 4U)) {
4231          *cam_size_out = MIN2(31U, vdm_cam_size - 1U);
4232          *vs_max_instances_out = 16U;
4233       } else if (uvs_vertex_vector_size_in_dwords < (20U * 4U)) {
4234          *cam_size_out = 15U;
4235          *vs_max_instances_out = 16U;
4236       } else if (uvs_vertex_vector_size_in_dwords < (28U * 4U)) {
4237          *cam_size_out = 11U;
4238          *vs_max_instances_out = 12U;
4239       } else if (uvs_vertex_vector_size_in_dwords < (44U * 4U)) {
4240          *cam_size_out = 7U;
4241          *vs_max_instances_out = 8U;
4242       } else if (PVR_HAS_FEATURE(dev_info,
4243                                  simple_internal_parameter_format_v2) ||
4244                  uvs_vertex_vector_size_in_dwords < (64U * 4U)) {
4245          *cam_size_out = 7U;
4246          *vs_max_instances_out = 4U;
4247       } else {
4248          *cam_size_out = 3U;
4249          *vs_max_instances_out = 2U;
4250       }
4251    } else {
4252       /* Comparisons are based on size including scratch per vertex vector. */
4253       if (uvs_vertex_vector_size_in_dwords <= (32U * 4U)) {
4254          /* output size <= 27 + 5 scratch. */
4255          *cam_size_out = MIN2(95U, vdm_cam_size - 1U);
4256          *vs_max_instances_out = 0U;
4257       } else if (uvs_vertex_vector_size_in_dwords <= 48U * 4U) {
4258          /* output size <= 43 + 5 scratch */
4259          *cam_size_out = 63U;
4260          if (PVR_GET_FEATURE_VALUE(dev_info, uvs_vtx_entries, 144U) < 288U)
4261             *vs_max_instances_out = 16U;
4262          else
4263             *vs_max_instances_out = 0U;
4264       } else if (uvs_vertex_vector_size_in_dwords <= 64U * 4U) {
4265          /* output size <= 59 + 5 scratch. */
4266          *cam_size_out = 31U;
4267          if (PVR_GET_FEATURE_VALUE(dev_info, uvs_vtx_entries, 144U) < 288U)
4268             *vs_max_instances_out = 16U;
4269          else
4270             *vs_max_instances_out = 0U;
4271       } else {
4272          *cam_size_out = 15U;
4273          *vs_max_instances_out = 16U;
4274       }
4275    }
4276 }
4277 
4278 static void
pvr_emit_dirty_vdm_state(const struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)4279 pvr_emit_dirty_vdm_state(const struct pvr_cmd_buffer *const cmd_buffer,
4280                          struct pvr_sub_cmd_gfx *const sub_cmd)
4281 {
4282    /* FIXME: Assume all state is dirty for the moment. */
4283    struct pvr_device_info *const dev_info =
4284       &cmd_buffer->device->pdevice->dev_info;
4285    ASSERTED const uint32_t max_user_vertex_output_components =
4286       pvr_get_max_user_vertex_output_components(dev_info);
4287    struct PVRX(VDMCTRL_VDM_STATE0)
4288       header = { pvr_cmd_header(VDMCTRL_VDM_STATE0) };
4289    const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
4290    const struct pvr_graphics_pipeline *const gfx_pipeline = state->gfx_pipeline;
4291    struct pvr_csb *const csb = &sub_cmd->control_stream;
4292    uint32_t vs_output_size;
4293    uint32_t max_instances;
4294    uint32_t cam_size;
4295 
4296    assert(gfx_pipeline);
4297 
4298    /* CAM Calculations and HW state take vertex size aligned to DWORDS. */
4299    vs_output_size =
4300       DIV_ROUND_UP(gfx_pipeline->vertex_shader_state.vertex_output_size,
4301                    PVRX(VDMCTRL_VDM_STATE4_VS_OUTPUT_SIZE_UNIT_SIZE));
4302 
4303    assert(vs_output_size <= max_user_vertex_output_components);
4304 
4305    pvr_calculate_vertex_cam_size(dev_info,
4306                                  vs_output_size,
4307                                  true,
4308                                  &cam_size,
4309                                  &max_instances);
4310 
4311    pvr_csb_emit (csb, VDMCTRL_VDM_STATE0, state0) {
4312       state0.cam_size = cam_size;
4313 
4314       if (gfx_pipeline->input_asm_state.primitive_restart) {
4315          state0.cut_index_enable = true;
4316          state0.cut_index_present = true;
4317       }
4318 
4319       switch (gfx_pipeline->input_asm_state.topology) {
4320       case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
4321          state0.flatshade_control = PVRX(VDMCTRL_FLATSHADE_CONTROL_VERTEX_1);
4322          break;
4323 
4324       default:
4325          state0.flatshade_control = PVRX(VDMCTRL_FLATSHADE_CONTROL_VERTEX_0);
4326          break;
4327       }
4328 
4329       /* If we've bound a different vertex buffer, or this draw-call requires
4330        * a different PDS attrib data-section from the last draw call (changed
4331        * base_instance) then we need to specify a new data section. This is
4332        * also the case if we've switched pipeline or attrib program as the
4333        * data-section layout will be different.
4334        */
4335       state0.vs_data_addr_present =
4336          state->dirty.gfx_pipeline_binding || state->dirty.vertex_bindings ||
4337          state->dirty.draw_base_instance || state->dirty.draw_variant;
4338 
4339       /* Need to specify new PDS Attrib program if we've bound a different
4340        * pipeline or we needed a different PDS Attrib variant for this
4341        * draw-call.
4342        */
4343       state0.vs_other_present = state->dirty.gfx_pipeline_binding ||
4344                                 state->dirty.draw_variant;
4345 
4346       /* UVB_SCRATCH_SELECT_ONE with no rasterization is only valid when
4347        * stream output is enabled. We use UVB_SCRATCH_SELECT_FIVE because
4348        * Vulkan doesn't support stream output and the vertex position is
4349        * always emitted to the UVB.
4350        */
4351       state0.uvs_scratch_size_select =
4352          PVRX(VDMCTRL_UVS_SCRATCH_SIZE_SELECT_FIVE);
4353 
4354       header = state0;
4355    }
4356 
4357    if (header.cut_index_present) {
4358       pvr_csb_emit (csb, VDMCTRL_VDM_STATE1, state1) {
4359          switch (state->index_buffer_binding.type) {
4360          case VK_INDEX_TYPE_UINT32:
4361             /* FIXME: Defines for these? These seem to come from the Vulkan
4362              * spec. for VkPipelineInputAssemblyStateCreateInfo
4363              * primitiveRestartEnable.
4364              */
4365             state1.cut_index = 0xFFFFFFFF;
4366             break;
4367 
4368          case VK_INDEX_TYPE_UINT16:
4369             state1.cut_index = 0xFFFF;
4370             break;
4371 
4372          default:
4373             unreachable(!"Invalid index type");
4374          }
4375       }
4376    }
4377 
4378    if (header.vs_data_addr_present) {
4379       pvr_csb_emit (csb, VDMCTRL_VDM_STATE2, state2) {
4380          state2.vs_pds_data_base_addr =
4381             PVR_DEV_ADDR(state->pds_vertex_attrib_offset);
4382       }
4383    }
4384 
4385    if (header.vs_other_present) {
4386       const uint32_t usc_unified_store_size_in_bytes =
4387          gfx_pipeline->vertex_shader_state.vertex_input_size << 2;
4388 
4389       pvr_csb_emit (csb, VDMCTRL_VDM_STATE3, state3) {
4390          state3.vs_pds_code_base_addr =
4391             PVR_DEV_ADDR(state->pds_shader.code_offset);
4392       }
4393 
4394       pvr_csb_emit (csb, VDMCTRL_VDM_STATE4, state4) {
4395          state4.vs_output_size = vs_output_size;
4396       }
4397 
4398       pvr_csb_emit (csb, VDMCTRL_VDM_STATE5, state5) {
4399          state5.vs_max_instances = max_instances;
4400          state5.vs_usc_common_size = 0U;
4401          state5.vs_usc_unified_size = DIV_ROUND_UP(
4402             usc_unified_store_size_in_bytes,
4403             PVRX(VDMCTRL_VDM_STATE5_VS_USC_UNIFIED_SIZE_UNIT_SIZE));
4404          state5.vs_pds_temp_size =
4405             DIV_ROUND_UP(state->pds_shader.info->temps_required << 2,
4406                          PVRX(VDMCTRL_VDM_STATE5_VS_PDS_TEMP_SIZE_UNIT_SIZE));
4407          state5.vs_pds_data_size =
4408             DIV_ROUND_UP(state->pds_shader.info->data_size_in_dwords << 2,
4409                          PVRX(VDMCTRL_VDM_STATE5_VS_PDS_DATA_SIZE_UNIT_SIZE));
4410       }
4411    }
4412 }
4413 
pvr_validate_draw_state(struct pvr_cmd_buffer * cmd_buffer)4414 static VkResult pvr_validate_draw_state(struct pvr_cmd_buffer *cmd_buffer)
4415 {
4416    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
4417    const struct pvr_graphics_pipeline *const gfx_pipeline = state->gfx_pipeline;
4418    const struct pvr_pipeline_stage_state *const fragment_state =
4419       &gfx_pipeline->fragment_shader_state.stage_state;
4420    struct pvr_sub_cmd_gfx *sub_cmd;
4421    bool fstencil_writemask_zero;
4422    bool bstencil_writemask_zero;
4423    bool push_descriptors_dirty;
4424    bool fstencil_keep;
4425    bool bstencil_keep;
4426    VkResult result;
4427 
4428    pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
4429 
4430    sub_cmd = &state->current_sub_cmd->gfx;
4431    sub_cmd->empty_cmd = false;
4432 
4433    /* Determine pipeline depth/stencil usage. If a pipeline uses depth or
4434     * stencil testing, those attachments are using their loaded values, and
4435     * the loadOps cannot be optimized out.
4436     */
4437    /* Pipeline uses depth testing. */
4438    if (sub_cmd->depth_usage == PVR_DEPTH_STENCIL_USAGE_UNDEFINED &&
4439        gfx_pipeline->depth_compare_op != VK_COMPARE_OP_ALWAYS) {
4440       sub_cmd->depth_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED;
4441    }
4442 
4443    /* Pipeline uses stencil testing. */
4444    if (sub_cmd->stencil_usage == PVR_DEPTH_STENCIL_USAGE_UNDEFINED &&
4445        (gfx_pipeline->stencil_front.compare_op != VK_COMPARE_OP_ALWAYS ||
4446         gfx_pipeline->stencil_back.compare_op != VK_COMPARE_OP_ALWAYS)) {
4447       sub_cmd->stencil_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED;
4448    }
4449 
4450    if (PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info,
4451                        compute_overlap)) {
4452       uint32_t coefficient_size =
4453          DIV_ROUND_UP(fragment_state->coefficient_size,
4454                       PVRX(TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_UNIT_SIZE));
4455 
4456       if (coefficient_size >
4457           PVRX(TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_MAX_SIZE))
4458          sub_cmd->disable_compute_overlap = true;
4459    }
4460 
4461    sub_cmd->frag_uses_atomic_ops |= fragment_state->uses_atomic_ops;
4462    sub_cmd->frag_has_side_effects |= fragment_state->has_side_effects;
4463    sub_cmd->frag_uses_texture_rw |= fragment_state->uses_texture_rw;
4464    sub_cmd->vertex_uses_texture_rw |=
4465       gfx_pipeline->vertex_shader_state.stage_state.uses_texture_rw;
4466 
4467    fstencil_keep =
4468       (gfx_pipeline->stencil_front.fail_op == VK_STENCIL_OP_KEEP) &&
4469       (gfx_pipeline->stencil_front.pass_op == VK_STENCIL_OP_KEEP);
4470    bstencil_keep = (gfx_pipeline->stencil_back.fail_op == VK_STENCIL_OP_KEEP) &&
4471                    (gfx_pipeline->stencil_back.pass_op == VK_STENCIL_OP_KEEP);
4472    fstencil_writemask_zero = (state->dynamic.common.write_mask.front == 0);
4473    bstencil_writemask_zero = (state->dynamic.common.write_mask.back == 0);
4474 
4475    /* Set stencil modified flag if:
4476     * - Neither front nor back-facing stencil has a fail_op/pass_op of KEEP.
4477     * - Neither front nor back-facing stencil has a write_mask of zero.
4478     */
4479    if (!(fstencil_keep && bstencil_keep) &&
4480        !(fstencil_writemask_zero && bstencil_writemask_zero)) {
4481       sub_cmd->modifies_stencil = true;
4482    }
4483 
4484    /* Set depth modified flag if depth write is enabled. */
4485    if (!gfx_pipeline->depth_write_disable)
4486       sub_cmd->modifies_depth = true;
4487 
4488    /* If either the data or code changes for pds vertex attribs, regenerate the
4489     * data segment.
4490     */
4491    if (state->dirty.vertex_bindings || state->dirty.gfx_pipeline_binding ||
4492        state->dirty.draw_variant || state->dirty.draw_base_instance) {
4493       enum pvr_pds_vertex_attrib_program_type prog_type;
4494       const struct pvr_pds_attrib_program *program;
4495 
4496       if (state->draw_state.draw_indirect)
4497          prog_type = PVR_PDS_VERTEX_ATTRIB_PROGRAM_DRAW_INDIRECT;
4498       else if (state->draw_state.base_instance)
4499          prog_type = PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASE_INSTANCE;
4500       else
4501          prog_type = PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASIC;
4502 
4503       program =
4504          &gfx_pipeline->vertex_shader_state.pds_attrib_programs[prog_type];
4505       state->pds_shader.info = &program->info;
4506       state->pds_shader.code_offset = program->program.code_offset;
4507 
4508       state->max_shared_regs =
4509          MAX2(state->max_shared_regs, pvr_calc_shared_regs_count(gfx_pipeline));
4510 
4511       pvr_setup_vertex_buffers(cmd_buffer, gfx_pipeline);
4512    }
4513 
4514    /* TODO: Check for dirty push constants */
4515 
4516    pvr_validate_push_descriptors(cmd_buffer, &push_descriptors_dirty);
4517 
4518    state->dirty.vertex_descriptors = push_descriptors_dirty ||
4519                                      state->dirty.gfx_pipeline_binding;
4520    state->dirty.fragment_descriptors = state->dirty.vertex_descriptors;
4521 
4522    if (state->dirty.fragment_descriptors) {
4523       result = pvr_setup_descriptor_mappings(
4524          cmd_buffer,
4525          PVR_STAGE_ALLOCATION_FRAGMENT,
4526          &state->gfx_pipeline->fragment_shader_state.descriptor_state,
4527          NULL,
4528          &state->pds_fragment_descriptor_data_offset);
4529       if (result != VK_SUCCESS) {
4530          mesa_loge("Could not setup fragment descriptor mappings.");
4531          return result;
4532       }
4533    }
4534 
4535    if (state->dirty.vertex_descriptors) {
4536       uint32_t pds_vertex_descriptor_data_offset;
4537 
4538       result = pvr_setup_descriptor_mappings(
4539          cmd_buffer,
4540          PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY,
4541          &state->gfx_pipeline->vertex_shader_state.descriptor_state,
4542          NULL,
4543          &pds_vertex_descriptor_data_offset);
4544       if (result != VK_SUCCESS) {
4545          mesa_loge("Could not setup vertex descriptor mappings.");
4546          return result;
4547       }
4548 
4549       pvr_emit_dirty_pds_state(cmd_buffer,
4550                                sub_cmd,
4551                                pds_vertex_descriptor_data_offset);
4552    }
4553 
4554    pvr_emit_dirty_ppp_state(cmd_buffer, sub_cmd);
4555    pvr_emit_dirty_vdm_state(cmd_buffer, sub_cmd);
4556 
4557    state->dirty.gfx_desc_dirty = false;
4558    state->dirty.blend_constants = false;
4559    state->dirty.compare_mask = false;
4560    state->dirty.depth_bias = false;
4561    state->dirty.draw_base_instance = false;
4562    state->dirty.draw_variant = false;
4563    state->dirty.fragment_descriptors = false;
4564    state->dirty.line_width = false;
4565    state->dirty.gfx_pipeline_binding = false;
4566    state->dirty.reference = false;
4567    state->dirty.scissor = false;
4568    state->dirty.userpass_spawn = false;
4569    state->dirty.vertex_bindings = false;
4570    state->dirty.viewport = false;
4571    state->dirty.write_mask = false;
4572 
4573    return VK_SUCCESS;
4574 }
4575 
pvr_get_hw_primitive_topology(VkPrimitiveTopology topology)4576 static uint32_t pvr_get_hw_primitive_topology(VkPrimitiveTopology topology)
4577 {
4578    switch (topology) {
4579    case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
4580       return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_POINT_LIST);
4581    case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
4582       return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_LIST);
4583    case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
4584       return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_STRIP);
4585    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
4586       return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_LIST);
4587    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
4588       return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_STRIP);
4589    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
4590       return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_FAN);
4591    case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
4592       return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_LIST_ADJ);
4593    case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
4594       return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_STRIP_ADJ);
4595    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
4596       return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_LIST_ADJ);
4597    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
4598       return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_STRIP_ADJ);
4599    case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
4600       return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_PATCH_LIST);
4601    default:
4602       unreachable("Undefined primitive topology");
4603    }
4604 }
4605 
pvr_emit_vdm_index_list(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd,VkPrimitiveTopology topology,uint32_t first_vertex,uint32_t vertex_count,uint32_t first_index,uint32_t index_count,uint32_t instance_count)4606 static void pvr_emit_vdm_index_list(struct pvr_cmd_buffer *cmd_buffer,
4607                                     struct pvr_sub_cmd_gfx *const sub_cmd,
4608                                     VkPrimitiveTopology topology,
4609                                     uint32_t first_vertex,
4610                                     uint32_t vertex_count,
4611                                     uint32_t first_index,
4612                                     uint32_t index_count,
4613                                     uint32_t instance_count)
4614 {
4615    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4616    struct pvr_csb *const csb = &sub_cmd->control_stream;
4617    struct PVRX(VDMCTRL_INDEX_LIST0)
4618       list_hdr = { pvr_cmd_header(VDMCTRL_INDEX_LIST0) };
4619    pvr_dev_addr_t index_buffer_addr = PVR_DEV_ADDR_INVALID;
4620    unsigned int index_stride = 0;
4621 
4622    pvr_csb_emit (csb, VDMCTRL_INDEX_LIST0, list0) {
4623       const bool vertex_shader_has_side_effects =
4624          cmd_buffer->state.gfx_pipeline->vertex_shader_state.stage_state
4625             .has_side_effects;
4626 
4627       list0.primitive_topology = pvr_get_hw_primitive_topology(topology);
4628 
4629       /* First instance is not handled in the VDM state, it's implemented as
4630        * an addition in the PDS vertex fetch.
4631        */
4632       list0.index_count_present = true;
4633 
4634       if (instance_count > 1)
4635          list0.index_instance_count_present = true;
4636 
4637       if (first_vertex != 0)
4638          list0.index_offset_present = true;
4639 
4640       if (state->draw_state.draw_indexed) {
4641          struct pvr_buffer *buffer = state->index_buffer_binding.buffer;
4642 
4643          switch (state->index_buffer_binding.type) {
4644          case VK_INDEX_TYPE_UINT32:
4645             list0.index_size = PVRX(VDMCTRL_INDEX_SIZE_B32);
4646             index_stride = 4;
4647             break;
4648 
4649          case VK_INDEX_TYPE_UINT16:
4650             list0.index_size = PVRX(VDMCTRL_INDEX_SIZE_B16);
4651             index_stride = 2;
4652             break;
4653 
4654          default:
4655             unreachable("Invalid index type");
4656          }
4657 
4658          list0.index_addr_present = true;
4659          index_buffer_addr = PVR_DEV_ADDR_OFFSET(
4660             buffer->dev_addr,
4661             state->index_buffer_binding.offset + first_index * index_stride);
4662          list0.index_base_addrmsb = index_buffer_addr;
4663       }
4664 
4665       list0.degen_cull_enable =
4666          PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info,
4667                          vdm_degenerate_culling) &&
4668          !vertex_shader_has_side_effects;
4669 
4670       list_hdr = list0;
4671    }
4672 
4673    if (list_hdr.index_addr_present) {
4674       pvr_csb_emit (csb, VDMCTRL_INDEX_LIST1, list1) {
4675          list1.index_base_addrlsb = index_buffer_addr;
4676       }
4677    }
4678 
4679    if (list_hdr.index_count_present) {
4680       pvr_csb_emit (csb, VDMCTRL_INDEX_LIST2, list2) {
4681          list2.index_count = vertex_count | index_count;
4682       }
4683    }
4684 
4685    if (list_hdr.index_instance_count_present) {
4686       pvr_csb_emit (csb, VDMCTRL_INDEX_LIST3, list3) {
4687          list3.instance_count = instance_count - 1;
4688       }
4689    }
4690 
4691    if (list_hdr.index_offset_present) {
4692       pvr_csb_emit (csb, VDMCTRL_INDEX_LIST4, list4) {
4693          list4.index_offset = first_vertex;
4694       }
4695    }
4696 
4697    /* TODO: See if we need list_words[5-9]. */
4698 }
4699 
pvr_CmdDraw(VkCommandBuffer commandBuffer,uint32_t vertexCount,uint32_t instanceCount,uint32_t firstVertex,uint32_t firstInstance)4700 void pvr_CmdDraw(VkCommandBuffer commandBuffer,
4701                  uint32_t vertexCount,
4702                  uint32_t instanceCount,
4703                  uint32_t firstVertex,
4704                  uint32_t firstInstance)
4705 {
4706    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
4707    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4708    struct pvr_cmd_buffer_draw_state draw_state;
4709    VkResult result;
4710 
4711    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
4712 
4713    draw_state.base_vertex = firstVertex;
4714    draw_state.base_instance = firstInstance;
4715    draw_state.draw_indirect = false;
4716    draw_state.draw_indexed = false;
4717    pvr_update_draw_state(state, &draw_state);
4718 
4719    result = pvr_validate_draw_state(cmd_buffer);
4720    if (result != VK_SUCCESS)
4721       return;
4722 
4723    /* Write the VDM control stream for the primitive. */
4724    pvr_emit_vdm_index_list(cmd_buffer,
4725                            &state->current_sub_cmd->gfx,
4726                            state->gfx_pipeline->input_asm_state.topology,
4727                            firstVertex,
4728                            vertexCount,
4729                            0U,
4730                            0U,
4731                            instanceCount);
4732 }
4733 
pvr_CmdDrawIndexed(VkCommandBuffer commandBuffer,uint32_t indexCount,uint32_t instanceCount,uint32_t firstIndex,int32_t vertexOffset,uint32_t firstInstance)4734 void pvr_CmdDrawIndexed(VkCommandBuffer commandBuffer,
4735                         uint32_t indexCount,
4736                         uint32_t instanceCount,
4737                         uint32_t firstIndex,
4738                         int32_t vertexOffset,
4739                         uint32_t firstInstance)
4740 {
4741    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
4742    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4743    struct pvr_cmd_buffer_draw_state draw_state;
4744    VkResult result;
4745 
4746    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
4747 
4748    draw_state.base_vertex = vertexOffset;
4749    draw_state.base_instance = firstInstance;
4750    draw_state.draw_indirect = false;
4751    draw_state.draw_indexed = true;
4752    pvr_update_draw_state(state, &draw_state);
4753 
4754    result = pvr_validate_draw_state(cmd_buffer);
4755    if (result != VK_SUCCESS)
4756       return;
4757 
4758    /* Write the VDM control stream for the primitive. */
4759    pvr_emit_vdm_index_list(cmd_buffer,
4760                            &state->current_sub_cmd->gfx,
4761                            state->gfx_pipeline->input_asm_state.topology,
4762                            vertexOffset,
4763                            0,
4764                            firstIndex,
4765                            indexCount,
4766                            instanceCount);
4767 }
4768 
pvr_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)4769 void pvr_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
4770                                 VkBuffer _buffer,
4771                                 VkDeviceSize offset,
4772                                 uint32_t drawCount,
4773                                 uint32_t stride)
4774 {
4775    assert(!"Unimplemented");
4776 }
4777 
pvr_CmdDrawIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)4778 void pvr_CmdDrawIndirect(VkCommandBuffer commandBuffer,
4779                          VkBuffer _buffer,
4780                          VkDeviceSize offset,
4781                          uint32_t drawCount,
4782                          uint32_t stride)
4783 {
4784    assert(!"Unimplemented");
4785 }
4786 
4787 static VkResult
pvr_resolve_unemitted_resolve_attachments(struct pvr_cmd_buffer * cmd_buffer)4788 pvr_resolve_unemitted_resolve_attachments(struct pvr_cmd_buffer *cmd_buffer)
4789 {
4790    pvr_finishme("Add attachment resolve support!");
4791    return pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
4792 }
4793 
pvr_CmdEndRenderPass2(VkCommandBuffer commandBuffer,const VkSubpassEndInfo * pSubpassEndInfo)4794 void pvr_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
4795                            const VkSubpassEndInfo *pSubpassEndInfo)
4796 {
4797    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
4798    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4799    struct pvr_image_view **attachments;
4800    VkClearValue *clear_values;
4801    VkResult result;
4802 
4803    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
4804 
4805    assert(state->render_pass_info.pass);
4806    assert(state->render_pass_info.framebuffer);
4807 
4808    /* TODO: Investigate why pvr_cmd_buffer_end_sub_cmd/EndSubCommand is called
4809     * twice in this path, one here and one from
4810     * pvr_resolve_unemitted_resolve_attachments.
4811     */
4812    result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
4813    if (result != VK_SUCCESS)
4814       return;
4815 
4816    result = pvr_resolve_unemitted_resolve_attachments(cmd_buffer);
4817    if (result != VK_SUCCESS)
4818       return;
4819 
4820    /* Save the required fields before clearing render_pass_info struct. */
4821    attachments = state->render_pass_info.attachments;
4822    clear_values = state->render_pass_info.clear_values;
4823 
4824    memset(&state->render_pass_info, 0, sizeof(state->render_pass_info));
4825 
4826    state->render_pass_info.attachments = attachments;
4827    state->render_pass_info.clear_values = clear_values;
4828 }
4829 
pvr_CmdExecuteCommands(VkCommandBuffer commandBuffer,uint32_t commandBufferCount,const VkCommandBuffer * pCommandBuffers)4830 void pvr_CmdExecuteCommands(VkCommandBuffer commandBuffer,
4831                             uint32_t commandBufferCount,
4832                             const VkCommandBuffer *pCommandBuffers)
4833 {
4834    assert(!"Unimplemented");
4835 }
4836 
pvr_CmdNextSubpass2(VkCommandBuffer commandBuffer,const VkSubpassBeginInfo * pSubpassBeginInfo,const VkSubpassEndInfo * pSubpassEndInfo)4837 void pvr_CmdNextSubpass2(VkCommandBuffer commandBuffer,
4838                          const VkSubpassBeginInfo *pSubpassBeginInfo,
4839                          const VkSubpassEndInfo *pSubpassEndInfo)
4840 {
4841    assert(!"Unimplemented");
4842 }
4843 
4844 /* This is just enough to handle vkCmdPipelineBarrier().
4845  * TODO: Complete?
4846  */
pvr_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,const VkDependencyInfo * pDependencyInfo)4847 void pvr_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,
4848                              const VkDependencyInfo *pDependencyInfo)
4849 {
4850    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
4851    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
4852    const struct pvr_render_pass *const render_pass =
4853       state->render_pass_info.pass;
4854    VkPipelineStageFlags vk_src_stage_mask = 0U;
4855    VkPipelineStageFlags vk_dst_stage_mask = 0U;
4856    uint32_t required_stage_mask = 0U;
4857    uint32_t src_stage_mask;
4858    uint32_t dst_stage_mask;
4859    bool is_barrier_needed;
4860 
4861    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
4862 
4863    for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++) {
4864       vk_src_stage_mask |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
4865       vk_dst_stage_mask |= pDependencyInfo->pMemoryBarriers[i].dstStageMask;
4866    }
4867 
4868    for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++) {
4869       vk_src_stage_mask |=
4870          pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
4871       vk_dst_stage_mask |=
4872          pDependencyInfo->pBufferMemoryBarriers[i].dstStageMask;
4873    }
4874 
4875    for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++) {
4876       vk_src_stage_mask |=
4877          pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
4878       vk_dst_stage_mask |=
4879          pDependencyInfo->pImageMemoryBarriers[i].dstStageMask;
4880    }
4881 
4882    src_stage_mask = pvr_stage_mask_src(vk_src_stage_mask);
4883    dst_stage_mask = pvr_stage_mask_dst(vk_dst_stage_mask);
4884 
4885    for (uint32_t stage = 0U; stage != PVR_NUM_SYNC_PIPELINE_STAGES; stage++) {
4886       if (!(dst_stage_mask & BITFIELD_BIT(stage)))
4887          continue;
4888 
4889       required_stage_mask |= state->barriers_needed[stage];
4890    }
4891 
4892    src_stage_mask &= required_stage_mask;
4893    for (uint32_t stage = 0U; stage != PVR_NUM_SYNC_PIPELINE_STAGES; stage++) {
4894       if (!(dst_stage_mask & BITFIELD_BIT(stage)))
4895          continue;
4896 
4897       state->barriers_needed[stage] &= ~src_stage_mask;
4898    }
4899 
4900    if (src_stage_mask == 0 || dst_stage_mask == 0) {
4901       is_barrier_needed = false;
4902    } else if (src_stage_mask == PVR_PIPELINE_STAGE_GEOM_BIT &&
4903               dst_stage_mask == PVR_PIPELINE_STAGE_FRAG_BIT) {
4904       /* This is implicit so no need to barrier. */
4905       is_barrier_needed = false;
4906    } else if (src_stage_mask == dst_stage_mask &&
4907               util_bitcount(src_stage_mask) == 1) {
4908       switch (src_stage_mask) {
4909       case PVR_PIPELINE_STAGE_FRAG_BIT:
4910          pvr_finishme("Handle fragment stage pipeline barrier.");
4911          is_barrier_needed = true;
4912          break;
4913 
4914       case PVR_PIPELINE_STAGE_COMPUTE_BIT: {
4915          struct pvr_sub_cmd *const current_sub_cmd = state->current_sub_cmd;
4916 
4917          is_barrier_needed = false;
4918 
4919          if (!current_sub_cmd ||
4920              current_sub_cmd->type != PVR_SUB_CMD_TYPE_COMPUTE) {
4921             break;
4922          }
4923 
4924          /* Multiple dispatches can be merged into a single job. When back to
4925           * back dispatches have a sequential dependency (CDM -> CDM pipeline
4926           * barrier) we need to do the following.
4927           *   - Dispatch a kernel which fences all previous memory writes and
4928           *     flushes the MADD cache.
4929           *   - Issue a CDM fence which ensures all previous tasks emitted by
4930           *     the CDM are completed before starting anything new.
4931           */
4932 
4933          /* Issue Data Fence, Wait for Data Fence (IDFWDF) makes the PDS wait
4934           * for data.
4935           */
4936          pvr_compute_generate_idfwdf(cmd_buffer, &current_sub_cmd->compute);
4937 
4938          pvr_compute_generate_fence(cmd_buffer,
4939                                     &current_sub_cmd->compute,
4940                                     false);
4941          break;
4942       }
4943 
4944       default:
4945          is_barrier_needed = false;
4946          break;
4947       };
4948    } else {
4949       is_barrier_needed = true;
4950    }
4951 
4952    if (render_pass) {
4953       pvr_finishme("Insert mid fragment stage barrier if needed.");
4954    } else {
4955       if (is_barrier_needed)
4956          pvr_finishme("Insert barrier if needed.");
4957    }
4958 }
4959 
pvr_CmdResetEvent2KHR(VkCommandBuffer commandBuffer,VkEvent _event,VkPipelineStageFlags2 stageMask)4960 void pvr_CmdResetEvent2KHR(VkCommandBuffer commandBuffer,
4961                            VkEvent _event,
4962                            VkPipelineStageFlags2 stageMask)
4963 {
4964    assert(!"Unimplemented");
4965 }
4966 
pvr_CmdSetEvent2KHR(VkCommandBuffer commandBuffer,VkEvent _event,const VkDependencyInfo * pDependencyInfo)4967 void pvr_CmdSetEvent2KHR(VkCommandBuffer commandBuffer,
4968                          VkEvent _event,
4969                          const VkDependencyInfo *pDependencyInfo)
4970 {
4971    assert(!"Unimplemented");
4972 }
4973 
pvr_CmdWaitEvents2KHR(VkCommandBuffer commandBuffer,uint32_t eventCount,const VkEvent * pEvents,const VkDependencyInfo * pDependencyInfos)4974 void pvr_CmdWaitEvents2KHR(VkCommandBuffer commandBuffer,
4975                            uint32_t eventCount,
4976                            const VkEvent *pEvents,
4977                            const VkDependencyInfo *pDependencyInfos)
4978 {
4979    assert(!"Unimplemented");
4980 }
4981 
pvr_CmdWriteTimestamp2KHR(VkCommandBuffer commandBuffer,VkPipelineStageFlags2 stage,VkQueryPool queryPool,uint32_t query)4982 void pvr_CmdWriteTimestamp2KHR(VkCommandBuffer commandBuffer,
4983                                VkPipelineStageFlags2 stage,
4984                                VkQueryPool queryPool,
4985                                uint32_t query)
4986 {
4987    unreachable("Timestamp queries are not supported.");
4988 }
4989 
pvr_EndCommandBuffer(VkCommandBuffer commandBuffer)4990 VkResult pvr_EndCommandBuffer(VkCommandBuffer commandBuffer)
4991 {
4992    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
4993    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4994    VkResult result;
4995 
4996    /* From the Vulkan 1.0 spec:
4997     *
4998     * CommandBuffer must be in the recording state.
4999     */
5000    assert(cmd_buffer->status == PVR_CMD_BUFFER_STATUS_RECORDING);
5001 
5002    if (state->status != VK_SUCCESS)
5003       return state->status;
5004 
5005    result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
5006    if (result != VK_SUCCESS)
5007       return result;
5008 
5009    cmd_buffer->status = PVR_CMD_BUFFER_STATUS_EXECUTABLE;
5010 
5011    return VK_SUCCESS;
5012 }
5013